diff --git a/Triton_Inference_Server_Python_API/examples/rayserve/tritonserver_deployment.py b/Triton_Inference_Server_Python_API/examples/rayserve/tritonserver_deployment.py index 1610b583..77bcdb32 100644 --- a/Triton_Inference_Server_Python_API/examples/rayserve/tritonserver_deployment.py +++ b/Triton_Inference_Server_Python_API/examples/rayserve/tritonserver_deployment.py @@ -54,7 +54,7 @@ def _print_heading(message): @serve.deployment(ray_actor_options={"num_gpus": 1}) @serve.ingress(app) class BaseDeployment: - def __init__(self): + def __init__(self, use_torch_compile=True): self._image_size = 512 self._model_id = "runwayml/stable-diffusion-v1-5" from diffusers import StableDiffusionPipeline @@ -63,6 +63,22 @@ def __init__(self): self._model_id, revision="fp16", torch_dtype=torch.float16 ) self._pipeline = self._pipeline.to("cuda") + if use_torch_compile: + import torch_tensorrt + + backend = "torch_tensorrt" + print("compiling") + print(torch._dynamo.list_backends()) + self._pipeline.unet = torch.compile( + self._pipeline.unet, + backend=backend, + options={ + "truncate_long_and_double": True, + "precision": torch.float16, + }, + dynamic=False, + ) + self.generate("temp") @app.get("/generate") def generate(self, prompt: str, filename: Optional[str] = None) -> None: @@ -153,7 +169,10 @@ def tritonserver_deployment(_args): def base_deployment(_args): - return BaseDeployment.bind() + if "use_torch_compile" in _args: + return BaseDeployment.bind(use_torch_compile=True) + else: + return BaseDeployment.bind(use_torch_compile=False) if __name__ == "__main__": diff --git a/Triton_Inference_Server_Python_API/run.sh b/Triton_Inference_Server_Python_API/run.sh index c465e7f5..040e1610 100755 --- a/Triton_Inference_Server_Python_API/run.sh +++ b/Triton_Inference_Server_Python_API/run.sh @@ -137,7 +137,7 @@ fi $RUN_PREFIX mkdir -p backend/diffusion -$RUN_PREFIX docker run --gpus all -it --rm --network host --shm-size=10G --ulimit memlock=-1 --ulimit stack=67108864 -eHF_TOKEN -eGITHUB_TOKEN -eAWS_DEFAULT_REGION -eAWS_ACCESS_KEY_ID -eAWS_SECRET_ACCESS_KEY -eS3_BUCKET_URL -v ${SOURCE_DIR}:/workspace -v${SOURCE_DIR}/.cache/huggingface:/root/.cache/huggingface -w /workspace -v${SOURCE_DIR}/../Popular_Models_Guide/StableDiffusion/backend/diffusion:/opt/tritonserver/backends/diffusion $IMAGE +$RUN_PREFIX docker run --gpus all -it --rm --network host --shm-size=10G --ulimit memlock=-1 --ulimit stack=67108864 -eHF_TOKEN -eGITHUB_TOKEN -eAWS_DEFAULT_REGION -eAWS_ACCESS_KEY_ID -eAWS_SECRET_ACCESS_KEY -eS3_BUCKET_URL -v ${SOURCE_DIR}:/workspace -v${SOURCE_DIR}/.cache/huggingface:/root/.cache/huggingface -w /workspace -v${SOURCE_DIR}/../Popular_Models_Guide/StableDiffusion/backend/diffusion:/opt/tritonserver/backends/diffusion -v/tmp:/tmp $IMAGE { set +x; } 2>/dev/null