diff --git a/Makefile b/Makefile index bf3a4260a352..36df776d4aa5 100644 --- a/Makefile +++ b/Makefile @@ -424,7 +424,11 @@ docker-build-rerankers: docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:rerankers -f backend/Dockerfile.python --build-arg BACKEND=rerankers . docker-build-vllm: +ifeq ($(BUILD_TYPE),) + docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:vllm -f backend/Dockerfile.vllmcpu --build-arg BACKEND=vllm . +else docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:vllm -f backend/Dockerfile.python --build-arg BACKEND=vllm . +endif docker-build-transformers: docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:transformers -f backend/Dockerfile.python --build-arg BACKEND=transformers . diff --git a/backend/Dockerfile.vllmcpu b/backend/Dockerfile.vllmcpu new file mode 100644 index 000000000000..61c4c75b3b3f --- /dev/null +++ b/backend/Dockerfile.vllmcpu @@ -0,0 +1,64 @@ +ARG BASE_IMAGE=ubuntu:22.04 + +FROM ${BASE_IMAGE} AS builder +ARG BACKEND=vllm +ARG BUILD_TYPE +ENV BUILD_TYPE=${BUILD_TYPE} +ARG FROM_SOURCE=true +ENV FROM_SOURCE=${FROM_SOURCE} +ARG CUDA_MAJOR_VERSION +ARG CUDA_MINOR_VERSION +ARG SKIP_DRIVERS=false +ENV CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION} +ENV CUDA_MINOR_VERSION=${CUDA_MINOR_VERSION} +ENV DEBIAN_FRONTEND=noninteractive +ARG TARGETARCH +ARG TARGETVARIANT + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + build-essential \ + ccache \ + ca-certificates \ + espeak-ng \ + curl \ + libssl-dev \ + git \ + git-lfs \ + unzip \ + upx-ucl \ + curl python3-pip \ + python-is-python3 \ + python3-dev llvm \ + python3-venv make \ + wget \ + gcc-12 g++-12 \ + libtcmalloc-minimal4 \ + libnuma-dev \ + ffmpeg \ + libsm6 libxext6 \ + libgl1 \ + jq lsof && \ + apt-get clean && \ + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 && \ + rm -rf /var/lib/apt/lists/* && \ + pip install --upgrade pip + +# Install uv as a system package +RUN curl -LsSf https://astral.sh/uv/install.sh | UV_INSTALL_DIR=/usr/bin sh +ENV PATH="/root/.cargo/bin:${PATH}" + +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y + +# Install grpcio-tools (the version in 22.04 is too old) +RUN pip install --user grpcio-tools==1.71.0 grpcio==1.71.0 + +COPY python/${BACKEND} /${BACKEND} +COPY backend.proto /${BACKEND}/backend.proto +COPY python/common/ /${BACKEND}/common + +RUN cd /${BACKEND} && make + +FROM scratch +ARG BACKEND=vllm +COPY --from=builder /${BACKEND}/ / diff --git a/backend/python/bark/requirements-cpu.txt b/backend/python/bark/requirements-cpu.txt index 12e376adeb15..ce631e836e4d 100644 --- a/backend/python/bark/requirements-cpu.txt +++ b/backend/python/bark/requirements-cpu.txt @@ -1,4 +1,6 @@ +bark==0.1.5 transformers accelerate -torch==2.4.1 -torchaudio==2.4.1 \ No newline at end of file +--extra-index-url https://download.pytorch.org/whl/cpu +torch==2.4.1+cpu +torchaudio==2.4.1+cpu diff --git a/backend/python/bark/requirements-cublas11.txt b/backend/python/bark/requirements-cublas11.txt index 9f8fe9ff87a3..2955e15bd915 100644 --- a/backend/python/bark/requirements-cublas11.txt +++ b/backend/python/bark/requirements-cublas11.txt @@ -1,5 +1,6 @@ +bark==0.1.5 --extra-index-url https://download.pytorch.org/whl/cu118 torch==2.4.1+cu118 torchaudio==2.4.1+cu118 transformers -accelerate \ No newline at end of file +accelerate diff --git a/backend/python/bark/requirements-cublas12.txt b/backend/python/bark/requirements-cublas12.txt index 537169495d1e..f3020ade49e1 100644 --- a/backend/python/bark/requirements-cublas12.txt +++ b/backend/python/bark/requirements-cublas12.txt @@ -1,4 +1,5 @@ +bark==0.1.5 torch==2.4.1 torchaudio==2.4.1 transformers -accelerate \ No newline at end of file +accelerate diff --git a/backend/python/bark/requirements-hipblas.txt b/backend/python/bark/requirements-hipblas.txt index 1d54fb165974..1650a754aee6 100644 --- a/backend/python/bark/requirements-hipblas.txt +++ b/backend/python/bark/requirements-hipblas.txt @@ -1,5 +1,6 @@ +bark==0.1.5 --extra-index-url https://download.pytorch.org/whl/rocm6.0 torch==2.4.1+rocm6.0 torchaudio==2.4.1+rocm6.0 transformers -accelerate \ No newline at end of file +accelerate diff --git a/backend/python/bark/requirements-intel.txt b/backend/python/bark/requirements-intel.txt index f24bd166e4b5..6b67d5862a70 100644 --- a/backend/python/bark/requirements-intel.txt +++ b/backend/python/bark/requirements-intel.txt @@ -1,3 +1,4 @@ +bark==0.1.5 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ intel-extension-for-pytorch==2.3.110+xpu torch==2.3.1+cxx11.abi @@ -6,4 +7,4 @@ oneccl_bind_pt==2.3.100+xpu optimum[openvino] setuptools transformers -accelerate \ No newline at end of file +accelerate diff --git a/backend/python/bark/requirements.txt b/backend/python/bark/requirements.txt index 2f40b3208b1f..874913ee456a 100644 --- a/backend/python/bark/requirements.txt +++ b/backend/python/bark/requirements.txt @@ -1,4 +1,3 @@ -bark==0.1.5 grpcio==1.71.0 protobuf -certifi \ No newline at end of file +certifi diff --git a/backend/python/chatterbox/requirements-cpu.txt b/backend/python/chatterbox/requirements-cpu.txt index 7c87f8803115..405b9d59f46c 100644 --- a/backend/python/chatterbox/requirements-cpu.txt +++ b/backend/python/chatterbox/requirements-cpu.txt @@ -1,5 +1,6 @@ accelerate -torch==2.6.0 -torchaudio==2.6.0 +--extra-index-url https://download.pytorch.org/whl/cpu +torch==2.6.0+cpu +torchaudio==2.6.0+cpu transformers==4.46.3 chatterbox-tts diff --git a/backend/python/common/libbackend.sh b/backend/python/common/libbackend.sh index daa47c3c2080..3be723c838da 100644 --- a/backend/python/common/libbackend.sh +++ b/backend/python/common/libbackend.sh @@ -74,8 +74,8 @@ function getBuildProfile() { return 0 fi - # If /opt/intel exists, then we are doing an intel/ARC build - if [ -d "/opt/intel" ]; then + # If /opt/intel exists and BUILD_TYPE is one of the Intel ones, then we are doing an intel/ARC build + if [[ -d "/opt/intel" && ( x"${BUILD_TYPE}" == "xintel" || ( x"${BUILD_TYPE}" == "xsycl_f16" || x"${BUILD_TYPE}" == "xsycl_f32" ) ) ]]; then echo "intel" return 0 fi diff --git a/backend/python/coqui/requirements-cpu.txt b/backend/python/coqui/requirements-cpu.txt index 787877bd8439..3cb78b550b7e 100644 --- a/backend/python/coqui/requirements-cpu.txt +++ b/backend/python/coqui/requirements-cpu.txt @@ -1,4 +1,5 @@ transformers==4.48.3 accelerate -torch==2.4.1 -coqui-tts \ No newline at end of file +--extra-index-url https://download.pytorch.org/whl/cpu +torch==2.4.1+cpu +coqui-tts diff --git a/backend/python/diffusers/backend.py b/backend/python/diffusers/backend.py index 17a71694a453..ec7aff7c7435 100755 --- a/backend/python/diffusers/backend.py +++ b/backend/python/diffusers/backend.py @@ -29,7 +29,8 @@ _ONE_DAY_IN_SECONDS = 60 * 60 * 24 COMPEL = os.environ.get("COMPEL", "0") == "1" -XPU = os.environ.get("XPU", "0") == "1" +# Attempt to use XPU only if Torch says it is available when asking for it +XPU = ((os.environ.get("XPU", "0") == "1") & (torch.xpu.is_available())) CLIPSKIP = os.environ.get("CLIPSKIP", "1") == "1" SAFETENSORS = os.environ.get("SAFETENSORS", "1") == "1" CHUNK_SIZE = os.environ.get("CHUNK_SIZE", "8") @@ -37,6 +38,11 @@ DISABLE_CPU_OFFLOAD = os.environ.get("DISABLE_CPU_OFFLOAD", "0") == "1" FRAMES = os.environ.get("FRAMES", "64") +# Set Torch to use all logical CPU cores for CPU mode +num_cores = os.cpu_count() +torch.set_num_threads(max(1, num_cores // 2)) +torch.set_num_interop_threads(num_cores) + if XPU: print(torch.xpu.get_device_name(0)) @@ -166,7 +172,8 @@ def LoadModel(self, request, context): torchType = torch.float32 variant = None - if request.F16Memory: + # Only use f16 if not running on CPU - forcing f16 on CPU causes freezes (https://github.com/pytorch/pytorch/issues/75458) + if (request.F16Memory & ((request.CUDA & torch.cuda.is_available()) | XPU)): torchType = torch.float16 variant = "fp16" @@ -189,12 +196,18 @@ def LoadModel(self, request, context): value = int(value) self.options[key] = value - # From options, extract if present "torch_dtype" and set it to the appropriate type + # From options, extract if present "torch_dtype" and set it to the appropriate type; if on CPU, always force float32 if "torch_dtype" in self.options: if self.options["torch_dtype"] == "fp16": - torchType = torch.float16 + if not ((request.CUDA & torch.cuda.is_available()) | XPU): + torchType = torch.float32 + else: + torchType = torch.float16 elif self.options["torch_dtype"] == "bf16": - torchType = torch.bfloat16 + if not ((request.CUDA & torch.cuda.is_available()) | XPU): + torchType = torch.float32 + else: + torchType = torch.bfloat16 elif self.options["torch_dtype"] == "fp32": torchType = torch.float32 # remove it from options @@ -290,6 +303,8 @@ def LoadModel(self, request, context): use_safetensors=True, variant=variant) elif request.PipelineType == "FluxPipeline": + if not ((request.CUDA & torch.cuda.is_available()) | XPU): + raise RuntimeError("Flux requires f16. Cannot run diffusers using f16 on CPU - doing so causes deadlocks. Refer to: https://github.com/pytorch/pytorch/issues/75458") if fromSingleFile: self.pipe = FluxPipeline.from_single_file(modelFile, torch_dtype=torchType, @@ -301,6 +316,8 @@ def LoadModel(self, request, context): if request.LowVRAM: self.pipe.enable_model_cpu_offload() elif request.PipelineType == "FluxTransformer2DModel": + if not ((request.CUDA & torch.cuda.is_available()) | XPU): + raise RuntimeError("Flux requires f16. Cannot run diffusers using f16 on CPU - doing so causes deadlocks. Refer to: https://github.com/pytorch/pytorch/issues/75458") dtype = torch.bfloat16 # specify from environment or default to "ChuckMcSneed/FLUX.1-dev" bfl_repo = os.environ.get("BFL_REPO", "ChuckMcSneed/FLUX.1-dev") @@ -319,12 +336,16 @@ def LoadModel(self, request, context): if request.LowVRAM: self.pipe.enable_model_cpu_offload() elif request.PipelineType == "Lumina2Text2ImgPipeline": + if not ((request.CUDA & torch.cuda.is_available()) | XPU): + raise RuntimeError("Lumina requires f16. Cannot run diffusers using f16 on CPU - doing so causes deadlocks. Refer to: https://github.com/pytorch/pytorch/issues/75458") self.pipe = Lumina2Text2ImgPipeline.from_pretrained( request.Model, torch_dtype=torch.bfloat16) if request.LowVRAM: self.pipe.enable_model_cpu_offload() elif request.PipelineType == "SanaPipeline": + if not ((request.CUDA & torch.cuda.is_available()) | XPU): + raise RuntimeError("Sana requires f16. Cannot run diffusers using f16 on CPU - doing so causes deadlocks. Refer to: https://github.com/pytorch/pytorch/issues/75458") self.pipe = SanaPipeline.from_pretrained( request.Model, variant="bf16", @@ -362,7 +383,7 @@ def LoadModel(self, request, context): # modify LoraAdapter to be relative to modelFileBase request.LoraAdapter = os.path.join(request.ModelPath, request.LoraAdapter) - device = "cpu" if not request.CUDA else "cuda" + device = "cpu" if not (request.CUDA & torch.cuda.is_available()) else "cuda" if XPU: device = "xpu" self.device = device @@ -392,6 +413,8 @@ def LoadModel(self, request, context): self.pipe.to(device) if self.controlnet: self.controlnet.to(device) + else: + self.pipe.to("cpu") except Exception as err: return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") diff --git a/backend/python/diffusers/requirements-cpu.txt b/backend/python/diffusers/requirements-cpu.txt index 7b19bf2192ca..378a162cdadf 100644 --- a/backend/python/diffusers/requirements-cpu.txt +++ b/backend/python/diffusers/requirements-cpu.txt @@ -5,5 +5,6 @@ accelerate compel peft sentencepiece -torch==2.7.1 -optimum-quanto \ No newline at end of file +--extra-index-url https://download.pytorch.org/whl/cpu +torch==2.7.1+cpu +optimum-quanto diff --git a/backend/python/diffusers/run.sh b/backend/python/diffusers/run.sh index ee730f21f5a9..5028271170d8 100755 --- a/backend/python/diffusers/run.sh +++ b/backend/python/diffusers/run.sh @@ -6,6 +6,10 @@ else source $backend_dir/../common/libbackend.sh fi +# Set thread counts for CPU mode +export OMP_NUM_THREADS=$(nproc) +export MKL_NUM_THREADS=$(nproc) + if [ -d "/opt/intel" ]; then # Assumes we are using the Intel oneAPI container image # https://github.com/intel/intel-extension-for-pytorch/issues/538 diff --git a/backend/python/faster-whisper/backend.py b/backend/python/faster-whisper/backend.py index b73664ab88f7..16f9b5f5e6ce 100755 --- a/backend/python/faster-whisper/backend.py +++ b/backend/python/faster-whisper/backend.py @@ -8,6 +8,7 @@ import signal import sys import os +import torch import backend_pb2 import backend_pb2_grpc @@ -31,14 +32,17 @@ def Health(self, request, context): return backend_pb2.Reply(message=bytes("OK", 'utf-8')) def LoadModel(self, request, context): device = "cpu" + precision = "float32" # Get device # device = "cuda" if request.CUDA else "cpu" - if request.CUDA: + # Detecting CUDA availability using Torch. + if (request.CUDA & torch.cuda.is_available()): device = "cuda" + precision="float16" try: print("Preparing models, please wait", file=sys.stderr) - self.model = WhisperModel(request.Model, device=device, compute_type="float16") + self.model = WhisperModel(request.Model, device=device, compute_type=precision) except Exception as err: return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") # Implement your logic here for the LoadModel service diff --git a/backend/python/faster-whisper/requirements-cpu.txt b/backend/python/faster-whisper/requirements-cpu.txt index 3e03f3adffdd..f0bcde0194de 100644 --- a/backend/python/faster-whisper/requirements-cpu.txt +++ b/backend/python/faster-whisper/requirements-cpu.txt @@ -4,5 +4,6 @@ accelerate compel peft sentencepiece -torch==2.4.1 -optimum-quanto \ No newline at end of file +--extra-index-url https://download.pytorch.org/whl/cpu +torch==2.4.1+cpu +optimum-quanto diff --git a/backend/python/kokoro/requirements-cpu.txt b/backend/python/kokoro/requirements-cpu.txt index 1a1abb2f2d56..b4684b15ab92 100644 --- a/backend/python/kokoro/requirements-cpu.txt +++ b/backend/python/kokoro/requirements-cpu.txt @@ -1,6 +1,6 @@ --extra-index-url https://download.pytorch.org/whl/cpu transformers accelerate -torch +torch==2.7.1+cpu kokoro -soundfile \ No newline at end of file +soundfile diff --git a/backend/python/rerankers/requirements-cpu.txt b/backend/python/rerankers/requirements-cpu.txt index e27a47263797..396c3eda3661 100644 --- a/backend/python/rerankers/requirements-cpu.txt +++ b/backend/python/rerankers/requirements-cpu.txt @@ -1,4 +1,5 @@ transformers accelerate -torch==2.4.1 -rerankers[transformers] \ No newline at end of file +--extra-index-url https://download.pytorch.org/whl/cpu +torch==2.4.1+cpu +rerankers[transformers] diff --git a/backend/python/rfdetr/requirements-cpu.txt b/backend/python/rfdetr/requirements-cpu.txt index d0d1f4afaa94..041ecc26e3fc 100644 --- a/backend/python/rfdetr/requirements-cpu.txt +++ b/backend/python/rfdetr/requirements-cpu.txt @@ -3,5 +3,6 @@ opencv-python accelerate peft inference -torch==2.7.1 -optimum-quanto \ No newline at end of file +--extra-index-url https://download.pytorch.org/whl/cpu +torch==2.7.1+cpu +optimum-quanto diff --git a/backend/python/transformers/backend.py b/backend/python/transformers/backend.py index ef8a2fd40b6e..80fdcd9bd885 100644 --- a/backend/python/transformers/backend.py +++ b/backend/python/transformers/backend.py @@ -19,8 +19,8 @@ import torch import torch.cuda - -XPU=os.environ.get("XPU", "0") == "1" +# Attempt to use XPU only if Torch says it is available when asking for it +XPU = ((os.environ.get("XPU", "0") == "1") & (torch.xpu.is_available())) from transformers import AutoTokenizer, AutoModel, set_seed, TextIteratorStreamer, StoppingCriteriaList, StopStringCriteria, MambaConfig, MambaForCausalLM from transformers import AutoProcessor, MusicgenForConditionalGeneration, DiaForConditionalGeneration from scipy.io import wavfile @@ -83,8 +83,14 @@ def LoadModel(self, request, context): if os.path.exists(request.ModelFile): model_name = request.ModelFile - compute = torch.float16 - if request.F16Memory == True: + # Use float32 for CPU inference + if (torch.cuda.is_available() | XPU): + compute = torch.float16 + else: + compute = torch.float32 + + # Only use f16 if not running on CPU - forcing f16 on CPU causes freezes (https://github.com/pytorch/pytorch/issues/75458) + if (request.F16Memory & (torch.cuda.is_available() | XPU)) == True: compute=torch.bfloat16 self.CUDA = torch.cuda.is_available() @@ -122,6 +128,9 @@ def LoadModel(self, request, context): print(f"Parsed options: {self.options}", file=sys.stderr) + if not (self.CUDA | XPU): + from transformers import BitsAndBytesConfig, AutoModelForCausalLM + if self.CUDA: from transformers import BitsAndBytesConfig, AutoModelForCausalLM if request.MainGPU: diff --git a/backend/python/transformers/requirements-cpu.txt b/backend/python/transformers/requirements-cpu.txt index c944de271065..e533d3177d8b 100644 --- a/backend/python/transformers/requirements-cpu.txt +++ b/backend/python/transformers/requirements-cpu.txt @@ -1,4 +1,5 @@ -torch==2.7.1 +--extra-index-url https://download.pytorch.org/whl/cpu +torch==2.7.1+cpu llvmlite==0.43.0 numba==0.60.0 accelerate @@ -6,4 +7,4 @@ transformers bitsandbytes outetts sentence-transformers==5.0.0 -protobuf==6.31.0 \ No newline at end of file +protobuf==6.31.0 diff --git a/backend/python/vllm/install.sh b/backend/python/vllm/install.sh index 88ea4c7cb050..53cde03cb8b3 100755 --- a/backend/python/vllm/install.sh +++ b/backend/python/vllm/install.sh @@ -28,8 +28,10 @@ if [ "x${BUILD_TYPE}" == "x" ] && [ "x${FROM_SOURCE}" == "xtrue" ]; then git clone https://github.com/vllm-project/vllm fi pushd vllm - uv pip install wheel packaging ninja "setuptools>=49.4.0" numpy typing-extensions pillow setuptools-scm grpcio==1.68.1 protobuf bitsandbytes - uv pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu + uv pip install --upgrade pip + uv pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu --index-strategy unsafe-best-match + uv pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu --index-strategy unsafe-best-match + uv pip install numpy typing-extensions pillow grpcio==1.71.0 protobuf protobuf certifi accelerate transformers bitsandbytes VLLM_TARGET_DEVICE=cpu python setup.py install popd rm -rf vllm diff --git a/backend/python/vllm/requirements-cpu.txt b/backend/python/vllm/requirements-cpu.txt index 16c7cbac50c0..d1e8822452c6 100644 --- a/backend/python/vllm/requirements-cpu.txt +++ b/backend/python/vllm/requirements-cpu.txt @@ -1,3 +1,4 @@ accelerate -torch==2.7.0 -transformers \ No newline at end of file +--extra-index-url https://download.pytorch.org/whl/cpu +torch==2.7.0+cpu +transformers