mudler · rampa3 · Aug 7, 2025 · Aug 7, 2025 · Aug 7, 2025 · Aug 7, 2025
diff --git a/Makefile b/Makefile
@@ -424,7 +424,11 @@ docker-build-rerankers:
 	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:rerankers -f backend/Dockerfile.python --build-arg BACKEND=rerankers .
 
 docker-build-vllm:
+ifeq ($(BUILD_TYPE),)
+	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:vllm -f backend/Dockerfile.vllmcpu --build-arg BACKEND=vllm .
+else
 	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:vllm -f backend/Dockerfile.python --build-arg BACKEND=vllm .
+endif
 
 docker-build-transformers:
 	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:transformers -f backend/Dockerfile.python --build-arg BACKEND=transformers .

diff --git a/backend/Dockerfile.vllmcpu b/backend/Dockerfile.vllmcpu
@@ -0,0 +1,64 @@
+ARG BASE_IMAGE=ubuntu:22.04
+
+FROM ${BASE_IMAGE} AS builder
+ARG BACKEND=vllm
+ARG BUILD_TYPE
+ENV BUILD_TYPE=${BUILD_TYPE}
+ARG FROM_SOURCE=true
+ENV FROM_SOURCE=${FROM_SOURCE}
+ARG CUDA_MAJOR_VERSION
+ARG CUDA_MINOR_VERSION
+ARG SKIP_DRIVERS=false
+ENV CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION}
+ENV CUDA_MINOR_VERSION=${CUDA_MINOR_VERSION}
+ENV DEBIAN_FRONTEND=noninteractive
+ARG TARGETARCH
+ARG TARGETVARIANT
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        build-essential \
+        ccache \
+        ca-certificates \
+        espeak-ng \
+        curl \
+        libssl-dev \
+        git \
+        git-lfs \
+        unzip \
+        upx-ucl \
+        curl python3-pip \
+        python-is-python3 \
+        python3-dev llvm \
+        python3-venv make \
+        wget \
+        gcc-12 g++-12 \
+        libtcmalloc-minimal4 \
+        libnuma-dev \
+        ffmpeg \
+        libsm6 libxext6 \
+        libgl1 \
+        jq lsof && \
+    apt-get clean && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 && \
+    rm -rf /var/lib/apt/lists/* && \
+    pip install --upgrade pip
+
+# Install uv as a system package
+RUN curl -LsSf https://astral.sh/uv/install.sh | UV_INSTALL_DIR=/usr/bin sh
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+
+# Install grpcio-tools (the version in 22.04 is too old)
+RUN pip install --user grpcio-tools==1.71.0 grpcio==1.71.0
+
+COPY python/${BACKEND} /${BACKEND}
+COPY backend.proto /${BACKEND}/backend.proto
+COPY python/common/ /${BACKEND}/common
+
+RUN cd /${BACKEND} && make
+
+FROM scratch
+ARG BACKEND=vllm
+COPY --from=builder /${BACKEND}/ /
diff --git a/backend/python/bark/requirements-cpu.txt b/backend/python/bark/requirements-cpu.txt
@@ -1,4 +1,6 @@
+bark==0.1.5
 transformers
 accelerate
-torch==2.4.1
-torchaudio==2.4.1
+--extra-index-url https://download.pytorch.org/whl/cpu
+torch==2.4.1+cpu
+torchaudio==2.4.1+cpu
diff --git a/backend/python/bark/requirements-cublas11.txt b/backend/python/bark/requirements-cublas11.txt
@@ -1,5 +1,6 @@
+bark==0.1.5
 --extra-index-url https://download.pytorch.org/whl/cu118
 torch==2.4.1+cu118
 torchaudio==2.4.1+cu118
 transformers
-accelerate
+accelerate
diff --git a/backend/python/bark/requirements-cublas12.txt b/backend/python/bark/requirements-cublas12.txt
@@ -1,4 +1,5 @@
+bark==0.1.5
 torch==2.4.1
 torchaudio==2.4.1
 transformers
-accelerate
+accelerate
diff --git a/backend/python/bark/requirements-hipblas.txt b/backend/python/bark/requirements-hipblas.txt
@@ -1,5 +1,6 @@
+bark==0.1.5
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
 torch==2.4.1+rocm6.0
 torchaudio==2.4.1+rocm6.0
 transformers
-accelerate
+accelerate
diff --git a/backend/python/bark/requirements-intel.txt b/backend/python/bark/requirements-intel.txt
@@ -1,3 +1,4 @@
+bark==0.1.5
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
 intel-extension-for-pytorch==2.3.110+xpu
 torch==2.3.1+cxx11.abi
@@ -6,4 +7,4 @@ oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
 setuptools
 transformers
-accelerate
+accelerate
diff --git a/backend/python/bark/requirements.txt b/backend/python/bark/requirements.txt
@@ -1,4 +1,3 @@
-bark==0.1.5
 grpcio==1.71.0
 protobuf
-certifi
+certifi
diff --git a/backend/python/chatterbox/requirements-cpu.txt b/backend/python/chatterbox/requirements-cpu.txt
@@ -1,5 +1,6 @@
 accelerate
-torch==2.6.0
-torchaudio==2.6.0
+--extra-index-url https://download.pytorch.org/whl/cpu
+torch==2.6.0+cpu
+torchaudio==2.6.0+cpu
 transformers==4.46.3
 chatterbox-tts
diff --git a/backend/python/common/libbackend.sh b/backend/python/common/libbackend.sh
@@ -74,8 +74,8 @@ function getBuildProfile() {
         return 0
     fi
 
-    # If /opt/intel exists, then we are doing an intel/ARC build
-    if [ -d "/opt/intel" ]; then
+    # If /opt/intel exists and BUILD_TYPE is one of the Intel ones, then we are doing an intel/ARC build
+    if [[ -d "/opt/intel" && ( x"${BUILD_TYPE}" == "xintel" || ( x"${BUILD_TYPE}" == "xsycl_f16" || x"${BUILD_TYPE}" == "xsycl_f32" ) ) ]]; then
         echo "intel"
         return 0
     fi

diff --git a/backend/python/coqui/requirements-cpu.txt b/backend/python/coqui/requirements-cpu.txt
@@ -1,4 +1,5 @@
 transformers==4.48.3
 accelerate
-torch==2.4.1
-coqui-tts
+--extra-index-url https://download.pytorch.org/whl/cpu
+torch==2.4.1+cpu
+coqui-tts
diff --git a/backend/python/diffusers/backend.py b/backend/python/diffusers/backend.py
@@ -29,14 +29,20 @@
 
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
 COMPEL = os.environ.get("COMPEL", "0") == "1"
-XPU = os.environ.get("XPU", "0") == "1"
+# Attempt to use XPU only if Torch says it is available when asking for it
+XPU = ((os.environ.get("XPU", "0") == "1") & (torch.xpu.is_available()))
 CLIPSKIP = os.environ.get("CLIPSKIP", "1") == "1"
 SAFETENSORS = os.environ.get("SAFETENSORS", "1") == "1"
 CHUNK_SIZE = os.environ.get("CHUNK_SIZE", "8")
 FPS = os.environ.get("FPS", "7")
 DISABLE_CPU_OFFLOAD = os.environ.get("DISABLE_CPU_OFFLOAD", "0") == "1"
 FRAMES = os.environ.get("FRAMES", "64")
 
+# Set Torch to use all logical CPU cores for CPU mode
+num_cores = os.cpu_count()
+torch.set_num_threads(max(1, num_cores // 2))
+torch.set_num_interop_threads(num_cores)
+
 if XPU:
     print(torch.xpu.get_device_name(0))
 
@@ -166,7 +172,8 @@ def LoadModel(self, request, context):
             torchType = torch.float32
             variant = None
 
-            if request.F16Memory:
+            # Only use f16 if not running on CPU - forcing f16 on CPU causes freezes (https://github.com/pytorch/pytorch/issues/75458)
+            if (request.F16Memory & ((request.CUDA & torch.cuda.is_available()) | XPU)):
                 torchType = torch.float16
                 variant = "fp16"
 
@@ -189,12 +196,18 @@ def LoadModel(self, request, context):
                     value = int(value)
                 self.options[key] = value
 
-            # From options, extract if present "torch_dtype" and set it to the appropriate type
+            # From options, extract if present "torch_dtype" and set it to the appropriate type; if on CPU, always force float32
             if "torch_dtype" in self.options:
                 if self.options["torch_dtype"] == "fp16":
-                    torchType = torch.float16
+                    if not ((request.CUDA & torch.cuda.is_available()) | XPU):
+                        torchType = torch.float32
+                    else:
+                        torchType = torch.float16
                 elif self.options["torch_dtype"] == "bf16":
-                    torchType = torch.bfloat16
+                    if not ((request.CUDA & torch.cuda.is_available()) | XPU):
+                        torchType = torch.float32
+                    else:
+                        torchType = torch.bfloat16
                 elif self.options["torch_dtype"] == "fp32":
                     torchType = torch.float32
                 # remove it from options
@@ -290,6 +303,8 @@ def LoadModel(self, request, context):
                         use_safetensors=True,
                         variant=variant)
             elif request.PipelineType == "FluxPipeline":
+                if not ((request.CUDA & torch.cuda.is_available()) | XPU):
+                        raise RuntimeError("Flux requires f16. Cannot run diffusers using f16 on CPU - doing so causes deadlocks. Refer to: https://github.com/pytorch/pytorch/issues/75458")
                 if fromSingleFile:
                     self.pipe = FluxPipeline.from_single_file(modelFile,
                                                               torch_dtype=torchType,
@@ -301,6 +316,8 @@ def LoadModel(self, request, context):
                 if request.LowVRAM:
                     self.pipe.enable_model_cpu_offload()
             elif request.PipelineType == "FluxTransformer2DModel":
+                    if not ((request.CUDA & torch.cuda.is_available()) | XPU):
+                        raise RuntimeError("Flux requires f16. Cannot run diffusers using f16 on CPU - doing so causes deadlocks. Refer to: https://github.com/pytorch/pytorch/issues/75458")
                     dtype = torch.bfloat16
                     # specify from environment or default to "ChuckMcSneed/FLUX.1-dev"
                     bfl_repo = os.environ.get("BFL_REPO", "ChuckMcSneed/FLUX.1-dev")
@@ -319,12 +336,16 @@ def LoadModel(self, request, context):
                     if request.LowVRAM:
                         self.pipe.enable_model_cpu_offload()
             elif request.PipelineType == "Lumina2Text2ImgPipeline":
+                if not ((request.CUDA & torch.cuda.is_available()) | XPU):
+                        raise RuntimeError("Lumina requires f16. Cannot run diffusers using f16 on CPU - doing so causes deadlocks. Refer to: https://github.com/pytorch/pytorch/issues/75458")
                 self.pipe = Lumina2Text2ImgPipeline.from_pretrained(
                     request.Model,
                     torch_dtype=torch.bfloat16)
                 if request.LowVRAM:
                     self.pipe.enable_model_cpu_offload()
             elif request.PipelineType == "SanaPipeline":
+                if not ((request.CUDA & torch.cuda.is_available()) | XPU):
+                        raise RuntimeError("Sana requires f16. Cannot run diffusers using f16 on CPU - doing so causes deadlocks. Refer to: https://github.com/pytorch/pytorch/issues/75458")
                 self.pipe = SanaPipeline.from_pretrained(
                     request.Model,
                     variant="bf16",
@@ -362,7 +383,7 @@ def LoadModel(self, request, context):
                 # modify LoraAdapter to be relative to modelFileBase
                 request.LoraAdapter = os.path.join(request.ModelPath, request.LoraAdapter)
 
-            device = "cpu" if not request.CUDA else "cuda"
+            device = "cpu" if not (request.CUDA & torch.cuda.is_available()) else "cuda"
             if XPU:
                 device = "xpu"
             self.device = device
@@ -392,6 +413,8 @@ def LoadModel(self, request, context):
                 self.pipe.to(device)
                 if self.controlnet:
                     self.controlnet.to(device)
+            else:
+                self.pipe.to("cpu")
 
         except Exception as err:
             return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")

diff --git a/backend/python/diffusers/requirements-cpu.txt b/backend/python/diffusers/requirements-cpu.txt
@@ -5,5 +5,6 @@ accelerate
 compel
 peft
 sentencepiece
-torch==2.7.1
-optimum-quanto
+--extra-index-url https://download.pytorch.org/whl/cpu
+torch==2.7.1+cpu
+optimum-quanto
diff --git a/backend/python/diffusers/run.sh b/backend/python/diffusers/run.sh
@@ -6,6 +6,10 @@ else
     source $backend_dir/../common/libbackend.sh
 fi
 
+# Set thread counts for CPU mode
+export OMP_NUM_THREADS=$(nproc)
+export MKL_NUM_THREADS=$(nproc)
+
 if [ -d "/opt/intel" ]; then
     # Assumes we are using the Intel oneAPI container image
     # https://github.com/intel/intel-extension-for-pytorch/issues/538

diff --git a/backend/python/faster-whisper/backend.py b/backend/python/faster-whisper/backend.py
@@ -8,6 +8,7 @@
 import signal
 import sys
 import os
+import torch
 import backend_pb2
 import backend_pb2_grpc
 
@@ -31,14 +32,17 @@ def Health(self, request, context):
         return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
     def LoadModel(self, request, context):
         device = "cpu"
+        precision = "float32"
         # Get device
         # device = "cuda" if request.CUDA else "cpu"
-        if request.CUDA:
+        # Detecting CUDA availability using Torch.
+        if (request.CUDA & torch.cuda.is_available()):
             device = "cuda"
+            precision="float16"
 
         try:
             print("Preparing models, please wait", file=sys.stderr)
-            self.model = WhisperModel(request.Model, device=device, compute_type="float16")
+            self.model = WhisperModel(request.Model, device=device, compute_type=precision)
         except Exception as err:
             return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
         # Implement your logic here for the LoadModel service

diff --git a/backend/python/faster-whisper/requirements-cpu.txt b/backend/python/faster-whisper/requirements-cpu.txt
@@ -4,5 +4,6 @@ accelerate
 compel
 peft
 sentencepiece
-torch==2.4.1
-optimum-quanto
+--extra-index-url https://download.pytorch.org/whl/cpu
+torch==2.4.1+cpu
+optimum-quanto
diff --git a/backend/python/kokoro/requirements-cpu.txt b/backend/python/kokoro/requirements-cpu.txt
@@ -1,6 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 transformers
 accelerate
-torch
+torch==2.7.1+cpu
 kokoro
-soundfile
+soundfile
diff --git a/backend/python/rerankers/requirements-cpu.txt b/backend/python/rerankers/requirements-cpu.txt
@@ -1,4 +1,5 @@
 transformers
 accelerate
-torch==2.4.1
-rerankers[transformers]
+--extra-index-url https://download.pytorch.org/whl/cpu
+torch==2.4.1+cpu
+rerankers[transformers]
diff --git a/backend/python/rfdetr/requirements-cpu.txt b/backend/python/rfdetr/requirements-cpu.txt
@@ -3,5 +3,6 @@ opencv-python
 accelerate
 peft
 inference
-torch==2.7.1
-optimum-quanto
+--extra-index-url https://download.pytorch.org/whl/cpu
+torch==2.7.1+cpu
+optimum-quanto
diff --git a/backend/python/transformers/backend.py b/backend/python/transformers/backend.py
@@ -19,8 +19,8 @@
 import torch
 import torch.cuda
 
-
-XPU=os.environ.get("XPU", "0") == "1"
+# Attempt to use XPU only if Torch says it is available when asking for it
+XPU = ((os.environ.get("XPU", "0") == "1") & (torch.xpu.is_available()))
 from transformers import AutoTokenizer, AutoModel, set_seed, TextIteratorStreamer, StoppingCriteriaList, StopStringCriteria, MambaConfig, MambaForCausalLM
 from transformers import AutoProcessor, MusicgenForConditionalGeneration, DiaForConditionalGeneration
 from scipy.io import wavfile
@@ -83,8 +83,14 @@ def LoadModel(self, request, context):
         if os.path.exists(request.ModelFile):
             model_name = request.ModelFile
 
-        compute = torch.float16
-        if request.F16Memory == True:
+        # Use float32 for CPU inference
+        if (torch.cuda.is_available() | XPU):
+            compute = torch.float16
+        else:
+            compute = torch.float32
+
+        # Only use f16 if not running on CPU - forcing f16 on CPU causes freezes (https://github.com/pytorch/pytorch/issues/75458)
+        if (request.F16Memory & (torch.cuda.is_available() | XPU)) == True:
             compute=torch.bfloat16
 
         self.CUDA = torch.cuda.is_available()
@@ -122,6 +128,9 @@ def LoadModel(self, request, context):
 
         print(f"Parsed options: {self.options}", file=sys.stderr)
 
+        if not (self.CUDA | XPU):
+            from transformers import BitsAndBytesConfig, AutoModelForCausalLM
+
         if self.CUDA:
             from transformers import BitsAndBytesConfig, AutoModelForCausalLM
             if request.MainGPU: