From 6d0af39634936701b48ec638a111c7116dcdc968 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Tue, 12 Aug 2025 11:29:53 +0200
Subject: [PATCH 01/14] chore: allow to install with pip

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 backend/python/common/libbackend.sh | 56 +++++++++++++++++++++++------
 1 file changed, 45 insertions(+), 11 deletions(-)

diff --git a/backend/python/common/libbackend.sh b/backend/python/common/libbackend.sh
index daa47c3c2080..d98924fe3550 100644
--- a/backend/python/common/libbackend.sh
+++ b/backend/python/common/libbackend.sh
@@ -17,9 +17,17 @@
 # LIMIT_TARGETS="cublas12"
 # source $(dirname $0)/../common/libbackend.sh
 #
+# You can switch between uv (conda-like) and pip installation methods by setting USE_PIP:
+# USE_PIP=true source $(dirname $0)/../common/libbackend.sh
+#
 
 PYTHON_VERSION="3.10"
 
+# Default to uv if USE_PIP is not set
+if [ "x${USE_PIP}" == "x" ]; then
+    USE_PIP=false
+fi
+
 function init() {
     # Name of the backend (directory name)
     BACKEND_NAME=${PWD##*/}
@@ -48,6 +56,11 @@ function init() {
     fi
 
     echo "Initializing libbackend for ${BACKEND_NAME}"
+    if [ "x${USE_PIP}" == "xtrue" ]; then
+        echo "Using pip and Python virtual environments"
+    else
+        echo "Using uv package manager"
+    fi
 }
 
 # getBuildProfile will inspect the system to determine which build profile is appropriate:
@@ -95,18 +108,33 @@ function getBuildProfile() {
 # This function is idempotent, so you can call it as many times as you want and it will
 # always result in an activated virtual environment
 function ensureVenv() {
-    if [ ! -d "${EDIR}/venv" ]; then
-        uv venv --python ${PYTHON_VERSION} ${EDIR}/venv
-        echo "virtualenv created"
-    fi
+    if [ "x${USE_PIP}" == "xtrue" ]; then
+        # Use Python virtual environment with pip
+        if [ ! -d "${EDIR}/venv" ]; then
+            python${PYTHON_VERSION} -m venv ${EDIR}/venv
+            echo "Python virtual environment created"
+        fi
 
-    # Source if we are not already in a Virtual env
-    if [ "x${VIRTUAL_ENV}" != "x${EDIR}/venv" ]; then
-        source ${EDIR}/venv/bin/activate
-        echo "virtualenv activated"
+        # Source if we are not already in a Virtual env
+        if [ "x${VIRTUAL_ENV}" != "x${EDIR}/venv" ]; then
+            source ${EDIR}/venv/bin/activate
+            echo "Python virtual environment activated"
+        fi
+    else
+        # Use uv (conda-like)
+        if [ ! -d "${EDIR}/venv" ]; then
+            uv venv --python ${PYTHON_VERSION} ${EDIR}/venv
+            echo "uv virtual environment created"
+        fi
+
+        # Source if we are not already in a Virtual env
+        if [ "x${VIRTUAL_ENV}" != "x${EDIR}/venv" ]; then
+            source ${EDIR}/venv/bin/activate
+            echo "uv virtual environment activated"
+        fi
     fi
 
-    echo "activated virtualenv has been ensured"
+    echo "activated virtual environment has been ensured"
 }
 
 # installRequirements looks for several requirements files and if they exist runs the install for them in order
@@ -116,7 +144,7 @@ function ensureVenv() {
 #  - requirements-${BUILD_TYPE}.txt
 #  - requirements-${BUILD_PROFILE}.txt
 #
-# BUILD_PROFILE is a pore specific version of BUILD_TYPE, ex: cuda-11 or cuda-12
+# BUILD_PROFILE is a more specific version of BUILD_TYPE, ex: cuda-11 or cuda-12
 # it can also include some options that we do not have BUILD_TYPES for, ex: intel
 #
 # NOTE: for BUILD_PROFILE==intel, this function does NOT automatically use the Intel python package index.
@@ -158,7 +186,13 @@ function installRequirements() {
     for reqFile in ${requirementFiles[@]}; do
         if [ -f ${reqFile} ]; then
             echo "starting requirements install for ${reqFile}"
-            uv pip install ${EXTRA_PIP_INSTALL_FLAGS} --requirement ${reqFile}
+            if [ "x${USE_PIP}" == "xtrue" ]; then
+                # Use pip for installation
+                pip install ${EXTRA_PIP_INSTALL_FLAGS} --requirement ${reqFile}
+            else
+                # Use uv for installation
+                uv pip install ${EXTRA_PIP_INSTALL_FLAGS} --requirement ${reqFile}
+            fi
             echo "finished requirements install for ${reqFile}"
         fi
     done

From 44eaf6c20d85ff6ee411e81b00edf19c54aa0416 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Tue, 12 Aug 2025 22:18:27 +0200
Subject: [PATCH 02/14] WIP

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 backend/python/mlx-vlm/Makefile         |  29 ++
 backend/python/mlx-vlm/backend.py       | 367 ++++++++++++++++++++++++
 backend/python/mlx-vlm/install.sh       |  14 +
 backend/python/mlx-vlm/requirements.txt |   4 +
 backend/python/mlx-vlm/run.sh           |  11 +
 backend/python/mlx-vlm/test.py          | 146 ++++++++++
 backend/python/mlx-vlm/test.sh          |  12 +
 backend/python/mlx/Makefile             |  29 ++
 backend/python/mlx/backend.py           | 367 ++++++++++++++++++++++++
 backend/python/mlx/install.sh           |  14 +
 backend/python/mlx/requirements.txt     |   4 +
 backend/python/mlx/run.sh               |  11 +
 backend/python/mlx/test.py              | 146 ++++++++++
 backend/python/mlx/test.sh              |  12 +
 14 files changed, 1166 insertions(+)
 create mode 100644 backend/python/mlx-vlm/Makefile
 create mode 100644 backend/python/mlx-vlm/backend.py
 create mode 100755 backend/python/mlx-vlm/install.sh
 create mode 100644 backend/python/mlx-vlm/requirements.txt
 create mode 100755 backend/python/mlx-vlm/run.sh
 create mode 100644 backend/python/mlx-vlm/test.py
 create mode 100755 backend/python/mlx-vlm/test.sh
 create mode 100644 backend/python/mlx/Makefile
 create mode 100644 backend/python/mlx/backend.py
 create mode 100755 backend/python/mlx/install.sh
 create mode 100644 backend/python/mlx/requirements.txt
 create mode 100755 backend/python/mlx/run.sh
 create mode 100644 backend/python/mlx/test.py
 create mode 100755 backend/python/mlx/test.sh

diff --git a/backend/python/mlx-vlm/Makefile b/backend/python/mlx-vlm/Makefile
new file mode 100644
index 000000000000..c4c18bee55b0
--- /dev/null
+++ b/backend/python/mlx-vlm/Makefile
@@ -0,0 +1,29 @@
+.PHONY: mlx
+mlx: protogen
+	bash install.sh
+
+.PHONY: run
+run: protogen
+	@echo "Running mlx..."
+	bash run.sh
+	@echo "mlx run."
+
+.PHONY: test
+test: protogen
+	@echo "Testing mlx..."
+	bash test.sh
+	@echo "mlx tested."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
+
+.PHONY: clean
+clean: protogen-clean
+	rm -rf venv __pycache__
\ No newline at end of file
diff --git a/backend/python/mlx-vlm/backend.py b/backend/python/mlx-vlm/backend.py
new file mode 100644
index 000000000000..56698a54e5f5
--- /dev/null
+++ b/backend/python/mlx-vlm/backend.py
@@ -0,0 +1,367 @@
+#!/usr/bin/env python3
+import asyncio
+from concurrent import futures
+import argparse
+import signal
+import sys
+import os
+from typing import List
+from PIL import Image
+
+import backend_pb2
+import backend_pb2_grpc
+
+import grpc
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.sampling_params import SamplingParams
+from vllm.utils import random_uuid
+from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.multimodal.utils import fetch_image
+from vllm.assets.video import VideoAsset
+import base64
+import io
+
+_ONE_DAY_IN_SECONDS = 60 * 60 * 24
+
+# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
+MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
+
+# Implement the BackendServicer class with the service methods
+class BackendServicer(backend_pb2_grpc.BackendServicer):
+    """
+    A gRPC servicer that implements the Backend service defined in backend.proto.
+    """
+    def generate(self,prompt, max_new_tokens):
+        """
+        Generates text based on the given prompt and maximum number of new tokens.
+
+        Args:
+            prompt (str): The prompt to generate text from.
+            max_new_tokens (int): The maximum number of new tokens to generate.
+
+        Returns:
+            str: The generated text.
+        """
+        self.generator.end_beam_search()
+
+        # Tokenizing the input
+        ids = self.generator.tokenizer.encode(prompt)
+
+        self.generator.gen_begin_reuse(ids)
+        initial_len = self.generator.sequence[0].shape[0]
+        has_leading_space = False
+        decoded_text = ''
+        for i in range(max_new_tokens):
+            token = self.generator.gen_single_token()
+            if i == 0 and self.generator.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'):
+                has_leading_space = True
+
+            decoded_text = self.generator.tokenizer.decode(self.generator.sequence[0][initial_len:])
+            if has_leading_space:
+                decoded_text = ' ' + decoded_text
+
+            if token.item() == self.generator.tokenizer.eos_token_id:
+                break
+        return decoded_text
+
+    def Health(self, request, context):
+        """
+        Returns a health check message.
+
+        Args:
+            request: The health check request.
+            context: The gRPC context.
+
+        Returns:
+            backend_pb2.Reply: The health check reply.
+        """
+        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
+
+    async def LoadModel(self, request, context):
+        """
+        Loads a language model.
+
+        Args:
+            request: The load model request.
+            context: The gRPC context.
+
+        Returns:
+            backend_pb2.Result: The load model result.
+        """
+        engine_args = AsyncEngineArgs(
+            model=request.Model,
+        )
+
+        if request.Quantization != "":
+            engine_args.quantization = request.Quantization
+        if request.LoadFormat != "":
+            engine_args.load_format = request.LoadFormat
+        if request.GPUMemoryUtilization != 0:
+            engine_args.gpu_memory_utilization = request.GPUMemoryUtilization
+        if request.TrustRemoteCode:
+            engine_args.trust_remote_code = request.TrustRemoteCode
+        if request.EnforceEager:
+            engine_args.enforce_eager = request.EnforceEager
+        if request.TensorParallelSize:
+            engine_args.tensor_parallel_size = request.TensorParallelSize
+        if request.SwapSpace != 0:
+            engine_args.swap_space = request.SwapSpace
+        if request.MaxModelLen != 0:
+            engine_args.max_model_len = request.MaxModelLen
+        if request.DisableLogStatus:
+            engine_args.disable_log_status = request.DisableLogStatus
+        if request.DType != "":
+            engine_args.dtype = request.DType
+        if request.LimitImagePerPrompt != 0 or request.LimitVideoPerPrompt != 0 or request.LimitAudioPerPrompt != 0:
+            # limit-mm-per-prompt defaults to 1 per modality, based on vLLM docs
+            engine_args.limit_mm_per_prompt = {
+                "image": max(request.LimitImagePerPrompt, 1),
+                "video": max(request.LimitVideoPerPrompt, 1),
+                "audio": max(request.LimitAudioPerPrompt, 1)
+            }
+
+        try:
+            self.llm = AsyncLLMEngine.from_engine_args(engine_args)
+        except Exception as err:
+            print(f"Unexpected {err=}, {type(err)=}", file=sys.stderr)
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+
+        try:
+           engine_model_config = await self.llm.get_model_config()
+           self.tokenizer = get_tokenizer(
+               engine_model_config.tokenizer,
+               tokenizer_mode=engine_model_config.tokenizer_mode,
+               trust_remote_code=engine_model_config.trust_remote_code,
+               truncation_side="left",
+           )
+        except Exception as err:
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+        print("Model loaded successfully", file=sys.stderr)
+        return backend_pb2.Result(message="Model loaded successfully", success=True)
+
+    async def Predict(self, request, context):
+        """
+        Generates text based on the given prompt and sampling parameters.
+
+        Args:
+            request: The predict request.
+            context: The gRPC context.
+
+        Returns:
+            backend_pb2.Reply: The predict result.
+        """
+        gen = self._predict(request, context, streaming=False)
+        res = await gen.__anext__()
+        return res
+
+    def Embedding(self, request, context):
+        """
+        A gRPC method that calculates embeddings for a given sentence.
+
+        Args:
+            request: An EmbeddingRequest object that contains the request parameters.
+            context: A grpc.ServicerContext object that provides information about the RPC.
+
+        Returns:
+            An EmbeddingResult object that contains the calculated embeddings.
+        """
+        print("Calculated embeddings for: " + request.Embeddings, file=sys.stderr)
+        outputs = self.model.encode(request.Embeddings)
+        # Check if we have one result at least
+        if len(outputs) == 0:
+            context.set_code(grpc.StatusCode.INVALID_ARGUMENT)
+            context.set_details("No embeddings were calculated.")
+            return backend_pb2.EmbeddingResult()
+        return backend_pb2.EmbeddingResult(embeddings=outputs[0].outputs.embedding)
+
+    async def PredictStream(self, request, context):
+        """
+        Generates text based on the given prompt and sampling parameters, and streams the results.
+
+        Args:
+            request: The predict stream request.
+            context: The gRPC context.
+
+        Returns:
+            backend_pb2.Result: The predict stream result.
+        """
+        iterations = self._predict(request, context, streaming=True)
+        try:
+            async for iteration in iterations:
+                yield iteration
+        finally:
+            await iterations.aclose()
+
+    async def _predict(self, request, context, streaming=False):
+        # Build the sampling parameters
+        # NOTE: this must stay in sync with the vllm backend
+        request_to_sampling_params = {
+            "N": "n",
+            "PresencePenalty": "presence_penalty",
+            "FrequencyPenalty": "frequency_penalty",
+            "RepetitionPenalty": "repetition_penalty",
+            "Temperature": "temperature",
+            "TopP": "top_p",
+            "TopK": "top_k",
+            "MinP": "min_p",
+            "Seed": "seed",
+            "StopPrompts": "stop",
+            "StopTokenIds": "stop_token_ids",
+            "BadWords": "bad_words",
+            "IncludeStopStrInOutput": "include_stop_str_in_output",
+            "IgnoreEOS": "ignore_eos",
+            "Tokens": "max_tokens",
+            "MinTokens": "min_tokens",
+            "Logprobs": "logprobs",
+            "PromptLogprobs": "prompt_logprobs",
+            "SkipSpecialTokens": "skip_special_tokens",
+            "SpacesBetweenSpecialTokens": "spaces_between_special_tokens",
+            "TruncatePromptTokens": "truncate_prompt_tokens",
+            "GuidedDecoding": "guided_decoding",
+        }
+
+        sampling_params = SamplingParams(top_p=0.9, max_tokens=200)
+
+        for request_field, param_field in request_to_sampling_params.items():
+            if hasattr(request, request_field):
+                value = getattr(request, request_field)
+                if value not in (None, 0, [], False):
+                    setattr(sampling_params, param_field, value)
+
+        # Extract image paths and process images
+        prompt = request.Prompt
+
+        image_paths = request.Images
+        image_data = [self.load_image(img_path) for img_path in image_paths]
+
+        videos_path = request.Videos
+        video_data = [self.load_video(video_path) for video_path in videos_path]
+
+        # If tokenizer template is enabled and messages are provided instead of prompt, apply the tokenizer template
+        if not request.Prompt and request.UseTokenizerTemplate and request.Messages:
+            prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True)
+
+        # Generate text using the LLM engine
+        request_id = random_uuid()
+        print(f"Generating text with request_id: {request_id}", file=sys.stderr)
+        multi_modal_data = {}
+        if image_data:
+            multi_modal_data["image"] = image_data
+        if video_data:
+            multi_modal_data["video"] = video_data
+        outputs = self.llm.generate(
+            {
+            "prompt": prompt,
+            "multi_modal_data": multi_modal_data if multi_modal_data else None,
+            },
+            sampling_params=sampling_params,
+            request_id=request_id,
+        )
+
+        # Stream the results
+        generated_text = ""
+        try:
+            async for request_output in outputs:
+                iteration_text = request_output.outputs[0].text
+
+                if streaming:
+                    # Remove text already sent as vllm concatenates the text from previous yields
+                    delta_iteration_text = iteration_text.removeprefix(generated_text)
+                    # Send the partial result
+                    yield backend_pb2.Reply(message=bytes(delta_iteration_text, encoding='utf-8'))
+
+                # Keep track of text generated
+                generated_text = iteration_text
+        finally:
+            await outputs.aclose()
+
+        # If streaming, we already sent everything
+        if streaming:
+            return
+
+        # Remove the image files from /tmp folder
+        for img_path in image_paths:
+            try:
+                os.remove(img_path)
+            except Exception as e:
+                print(f"Error removing image file: {img_path}, {e}", file=sys.stderr)
+
+        # Sending the final generated text
+        yield backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))
+
+    def load_image(self, image_path: str):
+        """
+        Load an image from the given file path or base64 encoded data.
+
+        Args:
+            image_path (str): The path to the image file or base64 encoded data.
+
+        Returns:
+            Image: The loaded image.
+        """
+        try:
+
+            image_data = base64.b64decode(image_path)
+            image = Image.open(io.BytesIO(image_data))
+            return image
+        except Exception as e:
+            print(f"Error loading image {image_path}: {e}", file=sys.stderr)
+            return None
+
+    def load_video(self, video_path: str):
+        """
+        Load a video from the given file path.
+
+        Args:
+            video_path (str): The path to the image file.
+
+        Returns:
+            Video: The loaded video.
+        """
+        try:
+            timestamp = str(int(time.time() * 1000))  # Generate timestamp
+            p = f"/tmp/vl-{timestamp}.data"  # Use timestamp in filename
+            with open(p, "wb") as f:
+                f.write(base64.b64decode(video_path))
+            video = VideoAsset(name=p).np_ndarrays
+            os.remove(p)
+            return video
+        except Exception as e:
+            print(f"Error loading video {video_path}: {e}", file=sys.stderr)
+            return None
+
+async def serve(address):
+    # Start asyncio gRPC server
+    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
+    # Add the servicer to the server
+    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
+    # Bind the server to the address
+    server.add_insecure_port(address)
+
+    # Gracefully shutdown the server on SIGTERM or SIGINT
+    loop = asyncio.get_event_loop()
+    for sig in (signal.SIGINT, signal.SIGTERM):
+        loop.add_signal_handler(
+            sig, lambda: asyncio.ensure_future(server.stop(5))
+        )
+
+    # Start the server
+    await server.start()
+    print("Server started. Listening on: " + address, file=sys.stderr)
+    # Wait for the server to be terminated
+    await server.wait_for_termination()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run the gRPC server.")
+    parser.add_argument(
+        "--addr", default="localhost:50051", help="The address to bind the server to."
+    )
+    args = parser.parse_args()
+
+    asyncio.run(serve(args.addr))
diff --git a/backend/python/mlx-vlm/install.sh b/backend/python/mlx-vlm/install.sh
new file mode 100755
index 000000000000..b8ee48552490
--- /dev/null
+++ b/backend/python/mlx-vlm/install.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+set -e
+
+USE_PIP=true
+
+backend_dir=$(dirname $0)
+
+if [ -d $backend_dir/common ]; then
+    source $backend_dir/common/libbackend.sh
+else
+    source $backend_dir/../common/libbackend.sh
+fi
+
+installRequirements
diff --git a/backend/python/mlx-vlm/requirements.txt b/backend/python/mlx-vlm/requirements.txt
new file mode 100644
index 000000000000..f1771cc4adb4
--- /dev/null
+++ b/backend/python/mlx-vlm/requirements.txt
@@ -0,0 +1,4 @@
+grpcio==1.71.0
+protobuf
+certifi
+setuptools
\ No newline at end of file
diff --git a/backend/python/mlx-vlm/run.sh b/backend/python/mlx-vlm/run.sh
new file mode 100755
index 000000000000..fc88f97da712
--- /dev/null
+++ b/backend/python/mlx-vlm/run.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+backend_dir=$(dirname $0)
+
+if [ -d $backend_dir/common ]; then
+    source $backend_dir/common/libbackend.sh
+else
+    source $backend_dir/../common/libbackend.sh
+fi
+
+startBackend $@
\ No newline at end of file
diff --git a/backend/python/mlx-vlm/test.py b/backend/python/mlx-vlm/test.py
new file mode 100644
index 000000000000..827aa71a3e33
--- /dev/null
+++ b/backend/python/mlx-vlm/test.py
@@ -0,0 +1,146 @@
+import unittest
+import subprocess
+import time
+import backend_pb2
+import backend_pb2_grpc
+
+import grpc
+
+import unittest
+import subprocess
+import time
+import grpc
+import backend_pb2_grpc
+import backend_pb2
+
+class TestBackendServicer(unittest.TestCase):
+    """
+    TestBackendServicer is the class that tests the gRPC service.
+
+    This class contains methods to test the startup and shutdown of the gRPC service.
+    """
+    def setUp(self):
+        self.service = subprocess.Popen(["python", "backend.py", "--addr", "localhost:50051"])
+        time.sleep(10)
+
+    def tearDown(self) -> None:
+        self.service.terminate()
+        self.service.wait()
+
+    def test_server_startup(self):
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.Health(backend_pb2.HealthMessage())
+                self.assertEqual(response.message, b'OK')
+        except Exception as err:
+            print(err)
+            self.fail("Server failed to start")
+        finally:
+            self.tearDown()
+    def test_load_model(self):
+        """
+        This method tests if the model is loaded successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m"))
+                self.assertTrue(response.success)
+                self.assertEqual(response.message, "Model loaded successfully")
+        except Exception as err:
+            print(err)
+            self.fail("LoadModel service failed")
+        finally:
+            self.tearDown()
+
+    def test_text(self):
+        """
+        This method tests if the embeddings are generated successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m"))
+                self.assertTrue(response.success)
+                req = backend_pb2.PredictOptions(Prompt="The capital of France is")
+                resp = stub.Predict(req)
+                self.assertIsNotNone(resp.message)
+        except Exception as err:
+            print(err)
+            self.fail("text service failed")
+        finally:
+            self.tearDown()
+
+    def test_sampling_params(self):
+        """
+        This method tests if all sampling parameters are correctly processed
+        NOTE: this does NOT test for correctness, just that we received a compatible response
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m"))
+                self.assertTrue(response.success)
+
+                req = backend_pb2.PredictOptions(
+                    Prompt="The capital of France is",
+                    TopP=0.8,
+                    Tokens=50,
+                    Temperature=0.7,
+                    TopK=40,
+                    PresencePenalty=0.1,
+                    FrequencyPenalty=0.2,
+                    RepetitionPenalty=1.1,
+                    MinP=0.05,
+                    Seed=42,
+                    StopPrompts=["\n"],
+                    StopTokenIds=[50256],
+                    BadWords=["badword"],
+                    IncludeStopStrInOutput=True,
+                    IgnoreEOS=True,
+                    MinTokens=5,
+                    Logprobs=5,
+                    PromptLogprobs=5,
+                    SkipSpecialTokens=True,
+                    SpacesBetweenSpecialTokens=True,
+                    TruncatePromptTokens=10,
+                    GuidedDecoding=True,
+                    N=2,
+                )
+                resp = stub.Predict(req)
+                self.assertIsNotNone(resp.message)
+                self.assertIsNotNone(resp.logprobs)
+        except Exception as err:
+            print(err)
+            self.fail("sampling params service failed")
+        finally:
+            self.tearDown()
+
+
+    def test_embedding(self):
+        """
+        This method tests if the embeddings are generated successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="intfloat/e5-mistral-7b-instruct"))
+                self.assertTrue(response.success)
+                embedding_request = backend_pb2.PredictOptions(Embeddings="This is a test sentence.")
+                embedding_response = stub.Embedding(embedding_request)
+                self.assertIsNotNone(embedding_response.embeddings)
+                # assert that is a list of floats
+                self.assertIsInstance(embedding_response.embeddings, list)
+                # assert that the list is not empty
+                self.assertTrue(len(embedding_response.embeddings) > 0)
+        except Exception as err:
+            print(err)
+            self.fail("Embedding service failed")
+        finally:
+            self.tearDown()
\ No newline at end of file
diff --git a/backend/python/mlx-vlm/test.sh b/backend/python/mlx-vlm/test.sh
new file mode 100755
index 000000000000..f31ae54e47dc
--- /dev/null
+++ b/backend/python/mlx-vlm/test.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+set -e
+
+backend_dir=$(dirname $0)
+
+if [ -d $backend_dir/common ]; then
+    source $backend_dir/common/libbackend.sh
+else
+    source $backend_dir/../common/libbackend.sh
+fi
+
+runUnittests
diff --git a/backend/python/mlx/Makefile b/backend/python/mlx/Makefile
new file mode 100644
index 000000000000..c4c18bee55b0
--- /dev/null
+++ b/backend/python/mlx/Makefile
@@ -0,0 +1,29 @@
+.PHONY: mlx
+mlx: protogen
+	bash install.sh
+
+.PHONY: run
+run: protogen
+	@echo "Running mlx..."
+	bash run.sh
+	@echo "mlx run."
+
+.PHONY: test
+test: protogen
+	@echo "Testing mlx..."
+	bash test.sh
+	@echo "mlx tested."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
+
+.PHONY: clean
+clean: protogen-clean
+	rm -rf venv __pycache__
\ No newline at end of file
diff --git a/backend/python/mlx/backend.py b/backend/python/mlx/backend.py
new file mode 100644
index 000000000000..56698a54e5f5
--- /dev/null
+++ b/backend/python/mlx/backend.py
@@ -0,0 +1,367 @@
+#!/usr/bin/env python3
+import asyncio
+from concurrent import futures
+import argparse
+import signal
+import sys
+import os
+from typing import List
+from PIL import Image
+
+import backend_pb2
+import backend_pb2_grpc
+
+import grpc
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.sampling_params import SamplingParams
+from vllm.utils import random_uuid
+from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.multimodal.utils import fetch_image
+from vllm.assets.video import VideoAsset
+import base64
+import io
+
+_ONE_DAY_IN_SECONDS = 60 * 60 * 24
+
+# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
+MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
+
+# Implement the BackendServicer class with the service methods
+class BackendServicer(backend_pb2_grpc.BackendServicer):
+    """
+    A gRPC servicer that implements the Backend service defined in backend.proto.
+    """
+    def generate(self,prompt, max_new_tokens):
+        """
+        Generates text based on the given prompt and maximum number of new tokens.
+
+        Args:
+            prompt (str): The prompt to generate text from.
+            max_new_tokens (int): The maximum number of new tokens to generate.
+
+        Returns:
+            str: The generated text.
+        """
+        self.generator.end_beam_search()
+
+        # Tokenizing the input
+        ids = self.generator.tokenizer.encode(prompt)
+
+        self.generator.gen_begin_reuse(ids)
+        initial_len = self.generator.sequence[0].shape[0]
+        has_leading_space = False
+        decoded_text = ''
+        for i in range(max_new_tokens):
+            token = self.generator.gen_single_token()
+            if i == 0 and self.generator.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'):
+                has_leading_space = True
+
+            decoded_text = self.generator.tokenizer.decode(self.generator.sequence[0][initial_len:])
+            if has_leading_space:
+                decoded_text = ' ' + decoded_text
+
+            if token.item() == self.generator.tokenizer.eos_token_id:
+                break
+        return decoded_text
+
+    def Health(self, request, context):
+        """
+        Returns a health check message.
+
+        Args:
+            request: The health check request.
+            context: The gRPC context.
+
+        Returns:
+            backend_pb2.Reply: The health check reply.
+        """
+        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
+
+    async def LoadModel(self, request, context):
+        """
+        Loads a language model.
+
+        Args:
+            request: The load model request.
+            context: The gRPC context.
+
+        Returns:
+            backend_pb2.Result: The load model result.
+        """
+        engine_args = AsyncEngineArgs(
+            model=request.Model,
+        )
+
+        if request.Quantization != "":
+            engine_args.quantization = request.Quantization
+        if request.LoadFormat != "":
+            engine_args.load_format = request.LoadFormat
+        if request.GPUMemoryUtilization != 0:
+            engine_args.gpu_memory_utilization = request.GPUMemoryUtilization
+        if request.TrustRemoteCode:
+            engine_args.trust_remote_code = request.TrustRemoteCode
+        if request.EnforceEager:
+            engine_args.enforce_eager = request.EnforceEager
+        if request.TensorParallelSize:
+            engine_args.tensor_parallel_size = request.TensorParallelSize
+        if request.SwapSpace != 0:
+            engine_args.swap_space = request.SwapSpace
+        if request.MaxModelLen != 0:
+            engine_args.max_model_len = request.MaxModelLen
+        if request.DisableLogStatus:
+            engine_args.disable_log_status = request.DisableLogStatus
+        if request.DType != "":
+            engine_args.dtype = request.DType
+        if request.LimitImagePerPrompt != 0 or request.LimitVideoPerPrompt != 0 or request.LimitAudioPerPrompt != 0:
+            # limit-mm-per-prompt defaults to 1 per modality, based on vLLM docs
+            engine_args.limit_mm_per_prompt = {
+                "image": max(request.LimitImagePerPrompt, 1),
+                "video": max(request.LimitVideoPerPrompt, 1),
+                "audio": max(request.LimitAudioPerPrompt, 1)
+            }
+
+        try:
+            self.llm = AsyncLLMEngine.from_engine_args(engine_args)
+        except Exception as err:
+            print(f"Unexpected {err=}, {type(err)=}", file=sys.stderr)
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+
+        try:
+           engine_model_config = await self.llm.get_model_config()
+           self.tokenizer = get_tokenizer(
+               engine_model_config.tokenizer,
+               tokenizer_mode=engine_model_config.tokenizer_mode,
+               trust_remote_code=engine_model_config.trust_remote_code,
+               truncation_side="left",
+           )
+        except Exception as err:
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+        print("Model loaded successfully", file=sys.stderr)
+        return backend_pb2.Result(message="Model loaded successfully", success=True)
+
+    async def Predict(self, request, context):
+        """
+        Generates text based on the given prompt and sampling parameters.
+
+        Args:
+            request: The predict request.
+            context: The gRPC context.
+
+        Returns:
+            backend_pb2.Reply: The predict result.
+        """
+        gen = self._predict(request, context, streaming=False)
+        res = await gen.__anext__()
+        return res
+
+    def Embedding(self, request, context):
+        """
+        A gRPC method that calculates embeddings for a given sentence.
+
+        Args:
+            request: An EmbeddingRequest object that contains the request parameters.
+            context: A grpc.ServicerContext object that provides information about the RPC.
+
+        Returns:
+            An EmbeddingResult object that contains the calculated embeddings.
+        """
+        print("Calculated embeddings for: " + request.Embeddings, file=sys.stderr)
+        outputs = self.model.encode(request.Embeddings)
+        # Check if we have one result at least
+        if len(outputs) == 0:
+            context.set_code(grpc.StatusCode.INVALID_ARGUMENT)
+            context.set_details("No embeddings were calculated.")
+            return backend_pb2.EmbeddingResult()
+        return backend_pb2.EmbeddingResult(embeddings=outputs[0].outputs.embedding)
+
+    async def PredictStream(self, request, context):
+        """
+        Generates text based on the given prompt and sampling parameters, and streams the results.
+
+        Args:
+            request: The predict stream request.
+            context: The gRPC context.
+
+        Returns:
+            backend_pb2.Result: The predict stream result.
+        """
+        iterations = self._predict(request, context, streaming=True)
+        try:
+            async for iteration in iterations:
+                yield iteration
+        finally:
+            await iterations.aclose()
+
+    async def _predict(self, request, context, streaming=False):
+        # Build the sampling parameters
+        # NOTE: this must stay in sync with the vllm backend
+        request_to_sampling_params = {
+            "N": "n",
+            "PresencePenalty": "presence_penalty",
+            "FrequencyPenalty": "frequency_penalty",
+            "RepetitionPenalty": "repetition_penalty",
+            "Temperature": "temperature",
+            "TopP": "top_p",
+            "TopK": "top_k",
+            "MinP": "min_p",
+            "Seed": "seed",
+            "StopPrompts": "stop",
+            "StopTokenIds": "stop_token_ids",
+            "BadWords": "bad_words",
+            "IncludeStopStrInOutput": "include_stop_str_in_output",
+            "IgnoreEOS": "ignore_eos",
+            "Tokens": "max_tokens",
+            "MinTokens": "min_tokens",
+            "Logprobs": "logprobs",
+            "PromptLogprobs": "prompt_logprobs",
+            "SkipSpecialTokens": "skip_special_tokens",
+            "SpacesBetweenSpecialTokens": "spaces_between_special_tokens",
+            "TruncatePromptTokens": "truncate_prompt_tokens",
+            "GuidedDecoding": "guided_decoding",
+        }
+
+        sampling_params = SamplingParams(top_p=0.9, max_tokens=200)
+
+        for request_field, param_field in request_to_sampling_params.items():
+            if hasattr(request, request_field):
+                value = getattr(request, request_field)
+                if value not in (None, 0, [], False):
+                    setattr(sampling_params, param_field, value)
+
+        # Extract image paths and process images
+        prompt = request.Prompt
+
+        image_paths = request.Images
+        image_data = [self.load_image(img_path) for img_path in image_paths]
+
+        videos_path = request.Videos
+        video_data = [self.load_video(video_path) for video_path in videos_path]
+
+        # If tokenizer template is enabled and messages are provided instead of prompt, apply the tokenizer template
+        if not request.Prompt and request.UseTokenizerTemplate and request.Messages:
+            prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True)
+
+        # Generate text using the LLM engine
+        request_id = random_uuid()
+        print(f"Generating text with request_id: {request_id}", file=sys.stderr)
+        multi_modal_data = {}
+        if image_data:
+            multi_modal_data["image"] = image_data
+        if video_data:
+            multi_modal_data["video"] = video_data
+        outputs = self.llm.generate(
+            {
+            "prompt": prompt,
+            "multi_modal_data": multi_modal_data if multi_modal_data else None,
+            },
+            sampling_params=sampling_params,
+            request_id=request_id,
+        )
+
+        # Stream the results
+        generated_text = ""
+        try:
+            async for request_output in outputs:
+                iteration_text = request_output.outputs[0].text
+
+                if streaming:
+                    # Remove text already sent as vllm concatenates the text from previous yields
+                    delta_iteration_text = iteration_text.removeprefix(generated_text)
+                    # Send the partial result
+                    yield backend_pb2.Reply(message=bytes(delta_iteration_text, encoding='utf-8'))
+
+                # Keep track of text generated
+                generated_text = iteration_text
+        finally:
+            await outputs.aclose()
+
+        # If streaming, we already sent everything
+        if streaming:
+            return
+
+        # Remove the image files from /tmp folder
+        for img_path in image_paths:
+            try:
+                os.remove(img_path)
+            except Exception as e:
+                print(f"Error removing image file: {img_path}, {e}", file=sys.stderr)
+
+        # Sending the final generated text
+        yield backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))
+
+    def load_image(self, image_path: str):
+        """
+        Load an image from the given file path or base64 encoded data.
+
+        Args:
+            image_path (str): The path to the image file or base64 encoded data.
+
+        Returns:
+            Image: The loaded image.
+        """
+        try:
+
+            image_data = base64.b64decode(image_path)
+            image = Image.open(io.BytesIO(image_data))
+            return image
+        except Exception as e:
+            print(f"Error loading image {image_path}: {e}", file=sys.stderr)
+            return None
+
+    def load_video(self, video_path: str):
+        """
+        Load a video from the given file path.
+
+        Args:
+            video_path (str): The path to the image file.
+
+        Returns:
+            Video: The loaded video.
+        """
+        try:
+            timestamp = str(int(time.time() * 1000))  # Generate timestamp
+            p = f"/tmp/vl-{timestamp}.data"  # Use timestamp in filename
+            with open(p, "wb") as f:
+                f.write(base64.b64decode(video_path))
+            video = VideoAsset(name=p).np_ndarrays
+            os.remove(p)
+            return video
+        except Exception as e:
+            print(f"Error loading video {video_path}: {e}", file=sys.stderr)
+            return None
+
+async def serve(address):
+    # Start asyncio gRPC server
+    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
+    # Add the servicer to the server
+    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
+    # Bind the server to the address
+    server.add_insecure_port(address)
+
+    # Gracefully shutdown the server on SIGTERM or SIGINT
+    loop = asyncio.get_event_loop()
+    for sig in (signal.SIGINT, signal.SIGTERM):
+        loop.add_signal_handler(
+            sig, lambda: asyncio.ensure_future(server.stop(5))
+        )
+
+    # Start the server
+    await server.start()
+    print("Server started. Listening on: " + address, file=sys.stderr)
+    # Wait for the server to be terminated
+    await server.wait_for_termination()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run the gRPC server.")
+    parser.add_argument(
+        "--addr", default="localhost:50051", help="The address to bind the server to."
+    )
+    args = parser.parse_args()
+
+    asyncio.run(serve(args.addr))
diff --git a/backend/python/mlx/install.sh b/backend/python/mlx/install.sh
new file mode 100755
index 000000000000..b8ee48552490
--- /dev/null
+++ b/backend/python/mlx/install.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+set -e
+
+USE_PIP=true
+
+backend_dir=$(dirname $0)
+
+if [ -d $backend_dir/common ]; then
+    source $backend_dir/common/libbackend.sh
+else
+    source $backend_dir/../common/libbackend.sh
+fi
+
+installRequirements
diff --git a/backend/python/mlx/requirements.txt b/backend/python/mlx/requirements.txt
new file mode 100644
index 000000000000..f1771cc4adb4
--- /dev/null
+++ b/backend/python/mlx/requirements.txt
@@ -0,0 +1,4 @@
+grpcio==1.71.0
+protobuf
+certifi
+setuptools
\ No newline at end of file
diff --git a/backend/python/mlx/run.sh b/backend/python/mlx/run.sh
new file mode 100755
index 000000000000..fc88f97da712
--- /dev/null
+++ b/backend/python/mlx/run.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+backend_dir=$(dirname $0)
+
+if [ -d $backend_dir/common ]; then
+    source $backend_dir/common/libbackend.sh
+else
+    source $backend_dir/../common/libbackend.sh
+fi
+
+startBackend $@
\ No newline at end of file
diff --git a/backend/python/mlx/test.py b/backend/python/mlx/test.py
new file mode 100644
index 000000000000..827aa71a3e33
--- /dev/null
+++ b/backend/python/mlx/test.py
@@ -0,0 +1,146 @@
+import unittest
+import subprocess
+import time
+import backend_pb2
+import backend_pb2_grpc
+
+import grpc
+
+import unittest
+import subprocess
+import time
+import grpc
+import backend_pb2_grpc
+import backend_pb2
+
+class TestBackendServicer(unittest.TestCase):
+    """
+    TestBackendServicer is the class that tests the gRPC service.
+
+    This class contains methods to test the startup and shutdown of the gRPC service.
+    """
+    def setUp(self):
+        self.service = subprocess.Popen(["python", "backend.py", "--addr", "localhost:50051"])
+        time.sleep(10)
+
+    def tearDown(self) -> None:
+        self.service.terminate()
+        self.service.wait()
+
+    def test_server_startup(self):
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.Health(backend_pb2.HealthMessage())
+                self.assertEqual(response.message, b'OK')
+        except Exception as err:
+            print(err)
+            self.fail("Server failed to start")
+        finally:
+            self.tearDown()
+    def test_load_model(self):
+        """
+        This method tests if the model is loaded successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m"))
+                self.assertTrue(response.success)
+                self.assertEqual(response.message, "Model loaded successfully")
+        except Exception as err:
+            print(err)
+            self.fail("LoadModel service failed")
+        finally:
+            self.tearDown()
+
+    def test_text(self):
+        """
+        This method tests if the embeddings are generated successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m"))
+                self.assertTrue(response.success)
+                req = backend_pb2.PredictOptions(Prompt="The capital of France is")
+                resp = stub.Predict(req)
+                self.assertIsNotNone(resp.message)
+        except Exception as err:
+            print(err)
+            self.fail("text service failed")
+        finally:
+            self.tearDown()
+
+    def test_sampling_params(self):
+        """
+        This method tests if all sampling parameters are correctly processed
+        NOTE: this does NOT test for correctness, just that we received a compatible response
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m"))
+                self.assertTrue(response.success)
+
+                req = backend_pb2.PredictOptions(
+                    Prompt="The capital of France is",
+                    TopP=0.8,
+                    Tokens=50,
+                    Temperature=0.7,
+                    TopK=40,
+                    PresencePenalty=0.1,
+                    FrequencyPenalty=0.2,
+                    RepetitionPenalty=1.1,
+                    MinP=0.05,
+                    Seed=42,
+                    StopPrompts=["\n"],
+                    StopTokenIds=[50256],
+                    BadWords=["badword"],
+                    IncludeStopStrInOutput=True,
+                    IgnoreEOS=True,
+                    MinTokens=5,
+                    Logprobs=5,
+                    PromptLogprobs=5,
+                    SkipSpecialTokens=True,
+                    SpacesBetweenSpecialTokens=True,
+                    TruncatePromptTokens=10,
+                    GuidedDecoding=True,
+                    N=2,
+                )
+                resp = stub.Predict(req)
+                self.assertIsNotNone(resp.message)
+                self.assertIsNotNone(resp.logprobs)
+        except Exception as err:
+            print(err)
+            self.fail("sampling params service failed")
+        finally:
+            self.tearDown()
+
+
+    def test_embedding(self):
+        """
+        This method tests if the embeddings are generated successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="intfloat/e5-mistral-7b-instruct"))
+                self.assertTrue(response.success)
+                embedding_request = backend_pb2.PredictOptions(Embeddings="This is a test sentence.")
+                embedding_response = stub.Embedding(embedding_request)
+                self.assertIsNotNone(embedding_response.embeddings)
+                # assert that is a list of floats
+                self.assertIsInstance(embedding_response.embeddings, list)
+                # assert that the list is not empty
+                self.assertTrue(len(embedding_response.embeddings) > 0)
+        except Exception as err:
+            print(err)
+            self.fail("Embedding service failed")
+        finally:
+            self.tearDown()
\ No newline at end of file
diff --git a/backend/python/mlx/test.sh b/backend/python/mlx/test.sh
new file mode 100755
index 000000000000..f31ae54e47dc
--- /dev/null
+++ b/backend/python/mlx/test.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+set -e
+
+backend_dir=$(dirname $0)
+
+if [ -d $backend_dir/common ]; then
+    source $backend_dir/common/libbackend.sh
+else
+    source $backend_dir/../common/libbackend.sh
+fi
+
+runUnittests

From d933847f04b94f96ee52a78c8b94e4a3ae2fbce8 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Thu, 21 Aug 2025 19:21:35 +0200
Subject: [PATCH 03/14] Make the backend to build and actually work

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 Makefile                                      |  84 ++--
 backend/python/mlx/Makefile                   |  12 +-
 backend/python/mlx/backend.py                 | 463 +++++++++---------
 backend/python/mlx/install.sh                 |   1 +
 backend/python/mlx/requirements-mps.txt       |   1 +
 .../llama-cpp-darwin.sh}                      |   0
 6 files changed, 288 insertions(+), 273 deletions(-)
 create mode 100644 backend/python/mlx/requirements-mps.txt
 rename scripts/{build-llama-cpp-darwin.sh => build/llama-cpp-darwin.sh} (100%)

diff --git a/Makefile b/Makefile
index a050f84f8d7c..9ce4c079e73e 100644
--- a/Makefile
+++ b/Makefile
@@ -132,43 +132,6 @@ test: test-models/testmodel.ggml protogen-go
 	$(MAKE) test-tts
 	$(MAKE) test-stablediffusion
 
-backends/diffusers: docker-build-diffusers docker-save-diffusers build
-	./local-ai backends install "ocifile://$(abspath ./backend-images/diffusers.tar)"
-
-backends/llama-cpp: docker-build-llama-cpp docker-save-llama-cpp build
-	./local-ai backends install "ocifile://$(abspath ./backend-images/llama-cpp.tar)"
-
-backends/piper: docker-build-piper docker-save-piper build
-	./local-ai backends install "ocifile://$(abspath ./backend-images/piper.tar)"
-
-backends/stablediffusion-ggml: docker-build-stablediffusion-ggml docker-save-stablediffusion-ggml build
-	./local-ai backends install "ocifile://$(abspath ./backend-images/stablediffusion-ggml.tar)"
-
-backends/whisper: docker-build-whisper docker-save-whisper build
-	./local-ai backends install "ocifile://$(abspath ./backend-images/whisper.tar)"
-
-backends/silero-vad: docker-build-silero-vad docker-save-silero-vad build
-	./local-ai backends install "ocifile://$(abspath ./backend-images/silero-vad.tar)"
-
-backends/local-store: docker-build-local-store docker-save-local-store build
-	./local-ai backends install "ocifile://$(abspath ./backend-images/local-store.tar)"
-
-backends/huggingface: docker-build-huggingface docker-save-huggingface build
-	./local-ai backends install "ocifile://$(abspath ./backend-images/huggingface.tar)"
-
-backends/rfdetr: docker-build-rfdetr docker-save-rfdetr build
-	./local-ai backends install "ocifile://$(abspath ./backend-images/rfdetr.tar)"
-
-backends/kitten-tts: docker-build-kitten-tts docker-save-kitten-tts build
-	./local-ai backends install "ocifile://$(abspath ./backend-images/kitten-tts.tar)"
-
-backends/kokoro: docker-build-kokoro docker-save-kokoro build
-	./local-ai backends install "ocifile://$(abspath ./backend-images/kokoro.tar)"
-
-backends/llama-cpp-darwin: build
-	bash ./scripts/build-llama-cpp-darwin.sh
-	./local-ai backends install "ocifile://$(abspath ./backend-images/llama-cpp.tar)"
-
 ########################################################
 ## AIO tests
 ########################################################
@@ -361,6 +324,47 @@ docker-image-intel:
 ## Backends
 ########################################################
 
+
+backends/diffusers: docker-build-diffusers docker-save-diffusers build
+	./local-ai backends install "ocifile://$(abspath ./backend-images/diffusers.tar)"
+
+backends/llama-cpp: docker-build-llama-cpp docker-save-llama-cpp build
+	./local-ai backends install "ocifile://$(abspath ./backend-images/llama-cpp.tar)"
+
+backends/piper: docker-build-piper docker-save-piper build
+	./local-ai backends install "ocifile://$(abspath ./backend-images/piper.tar)"
+
+backends/stablediffusion-ggml: docker-build-stablediffusion-ggml docker-save-stablediffusion-ggml build
+	./local-ai backends install "ocifile://$(abspath ./backend-images/stablediffusion-ggml.tar)"
+
+backends/whisper: docker-build-whisper docker-save-whisper build
+	./local-ai backends install "ocifile://$(abspath ./backend-images/whisper.tar)"
+
+backends/silero-vad: docker-build-silero-vad docker-save-silero-vad build
+	./local-ai backends install "ocifile://$(abspath ./backend-images/silero-vad.tar)"
+
+backends/local-store: docker-build-local-store docker-save-local-store build
+	./local-ai backends install "ocifile://$(abspath ./backend-images/local-store.tar)"
+
+backends/huggingface: docker-build-huggingface docker-save-huggingface build
+	./local-ai backends install "ocifile://$(abspath ./backend-images/huggingface.tar)"
+
+backends/rfdetr: docker-build-rfdetr docker-save-rfdetr build
+	./local-ai backends install "ocifile://$(abspath ./backend-images/rfdetr.tar)"
+
+backends/kitten-tts: docker-build-kitten-tts docker-save-kitten-tts build
+	./local-ai backends install "ocifile://$(abspath ./backend-images/kitten-tts.tar)"
+
+backends/kokoro: docker-build-kokoro docker-save-kokoro build
+	./local-ai backends install "ocifile://$(abspath ./backend-images/kokoro.tar)"
+
+backends/llama-cpp-darwin: build
+	bash ./scripts/build/llama-cpp-darwin.sh
+	./local-ai backends install "ocifile://$(abspath ./backend-images/llama-cpp.tar)"
+
+backends/nemo: docker-build-nemo docker-save-nemo build
+	./local-ai backends install "ocifile://$(abspath ./backend-images/nemo.tar)"
+
 backend-images:
 	mkdir -p backend-images
 
@@ -391,6 +395,12 @@ docker-save-kitten-tts: backend-images
 docker-build-kokoro:
 	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:kokoro -f backend/Dockerfile.python --build-arg BACKEND=kokoro ./backend
 
+docker-build-nemo:
+	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:nemo -f backend/Dockerfile.python --build-arg BACKEND=nemo ./backend
+
+docker-save-nemo: backend-images
+	docker save local-ai-backend:nemo -o backend-images/nemo.tar
+
 docker-save-kokoro: backend-images
 	docker save local-ai-backend:kokoro -o backend-images/kokoro.tar
 
diff --git a/backend/python/mlx/Makefile b/backend/python/mlx/Makefile
index c4c18bee55b0..06f3bf614854 100644
--- a/backend/python/mlx/Makefile
+++ b/backend/python/mlx/Makefile
@@ -1,29 +1,23 @@
 .PHONY: mlx
-mlx: protogen
+mlx:
 	bash install.sh
 
 .PHONY: run
-run: protogen
+run:
 	@echo "Running mlx..."
 	bash run.sh
 	@echo "mlx run."
 
 .PHONY: test
-test: protogen
+test:
 	@echo "Testing mlx..."
 	bash test.sh
 	@echo "mlx tested."
 
-.PHONY: protogen
-protogen: backend_pb2_grpc.py backend_pb2.py
-
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 
-backend_pb2_grpc.py backend_pb2.py:
-	python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
-
 .PHONY: clean
 clean: protogen-clean
 	rm -rf venv __pycache__
\ No newline at end of file
diff --git a/backend/python/mlx/backend.py b/backend/python/mlx/backend.py
index 56698a54e5f5..84024b387f29 100644
--- a/backend/python/mlx/backend.py
+++ b/backend/python/mlx/backend.py
@@ -6,19 +6,16 @@
 import sys
 import os
 from typing import List
-from PIL import Image
+import time
 
 import backend_pb2
 import backend_pb2_grpc
 
 import grpc
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.async_llm_engine import AsyncLLMEngine
-from vllm.sampling_params import SamplingParams
-from vllm.utils import random_uuid
-from vllm.transformers_utils.tokenizer import get_tokenizer
-from vllm.multimodal.utils import fetch_image
-from vllm.assets.video import VideoAsset
+from mlx_lm import load, generate, stream_generate
+from mlx_lm.sample_utils import make_sampler
+from mlx_lm.models.cache import make_prompt_cache
+import mlx.core as mx
 import base64
 import io
 
@@ -32,38 +29,22 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
     """
     A gRPC servicer that implements the Backend service defined in backend.proto.
     """
-    def generate(self,prompt, max_new_tokens):
-        """
-        Generates text based on the given prompt and maximum number of new tokens.
-
-        Args:
-            prompt (str): The prompt to generate text from.
-            max_new_tokens (int): The maximum number of new tokens to generate.
-
-        Returns:
-            str: The generated text.
-        """
-        self.generator.end_beam_search()
-
-        # Tokenizing the input
-        ids = self.generator.tokenizer.encode(prompt)
-
-        self.generator.gen_begin_reuse(ids)
-        initial_len = self.generator.sequence[0].shape[0]
-        has_leading_space = False
-        decoded_text = ''
-        for i in range(max_new_tokens):
-            token = self.generator.gen_single_token()
-            if i == 0 and self.generator.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'):
-                has_leading_space = True
 
-            decoded_text = self.generator.tokenizer.decode(self.generator.sequence[0][initial_len:])
-            if has_leading_space:
-                decoded_text = ' ' + decoded_text
+    def _is_float(self, s):
+        """Check if a string can be converted to float."""
+        try:
+            float(s)
+            return True
+        except ValueError:
+            return False
 
-            if token.item() == self.generator.tokenizer.eos_token_id:
-                break
-        return decoded_text
+    def _is_int(self, s):
+        """Check if a string can be converted to int."""
+        try:
+            int(s)
+            return True
+        except ValueError:
+            return False
 
     def Health(self, request, context):
         """
@@ -80,7 +61,7 @@ def Health(self, request, context):
 
     async def LoadModel(self, request, context):
         """
-        Loads a language model.
+        Loads a language model using MLX.
 
         Args:
             request: The load model request.
@@ -89,60 +70,70 @@ async def LoadModel(self, request, context):
         Returns:
             backend_pb2.Result: The load model result.
         """
-        engine_args = AsyncEngineArgs(
-            model=request.Model,
-        )
-
-        if request.Quantization != "":
-            engine_args.quantization = request.Quantization
-        if request.LoadFormat != "":
-            engine_args.load_format = request.LoadFormat
-        if request.GPUMemoryUtilization != 0:
-            engine_args.gpu_memory_utilization = request.GPUMemoryUtilization
-        if request.TrustRemoteCode:
-            engine_args.trust_remote_code = request.TrustRemoteCode
-        if request.EnforceEager:
-            engine_args.enforce_eager = request.EnforceEager
-        if request.TensorParallelSize:
-            engine_args.tensor_parallel_size = request.TensorParallelSize
-        if request.SwapSpace != 0:
-            engine_args.swap_space = request.SwapSpace
-        if request.MaxModelLen != 0:
-            engine_args.max_model_len = request.MaxModelLen
-        if request.DisableLogStatus:
-            engine_args.disable_log_status = request.DisableLogStatus
-        if request.DType != "":
-            engine_args.dtype = request.DType
-        if request.LimitImagePerPrompt != 0 or request.LimitVideoPerPrompt != 0 or request.LimitAudioPerPrompt != 0:
-            # limit-mm-per-prompt defaults to 1 per modality, based on vLLM docs
-            engine_args.limit_mm_per_prompt = {
-                "image": max(request.LimitImagePerPrompt, 1),
-                "video": max(request.LimitVideoPerPrompt, 1),
-                "audio": max(request.LimitAudioPerPrompt, 1)
-            }
-
         try:
-            self.llm = AsyncLLMEngine.from_engine_args(engine_args)
+            print(f"Loading MLX model: {request.Model}", file=sys.stderr)
+            print(f"Request: {request}", file=sys.stderr)
+            
+            # Parse options like in the diffusers backend
+            options = request.Options
+            self.options = {}
+            
+            # The options are a list of strings in this form optname:optvalue
+            # We store all the options in a dict for later use
+            for opt in options:
+                if ":" not in opt:
+                    continue
+                key, value = opt.split(":", 1)  # Split only on first colon to handle values with colons
+                
+                # Convert numeric values to appropriate types
+                if self._is_float(value):
+                    value = float(value)
+                elif self._is_int(value):
+                    value = int(value)
+                elif value.lower() in ["true", "false"]:
+                    value = value.lower() == "true"
+                    
+                self.options[key] = value
+            
+            print(f"Options: {self.options}", file=sys.stderr)
+            
+            # Build tokenizer config for MLX using options
+            tokenizer_config = {}
+            
+            # Handle trust_remote_code from request or options
+            if request.TrustRemoteCode or self.options.get("trust_remote_code", False):
+                tokenizer_config["trust_remote_code"] = True
+            
+            # Handle EOS token from options
+            if "eos_token" in self.options:
+                tokenizer_config["eos_token"] = self.options["eos_token"]
+            
+            # Handle other tokenizer config options
+            for key in ["pad_token", "bos_token", "unk_token", "sep_token", "cls_token", "mask_token"]:
+                if key in self.options:
+                    tokenizer_config[key] = self.options[key]
+            
+            # Load model and tokenizer using MLX
+            if tokenizer_config:
+                print(f"Loading with tokenizer_config: {tokenizer_config}", file=sys.stderr)
+                self.model, self.tokenizer = load(request.Model, tokenizer_config=tokenizer_config)
+            else:
+                self.model, self.tokenizer = load(request.Model)
+            
+            # Initialize prompt cache for efficient generation
+            max_kv_size = self.options.get("max_kv_size", None)
+            self.prompt_cache = make_prompt_cache(self.model, max_kv_size)
+                
         except Exception as err:
-            print(f"Unexpected {err=}, {type(err)=}", file=sys.stderr)
-            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+            print(f"Error loading MLX model {err=}, {type(err)=}", file=sys.stderr)
+            return backend_pb2.Result(success=False, message=f"Error loading MLX model: {err}")
 
-        try:
-           engine_model_config = await self.llm.get_model_config()
-           self.tokenizer = get_tokenizer(
-               engine_model_config.tokenizer,
-               tokenizer_mode=engine_model_config.tokenizer_mode,
-               trust_remote_code=engine_model_config.trust_remote_code,
-               truncation_side="left",
-           )
-        except Exception as err:
-            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
-        print("Model loaded successfully", file=sys.stderr)
-        return backend_pb2.Result(message="Model loaded successfully", success=True)
+        print("MLX model loaded successfully", file=sys.stderr)
+        return backend_pb2.Result(message="MLX model loaded successfully", success=True)
 
     async def Predict(self, request, context):
         """
-        Generates text based on the given prompt and sampling parameters.
+        Generates text based on the given prompt and sampling parameters using MLX.
 
         Args:
             request: The predict request.
@@ -151,13 +142,42 @@ async def Predict(self, request, context):
         Returns:
             backend_pb2.Reply: The predict result.
         """
-        gen = self._predict(request, context, streaming=False)
-        res = await gen.__anext__()
-        return res
+        try:
+            # Prepare the prompt
+            prompt = self._prepare_prompt(request)
+            
+            # Build generation parameters using request attributes and options
+            max_tokens, sampler_params = self._build_generation_params(request)
+            
+            print(f"Generating text with MLX - max_tokens: {max_tokens}, sampler_params: {sampler_params}", file=sys.stderr)
+            
+            # Create sampler with parameters
+            sampler = make_sampler(**sampler_params)
+            
+            # Generate text using MLX with proper parameters
+            response = generate(
+                self.model,
+                self.tokenizer,
+                prompt=prompt,
+                max_tokens=max_tokens,
+                sampler=sampler,
+                prompt_cache=self.prompt_cache,
+                verbose=False
+            )
+            
+            return backend_pb2.Reply(message=bytes(response, encoding='utf-8'))
+            
+        except Exception as e:
+            print(f"Error in MLX Predict: {e}", file=sys.stderr)
+            context.set_code(grpc.StatusCode.INTERNAL)
+            context.set_details(f"Generation failed: {str(e)}")
+            return backend_pb2.Reply(message=bytes("", encoding='utf-8'))
 
     def Embedding(self, request, context):
         """
         A gRPC method that calculates embeddings for a given sentence.
+        
+        Note: MLX-LM doesn't support embeddings directly. This method returns an error.
 
         Args:
             request: An EmbeddingRequest object that contains the request parameters.
@@ -166,170 +186,159 @@ def Embedding(self, request, context):
         Returns:
             An EmbeddingResult object that contains the calculated embeddings.
         """
-        print("Calculated embeddings for: " + request.Embeddings, file=sys.stderr)
-        outputs = self.model.encode(request.Embeddings)
-        # Check if we have one result at least
-        if len(outputs) == 0:
-            context.set_code(grpc.StatusCode.INVALID_ARGUMENT)
-            context.set_details("No embeddings were calculated.")
-            return backend_pb2.EmbeddingResult()
-        return backend_pb2.EmbeddingResult(embeddings=outputs[0].outputs.embedding)
+        print("Embeddings not supported in MLX backend", file=sys.stderr)
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details("Embeddings are not supported in the MLX backend.")
+        return backend_pb2.EmbeddingResult()
 
     async def PredictStream(self, request, context):
         """
-        Generates text based on the given prompt and sampling parameters, and streams the results.
+        Generates text based on the given prompt and sampling parameters, and streams the results using MLX.
 
         Args:
             request: The predict stream request.
             context: The gRPC context.
 
-        Returns:
-            backend_pb2.Result: The predict stream result.
+        Yields:
+            backend_pb2.Reply: Streaming predict results.
         """
-        iterations = self._predict(request, context, streaming=True)
         try:
-            async for iteration in iterations:
-                yield iteration
-        finally:
-            await iterations.aclose()
-
-    async def _predict(self, request, context, streaming=False):
-        # Build the sampling parameters
-        # NOTE: this must stay in sync with the vllm backend
-        request_to_sampling_params = {
-            "N": "n",
-            "PresencePenalty": "presence_penalty",
-            "FrequencyPenalty": "frequency_penalty",
-            "RepetitionPenalty": "repetition_penalty",
-            "Temperature": "temperature",
-            "TopP": "top_p",
-            "TopK": "top_k",
-            "MinP": "min_p",
-            "Seed": "seed",
-            "StopPrompts": "stop",
-            "StopTokenIds": "stop_token_ids",
-            "BadWords": "bad_words",
-            "IncludeStopStrInOutput": "include_stop_str_in_output",
-            "IgnoreEOS": "ignore_eos",
-            "Tokens": "max_tokens",
-            "MinTokens": "min_tokens",
-            "Logprobs": "logprobs",
-            "PromptLogprobs": "prompt_logprobs",
-            "SkipSpecialTokens": "skip_special_tokens",
-            "SpacesBetweenSpecialTokens": "spaces_between_special_tokens",
-            "TruncatePromptTokens": "truncate_prompt_tokens",
-            "GuidedDecoding": "guided_decoding",
-        }
-
-        sampling_params = SamplingParams(top_p=0.9, max_tokens=200)
-
-        for request_field, param_field in request_to_sampling_params.items():
-            if hasattr(request, request_field):
-                value = getattr(request, request_field)
-                if value not in (None, 0, [], False):
-                    setattr(sampling_params, param_field, value)
-
-        # Extract image paths and process images
-        prompt = request.Prompt
+            # Prepare the prompt
+            prompt = self._prepare_prompt(request)
+            
+            # Build generation parameters using request attributes and options
+            max_tokens, sampler_params = self._build_generation_params(request, default_max_tokens=512)
+            
+            print(f"Streaming text with MLX - max_tokens: {max_tokens}, sampler_params: {sampler_params}", file=sys.stderr)
+            
+            # Create sampler with parameters
+            sampler = make_sampler(**sampler_params)
+            
+            # Stream text generation using MLX with proper parameters
+            for response in stream_generate(
+                self.model,
+                self.tokenizer,
+                prompt=prompt,
+                max_tokens=max_tokens,
+                sampler=sampler,
+                prompt_cache=self.prompt_cache,
+            ):
+                yield backend_pb2.Reply(message=bytes(response.text, encoding='utf-8'))
+                
+        except Exception as e:
+            print(f"Error in MLX PredictStream: {e}", file=sys.stderr)
+            context.set_code(grpc.StatusCode.INTERNAL)
+            context.set_details(f"Streaming generation failed: {str(e)}")
+            yield backend_pb2.Reply(message=bytes("", encoding='utf-8'))
 
-        image_paths = request.Images
-        image_data = [self.load_image(img_path) for img_path in image_paths]
+    def _prepare_prompt(self, request):
+        """
+        Prepare the prompt for MLX generation, handling chat templates if needed.
 
-        videos_path = request.Videos
-        video_data = [self.load_video(video_path) for video_path in videos_path]
+        Args:
+            request: The gRPC request containing prompt and message information.
 
+        Returns:
+            str: The prepared prompt.
+        """
         # If tokenizer template is enabled and messages are provided instead of prompt, apply the tokenizer template
         if not request.Prompt and request.UseTokenizerTemplate and request.Messages:
-            prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True)
-
-        # Generate text using the LLM engine
-        request_id = random_uuid()
-        print(f"Generating text with request_id: {request_id}", file=sys.stderr)
-        multi_modal_data = {}
-        if image_data:
-            multi_modal_data["image"] = image_data
-        if video_data:
-            multi_modal_data["video"] = video_data
-        outputs = self.llm.generate(
-            {
-            "prompt": prompt,
-            "multi_modal_data": multi_modal_data if multi_modal_data else None,
-            },
-            sampling_params=sampling_params,
-            request_id=request_id,
-        )
+            # Convert gRPC messages to the format expected by apply_chat_template
+            messages = []
+            for msg in request.Messages:
+                messages.append({"role": msg.role, "content": msg.content})
+            
+            prompt = self.tokenizer.apply_chat_template(
+                messages, 
+                tokenize=False, 
+                add_generation_prompt=True
+            )
+            return prompt
+        else:
+            return request.Prompt
 
-        # Stream the results
-        generated_text = ""
-        try:
-            async for request_output in outputs:
-                iteration_text = request_output.outputs[0].text
-
-                if streaming:
-                    # Remove text already sent as vllm concatenates the text from previous yields
-                    delta_iteration_text = iteration_text.removeprefix(generated_text)
-                    # Send the partial result
-                    yield backend_pb2.Reply(message=bytes(delta_iteration_text, encoding='utf-8'))
-
-                # Keep track of text generated
-                generated_text = iteration_text
-        finally:
-            await outputs.aclose()
-
-        # If streaming, we already sent everything
-        if streaming:
-            return
-
-        # Remove the image files from /tmp folder
-        for img_path in image_paths:
-            try:
-                os.remove(img_path)
-            except Exception as e:
-                print(f"Error removing image file: {img_path}, {e}", file=sys.stderr)
-
-        # Sending the final generated text
-        yield backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))
-
-    def load_image(self, image_path: str):
-        """
-        Load an image from the given file path or base64 encoded data.
 
-        Args:
-            image_path (str): The path to the image file or base64 encoded data.
 
-        Returns:
-            Image: The loaded image.
-        """
-        try:
 
-            image_data = base64.b64decode(image_path)
-            image = Image.open(io.BytesIO(image_data))
-            return image
-        except Exception as e:
-            print(f"Error loading image {image_path}: {e}", file=sys.stderr)
-            return None
 
-    def load_video(self, video_path: str):
+    def _build_generation_params(self, request, default_max_tokens=200):
         """
-        Load a video from the given file path.
+        Build generation parameters from request attributes and options.
 
         Args:
-            video_path (str): The path to the image file.
+            request: The gRPC request.
+            default_max_tokens: Default max_tokens if not specified.
 
         Returns:
-            Video: The loaded video.
+            tuple: (max_tokens, sampler_params dict)
         """
+        # Extract max_tokens
+        max_tokens = getattr(request, 'Tokens', default_max_tokens)
+        if max_tokens == 0:
+            max_tokens = default_max_tokens
+        
+        # Extract sampler parameters from request attributes
+        temp = getattr(request, 'Temperature', 0.0)
+        if temp == 0.0:
+            temp = 0.6  # Default temperature
+        
+        top_p = getattr(request, 'TopP', 0.0)
+        if top_p == 0.0:
+            top_p = 1.0  # Default top_p
+        
+        # Initialize sampler parameters
+        sampler_params = {
+            'temp': temp,
+            'top_p': top_p,
+            'xtc_threshold': 0.0,
+            'xtc_probability': 0.0,
+        }
+        
+        # Add seed if specified
+        seed = getattr(request, 'Seed', 0)
+        if seed != 0:
+            mx.random.seed(seed)
+        
+        # Override with options if available
+        if hasattr(self, 'options'):
+            # Max tokens from options
+            if 'max_tokens' in self.options:
+                max_tokens = self.options['max_tokens']
+            
+            # Sampler parameters from options
+            sampler_option_mapping = {
+                'temp': 'temp',
+                'temperature': 'temp',  # alias
+                'top_p': 'top_p', 
+                'xtc_threshold': 'xtc_threshold',
+                'xtc_probability': 'xtc_probability',
+            }
+            
+            for option_key, param_key in sampler_option_mapping.items():
+                if option_key in self.options:
+                    sampler_params[param_key] = self.options[option_key]
+            
+            # Handle seed from options
+            if 'seed' in self.options:
+                mx.random.seed(self.options['seed'])
+        
+        # Special tokens for XTC sampling (if tokenizer has eos_token_ids)
+        xtc_special_tokens = []
+        if hasattr(self.tokenizer, 'eos_token_ids') and self.tokenizer.eos_token_ids:
+            xtc_special_tokens = list(self.tokenizer.eos_token_ids)
+        elif hasattr(self.tokenizer, 'eos_token_id') and self.tokenizer.eos_token_id is not None:
+            xtc_special_tokens = [self.tokenizer.eos_token_id]
+        
+        # Add newline token if available
         try:
-            timestamp = str(int(time.time() * 1000))  # Generate timestamp
-            p = f"/tmp/vl-{timestamp}.data"  # Use timestamp in filename
-            with open(p, "wb") as f:
-                f.write(base64.b64decode(video_path))
-            video = VideoAsset(name=p).np_ndarrays
-            os.remove(p)
-            return video
-        except Exception as e:
-            print(f"Error loading video {video_path}: {e}", file=sys.stderr)
-            return None
+            newline_tokens = self.tokenizer.encode("\n")
+            xtc_special_tokens.extend(newline_tokens)
+        except:
+            pass  # Skip if encoding fails
+        
+        sampler_params['xtc_special_tokens'] = xtc_special_tokens
+        
+        return max_tokens, sampler_params
 
 async def serve(address):
     # Start asyncio gRPC server
diff --git a/backend/python/mlx/install.sh b/backend/python/mlx/install.sh
index b8ee48552490..253ee0c13f1b 100755
--- a/backend/python/mlx/install.sh
+++ b/backend/python/mlx/install.sh
@@ -2,6 +2,7 @@
 set -e
 
 USE_PIP=true
+PYTHON_VERSION=""
 
 backend_dir=$(dirname $0)
 
diff --git a/backend/python/mlx/requirements-mps.txt b/backend/python/mlx/requirements-mps.txt
new file mode 100644
index 000000000000..22737f5fdda7
--- /dev/null
+++ b/backend/python/mlx/requirements-mps.txt
@@ -0,0 +1 @@
+mlx-lm
\ No newline at end of file
diff --git a/scripts/build-llama-cpp-darwin.sh b/scripts/build/llama-cpp-darwin.sh
similarity index 100%
rename from scripts/build-llama-cpp-darwin.sh
rename to scripts/build/llama-cpp-darwin.sh

From 8ed272ec0a23a679f1cb6bfae1b86a59c1876aca Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Thu, 21 Aug 2025 19:22:42 +0200
Subject: [PATCH 04/14] List models from system only

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 Makefile                               | 11 +++--------
 core/gallery/gallery.go                | 13 +++++++------
 core/http/endpoints/localai/welcome.go | 10 +++-------
 3 files changed, 13 insertions(+), 21 deletions(-)

diff --git a/Makefile b/Makefile
index 9ce4c079e73e..af95e2374575 100644
--- a/Makefile
+++ b/Makefile
@@ -362,8 +362,9 @@ backends/llama-cpp-darwin: build
 	bash ./scripts/build/llama-cpp-darwin.sh
 	./local-ai backends install "ocifile://$(abspath ./backend-images/llama-cpp.tar)"
 
-backends/nemo: docker-build-nemo docker-save-nemo build
-	./local-ai backends install "ocifile://$(abspath ./backend-images/nemo.tar)"
+backends/mlx: build
+	BACKEND=mlx BUILD_TYPE=mps bash ./scripts/build/python-darwin.sh
+	./local-ai backends install "ocifile://$(abspath ./backend-images/mlx.tar)"
 
 backend-images:
 	mkdir -p backend-images
@@ -395,12 +396,6 @@ docker-save-kitten-tts: backend-images
 docker-build-kokoro:
 	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:kokoro -f backend/Dockerfile.python --build-arg BACKEND=kokoro ./backend
 
-docker-build-nemo:
-	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:nemo -f backend/Dockerfile.python --build-arg BACKEND=nemo ./backend
-
-docker-save-nemo: backend-images
-	docker save local-ai-backend:nemo -o backend-images/nemo.tar
-
 docker-save-kokoro: backend-images
 	docker save local-ai-backend:kokoro -o backend-images/kokoro.tar
 
diff --git a/core/gallery/gallery.go b/core/gallery/gallery.go
index a80550102b17..e746f71a347f 100644
--- a/core/gallery/gallery.go
+++ b/core/gallery/gallery.go
@@ -141,14 +141,15 @@ func AvailableGalleryModels(galleries []config.Gallery, systemState *system.Syst
 func AvailableBackends(galleries []config.Gallery, systemState *system.SystemState) (GalleryElements[*GalleryBackend], error) {
 	var backends []*GalleryBackend
 
+	systemBackends, err := ListSystemBackends(systemState)
+	if err != nil {
+		return nil, err
+	}
+
 	// Get backends from galleries
 	for _, gallery := range galleries {
-		galleryBackends, err := getGalleryElements[*GalleryBackend](gallery, systemState.Backend.BackendsPath, func(backend *GalleryBackend) bool {
-			backends, err := ListSystemBackends(systemState)
-			if err != nil {
-				return false
-			}
-			return backends.Exists(backend.GetName())
+		galleryBackends, err := getGalleryElements(gallery, systemState.Backend.BackendsPath, func(backend *GalleryBackend) bool {
+			return systemBackends.Exists(backend.GetName())
 		})
 		if err != nil {
 			return nil, err
diff --git a/core/http/endpoints/localai/welcome.go b/core/http/endpoints/localai/welcome.go
index 23efd0788dc5..04f72743e34e 100644
--- a/core/http/endpoints/localai/welcome.go
+++ b/core/http/endpoints/localai/welcome.go
@@ -16,13 +16,9 @@ func WelcomeEndpoint(appConfig *config.ApplicationConfig,
 		modelConfigs := cl.GetAllModelsConfigs()
 		galleryConfigs := map[string]*gallery.ModelConfig{}
 
-		backends, _ := gallery.AvailableBackends(appConfig.BackendGalleries, appConfig.SystemState)
-
-		installedBackends := gallery.GalleryElements[*gallery.GalleryBackend]{}
-		for _, b := range backends {
-			if b.Installed {
-				installedBackends = append(installedBackends, b)
-			}
+		installedBackends, err := gallery.ListSystemBackends(appConfig.SystemState)
+		if err != nil {
+			return err
 		}
 
 		for _, m := range modelConfigs {

From 4662a9f65145f3152a7d43d8c5bae24cf69419b7 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Thu, 21 Aug 2025 19:23:20 +0200
Subject: [PATCH 05/14] Add script to build darwin python backends

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 scripts/build/python-darwin.sh | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 scripts/build/python-darwin.sh

diff --git a/scripts/build/python-darwin.sh b/scripts/build/python-darwin.sh
new file mode 100644
index 000000000000..6166a2630322
--- /dev/null
+++ b/scripts/build/python-darwin.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+set -ex
+
+IMAGE_NAME="${IMAGE_NAME:-localai/llama-cpp-darwin}"
+mkdir -p backend-images
+make -C backend/python/${BACKEND}
+
+cp -rfv backend/python/common backend/python/${BACKEND}/
+
+PLATFORMARCH="${PLATFORMARCH:-darwin/arm64}"
+
+./local-ai util create-oci-image \
+        backend/python/${BACKEND}/. \
+        --output ./backend-images/${BACKEND}.tar \
+        --image-name $IMAGE_NAME \
+        --platform $PLATFORMARCH
+
+make -C backend/python/${BACKEND} clean
+

From 1c86dfd2c8c941a2a4beefc3a65aecbbb5ed0475 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Thu, 21 Aug 2025 19:24:15 +0200
Subject: [PATCH 06/14] Run protogen in libbackend

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 backend/python/bark/Makefile               | 12 +---
 backend/python/chatterbox/Makefile         | 14 ++--
 backend/python/common/libbackend.sh        | 75 ++++++++++++----------
 backend/python/common/template/Makefile    |  7 --
 backend/python/common/template/protogen.sh |  4 +-
 backend/python/coqui/Makefile              | 12 +---
 backend/python/diffusers/Makefile          | 12 +---
 backend/python/exllama2/Makefile           | 10 +--
 backend/python/faster-whisper/Makefile     |  7 --
 backend/python/kitten-tts/Makefile         | 12 +---
 backend/python/kokoro/Makefile             |  6 --
 backend/python/rerankers/Makefile          | 12 +---
 backend/python/rfdetr/Makefile             |  7 --
 backend/python/rfdetr/protogen.sh          | 13 ----
 backend/python/transformers/Makefile       | 12 +---
 backend/python/vllm/Makefile               | 12 +---
 16 files changed, 68 insertions(+), 159 deletions(-)
 delete mode 100644 backend/python/rfdetr/protogen.sh

diff --git a/backend/python/bark/Makefile b/backend/python/bark/Makefile
index ef4fff1bef9d..da996aabeef0 100644
--- a/backend/python/bark/Makefile
+++ b/backend/python/bark/Makefile
@@ -1,29 +1,23 @@
 .PHONY: ttsbark
-ttsbark: protogen
+ttsbark:
 	bash install.sh
 
 .PHONY: run
-run: protogen
+run: ttsbark
 	@echo "Running bark..."
 	bash run.sh
 	@echo "bark run."
 
 .PHONY: test
-test: protogen
+test: ttsbark
 	@echo "Testing bark..."
 	bash test.sh
 	@echo "bark tested."
 
-.PHONY: protogen
-protogen: backend_pb2_grpc.py backend_pb2.py
-
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 
-backend_pb2_grpc.py backend_pb2.py:
-	python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
-
 .PHONY: clean
 clean: protogen-clean
 	rm -rf venv __pycache__
\ No newline at end of file
diff --git a/backend/python/chatterbox/Makefile b/backend/python/chatterbox/Makefile
index a69c0bcf58ca..be9330f8eac9 100644
--- a/backend/python/chatterbox/Makefile
+++ b/backend/python/chatterbox/Makefile
@@ -1,29 +1,23 @@
-.PHONY: coqui
-coqui: protogen
+.PHONY: chatterbox
+chatterbox:
 	bash install.sh
 
 .PHONY: run
-run: protogen
+run: chatterbox
 	@echo "Running coqui..."
 	bash run.sh
 	@echo "coqui run."
 
 .PHONY: test
-test: protogen
+test: chatterbox
 	@echo "Testing coqui..."
 	bash test.sh
 	@echo "coqui tested."
 
-.PHONY: protogen
-protogen: backend_pb2_grpc.py backend_pb2.py
-
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 
-backend_pb2_grpc.py backend_pb2.py:
-	python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
-
 .PHONY: clean
 clean: protogen-clean
 	rm -rf venv __pycache__
\ No newline at end of file
diff --git a/backend/python/common/libbackend.sh b/backend/python/common/libbackend.sh
index d98924fe3550..409d20665932 100644
--- a/backend/python/common/libbackend.sh
+++ b/backend/python/common/libbackend.sh
@@ -21,7 +21,7 @@
 # USE_PIP=true source $(dirname $0)/../common/libbackend.sh
 #
 
-PYTHON_VERSION="3.10"
+PYTHON_VERSION="${PYTHON_VERSION:-3.10}"
 
 # Default to uv if USE_PIP is not set
 if [ "x${USE_PIP}" == "x" ]; then
@@ -56,11 +56,6 @@ function init() {
     fi
 
     echo "Initializing libbackend for ${BACKEND_NAME}"
-    if [ "x${USE_PIP}" == "xtrue" ]; then
-        echo "Using pip and Python virtual environments"
-    else
-        echo "Using uv package manager"
-    fi
 }
 
 # getBuildProfile will inspect the system to determine which build profile is appropriate:
@@ -70,11 +65,6 @@ function init() {
 # - hipblas
 # - intel
 function getBuildProfile() {
-    if [ "x${BUILD_TYPE}" == "xl4t" ]; then
-        echo "l4t"
-        return 0
-    fi
-
     # First check if we are a cublas build, and if so report the correct build profile
     if [ x"${BUILD_TYPE}" == "xcublas" ]; then
         if [ ! -z ${CUDA_MAJOR_VERSION} ]; then
@@ -94,7 +84,7 @@ function getBuildProfile() {
     fi
 
     # If for any other values of BUILD_TYPE, we don't need any special handling/discovery
-    if [ ! -z ${BUILD_TYPE} ]; then
+    if [ -n ${BUILD_TYPE} ]; then
         echo ${BUILD_TYPE}
         return 0
     fi
@@ -108,35 +98,48 @@ function getBuildProfile() {
 # This function is idempotent, so you can call it as many times as you want and it will
 # always result in an activated virtual environment
 function ensureVenv() {
-    if [ "x${USE_PIP}" == "xtrue" ]; then
-        # Use Python virtual environment with pip
-        if [ ! -d "${EDIR}/venv" ]; then
-            python${PYTHON_VERSION} -m venv ${EDIR}/venv
-            echo "Python virtual environment created"
-        fi
-
-        # Source if we are not already in a Virtual env
-        if [ "x${VIRTUAL_ENV}" != "x${EDIR}/venv" ]; then
-            source ${EDIR}/venv/bin/activate
-            echo "Python virtual environment activated"
-        fi
-    else
-        # Use uv (conda-like)
-        if [ ! -d "${EDIR}/venv" ]; then
-            uv venv --python ${PYTHON_VERSION} ${EDIR}/venv
-            echo "uv virtual environment created"
-        fi
-
-        # Source if we are not already in a Virtual env
-        if [ "x${VIRTUAL_ENV}" != "x${EDIR}/venv" ]; then
-            source ${EDIR}/venv/bin/activate
-            echo "uv virtual environment activated"
+     if [ ! -d "${EDIR}/venv" ]; then
+        if [ "x${USE_PIP}" == "xtrue" ]; then
+                echo "Using pip and Python virtual environments"
+
+                # Use Python virtual environment with pip
+                interpreter="python3"
+                # if there is no python , call python${PYTHON_VERSION}
+                
+                if command -v python${PYTHON_VERSION} &> /dev/null; then
+                    interpreter="python${PYTHON_VERSION}"
+                fi
+                echo "Using interpreter: ${interpreter}"
+                ${interpreter} -m venv ${EDIR}/venv
+                echo "Python virtual environment created"
+        else
+                echo "Using uv package manager"
+                uv venv --python ${PYTHON_VERSION} ${EDIR}/venv
+                echo "uv virtual environment created"
         fi
     fi
+    # Source if we are not already in a Virtual env
+    if [ "x${VIRTUAL_ENV}" != "x${EDIR}/venv" ]; then
+        source ${EDIR}/venv/bin/activate
+        echo "Python virtual environment activated"
+    fi
 
     echo "activated virtual environment has been ensured"
 }
 
+function runProtogen() {
+    ensureVenv
+
+    if [ "x${USE_PIP}" == "xtrue" ]; then
+        pip install grpcio-tools
+    else
+        uv pip install grpcio-tools
+    fi
+    pushd ${EDIR}
+        python3 -m grpc_tools.protoc -I../../ -I./ --python_out=. --grpc_python_out=. backend.proto
+    popd
+}
+
 # installRequirements looks for several requirements files and if they exist runs the install for them in order
 #
 #  - requirements-install.txt
@@ -196,6 +199,8 @@ function installRequirements() {
             echo "finished requirements install for ${reqFile}"
         fi
     done
+
+    runProtogen
 }
 
 # startBackend discovers and runs the backend GRPC server
diff --git a/backend/python/common/template/Makefile b/backend/python/common/template/Makefile
index c0e5169f75c4..f6b9ddc6c888 100644
--- a/backend/python/common/template/Makefile
+++ b/backend/python/common/template/Makefile
@@ -3,18 +3,11 @@
 .PHONY: install
 install:
 	bash install.sh
-	$(MAKE) protogen
-
-.PHONY: protogen
-protogen: backend_pb2_grpc.py backend_pb2.py
 
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 
-backend_pb2_grpc.py backend_pb2.py:
-	bash protogen.sh
-
 .PHONY: clean
 clean: protogen-clean
 	rm -rf venv __pycache__
\ No newline at end of file
diff --git a/backend/python/common/template/protogen.sh b/backend/python/common/template/protogen.sh
index 0569b6c6e4b3..cba7791cbce3 100644
--- a/backend/python/common/template/protogen.sh
+++ b/backend/python/common/template/protogen.sh
@@ -8,6 +8,4 @@ else
     source $backend_dir/../common/libbackend.sh
 fi
 
-ensureVenv
-
-python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
\ No newline at end of file
+runProtogen
\ No newline at end of file
diff --git a/backend/python/coqui/Makefile b/backend/python/coqui/Makefile
index a69c0bcf58ca..6915b0f9f896 100644
--- a/backend/python/coqui/Makefile
+++ b/backend/python/coqui/Makefile
@@ -1,29 +1,23 @@
 .PHONY: coqui
-coqui: protogen
+coqui:
 	bash install.sh
 
 .PHONY: run
-run: protogen
+run: coqui
 	@echo "Running coqui..."
 	bash run.sh
 	@echo "coqui run."
 
 .PHONY: test
-test: protogen
+test: coqui
 	@echo "Testing coqui..."
 	bash test.sh
 	@echo "coqui tested."
 
-.PHONY: protogen
-protogen: backend_pb2_grpc.py backend_pb2.py
-
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 
-backend_pb2_grpc.py backend_pb2.py:
-	python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
-
 .PHONY: clean
 clean: protogen-clean
 	rm -rf venv __pycache__
\ No newline at end of file
diff --git a/backend/python/diffusers/Makefile b/backend/python/diffusers/Makefile
index 01156f876f00..f9ded4a1cff7 100644
--- a/backend/python/diffusers/Makefile
+++ b/backend/python/diffusers/Makefile
@@ -12,28 +12,22 @@ export SKIP_CONDA=1
 endif
 
 .PHONY: diffusers
-diffusers: protogen
+diffusers:
 	bash install.sh
 
 .PHONY: run
-run: protogen
+run: diffusers
 	@echo "Running diffusers..."
 	bash run.sh
 	@echo "Diffusers run."
 
-test: protogen
+test: diffusers
 	bash test.sh
 
-.PHONY: protogen
-protogen: backend_pb2_grpc.py backend_pb2.py
-
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 
-backend_pb2_grpc.py backend_pb2.py:
-	python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
-
 .PHONY: clean
 clean: protogen-clean
 	rm -rf venv __pycache__
\ No newline at end of file
diff --git a/backend/python/exllama2/Makefile b/backend/python/exllama2/Makefile
index 68a18f3aa855..15ba38d120f3 100644
--- a/backend/python/exllama2/Makefile
+++ b/backend/python/exllama2/Makefile
@@ -1,23 +1,17 @@
 .PHONY: exllama2
-exllama2: protogen
+exllama2:
 	bash install.sh
 
 .PHONY: run
-run: protogen
+run: exllama2
 	@echo "Running exllama2..."
 	bash run.sh
 	@echo "exllama2 run."
 
-.PHONY: protogen
-protogen: backend_pb2_grpc.py backend_pb2.py
-
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 
-backend_pb2_grpc.py backend_pb2.py:
-	python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
-
 .PHONY: clean
 clean: protogen-clean
 	$(RM) -r venv source __pycache__
\ No newline at end of file
diff --git a/backend/python/faster-whisper/Makefile b/backend/python/faster-whisper/Makefile
index c0e5169f75c4..f6b9ddc6c888 100644
--- a/backend/python/faster-whisper/Makefile
+++ b/backend/python/faster-whisper/Makefile
@@ -3,18 +3,11 @@
 .PHONY: install
 install:
 	bash install.sh
-	$(MAKE) protogen
-
-.PHONY: protogen
-protogen: backend_pb2_grpc.py backend_pb2.py
 
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 
-backend_pb2_grpc.py backend_pb2.py:
-	bash protogen.sh
-
 .PHONY: clean
 clean: protogen-clean
 	rm -rf venv __pycache__
\ No newline at end of file
diff --git a/backend/python/kitten-tts/Makefile b/backend/python/kitten-tts/Makefile
index f05fc191698f..021a9679bfd2 100644
--- a/backend/python/kitten-tts/Makefile
+++ b/backend/python/kitten-tts/Makefile
@@ -1,29 +1,23 @@
 .PHONY: kitten-tts
-kitten-tts: protogen
+kitten-tts:
 	bash install.sh
 
 .PHONY: run
-run: protogen
+run: kitten-tts
 	@echo "Running kitten-tts..."
 	bash run.sh
 	@echo "kitten-tts run."
 
 .PHONY: test
-test: protogen
+test: kitten-tts
 	@echo "Testing kitten-tts..."
 	bash test.sh
 	@echo "kitten-tts tested."
 
-.PHONY: protogen
-protogen: backend_pb2_grpc.py backend_pb2.py
-
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 
-backend_pb2_grpc.py backend_pb2.py:
-	python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
-
 .PHONY: clean
 clean: protogen-clean
 	rm -rf venv __pycache__
\ No newline at end of file
diff --git a/backend/python/kokoro/Makefile b/backend/python/kokoro/Makefile
index 660aabc34cec..29fc84b53159 100644
--- a/backend/python/kokoro/Makefile
+++ b/backend/python/kokoro/Makefile
@@ -14,16 +14,10 @@ test: protogen
 	bash test.sh
 	@echo "kokoro tested."
 
-.PHONY: protogen
-protogen: backend_pb2_grpc.py backend_pb2.py
-
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 
-backend_pb2_grpc.py backend_pb2.py:
-	python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
-
 .PHONY: clean
 clean: protogen-clean
 	rm -rf venv __pycache__
\ No newline at end of file
diff --git a/backend/python/rerankers/Makefile b/backend/python/rerankers/Makefile
index 82de822ff326..c9a1d30104b4 100644
--- a/backend/python/rerankers/Makefile
+++ b/backend/python/rerankers/Makefile
@@ -1,30 +1,24 @@
 .PHONY: rerankers
-rerankers: protogen
+rerankers:
 	bash install.sh
 
 .PHONY: run
-run: protogen
+run: rerankers
 	@echo "Running rerankers..."
 	bash run.sh
 	@echo "rerankers run."
 
 # It is not working well by using command line. It only6 works with IDE like VSCode.
 .PHONY: test
-test: protogen
+test: rerankers
 	@echo "Testing rerankers..."
 	bash test.sh
 	@echo "rerankers tested."
 
-.PHONY: protogen
-protogen: backend_pb2_grpc.py backend_pb2.py
-
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 
-backend_pb2_grpc.py backend_pb2.py:
-	python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
-
 .PHONY: clean
 clean: protogen-clean
 	rm -rf venv __pycache__
\ No newline at end of file
diff --git a/backend/python/rfdetr/Makefile b/backend/python/rfdetr/Makefile
index c0e5169f75c4..f6b9ddc6c888 100644
--- a/backend/python/rfdetr/Makefile
+++ b/backend/python/rfdetr/Makefile
@@ -3,18 +3,11 @@
 .PHONY: install
 install:
 	bash install.sh
-	$(MAKE) protogen
-
-.PHONY: protogen
-protogen: backend_pb2_grpc.py backend_pb2.py
 
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 
-backend_pb2_grpc.py backend_pb2.py:
-	bash protogen.sh
-
 .PHONY: clean
 clean: protogen-clean
 	rm -rf venv __pycache__
\ No newline at end of file
diff --git a/backend/python/rfdetr/protogen.sh b/backend/python/rfdetr/protogen.sh
deleted file mode 100644
index 0569b6c6e4b3..000000000000
--- a/backend/python/rfdetr/protogen.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-set -e
-
-backend_dir=$(dirname $0)
-if [ -d $backend_dir/common ]; then
-    source $backend_dir/common/libbackend.sh
-else
-    source $backend_dir/../common/libbackend.sh
-fi
-
-ensureVenv
-
-python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
\ No newline at end of file
diff --git a/backend/python/transformers/Makefile b/backend/python/transformers/Makefile
index 8d3f7fd73c9d..6897baf0c9b4 100644
--- a/backend/python/transformers/Makefile
+++ b/backend/python/transformers/Makefile
@@ -1,30 +1,24 @@
 .PHONY: transformers
-transformers: protogen
+transformers:
 	bash install.sh
 
 .PHONY: run
-run: protogen
+run: transformers
 	@echo "Running transformers..."
 	bash run.sh
 	@echo "transformers run."
 
 # It is not working well by using command line. It only6 works with IDE like VSCode.
 .PHONY: test
-test: protogen
+test: transformers
 	@echo "Testing transformers..."
 	bash test.sh
 	@echo "transformers tested."
 
-.PHONY: protogen
-protogen: backend_pb2_grpc.py backend_pb2.py
-
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 
-backend_pb2_grpc.py backend_pb2.py:
-	python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
-
 .PHONY: clean
 clean: protogen-clean
 	rm -rf venv __pycache__
\ No newline at end of file
diff --git a/backend/python/vllm/Makefile b/backend/python/vllm/Makefile
index bb57a19266bd..c7c1b6869c02 100644
--- a/backend/python/vllm/Makefile
+++ b/backend/python/vllm/Makefile
@@ -1,29 +1,23 @@
 .PHONY: vllm
-vllm: protogen
+vllm:
 	bash install.sh
 
 .PHONY: run
-run: protogen
+run: vllm
 	@echo "Running vllm..."
 	bash run.sh
 	@echo "vllm run."
 
 .PHONY: test
-test: protogen
+test: vllm
 	@echo "Testing vllm..."
 	bash test.sh
 	@echo "vllm tested."
 
-.PHONY: protogen
-protogen: backend_pb2_grpc.py backend_pb2.py
-
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 
-backend_pb2_grpc.py backend_pb2.py:
-	python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
-
 .PHONY: clean
 clean: protogen-clean
 	rm -rf venv __pycache__
\ No newline at end of file

From 1fbe95f1d70c2fb9d4fe8cdadbf392824d756fb8 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Thu, 21 Aug 2025 19:24:40 +0200
Subject: [PATCH 07/14] Detect if mps is available across python backends

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 backend/python/chatterbox/backend.py     |  4 +++-
 backend/python/coqui/backend.py          |  4 +++-
 backend/python/diffusers/backend.py      |  3 +++
 backend/python/faster-whisper/backend.py |  6 ++++--
 backend/python/kitten-tts/backend.py     | 12 ------------
 backend/python/kokoro/backend.py         | 11 -----------
 backend/python/transformers/backend.py   |  4 +++-
 7 files changed, 16 insertions(+), 28 deletions(-)

diff --git a/backend/python/chatterbox/backend.py b/backend/python/chatterbox/backend.py
index d7d241c302ac..0944202b9457 100644
--- a/backend/python/chatterbox/backend.py
+++ b/backend/python/chatterbox/backend.py
@@ -41,7 +41,9 @@ def LoadModel(self, request, context):
         else:
             print("CUDA is not available", file=sys.stderr)
             device = "cpu"
-
+        mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
+        if mps_available:
+            device = "mps"
         if not torch.cuda.is_available() and request.CUDA:
             return backend_pb2.Result(success=False, message="CUDA is not available")
 
diff --git a/backend/python/coqui/backend.py b/backend/python/coqui/backend.py
index f940f8e0a403..df115adb5030 100644
--- a/backend/python/coqui/backend.py
+++ b/backend/python/coqui/backend.py
@@ -40,7 +40,9 @@ def LoadModel(self, request, context):
         else:
             print("CUDA is not available", file=sys.stderr)
             device = "cpu"
-
+        mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
+        if mps_available:
+            device = "mps"
         if not torch.cuda.is_available() and request.CUDA:
             return backend_pb2.Result(success=False, message="CUDA is not available")
 
diff --git a/backend/python/diffusers/backend.py b/backend/python/diffusers/backend.py
index 185838209895..ef5f1b5c07ce 100755
--- a/backend/python/diffusers/backend.py
+++ b/backend/python/diffusers/backend.py
@@ -368,6 +368,9 @@ def LoadModel(self, request, context):
             device = "cpu" if not request.CUDA else "cuda"
             if XPU:
                 device = "xpu"
+            mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
+            if mps_available:
+                device = "mps"
             self.device = device
             if request.LoraAdapter:
                 # Check if its a local file and not a directory ( we load lora differently for a safetensor file )
diff --git a/backend/python/faster-whisper/backend.py b/backend/python/faster-whisper/backend.py
index b73664ab88f7..808f29238207 100755
--- a/backend/python/faster-whisper/backend.py
+++ b/backend/python/faster-whisper/backend.py
@@ -10,7 +10,7 @@
 import os
 import backend_pb2
 import backend_pb2_grpc
-
+import torch
 from faster_whisper import WhisperModel
 
 import grpc
@@ -35,7 +35,9 @@ def LoadModel(self, request, context):
         # device = "cuda" if request.CUDA else "cpu"
         if request.CUDA:
             device = "cuda"
-
+        mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
+        if mps_available:
+            device = "mps"
         try:
             print("Preparing models, please wait", file=sys.stderr)
             self.model = WhisperModel(request.Model, device=device, compute_type="float16")
diff --git a/backend/python/kitten-tts/backend.py b/backend/python/kitten-tts/backend.py
index 775f85f57d0b..b31023c8cac6 100644
--- a/backend/python/kitten-tts/backend.py
+++ b/backend/python/kitten-tts/backend.py
@@ -33,18 +33,6 @@ def Health(self, request, context):
         return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
     def LoadModel(self, request, context):
 
-        # Get device
-        # device = "cuda" if request.CUDA else "cpu"
-        if torch.cuda.is_available():
-            print("CUDA is available", file=sys.stderr)
-            device = "cuda"
-        else:
-            print("CUDA is not available", file=sys.stderr)
-            device = "cpu"
-
-        if not torch.cuda.is_available() and request.CUDA:
-            return backend_pb2.Result(success=False, message="CUDA is not available")
-
         self.AudioPath = None
         # List available KittenTTS models
         print("Available KittenTTS voices: expr-voice-2-m, expr-voice-2-f, expr-voice-3-m, expr-voice-3-f, expr-voice-4-m, expr-voice-4-f, expr-voice-5-m, expr-voice-5-f")
diff --git a/backend/python/kokoro/backend.py b/backend/python/kokoro/backend.py
index 83a3f3326fbd..32aefa558e8a 100644
--- a/backend/python/kokoro/backend.py
+++ b/backend/python/kokoro/backend.py
@@ -33,17 +33,6 @@ def Health(self, request, context):
         return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
     
     def LoadModel(self, request, context):
-        # Get device
-        if torch.cuda.is_available():
-            print("CUDA is available", file=sys.stderr)
-            device = "cuda"
-        else:
-            print("CUDA is not available", file=sys.stderr)
-            device = "cpu"
-
-        if not torch.cuda.is_available() and request.CUDA:
-            return backend_pb2.Result(success=False, message="CUDA is not available")
-
         try:
             print("Preparing Kokoro TTS pipeline, please wait", file=sys.stderr)
             # empty dict
diff --git a/backend/python/transformers/backend.py b/backend/python/transformers/backend.py
index ef8a2fd40b6e..05713b917d2a 100644
--- a/backend/python/transformers/backend.py
+++ b/backend/python/transformers/backend.py
@@ -94,7 +94,9 @@ def LoadModel(self, request, context):
         self.SentenceTransformer = False
 
         device_map="cpu"
-
+        mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
+        if mps_available:
+            device_map = "mps"
         quantization = None
         autoTokenizer = True
 

From cfaecbaaf582f23996d3c4178217960fb1e41fe2 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Thu, 21 Aug 2025 19:24:53 +0200
Subject: [PATCH 08/14] CI: try to build backend

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 .github/workflows/backend.yml | 86 +++++++++++++++++++++++++++++++++++
 backend/index.yaml            | 20 ++++++++
 2 files changed, 106 insertions(+)

diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml
index 3ff701d76864..704c90322575 100644
--- a/.github/workflows/backend.yml
+++ b/.github/workflows/backend.yml
@@ -1119,3 +1119,89 @@ jobs:
           for tag in $(echo "${{ steps.quaymeta.outputs.tags }}" | tr ',' '\n'); do
             crane push llama-cpp.tar $tag
           done
+  mlx-darwin:
+    runs-on: macOS-14
+    strategy:
+      matrix:
+        go-version: ['1.24.x']
+    steps:
+      - name: Clone
+        uses: actions/checkout@v5
+        with:
+          submodules: true
+      - name: Setup Go ${{ matrix.go-version }}
+        uses: actions/setup-go@v5
+        with:
+          go-version: ${{ matrix.go-version }}
+          cache: false
+      # You can test your matrix by printing the current Go version
+      - name: Display Go version
+        run: go version
+      - name: Dependencies
+        run: |
+          brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm
+      - name: Build mlx-darwin
+        run: |
+          make protogen-go
+          make backends/mlx
+      - name: Upload mlx.tar
+        uses: actions/upload-artifact@v4
+        with:
+          name: mlx-tar
+          path: backend-images/mlx.tar
+  mlx-darwin-publish:
+    needs: mlx-darwin
+    if: github.event_name != 'pull_request'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Download mlx.tar
+        uses: actions/download-artifact@v5
+        with:
+          name: mlx-tar
+          path: .
+      - name: Install crane
+        run: |
+          curl -L https://github.com/google/go-containerregistry/releases/latest/download/go-containerregistry_Linux_x86_64.tar.gz | tar -xz
+          sudo mv crane /usr/local/bin/
+      - name: Log in to DockerHub
+        run: |
+          echo "${{ secrets.DOCKERHUB_PASSWORD }}" | crane auth login docker.io -u "${{ secrets.DOCKERHUB_USERNAME }}" --password-stdin
+      - name: Log in to quay.io
+        run: |
+          echo "${{ secrets.LOCALAI_REGISTRY_PASSWORD }}" | crane auth login quay.io -u "${{ secrets.LOCALAI_REGISTRY_USERNAME }}" --password-stdin
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            localai/localai-backends
+          tags: |
+            type=ref,event=branch
+            type=semver,pattern={{raw}}
+            type=sha
+          flavor: |
+            latest=auto
+            suffix=-metal-darwin-arm64-mlx,onlatest=true
+      - name: Docker meta
+        id: quaymeta
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            quay.io/go-skynet/local-ai-backends
+          tags: |
+            type=ref,event=branch
+            type=semver,pattern={{raw}}
+            type=sha
+          flavor: |
+            latest=auto
+            suffix=-metal-darwin-arm64-mlx,onlatest=true
+      - name: Push Docker image (DockerHub)
+        run: |
+          for tag in $(echo "${{ steps.meta.outputs.tags }}" | tr ',' '\n'); do
+            crane push mlx.tar $tag
+          done
+      - name: Push Docker image (Quay)
+        run: |
+          for tag in $(echo "${{ steps.quaymeta.outputs.tags }}" | tr ',' '\n'); do
+            crane push mlx.tar $tag
+          done
\ No newline at end of file
diff --git a/backend/index.yaml b/backend/index.yaml
index 8bedccb67d42..3fed08f275d4 100644
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -127,6 +127,21 @@
     nvidia: "cuda12-vllm"
     amd: "rocm-vllm"
     intel: "intel-vllm"
+- &mlx
+  name: "mlx"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-mlx"
+  icon: https://avatars.githubusercontent.com/u/102832242?s=200&v=4
+  urls:
+    - https://github.com/ml-explore/mlx-lm
+  mirrors:
+    - localai/localai-backends:latest-metal-darwin-arm64-mlx
+  license: MIT
+  description: |
+      Run LLMs with MLX
+  tags:
+    - text-to-text
+    - LLM
+    - MLX
 - &rerankers
   name: "rerankers"
   alias: "rerankers"
@@ -371,6 +386,11 @@
     - text-to-speech
     - TTS
   license: apache-2.0
+- !!merge <<: *mlx
+  name: "mlx-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-mlx"
+  mirrors:
+    - localai/localai-backends:master-metal-darwin-arm64-mlx
 - !!merge <<: *kitten-tts
   name: "kitten-tts-development"
   uri: "quay.io/go-skynet/local-ai-backends:master-kitten-tts"

From d9d0439e0a79828292f398c5cb2bc2cf317b6cc8 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Thu, 21 Aug 2025 19:29:34 +0200
Subject: [PATCH 09/14] Debug CI

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 .github/workflows/backend.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml
index 704c90322575..f2f102082e3c 100644
--- a/.github/workflows/backend.yml
+++ b/.github/workflows/backend.yml
@@ -7,6 +7,7 @@ on:
       - master
     tags:
       - '*'
+  pull_request:
 
 concurrency:
   group: ci-backends-${{ github.head_ref || github.ref }}-${{ github.repository }}

From 0a8975d9319d8815fa11b6e5688ac99c8d7e1c91 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Thu, 21 Aug 2025 19:39:46 +0200
Subject: [PATCH 10/14] Fixups

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 backend/python/kokoro/Makefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/backend/python/kokoro/Makefile b/backend/python/kokoro/Makefile
index 29fc84b53159..7e1b238228b1 100644
--- a/backend/python/kokoro/Makefile
+++ b/backend/python/kokoro/Makefile
@@ -1,15 +1,15 @@
 .PHONY: kokoro
-kokoro: protogen
+kokoro:
 	bash install.sh
 
 .PHONY: run
-run: protogen
+run: kokoro
 	@echo "Running kokoro..."
 	bash run.sh
 	@echo "kokoro run."
 
 .PHONY: test
-test: protogen
+test: kokoro
 	@echo "Testing kokoro..."
 	bash test.sh
 	@echo "kokoro tested."

From ab9ab20d53c3600ffb6daa87413a6f35612e96a4 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Thu, 21 Aug 2025 23:44:56 +0200
Subject: [PATCH 11/14] Fixups

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 .github/workflows/backend.yml              |  99 +---
 .github/workflows/backend_build_darwin.yml | 136 ++++++
 Makefile                                   |   5 +-
 backend/python/common/libbackend.sh        |   2 +
 backend/python/mlx-vlm/Makefile            |  18 +-
 backend/python/mlx-vlm/backend.py          | 544 +++++++++++++--------
 6 files changed, 488 insertions(+), 316 deletions(-)
 create mode 100644 .github/workflows/backend_build_darwin.yml

diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml
index f2f102082e3c..eac2e2e83635 100644
--- a/.github/workflows/backend.yml
+++ b/.github/workflows/backend.yml
@@ -946,6 +946,19 @@ jobs:
             backend: "kitten-tts"
             dockerfile: "./backend/Dockerfile.python"
             context: "./backend"
+  mlx-darwin:
+    uses: ./.github/workflows/backend_build_darwin.yml
+    with:
+      backend: "mlx"
+      build-type: "mps"
+      go-version: "1.24.x"
+      tag-suffix: "-metal-darwin-arm64-mlx"
+      runs-on: "macOS-14"
+    secrets:
+      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
+      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
+      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
+      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
   llama-cpp-darwin:
     runs-on: macOS-14
     strategy:
@@ -1119,90 +1132,4 @@ jobs:
         run: |
           for tag in $(echo "${{ steps.quaymeta.outputs.tags }}" | tr ',' '\n'); do
             crane push llama-cpp.tar $tag
-          done
-  mlx-darwin:
-    runs-on: macOS-14
-    strategy:
-      matrix:
-        go-version: ['1.24.x']
-    steps:
-      - name: Clone
-        uses: actions/checkout@v5
-        with:
-          submodules: true
-      - name: Setup Go ${{ matrix.go-version }}
-        uses: actions/setup-go@v5
-        with:
-          go-version: ${{ matrix.go-version }}
-          cache: false
-      # You can test your matrix by printing the current Go version
-      - name: Display Go version
-        run: go version
-      - name: Dependencies
-        run: |
-          brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm
-      - name: Build mlx-darwin
-        run: |
-          make protogen-go
-          make backends/mlx
-      - name: Upload mlx.tar
-        uses: actions/upload-artifact@v4
-        with:
-          name: mlx-tar
-          path: backend-images/mlx.tar
-  mlx-darwin-publish:
-    needs: mlx-darwin
-    if: github.event_name != 'pull_request'
-    runs-on: ubuntu-latest
-    steps:
-      - name: Download mlx.tar
-        uses: actions/download-artifact@v5
-        with:
-          name: mlx-tar
-          path: .
-      - name: Install crane
-        run: |
-          curl -L https://github.com/google/go-containerregistry/releases/latest/download/go-containerregistry_Linux_x86_64.tar.gz | tar -xz
-          sudo mv crane /usr/local/bin/
-      - name: Log in to DockerHub
-        run: |
-          echo "${{ secrets.DOCKERHUB_PASSWORD }}" | crane auth login docker.io -u "${{ secrets.DOCKERHUB_USERNAME }}" --password-stdin
-      - name: Log in to quay.io
-        run: |
-          echo "${{ secrets.LOCALAI_REGISTRY_PASSWORD }}" | crane auth login quay.io -u "${{ secrets.LOCALAI_REGISTRY_USERNAME }}" --password-stdin
-      - name: Docker meta
-        id: meta
-        uses: docker/metadata-action@v5
-        with:
-          images: |
-            localai/localai-backends
-          tags: |
-            type=ref,event=branch
-            type=semver,pattern={{raw}}
-            type=sha
-          flavor: |
-            latest=auto
-            suffix=-metal-darwin-arm64-mlx,onlatest=true
-      - name: Docker meta
-        id: quaymeta
-        uses: docker/metadata-action@v5
-        with:
-          images: |
-            quay.io/go-skynet/local-ai-backends
-          tags: |
-            type=ref,event=branch
-            type=semver,pattern={{raw}}
-            type=sha
-          flavor: |
-            latest=auto
-            suffix=-metal-darwin-arm64-mlx,onlatest=true
-      - name: Push Docker image (DockerHub)
-        run: |
-          for tag in $(echo "${{ steps.meta.outputs.tags }}" | tr ',' '\n'); do
-            crane push mlx.tar $tag
-          done
-      - name: Push Docker image (Quay)
-        run: |
-          for tag in $(echo "${{ steps.quaymeta.outputs.tags }}" | tr ',' '\n'); do
-            crane push mlx.tar $tag
           done
\ No newline at end of file
diff --git a/.github/workflows/backend_build_darwin.yml b/.github/workflows/backend_build_darwin.yml
new file mode 100644
index 000000000000..e6a2b4d388e4
--- /dev/null
+++ b/.github/workflows/backend_build_darwin.yml
@@ -0,0 +1,136 @@
+---
+name: 'build darwin python backend container images (reusable)'
+
+on:
+  workflow_call:
+    inputs:
+      backend:
+        description: 'Backend to build'
+        required: true
+        type: string
+      build-type:
+        description: 'Build type (e.g., mps)'
+        default: ''
+        type: string
+      go-version:
+        description: 'Go version to use'
+        default: '1.24.x'
+        type: string
+      tag-suffix:
+        description: 'Tag suffix for the built image'
+        required: true
+        type: string
+      runs-on:
+        description: 'Runner to use'
+        default: 'macOS-14'
+        type: string
+    secrets:
+      dockerUsername:
+        required: false
+      dockerPassword:
+        required: false
+      quayUsername:
+        required: true
+      quayPassword:
+        required: true
+
+jobs:
+  darwin-backend-build:
+    runs-on: ${{ inputs.runs-on }}
+    strategy:
+      matrix:
+        go-version: ['${{ inputs.go-version }}']
+    steps:
+      - name: Clone
+        uses: actions/checkout@v5
+        with:
+          submodules: true
+          
+      - name: Setup Go ${{ matrix.go-version }}
+        uses: actions/setup-go@v5
+        with:
+          go-version: ${{ matrix.go-version }}
+          cache: false
+          
+      # You can test your matrix by printing the current Go version
+      - name: Display Go version
+        run: go version
+        
+      - name: Dependencies
+        run: |
+          brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm
+          
+      - name: Build ${{ inputs.backend }}-darwin
+        run: |
+          make protogen-go
+          BACKEND=${{ inputs.backend }} BUILD_TYPE=${{ inputs.build-type }} make build-darwin-python-backend
+          
+      - name: Upload ${{ inputs.backend }}.tar
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ inputs.backend }}-tar
+          path: backend-images/${{ inputs.backend }}.tar
+
+  darwin-backend-publish:
+    needs: darwin-backend-build
+    if: github.event_name != 'pull_request'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Download ${{ inputs.backend }}.tar
+        uses: actions/download-artifact@v5
+        with:
+          name: ${{ inputs.backend }}-tar
+          path: .
+          
+      - name: Install crane
+        run: |
+          curl -L https://github.com/google/go-containerregistry/releases/latest/download/go-containerregistry_Linux_x86_64.tar.gz | tar -xz
+          sudo mv crane /usr/local/bin/
+          
+      - name: Log in to DockerHub
+        run: |
+          echo "${{ secrets.dockerPassword }}" | crane auth login docker.io -u "${{ secrets.dockerUsername }}" --password-stdin
+          
+      - name: Log in to quay.io
+        run: |
+          echo "${{ secrets.quayPassword }}" | crane auth login quay.io -u "${{ secrets.quayUsername }}" --password-stdin
+          
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            localai/localai-backends
+          tags: |
+            type=ref,event=branch
+            type=semver,pattern={{raw}}
+            type=sha
+          flavor: |
+            latest=auto
+            suffix=${{ inputs.tag-suffix }},onlatest=true
+            
+      - name: Docker meta
+        id: quaymeta
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            quay.io/go-skynet/local-ai-backends
+          tags: |
+            type=ref,event=branch
+            type=semver,pattern={{raw}}
+            type=sha
+          flavor: |
+            latest=auto
+            suffix=${{ inputs.tag-suffix }},onlatest=true
+            
+      - name: Push Docker image (DockerHub)
+        run: |
+          for tag in $(echo "${{ steps.meta.outputs.tags }}" | tr ',' '\n'); do
+            crane push ${{ inputs.backend }}.tar $tag
+          done
+          
+      - name: Push Docker image (Quay)
+        run: |
+          for tag in $(echo "${{ steps.quaymeta.outputs.tags }}" | tr ',' '\n'); do
+            crane push ${{ inputs.backend }}.tar $tag
+          done
diff --git a/Makefile b/Makefile
index af95e2374575..5be5bca1ca01 100644
--- a/Makefile
+++ b/Makefile
@@ -362,8 +362,11 @@ backends/llama-cpp-darwin: build
 	bash ./scripts/build/llama-cpp-darwin.sh
 	./local-ai backends install "ocifile://$(abspath ./backend-images/llama-cpp.tar)"
 
+build-darwin-python-backend:
+	bash ./scripts/build/python-darwin.sh
+
 backends/mlx: build
-	BACKEND=mlx BUILD_TYPE=mps bash ./scripts/build/python-darwin.sh
+	BACKEND=mlx BUILD_TYPE=mps $(MAKE) build-darwin-python-backend
 	./local-ai backends install "ocifile://$(abspath ./backend-images/mlx.tar)"
 
 backend-images:
diff --git a/backend/python/common/libbackend.sh b/backend/python/common/libbackend.sh
index 409d20665932..79430ad2d4f1 100644
--- a/backend/python/common/libbackend.sh
+++ b/backend/python/common/libbackend.sh
@@ -111,6 +111,8 @@ function ensureVenv() {
                 fi
                 echo "Using interpreter: ${interpreter}"
                 ${interpreter} -m venv ${EDIR}/venv
+                source ${EDIR}/venv/bin/activate
+                ${interpreter} -m pip install --upgrade pip
                 echo "Python virtual environment created"
         else
                 echo "Using uv package manager"
diff --git a/backend/python/mlx-vlm/Makefile b/backend/python/mlx-vlm/Makefile
index c4c18bee55b0..804031aa970d 100644
--- a/backend/python/mlx-vlm/Makefile
+++ b/backend/python/mlx-vlm/Makefile
@@ -1,29 +1,23 @@
-.PHONY: mlx
-mlx: protogen
+.PHONY: mlx-vlm
+mlx-vlm:
 	bash install.sh
 
 .PHONY: run
-run: protogen
-	@echo "Running mlx..."
+run: mlx-vlm
+	@echo "Running mlx-vlm..."
 	bash run.sh
 	@echo "mlx run."
 
 .PHONY: test
-test: protogen
-	@echo "Testing mlx..."
+test: mlx-vlm
+	@echo "Testing mlx-vlm..."
 	bash test.sh
 	@echo "mlx tested."
 
-.PHONY: protogen
-protogen: backend_pb2_grpc.py backend_pb2.py
-
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 
-backend_pb2_grpc.py backend_pb2.py:
-	python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
-
 .PHONY: clean
 clean: protogen-clean
 	rm -rf venv __pycache__
\ No newline at end of file
diff --git a/backend/python/mlx-vlm/backend.py b/backend/python/mlx-vlm/backend.py
index 56698a54e5f5..02730c814965 100644
--- a/backend/python/mlx-vlm/backend.py
+++ b/backend/python/mlx-vlm/backend.py
@@ -6,21 +6,20 @@
 import sys
 import os
 from typing import List
-from PIL import Image
+import time
 
 import backend_pb2
 import backend_pb2_grpc
 
 import grpc
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.async_llm_engine import AsyncLLMEngine
-from vllm.sampling_params import SamplingParams
-from vllm.utils import random_uuid
-from vllm.transformers_utils.tokenizer import get_tokenizer
-from vllm.multimodal.utils import fetch_image
-from vllm.assets.video import VideoAsset
+from mlx_vlm import load, generate, stream_generate
+from mlx_vlm.prompt_utils import apply_chat_template
+from mlx_vlm.utils import load_config, load_image
+import mlx.core as mx
 import base64
 import io
+from PIL import Image
+import tempfile
 
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
 
@@ -32,38 +31,22 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
     """
     A gRPC servicer that implements the Backend service defined in backend.proto.
     """
-    def generate(self,prompt, max_new_tokens):
-        """
-        Generates text based on the given prompt and maximum number of new tokens.
-
-        Args:
-            prompt (str): The prompt to generate text from.
-            max_new_tokens (int): The maximum number of new tokens to generate.
-
-        Returns:
-            str: The generated text.
-        """
-        self.generator.end_beam_search()
-
-        # Tokenizing the input
-        ids = self.generator.tokenizer.encode(prompt)
 
-        self.generator.gen_begin_reuse(ids)
-        initial_len = self.generator.sequence[0].shape[0]
-        has_leading_space = False
-        decoded_text = ''
-        for i in range(max_new_tokens):
-            token = self.generator.gen_single_token()
-            if i == 0 and self.generator.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'):
-                has_leading_space = True
-
-            decoded_text = self.generator.tokenizer.decode(self.generator.sequence[0][initial_len:])
-            if has_leading_space:
-                decoded_text = ' ' + decoded_text
+    def _is_float(self, s):
+        """Check if a string can be converted to float."""
+        try:
+            float(s)
+            return True
+        except ValueError:
+            return False
 
-            if token.item() == self.generator.tokenizer.eos_token_id:
-                break
-        return decoded_text
+    def _is_int(self, s):
+        """Check if a string can be converted to int."""
+        try:
+            int(s)
+            return True
+        except ValueError:
+            return False
 
     def Health(self, request, context):
         """
@@ -80,7 +63,7 @@ def Health(self, request, context):
 
     async def LoadModel(self, request, context):
         """
-        Loads a language model.
+        Loads a multimodal vision-language model using MLX-VLM.
 
         Args:
             request: The load model request.
@@ -89,60 +72,50 @@ async def LoadModel(self, request, context):
         Returns:
             backend_pb2.Result: The load model result.
         """
-        engine_args = AsyncEngineArgs(
-            model=request.Model,
-        )
-
-        if request.Quantization != "":
-            engine_args.quantization = request.Quantization
-        if request.LoadFormat != "":
-            engine_args.load_format = request.LoadFormat
-        if request.GPUMemoryUtilization != 0:
-            engine_args.gpu_memory_utilization = request.GPUMemoryUtilization
-        if request.TrustRemoteCode:
-            engine_args.trust_remote_code = request.TrustRemoteCode
-        if request.EnforceEager:
-            engine_args.enforce_eager = request.EnforceEager
-        if request.TensorParallelSize:
-            engine_args.tensor_parallel_size = request.TensorParallelSize
-        if request.SwapSpace != 0:
-            engine_args.swap_space = request.SwapSpace
-        if request.MaxModelLen != 0:
-            engine_args.max_model_len = request.MaxModelLen
-        if request.DisableLogStatus:
-            engine_args.disable_log_status = request.DisableLogStatus
-        if request.DType != "":
-            engine_args.dtype = request.DType
-        if request.LimitImagePerPrompt != 0 or request.LimitVideoPerPrompt != 0 or request.LimitAudioPerPrompt != 0:
-            # limit-mm-per-prompt defaults to 1 per modality, based on vLLM docs
-            engine_args.limit_mm_per_prompt = {
-                "image": max(request.LimitImagePerPrompt, 1),
-                "video": max(request.LimitVideoPerPrompt, 1),
-                "audio": max(request.LimitAudioPerPrompt, 1)
-            }
-
         try:
-            self.llm = AsyncLLMEngine.from_engine_args(engine_args)
+            print(f"Loading MLX-VLM model: {request.Model}", file=sys.stderr)
+            print(f"Request: {request}", file=sys.stderr)
+            
+            # Parse options like in the diffusers backend
+            options = request.Options
+            self.options = {}
+            
+            # The options are a list of strings in this form optname:optvalue
+            # We store all the options in a dict for later use
+            for opt in options:
+                if ":" not in opt:
+                    continue
+                key, value = opt.split(":", 1)  # Split only on first colon to handle values with colons
+                
+                # Convert numeric values to appropriate types
+                if self._is_float(value):
+                    value = float(value)
+                elif self._is_int(value):
+                    value = int(value)
+                elif value.lower() in ["true", "false"]:
+                    value = value.lower() == "true"
+                    
+                self.options[key] = value
+            
+            print(f"Options: {self.options}", file=sys.stderr)
+            
+            # Load model and processor using MLX-VLM
+            # mlx-vlm load function returns (model, processor) instead of (model, tokenizer)
+            self.model, self.processor = load(request.Model)
+            
+            # Load model config for chat template support
+            self.config = load_config(request.Model)
+                
         except Exception as err:
-            print(f"Unexpected {err=}, {type(err)=}", file=sys.stderr)
-            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+            print(f"Error loading MLX-VLM model {err=}, {type(err)=}", file=sys.stderr)
+            return backend_pb2.Result(success=False, message=f"Error loading MLX-VLM model: {err}")
 
-        try:
-           engine_model_config = await self.llm.get_model_config()
-           self.tokenizer = get_tokenizer(
-               engine_model_config.tokenizer,
-               tokenizer_mode=engine_model_config.tokenizer_mode,
-               trust_remote_code=engine_model_config.trust_remote_code,
-               truncation_side="left",
-           )
-        except Exception as err:
-            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
-        print("Model loaded successfully", file=sys.stderr)
-        return backend_pb2.Result(message="Model loaded successfully", success=True)
+        print("MLX-VLM model loaded successfully", file=sys.stderr)
+        return backend_pb2.Result(message="MLX-VLM model loaded successfully", success=True)
 
     async def Predict(self, request, context):
         """
-        Generates text based on the given prompt and sampling parameters.
+        Generates text based on the given prompt and sampling parameters using MLX-VLM with multimodal support.
 
         Args:
             request: The predict request.
@@ -151,13 +124,66 @@ async def Predict(self, request, context):
         Returns:
             backend_pb2.Reply: The predict result.
         """
-        gen = self._predict(request, context, streaming=False)
-        res = await gen.__anext__()
-        return res
+        temp_files = []
+        try:
+            # Process images and audios from request
+            image_paths = []
+            audio_paths = []
+            
+            # Process images
+            if request.Images:
+                for img_data in request.Images:
+                    img_path = self.load_image_from_base64(img_data)
+                    if img_path:
+                        image_paths.append(img_path)
+                        temp_files.append(img_path)
+            
+            # Process audios
+            if request.Audios:
+                for audio_data in request.Audios:
+                    audio_path = self.load_audio_from_base64(audio_data)
+                    if audio_path:
+                        audio_paths.append(audio_path)
+                        temp_files.append(audio_path)
+            
+            # Prepare the prompt with multimodal information
+            prompt = self._prepare_prompt(request, num_images=len(image_paths), num_audios=len(audio_paths))
+            
+            # Build generation parameters using request attributes and options
+            max_tokens, generation_params = self._build_generation_params(request)
+            
+            print(f"Generating text with MLX-VLM - max_tokens: {max_tokens}, params: {generation_params}", file=sys.stderr)
+            print(f"Images: {len(image_paths)}, Audios: {len(audio_paths)}", file=sys.stderr)
+            
+            # Generate text using MLX-VLM with multimodal inputs
+            response = generate(
+                model=self.model,
+                processor=self.processor,
+                prompt=prompt,
+                image=image_paths if image_paths else None,
+                audio=audio_paths if audio_paths else None,
+                max_tokens=max_tokens,
+                temperature=generation_params.get('temp', 0.6),
+                top_p=generation_params.get('top_p', 1.0),
+                verbose=False
+            )
+            
+            return backend_pb2.Reply(message=bytes(response, encoding='utf-8'))
+            
+        except Exception as e:
+            print(f"Error in MLX-VLM Predict: {e}", file=sys.stderr)
+            context.set_code(grpc.StatusCode.INTERNAL)
+            context.set_details(f"Generation failed: {str(e)}")
+            return backend_pb2.Reply(message=bytes("", encoding='utf-8'))
+        finally:
+            # Clean up temporary files
+            self.cleanup_temp_files(temp_files)
 
     def Embedding(self, request, context):
         """
         A gRPC method that calculates embeddings for a given sentence.
+        
+        Note: MLX-VLM doesn't support embeddings directly. This method returns an error.
 
         Args:
             request: An EmbeddingRequest object that contains the request parameters.
@@ -166,171 +192,255 @@ def Embedding(self, request, context):
         Returns:
             An EmbeddingResult object that contains the calculated embeddings.
         """
-        print("Calculated embeddings for: " + request.Embeddings, file=sys.stderr)
-        outputs = self.model.encode(request.Embeddings)
-        # Check if we have one result at least
-        if len(outputs) == 0:
-            context.set_code(grpc.StatusCode.INVALID_ARGUMENT)
-            context.set_details("No embeddings were calculated.")
-            return backend_pb2.EmbeddingResult()
-        return backend_pb2.EmbeddingResult(embeddings=outputs[0].outputs.embedding)
+        print("Embeddings not supported in MLX-VLM backend", file=sys.stderr)
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details("Embeddings are not supported in the MLX-VLM backend.")
+        return backend_pb2.EmbeddingResult()
 
     async def PredictStream(self, request, context):
         """
-        Generates text based on the given prompt and sampling parameters, and streams the results.
+        Generates text based on the given prompt and sampling parameters, and streams the results using MLX-VLM with multimodal support.
 
         Args:
             request: The predict stream request.
             context: The gRPC context.
 
-        Returns:
-            backend_pb2.Result: The predict stream result.
+        Yields:
+            backend_pb2.Reply: Streaming predict results.
         """
-        iterations = self._predict(request, context, streaming=True)
+        temp_files = []
         try:
-            async for iteration in iterations:
-                yield iteration
+            # Process images and audios from request
+            image_paths = []
+            audio_paths = []
+            
+            # Process images
+            if request.Images:
+                for img_data in request.Images:
+                    img_path = self.load_image_from_base64(img_data)
+                    if img_path:
+                        image_paths.append(img_path)
+                        temp_files.append(img_path)
+            
+            # Process audios
+            if request.Audios:
+                for audio_data in request.Audios:
+                    audio_path = self.load_audio_from_base64(audio_data)
+                    if audio_path:
+                        audio_paths.append(audio_path)
+                        temp_files.append(audio_path)
+            
+            # Prepare the prompt with multimodal information
+            prompt = self._prepare_prompt(request, num_images=len(image_paths), num_audios=len(audio_paths))
+            
+            # Build generation parameters using request attributes and options
+            max_tokens, generation_params = self._build_generation_params(request, default_max_tokens=512)
+            
+            print(f"Streaming text with MLX-VLM - max_tokens: {max_tokens}, params: {generation_params}", file=sys.stderr)
+            print(f"Images: {len(image_paths)}, Audios: {len(audio_paths)}", file=sys.stderr)
+            
+            # Stream text generation using MLX-VLM with multimodal inputs
+            for response in stream_generate(
+                model=self.model,
+                processor=self.processor,
+                prompt=prompt,
+                image=image_paths if image_paths else None,
+                audio=audio_paths if audio_paths else None,
+                max_tokens=max_tokens,
+                temperature=generation_params.get('temp', 0.6),
+                top_p=generation_params.get('top_p', 1.0),
+            ):
+                yield backend_pb2.Reply(message=bytes(response.text, encoding='utf-8'))
+                
+        except Exception as e:
+            print(f"Error in MLX-VLM PredictStream: {e}", file=sys.stderr)
+            context.set_code(grpc.StatusCode.INTERNAL)
+            context.set_details(f"Streaming generation failed: {str(e)}")
+            yield backend_pb2.Reply(message=bytes("", encoding='utf-8'))
         finally:
-            await iterations.aclose()
-
-    async def _predict(self, request, context, streaming=False):
-        # Build the sampling parameters
-        # NOTE: this must stay in sync with the vllm backend
-        request_to_sampling_params = {
-            "N": "n",
-            "PresencePenalty": "presence_penalty",
-            "FrequencyPenalty": "frequency_penalty",
-            "RepetitionPenalty": "repetition_penalty",
-            "Temperature": "temperature",
-            "TopP": "top_p",
-            "TopK": "top_k",
-            "MinP": "min_p",
-            "Seed": "seed",
-            "StopPrompts": "stop",
-            "StopTokenIds": "stop_token_ids",
-            "BadWords": "bad_words",
-            "IncludeStopStrInOutput": "include_stop_str_in_output",
-            "IgnoreEOS": "ignore_eos",
-            "Tokens": "max_tokens",
-            "MinTokens": "min_tokens",
-            "Logprobs": "logprobs",
-            "PromptLogprobs": "prompt_logprobs",
-            "SkipSpecialTokens": "skip_special_tokens",
-            "SpacesBetweenSpecialTokens": "spaces_between_special_tokens",
-            "TruncatePromptTokens": "truncate_prompt_tokens",
-            "GuidedDecoding": "guided_decoding",
-        }
-
-        sampling_params = SamplingParams(top_p=0.9, max_tokens=200)
+            # Clean up temporary files
+            self.cleanup_temp_files(temp_files)
 
-        for request_field, param_field in request_to_sampling_params.items():
-            if hasattr(request, request_field):
-                value = getattr(request, request_field)
-                if value not in (None, 0, [], False):
-                    setattr(sampling_params, param_field, value)
-
-        # Extract image paths and process images
-        prompt = request.Prompt
-
-        image_paths = request.Images
-        image_data = [self.load_image(img_path) for img_path in image_paths]
+    def _prepare_prompt(self, request, num_images=0, num_audios=0):
+        """
+        Prepare the prompt for MLX-VLM generation, handling chat templates and multimodal inputs.
 
-        videos_path = request.Videos
-        video_data = [self.load_video(video_path) for video_path in videos_path]
+        Args:
+            request: The gRPC request containing prompt and message information.
+            num_images: Number of images in the request.
+            num_audios: Number of audio files in the request.
 
+        Returns:
+            str: The prepared prompt.
+        """
         # If tokenizer template is enabled and messages are provided instead of prompt, apply the tokenizer template
         if not request.Prompt and request.UseTokenizerTemplate and request.Messages:
-            prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True)
-
-        # Generate text using the LLM engine
-        request_id = random_uuid()
-        print(f"Generating text with request_id: {request_id}", file=sys.stderr)
-        multi_modal_data = {}
-        if image_data:
-            multi_modal_data["image"] = image_data
-        if video_data:
-            multi_modal_data["video"] = video_data
-        outputs = self.llm.generate(
-            {
-            "prompt": prompt,
-            "multi_modal_data": multi_modal_data if multi_modal_data else None,
-            },
-            sampling_params=sampling_params,
-            request_id=request_id,
-        )
-
-        # Stream the results
-        generated_text = ""
-        try:
-            async for request_output in outputs:
-                iteration_text = request_output.outputs[0].text
-
-                if streaming:
-                    # Remove text already sent as vllm concatenates the text from previous yields
-                    delta_iteration_text = iteration_text.removeprefix(generated_text)
-                    # Send the partial result
-                    yield backend_pb2.Reply(message=bytes(delta_iteration_text, encoding='utf-8'))
-
-                # Keep track of text generated
-                generated_text = iteration_text
-        finally:
-            await outputs.aclose()
-
-        # If streaming, we already sent everything
-        if streaming:
-            return
-
-        # Remove the image files from /tmp folder
-        for img_path in image_paths:
-            try:
-                os.remove(img_path)
-            except Exception as e:
-                print(f"Error removing image file: {img_path}, {e}", file=sys.stderr)
+            # Convert gRPC messages to the format expected by apply_chat_template
+            messages = []
+            for msg in request.Messages:
+                messages.append({"role": msg.role, "content": msg.content})
+            
+            # Use mlx-vlm's apply_chat_template which handles multimodal inputs
+            prompt = apply_chat_template(
+                self.processor,
+                self.config, 
+                messages,
+                num_images=num_images,
+                num_audios=num_audios
+            )
+            return prompt
+        elif request.Prompt:
+            # If we have a direct prompt but also have images/audio, we need to format it properly
+            if num_images > 0 or num_audios > 0:
+                # Create a simple message structure for multimodal prompt
+                messages = [{"role": "user", "content": request.Prompt}]
+                prompt = apply_chat_template(
+                    self.processor,
+                    self.config, 
+                    messages,
+                    num_images=num_images,
+                    num_audios=num_audios
+                )
+                return prompt
+            else:
+                return request.Prompt
+        else:
+            # Fallback to empty prompt with multimodal template if we have media
+            if num_images > 0 or num_audios > 0:
+                messages = [{"role": "user", "content": ""}]
+                prompt = apply_chat_template(
+                    self.processor,
+                    self.config, 
+                    messages,
+                    num_images=num_images,
+                    num_audios=num_audios
+                )
+                return prompt
+            else:
+                return ""
+
+
+
+
+
+    def _build_generation_params(self, request, default_max_tokens=200):
+        """
+        Build generation parameters from request attributes and options for MLX-VLM.
 
-        # Sending the final generated text
-        yield backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))
+        Args:
+            request: The gRPC request.
+            default_max_tokens: Default max_tokens if not specified.
 
-    def load_image(self, image_path: str):
+        Returns:
+            tuple: (max_tokens, generation_params dict)
         """
-        Load an image from the given file path or base64 encoded data.
+        # Extract max_tokens
+        max_tokens = getattr(request, 'Tokens', default_max_tokens)
+        if max_tokens == 0:
+            max_tokens = default_max_tokens
+        
+        # Extract generation parameters from request attributes
+        temp = getattr(request, 'Temperature', 0.0)
+        if temp == 0.0:
+            temp = 0.6  # Default temperature
+        
+        top_p = getattr(request, 'TopP', 0.0)
+        if top_p == 0.0:
+            top_p = 1.0  # Default top_p
+        
+        # Initialize generation parameters for MLX-VLM
+        generation_params = {
+            'temp': temp,
+            'top_p': top_p,
+        }
+        
+        # Add seed if specified
+        seed = getattr(request, 'Seed', 0)
+        if seed != 0:
+            mx.random.seed(seed)
+        
+        # Override with options if available
+        if hasattr(self, 'options'):
+            # Max tokens from options
+            if 'max_tokens' in self.options:
+                max_tokens = self.options['max_tokens']
+            
+            # Generation parameters from options
+            param_option_mapping = {
+                'temp': 'temp',
+                'temperature': 'temp',  # alias
+                'top_p': 'top_p', 
+            }
+            
+            for option_key, param_key in param_option_mapping.items():
+                if option_key in self.options:
+                    generation_params[param_key] = self.options[option_key]
+            
+            # Handle seed from options
+            if 'seed' in self.options:
+                mx.random.seed(self.options['seed'])
+        
+        return max_tokens, generation_params
+
+    def load_image_from_base64(self, image_data: str):
+        """
+        Load an image from base64 encoded data.
 
         Args:
-            image_path (str): The path to the image file or base64 encoded data.
+            image_data (str): Base64 encoded image data.
 
         Returns:
-            Image: The loaded image.
+            PIL.Image or str: The loaded image or path to the image.
         """
         try:
-
-            image_data = base64.b64decode(image_path)
-            image = Image.open(io.BytesIO(image_data))
-            return image
+            decoded_data = base64.b64decode(image_data)
+            image = Image.open(io.BytesIO(decoded_data))
+            
+            # Save to temporary file for mlx-vlm
+            with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as tmp_file:
+                image.save(tmp_file.name, format='JPEG')
+                return tmp_file.name
+                
         except Exception as e:
-            print(f"Error loading image {image_path}: {e}", file=sys.stderr)
+            print(f"Error loading image from base64: {e}", file=sys.stderr)
             return None
 
-    def load_video(self, video_path: str):
+    def load_audio_from_base64(self, audio_data: str):
         """
-        Load a video from the given file path.
+        Load audio from base64 encoded data.
 
         Args:
-            video_path (str): The path to the image file.
+            audio_data (str): Base64 encoded audio data.
 
         Returns:
-            Video: The loaded video.
+            str: Path to the loaded audio file.
         """
         try:
-            timestamp = str(int(time.time() * 1000))  # Generate timestamp
-            p = f"/tmp/vl-{timestamp}.data"  # Use timestamp in filename
-            with open(p, "wb") as f:
-                f.write(base64.b64decode(video_path))
-            video = VideoAsset(name=p).np_ndarrays
-            os.remove(p)
-            return video
+            decoded_data = base64.b64decode(audio_data)
+            
+            # Save to temporary file for mlx-vlm
+            with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
+                tmp_file.write(decoded_data)
+                return tmp_file.name
+                
         except Exception as e:
-            print(f"Error loading video {video_path}: {e}", file=sys.stderr)
+            print(f"Error loading audio from base64: {e}", file=sys.stderr)
             return None
 
+    def cleanup_temp_files(self, file_paths: List[str]):
+        """
+        Clean up temporary files.
+
+        Args:
+            file_paths (List[str]): List of file paths to clean up.
+        """
+        for file_path in file_paths:
+            try:
+                if file_path and os.path.exists(file_path):
+                    os.remove(file_path)
+            except Exception as e:
+                print(f"Error removing temporary file {file_path}: {e}", file=sys.stderr)
+
 async def serve(address):
     # Start asyncio gRPC server
     server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),

From c59b6a951dbf38426fdd2b510a1304aa0ea0d429 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Thu, 21 Aug 2025 23:45:12 +0200
Subject: [PATCH 12/14] Index mlx-vlm

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 Makefile           |  4 ++++
 backend/index.yaml | 22 ++++++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/Makefile b/Makefile
index 5be5bca1ca01..aebff8599dc9 100644
--- a/Makefile
+++ b/Makefile
@@ -369,6 +369,10 @@ backends/mlx: build
 	BACKEND=mlx BUILD_TYPE=mps $(MAKE) build-darwin-python-backend
 	./local-ai backends install "ocifile://$(abspath ./backend-images/mlx.tar)"
 
+backends/mlx-vlm: build
+	BACKEND=mlx-vlm BUILD_TYPE=mps bash ./scripts/build/python-darwin.sh
+	./local-ai backends install "ocifile://$(abspath ./backend-images/mlx-vlm.tar)"
+
 backend-images:
 	mkdir -p backend-images
 
diff --git a/backend/index.yaml b/backend/index.yaml
index 3fed08f275d4..b7f1efc1a9f9 100644
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -142,6 +142,23 @@
     - text-to-text
     - LLM
     - MLX
+- &mlx-vlm
+  name: "mlx-vlm"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-mlx-vlm"
+  icon: https://avatars.githubusercontent.com/u/102832242?s=200&v=4
+  urls:
+    - https://github.com/ml-explore/mlx-vlm
+  mirrors:
+    - localai/localai-backends:latest-metal-darwin-arm64-mlx-vlm
+  license: MIT
+  description: |
+      Run Vision-Language Models with MLX
+  tags:
+    - text-to-text
+    - multimodal
+    - vision-language
+    - LLM
+    - MLX
 - &rerankers
   name: "rerankers"
   alias: "rerankers"
@@ -391,6 +408,11 @@
   uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-mlx"
   mirrors:
     - localai/localai-backends:master-metal-darwin-arm64-mlx
+- !!merge <<: *mlx-vlm
+  name: "mlx-vlm-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-mlx-vlm"
+  mirrors:
+    - localai/localai-backends:master-metal-darwin-arm64-mlx-vlm
 - !!merge <<: *kitten-tts
   name: "kitten-tts-development"
   uri: "quay.io/go-skynet/local-ai-backends:master-kitten-tts"

From 92456f59c4c0ea43a73b82e8138ded87c891b921 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Thu, 21 Aug 2025 23:45:52 +0200
Subject: [PATCH 13/14] Remove mlx-vlm

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 Makefile                                |   4 -
 backend/index.yaml                      |  22 --
 backend/python/mlx-vlm/Makefile         |  23 --
 backend/python/mlx-vlm/backend.py       | 477 ------------------------
 backend/python/mlx-vlm/install.sh       |  14 -
 backend/python/mlx-vlm/requirements.txt |   4 -
 backend/python/mlx-vlm/run.sh           |  11 -
 backend/python/mlx-vlm/test.py          | 146 --------
 backend/python/mlx-vlm/test.sh          |  12 -
 9 files changed, 713 deletions(-)
 delete mode 100644 backend/python/mlx-vlm/Makefile
 delete mode 100644 backend/python/mlx-vlm/backend.py
 delete mode 100755 backend/python/mlx-vlm/install.sh
 delete mode 100644 backend/python/mlx-vlm/requirements.txt
 delete mode 100755 backend/python/mlx-vlm/run.sh
 delete mode 100644 backend/python/mlx-vlm/test.py
 delete mode 100755 backend/python/mlx-vlm/test.sh

diff --git a/Makefile b/Makefile
index aebff8599dc9..5be5bca1ca01 100644
--- a/Makefile
+++ b/Makefile
@@ -369,10 +369,6 @@ backends/mlx: build
 	BACKEND=mlx BUILD_TYPE=mps $(MAKE) build-darwin-python-backend
 	./local-ai backends install "ocifile://$(abspath ./backend-images/mlx.tar)"
 
-backends/mlx-vlm: build
-	BACKEND=mlx-vlm BUILD_TYPE=mps bash ./scripts/build/python-darwin.sh
-	./local-ai backends install "ocifile://$(abspath ./backend-images/mlx-vlm.tar)"
-
 backend-images:
 	mkdir -p backend-images
 
diff --git a/backend/index.yaml b/backend/index.yaml
index b7f1efc1a9f9..3fed08f275d4 100644
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -142,23 +142,6 @@
     - text-to-text
     - LLM
     - MLX
-- &mlx-vlm
-  name: "mlx-vlm"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-mlx-vlm"
-  icon: https://avatars.githubusercontent.com/u/102832242?s=200&v=4
-  urls:
-    - https://github.com/ml-explore/mlx-vlm
-  mirrors:
-    - localai/localai-backends:latest-metal-darwin-arm64-mlx-vlm
-  license: MIT
-  description: |
-      Run Vision-Language Models with MLX
-  tags:
-    - text-to-text
-    - multimodal
-    - vision-language
-    - LLM
-    - MLX
 - &rerankers
   name: "rerankers"
   alias: "rerankers"
@@ -408,11 +391,6 @@
   uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-mlx"
   mirrors:
     - localai/localai-backends:master-metal-darwin-arm64-mlx
-- !!merge <<: *mlx-vlm
-  name: "mlx-vlm-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-mlx-vlm"
-  mirrors:
-    - localai/localai-backends:master-metal-darwin-arm64-mlx-vlm
 - !!merge <<: *kitten-tts
   name: "kitten-tts-development"
   uri: "quay.io/go-skynet/local-ai-backends:master-kitten-tts"
diff --git a/backend/python/mlx-vlm/Makefile b/backend/python/mlx-vlm/Makefile
deleted file mode 100644
index 804031aa970d..000000000000
--- a/backend/python/mlx-vlm/Makefile
+++ /dev/null
@@ -1,23 +0,0 @@
-.PHONY: mlx-vlm
-mlx-vlm:
-	bash install.sh
-
-.PHONY: run
-run: mlx-vlm
-	@echo "Running mlx-vlm..."
-	bash run.sh
-	@echo "mlx run."
-
-.PHONY: test
-test: mlx-vlm
-	@echo "Testing mlx-vlm..."
-	bash test.sh
-	@echo "mlx tested."
-
-.PHONY: protogen-clean
-protogen-clean:
-	$(RM) backend_pb2_grpc.py backend_pb2.py
-
-.PHONY: clean
-clean: protogen-clean
-	rm -rf venv __pycache__
\ No newline at end of file
diff --git a/backend/python/mlx-vlm/backend.py b/backend/python/mlx-vlm/backend.py
deleted file mode 100644
index 02730c814965..000000000000
--- a/backend/python/mlx-vlm/backend.py
+++ /dev/null
@@ -1,477 +0,0 @@
-#!/usr/bin/env python3
-import asyncio
-from concurrent import futures
-import argparse
-import signal
-import sys
-import os
-from typing import List
-import time
-
-import backend_pb2
-import backend_pb2_grpc
-
-import grpc
-from mlx_vlm import load, generate, stream_generate
-from mlx_vlm.prompt_utils import apply_chat_template
-from mlx_vlm.utils import load_config, load_image
-import mlx.core as mx
-import base64
-import io
-from PIL import Image
-import tempfile
-
-_ONE_DAY_IN_SECONDS = 60 * 60 * 24
-
-# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
-MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
-
-# Implement the BackendServicer class with the service methods
-class BackendServicer(backend_pb2_grpc.BackendServicer):
-    """
-    A gRPC servicer that implements the Backend service defined in backend.proto.
-    """
-
-    def _is_float(self, s):
-        """Check if a string can be converted to float."""
-        try:
-            float(s)
-            return True
-        except ValueError:
-            return False
-
-    def _is_int(self, s):
-        """Check if a string can be converted to int."""
-        try:
-            int(s)
-            return True
-        except ValueError:
-            return False
-
-    def Health(self, request, context):
-        """
-        Returns a health check message.
-
-        Args:
-            request: The health check request.
-            context: The gRPC context.
-
-        Returns:
-            backend_pb2.Reply: The health check reply.
-        """
-        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
-
-    async def LoadModel(self, request, context):
-        """
-        Loads a multimodal vision-language model using MLX-VLM.
-
-        Args:
-            request: The load model request.
-            context: The gRPC context.
-
-        Returns:
-            backend_pb2.Result: The load model result.
-        """
-        try:
-            print(f"Loading MLX-VLM model: {request.Model}", file=sys.stderr)
-            print(f"Request: {request}", file=sys.stderr)
-            
-            # Parse options like in the diffusers backend
-            options = request.Options
-            self.options = {}
-            
-            # The options are a list of strings in this form optname:optvalue
-            # We store all the options in a dict for later use
-            for opt in options:
-                if ":" not in opt:
-                    continue
-                key, value = opt.split(":", 1)  # Split only on first colon to handle values with colons
-                
-                # Convert numeric values to appropriate types
-                if self._is_float(value):
-                    value = float(value)
-                elif self._is_int(value):
-                    value = int(value)
-                elif value.lower() in ["true", "false"]:
-                    value = value.lower() == "true"
-                    
-                self.options[key] = value
-            
-            print(f"Options: {self.options}", file=sys.stderr)
-            
-            # Load model and processor using MLX-VLM
-            # mlx-vlm load function returns (model, processor) instead of (model, tokenizer)
-            self.model, self.processor = load(request.Model)
-            
-            # Load model config for chat template support
-            self.config = load_config(request.Model)
-                
-        except Exception as err:
-            print(f"Error loading MLX-VLM model {err=}, {type(err)=}", file=sys.stderr)
-            return backend_pb2.Result(success=False, message=f"Error loading MLX-VLM model: {err}")
-
-        print("MLX-VLM model loaded successfully", file=sys.stderr)
-        return backend_pb2.Result(message="MLX-VLM model loaded successfully", success=True)
-
-    async def Predict(self, request, context):
-        """
-        Generates text based on the given prompt and sampling parameters using MLX-VLM with multimodal support.
-
-        Args:
-            request: The predict request.
-            context: The gRPC context.
-
-        Returns:
-            backend_pb2.Reply: The predict result.
-        """
-        temp_files = []
-        try:
-            # Process images and audios from request
-            image_paths = []
-            audio_paths = []
-            
-            # Process images
-            if request.Images:
-                for img_data in request.Images:
-                    img_path = self.load_image_from_base64(img_data)
-                    if img_path:
-                        image_paths.append(img_path)
-                        temp_files.append(img_path)
-            
-            # Process audios
-            if request.Audios:
-                for audio_data in request.Audios:
-                    audio_path = self.load_audio_from_base64(audio_data)
-                    if audio_path:
-                        audio_paths.append(audio_path)
-                        temp_files.append(audio_path)
-            
-            # Prepare the prompt with multimodal information
-            prompt = self._prepare_prompt(request, num_images=len(image_paths), num_audios=len(audio_paths))
-            
-            # Build generation parameters using request attributes and options
-            max_tokens, generation_params = self._build_generation_params(request)
-            
-            print(f"Generating text with MLX-VLM - max_tokens: {max_tokens}, params: {generation_params}", file=sys.stderr)
-            print(f"Images: {len(image_paths)}, Audios: {len(audio_paths)}", file=sys.stderr)
-            
-            # Generate text using MLX-VLM with multimodal inputs
-            response = generate(
-                model=self.model,
-                processor=self.processor,
-                prompt=prompt,
-                image=image_paths if image_paths else None,
-                audio=audio_paths if audio_paths else None,
-                max_tokens=max_tokens,
-                temperature=generation_params.get('temp', 0.6),
-                top_p=generation_params.get('top_p', 1.0),
-                verbose=False
-            )
-            
-            return backend_pb2.Reply(message=bytes(response, encoding='utf-8'))
-            
-        except Exception as e:
-            print(f"Error in MLX-VLM Predict: {e}", file=sys.stderr)
-            context.set_code(grpc.StatusCode.INTERNAL)
-            context.set_details(f"Generation failed: {str(e)}")
-            return backend_pb2.Reply(message=bytes("", encoding='utf-8'))
-        finally:
-            # Clean up temporary files
-            self.cleanup_temp_files(temp_files)
-
-    def Embedding(self, request, context):
-        """
-        A gRPC method that calculates embeddings for a given sentence.
-        
-        Note: MLX-VLM doesn't support embeddings directly. This method returns an error.
-
-        Args:
-            request: An EmbeddingRequest object that contains the request parameters.
-            context: A grpc.ServicerContext object that provides information about the RPC.
-
-        Returns:
-            An EmbeddingResult object that contains the calculated embeddings.
-        """
-        print("Embeddings not supported in MLX-VLM backend", file=sys.stderr)
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details("Embeddings are not supported in the MLX-VLM backend.")
-        return backend_pb2.EmbeddingResult()
-
-    async def PredictStream(self, request, context):
-        """
-        Generates text based on the given prompt and sampling parameters, and streams the results using MLX-VLM with multimodal support.
-
-        Args:
-            request: The predict stream request.
-            context: The gRPC context.
-
-        Yields:
-            backend_pb2.Reply: Streaming predict results.
-        """
-        temp_files = []
-        try:
-            # Process images and audios from request
-            image_paths = []
-            audio_paths = []
-            
-            # Process images
-            if request.Images:
-                for img_data in request.Images:
-                    img_path = self.load_image_from_base64(img_data)
-                    if img_path:
-                        image_paths.append(img_path)
-                        temp_files.append(img_path)
-            
-            # Process audios
-            if request.Audios:
-                for audio_data in request.Audios:
-                    audio_path = self.load_audio_from_base64(audio_data)
-                    if audio_path:
-                        audio_paths.append(audio_path)
-                        temp_files.append(audio_path)
-            
-            # Prepare the prompt with multimodal information
-            prompt = self._prepare_prompt(request, num_images=len(image_paths), num_audios=len(audio_paths))
-            
-            # Build generation parameters using request attributes and options
-            max_tokens, generation_params = self._build_generation_params(request, default_max_tokens=512)
-            
-            print(f"Streaming text with MLX-VLM - max_tokens: {max_tokens}, params: {generation_params}", file=sys.stderr)
-            print(f"Images: {len(image_paths)}, Audios: {len(audio_paths)}", file=sys.stderr)
-            
-            # Stream text generation using MLX-VLM with multimodal inputs
-            for response in stream_generate(
-                model=self.model,
-                processor=self.processor,
-                prompt=prompt,
-                image=image_paths if image_paths else None,
-                audio=audio_paths if audio_paths else None,
-                max_tokens=max_tokens,
-                temperature=generation_params.get('temp', 0.6),
-                top_p=generation_params.get('top_p', 1.0),
-            ):
-                yield backend_pb2.Reply(message=bytes(response.text, encoding='utf-8'))
-                
-        except Exception as e:
-            print(f"Error in MLX-VLM PredictStream: {e}", file=sys.stderr)
-            context.set_code(grpc.StatusCode.INTERNAL)
-            context.set_details(f"Streaming generation failed: {str(e)}")
-            yield backend_pb2.Reply(message=bytes("", encoding='utf-8'))
-        finally:
-            # Clean up temporary files
-            self.cleanup_temp_files(temp_files)
-
-    def _prepare_prompt(self, request, num_images=0, num_audios=0):
-        """
-        Prepare the prompt for MLX-VLM generation, handling chat templates and multimodal inputs.
-
-        Args:
-            request: The gRPC request containing prompt and message information.
-            num_images: Number of images in the request.
-            num_audios: Number of audio files in the request.
-
-        Returns:
-            str: The prepared prompt.
-        """
-        # If tokenizer template is enabled and messages are provided instead of prompt, apply the tokenizer template
-        if not request.Prompt and request.UseTokenizerTemplate and request.Messages:
-            # Convert gRPC messages to the format expected by apply_chat_template
-            messages = []
-            for msg in request.Messages:
-                messages.append({"role": msg.role, "content": msg.content})
-            
-            # Use mlx-vlm's apply_chat_template which handles multimodal inputs
-            prompt = apply_chat_template(
-                self.processor,
-                self.config, 
-                messages,
-                num_images=num_images,
-                num_audios=num_audios
-            )
-            return prompt
-        elif request.Prompt:
-            # If we have a direct prompt but also have images/audio, we need to format it properly
-            if num_images > 0 or num_audios > 0:
-                # Create a simple message structure for multimodal prompt
-                messages = [{"role": "user", "content": request.Prompt}]
-                prompt = apply_chat_template(
-                    self.processor,
-                    self.config, 
-                    messages,
-                    num_images=num_images,
-                    num_audios=num_audios
-                )
-                return prompt
-            else:
-                return request.Prompt
-        else:
-            # Fallback to empty prompt with multimodal template if we have media
-            if num_images > 0 or num_audios > 0:
-                messages = [{"role": "user", "content": ""}]
-                prompt = apply_chat_template(
-                    self.processor,
-                    self.config, 
-                    messages,
-                    num_images=num_images,
-                    num_audios=num_audios
-                )
-                return prompt
-            else:
-                return ""
-
-
-
-
-
-    def _build_generation_params(self, request, default_max_tokens=200):
-        """
-        Build generation parameters from request attributes and options for MLX-VLM.
-
-        Args:
-            request: The gRPC request.
-            default_max_tokens: Default max_tokens if not specified.
-
-        Returns:
-            tuple: (max_tokens, generation_params dict)
-        """
-        # Extract max_tokens
-        max_tokens = getattr(request, 'Tokens', default_max_tokens)
-        if max_tokens == 0:
-            max_tokens = default_max_tokens
-        
-        # Extract generation parameters from request attributes
-        temp = getattr(request, 'Temperature', 0.0)
-        if temp == 0.0:
-            temp = 0.6  # Default temperature
-        
-        top_p = getattr(request, 'TopP', 0.0)
-        if top_p == 0.0:
-            top_p = 1.0  # Default top_p
-        
-        # Initialize generation parameters for MLX-VLM
-        generation_params = {
-            'temp': temp,
-            'top_p': top_p,
-        }
-        
-        # Add seed if specified
-        seed = getattr(request, 'Seed', 0)
-        if seed != 0:
-            mx.random.seed(seed)
-        
-        # Override with options if available
-        if hasattr(self, 'options'):
-            # Max tokens from options
-            if 'max_tokens' in self.options:
-                max_tokens = self.options['max_tokens']
-            
-            # Generation parameters from options
-            param_option_mapping = {
-                'temp': 'temp',
-                'temperature': 'temp',  # alias
-                'top_p': 'top_p', 
-            }
-            
-            for option_key, param_key in param_option_mapping.items():
-                if option_key in self.options:
-                    generation_params[param_key] = self.options[option_key]
-            
-            # Handle seed from options
-            if 'seed' in self.options:
-                mx.random.seed(self.options['seed'])
-        
-        return max_tokens, generation_params
-
-    def load_image_from_base64(self, image_data: str):
-        """
-        Load an image from base64 encoded data.
-
-        Args:
-            image_data (str): Base64 encoded image data.
-
-        Returns:
-            PIL.Image or str: The loaded image or path to the image.
-        """
-        try:
-            decoded_data = base64.b64decode(image_data)
-            image = Image.open(io.BytesIO(decoded_data))
-            
-            # Save to temporary file for mlx-vlm
-            with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as tmp_file:
-                image.save(tmp_file.name, format='JPEG')
-                return tmp_file.name
-                
-        except Exception as e:
-            print(f"Error loading image from base64: {e}", file=sys.stderr)
-            return None
-
-    def load_audio_from_base64(self, audio_data: str):
-        """
-        Load audio from base64 encoded data.
-
-        Args:
-            audio_data (str): Base64 encoded audio data.
-
-        Returns:
-            str: Path to the loaded audio file.
-        """
-        try:
-            decoded_data = base64.b64decode(audio_data)
-            
-            # Save to temporary file for mlx-vlm
-            with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
-                tmp_file.write(decoded_data)
-                return tmp_file.name
-                
-        except Exception as e:
-            print(f"Error loading audio from base64: {e}", file=sys.stderr)
-            return None
-
-    def cleanup_temp_files(self, file_paths: List[str]):
-        """
-        Clean up temporary files.
-
-        Args:
-            file_paths (List[str]): List of file paths to clean up.
-        """
-        for file_path in file_paths:
-            try:
-                if file_path and os.path.exists(file_path):
-                    os.remove(file_path)
-            except Exception as e:
-                print(f"Error removing temporary file {file_path}: {e}", file=sys.stderr)
-
-async def serve(address):
-    # Start asyncio gRPC server
-    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
-        options=[
-            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
-            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
-            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
-        ])
-    # Add the servicer to the server
-    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
-    # Bind the server to the address
-    server.add_insecure_port(address)
-
-    # Gracefully shutdown the server on SIGTERM or SIGINT
-    loop = asyncio.get_event_loop()
-    for sig in (signal.SIGINT, signal.SIGTERM):
-        loop.add_signal_handler(
-            sig, lambda: asyncio.ensure_future(server.stop(5))
-        )
-
-    # Start the server
-    await server.start()
-    print("Server started. Listening on: " + address, file=sys.stderr)
-    # Wait for the server to be terminated
-    await server.wait_for_termination()
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Run the gRPC server.")
-    parser.add_argument(
-        "--addr", default="localhost:50051", help="The address to bind the server to."
-    )
-    args = parser.parse_args()
-
-    asyncio.run(serve(args.addr))
diff --git a/backend/python/mlx-vlm/install.sh b/backend/python/mlx-vlm/install.sh
deleted file mode 100755
index b8ee48552490..000000000000
--- a/backend/python/mlx-vlm/install.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-set -e
-
-USE_PIP=true
-
-backend_dir=$(dirname $0)
-
-if [ -d $backend_dir/common ]; then
-    source $backend_dir/common/libbackend.sh
-else
-    source $backend_dir/../common/libbackend.sh
-fi
-
-installRequirements
diff --git a/backend/python/mlx-vlm/requirements.txt b/backend/python/mlx-vlm/requirements.txt
deleted file mode 100644
index f1771cc4adb4..000000000000
--- a/backend/python/mlx-vlm/requirements.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-grpcio==1.71.0
-protobuf
-certifi
-setuptools
\ No newline at end of file
diff --git a/backend/python/mlx-vlm/run.sh b/backend/python/mlx-vlm/run.sh
deleted file mode 100755
index fc88f97da712..000000000000
--- a/backend/python/mlx-vlm/run.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/bash
-
-backend_dir=$(dirname $0)
-
-if [ -d $backend_dir/common ]; then
-    source $backend_dir/common/libbackend.sh
-else
-    source $backend_dir/../common/libbackend.sh
-fi
-
-startBackend $@
\ No newline at end of file
diff --git a/backend/python/mlx-vlm/test.py b/backend/python/mlx-vlm/test.py
deleted file mode 100644
index 827aa71a3e33..000000000000
--- a/backend/python/mlx-vlm/test.py
+++ /dev/null
@@ -1,146 +0,0 @@
-import unittest
-import subprocess
-import time
-import backend_pb2
-import backend_pb2_grpc
-
-import grpc
-
-import unittest
-import subprocess
-import time
-import grpc
-import backend_pb2_grpc
-import backend_pb2
-
-class TestBackendServicer(unittest.TestCase):
-    """
-    TestBackendServicer is the class that tests the gRPC service.
-
-    This class contains methods to test the startup and shutdown of the gRPC service.
-    """
-    def setUp(self):
-        self.service = subprocess.Popen(["python", "backend.py", "--addr", "localhost:50051"])
-        time.sleep(10)
-
-    def tearDown(self) -> None:
-        self.service.terminate()
-        self.service.wait()
-
-    def test_server_startup(self):
-        try:
-            self.setUp()
-            with grpc.insecure_channel("localhost:50051") as channel:
-                stub = backend_pb2_grpc.BackendStub(channel)
-                response = stub.Health(backend_pb2.HealthMessage())
-                self.assertEqual(response.message, b'OK')
-        except Exception as err:
-            print(err)
-            self.fail("Server failed to start")
-        finally:
-            self.tearDown()
-    def test_load_model(self):
-        """
-        This method tests if the model is loaded successfully
-        """
-        try:
-            self.setUp()
-            with grpc.insecure_channel("localhost:50051") as channel:
-                stub = backend_pb2_grpc.BackendStub(channel)
-                response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m"))
-                self.assertTrue(response.success)
-                self.assertEqual(response.message, "Model loaded successfully")
-        except Exception as err:
-            print(err)
-            self.fail("LoadModel service failed")
-        finally:
-            self.tearDown()
-
-    def test_text(self):
-        """
-        This method tests if the embeddings are generated successfully
-        """
-        try:
-            self.setUp()
-            with grpc.insecure_channel("localhost:50051") as channel:
-                stub = backend_pb2_grpc.BackendStub(channel)
-                response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m"))
-                self.assertTrue(response.success)
-                req = backend_pb2.PredictOptions(Prompt="The capital of France is")
-                resp = stub.Predict(req)
-                self.assertIsNotNone(resp.message)
-        except Exception as err:
-            print(err)
-            self.fail("text service failed")
-        finally:
-            self.tearDown()
-
-    def test_sampling_params(self):
-        """
-        This method tests if all sampling parameters are correctly processed
-        NOTE: this does NOT test for correctness, just that we received a compatible response
-        """
-        try:
-            self.setUp()
-            with grpc.insecure_channel("localhost:50051") as channel:
-                stub = backend_pb2_grpc.BackendStub(channel)
-                response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m"))
-                self.assertTrue(response.success)
-
-                req = backend_pb2.PredictOptions(
-                    Prompt="The capital of France is",
-                    TopP=0.8,
-                    Tokens=50,
-                    Temperature=0.7,
-                    TopK=40,
-                    PresencePenalty=0.1,
-                    FrequencyPenalty=0.2,
-                    RepetitionPenalty=1.1,
-                    MinP=0.05,
-                    Seed=42,
-                    StopPrompts=["\n"],
-                    StopTokenIds=[50256],
-                    BadWords=["badword"],
-                    IncludeStopStrInOutput=True,
-                    IgnoreEOS=True,
-                    MinTokens=5,
-                    Logprobs=5,
-                    PromptLogprobs=5,
-                    SkipSpecialTokens=True,
-                    SpacesBetweenSpecialTokens=True,
-                    TruncatePromptTokens=10,
-                    GuidedDecoding=True,
-                    N=2,
-                )
-                resp = stub.Predict(req)
-                self.assertIsNotNone(resp.message)
-                self.assertIsNotNone(resp.logprobs)
-        except Exception as err:
-            print(err)
-            self.fail("sampling params service failed")
-        finally:
-            self.tearDown()
-
-
-    def test_embedding(self):
-        """
-        This method tests if the embeddings are generated successfully
-        """
-        try:
-            self.setUp()
-            with grpc.insecure_channel("localhost:50051") as channel:
-                stub = backend_pb2_grpc.BackendStub(channel)
-                response = stub.LoadModel(backend_pb2.ModelOptions(Model="intfloat/e5-mistral-7b-instruct"))
-                self.assertTrue(response.success)
-                embedding_request = backend_pb2.PredictOptions(Embeddings="This is a test sentence.")
-                embedding_response = stub.Embedding(embedding_request)
-                self.assertIsNotNone(embedding_response.embeddings)
-                # assert that is a list of floats
-                self.assertIsInstance(embedding_response.embeddings, list)
-                # assert that the list is not empty
-                self.assertTrue(len(embedding_response.embeddings) > 0)
-        except Exception as err:
-            print(err)
-            self.fail("Embedding service failed")
-        finally:
-            self.tearDown()
\ No newline at end of file
diff --git a/backend/python/mlx-vlm/test.sh b/backend/python/mlx-vlm/test.sh
deleted file mode 100755
index f31ae54e47dc..000000000000
--- a/backend/python/mlx-vlm/test.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-set -e
-
-backend_dir=$(dirname $0)
-
-if [ -d $backend_dir/common ]; then
-    source $backend_dir/common/libbackend.sh
-else
-    source $backend_dir/../common/libbackend.sh
-fi
-
-runUnittests

From a4bdaab172ddac85c34dbea6fcd7862c7788e357 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Fri, 22 Aug 2025 08:40:38 +0200
Subject: [PATCH 14/14] Drop CI test

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 .github/workflows/backend.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml
index eac2e2e83635..965427f4013c 100644
--- a/.github/workflows/backend.yml
+++ b/.github/workflows/backend.yml
@@ -7,7 +7,6 @@ on:
       - master
     tags:
       - '*'
-  pull_request:
 
 concurrency:
   group: ci-backends-${{ github.head_ref || github.ref }}-${{ github.repository }}