From 6d0af39634936701b48ec638a111c7116dcdc968 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Tue, 12 Aug 2025 11:29:53 +0200 Subject: [PATCH 01/14] chore: allow to install with pip Signed-off-by: Ettore Di Giacinto --- backend/python/common/libbackend.sh | 56 +++++++++++++++++++++++------ 1 file changed, 45 insertions(+), 11 deletions(-) diff --git a/backend/python/common/libbackend.sh b/backend/python/common/libbackend.sh index daa47c3c2080..d98924fe3550 100644 --- a/backend/python/common/libbackend.sh +++ b/backend/python/common/libbackend.sh @@ -17,9 +17,17 @@ # LIMIT_TARGETS="cublas12" # source $(dirname $0)/../common/libbackend.sh # +# You can switch between uv (conda-like) and pip installation methods by setting USE_PIP: +# USE_PIP=true source $(dirname $0)/../common/libbackend.sh +# PYTHON_VERSION="3.10" +# Default to uv if USE_PIP is not set +if [ "x${USE_PIP}" == "x" ]; then + USE_PIP=false +fi + function init() { # Name of the backend (directory name) BACKEND_NAME=${PWD##*/} @@ -48,6 +56,11 @@ function init() { fi echo "Initializing libbackend for ${BACKEND_NAME}" + if [ "x${USE_PIP}" == "xtrue" ]; then + echo "Using pip and Python virtual environments" + else + echo "Using uv package manager" + fi } # getBuildProfile will inspect the system to determine which build profile is appropriate: @@ -95,18 +108,33 @@ function getBuildProfile() { # This function is idempotent, so you can call it as many times as you want and it will # always result in an activated virtual environment function ensureVenv() { - if [ ! -d "${EDIR}/venv" ]; then - uv venv --python ${PYTHON_VERSION} ${EDIR}/venv - echo "virtualenv created" - fi + if [ "x${USE_PIP}" == "xtrue" ]; then + # Use Python virtual environment with pip + if [ ! -d "${EDIR}/venv" ]; then + python${PYTHON_VERSION} -m venv ${EDIR}/venv + echo "Python virtual environment created" + fi - # Source if we are not already in a Virtual env - if [ "x${VIRTUAL_ENV}" != "x${EDIR}/venv" ]; then - source ${EDIR}/venv/bin/activate - echo "virtualenv activated" + # Source if we are not already in a Virtual env + if [ "x${VIRTUAL_ENV}" != "x${EDIR}/venv" ]; then + source ${EDIR}/venv/bin/activate + echo "Python virtual environment activated" + fi + else + # Use uv (conda-like) + if [ ! -d "${EDIR}/venv" ]; then + uv venv --python ${PYTHON_VERSION} ${EDIR}/venv + echo "uv virtual environment created" + fi + + # Source if we are not already in a Virtual env + if [ "x${VIRTUAL_ENV}" != "x${EDIR}/venv" ]; then + source ${EDIR}/venv/bin/activate + echo "uv virtual environment activated" + fi fi - echo "activated virtualenv has been ensured" + echo "activated virtual environment has been ensured" } # installRequirements looks for several requirements files and if they exist runs the install for them in order @@ -116,7 +144,7 @@ function ensureVenv() { # - requirements-${BUILD_TYPE}.txt # - requirements-${BUILD_PROFILE}.txt # -# BUILD_PROFILE is a pore specific version of BUILD_TYPE, ex: cuda-11 or cuda-12 +# BUILD_PROFILE is a more specific version of BUILD_TYPE, ex: cuda-11 or cuda-12 # it can also include some options that we do not have BUILD_TYPES for, ex: intel # # NOTE: for BUILD_PROFILE==intel, this function does NOT automatically use the Intel python package index. @@ -158,7 +186,13 @@ function installRequirements() { for reqFile in ${requirementFiles[@]}; do if [ -f ${reqFile} ]; then echo "starting requirements install for ${reqFile}" - uv pip install ${EXTRA_PIP_INSTALL_FLAGS} --requirement ${reqFile} + if [ "x${USE_PIP}" == "xtrue" ]; then + # Use pip for installation + pip install ${EXTRA_PIP_INSTALL_FLAGS} --requirement ${reqFile} + else + # Use uv for installation + uv pip install ${EXTRA_PIP_INSTALL_FLAGS} --requirement ${reqFile} + fi echo "finished requirements install for ${reqFile}" fi done From 44eaf6c20d85ff6ee411e81b00edf19c54aa0416 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Tue, 12 Aug 2025 22:18:27 +0200 Subject: [PATCH 02/14] WIP Signed-off-by: Ettore Di Giacinto --- backend/python/mlx-vlm/Makefile | 29 ++ backend/python/mlx-vlm/backend.py | 367 ++++++++++++++++++++++++ backend/python/mlx-vlm/install.sh | 14 + backend/python/mlx-vlm/requirements.txt | 4 + backend/python/mlx-vlm/run.sh | 11 + backend/python/mlx-vlm/test.py | 146 ++++++++++ backend/python/mlx-vlm/test.sh | 12 + backend/python/mlx/Makefile | 29 ++ backend/python/mlx/backend.py | 367 ++++++++++++++++++++++++ backend/python/mlx/install.sh | 14 + backend/python/mlx/requirements.txt | 4 + backend/python/mlx/run.sh | 11 + backend/python/mlx/test.py | 146 ++++++++++ backend/python/mlx/test.sh | 12 + 14 files changed, 1166 insertions(+) create mode 100644 backend/python/mlx-vlm/Makefile create mode 100644 backend/python/mlx-vlm/backend.py create mode 100755 backend/python/mlx-vlm/install.sh create mode 100644 backend/python/mlx-vlm/requirements.txt create mode 100755 backend/python/mlx-vlm/run.sh create mode 100644 backend/python/mlx-vlm/test.py create mode 100755 backend/python/mlx-vlm/test.sh create mode 100644 backend/python/mlx/Makefile create mode 100644 backend/python/mlx/backend.py create mode 100755 backend/python/mlx/install.sh create mode 100644 backend/python/mlx/requirements.txt create mode 100755 backend/python/mlx/run.sh create mode 100644 backend/python/mlx/test.py create mode 100755 backend/python/mlx/test.sh diff --git a/backend/python/mlx-vlm/Makefile b/backend/python/mlx-vlm/Makefile new file mode 100644 index 000000000000..c4c18bee55b0 --- /dev/null +++ b/backend/python/mlx-vlm/Makefile @@ -0,0 +1,29 @@ +.PHONY: mlx +mlx: protogen + bash install.sh + +.PHONY: run +run: protogen + @echo "Running mlx..." + bash run.sh + @echo "mlx run." + +.PHONY: test +test: protogen + @echo "Testing mlx..." + bash test.sh + @echo "mlx tested." + +.PHONY: protogen +protogen: backend_pb2_grpc.py backend_pb2.py + +.PHONY: protogen-clean +protogen-clean: + $(RM) backend_pb2_grpc.py backend_pb2.py + +backend_pb2_grpc.py backend_pb2.py: + python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto + +.PHONY: clean +clean: protogen-clean + rm -rf venv __pycache__ \ No newline at end of file diff --git a/backend/python/mlx-vlm/backend.py b/backend/python/mlx-vlm/backend.py new file mode 100644 index 000000000000..56698a54e5f5 --- /dev/null +++ b/backend/python/mlx-vlm/backend.py @@ -0,0 +1,367 @@ +#!/usr/bin/env python3 +import asyncio +from concurrent import futures +import argparse +import signal +import sys +import os +from typing import List +from PIL import Image + +import backend_pb2 +import backend_pb2_grpc + +import grpc +from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.engine.async_llm_engine import AsyncLLMEngine +from vllm.sampling_params import SamplingParams +from vllm.utils import random_uuid +from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.multimodal.utils import fetch_image +from vllm.assets.video import VideoAsset +import base64 +import io + +_ONE_DAY_IN_SECONDS = 60 * 60 * 24 + +# If MAX_WORKERS are specified in the environment use it, otherwise default to 1 +MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1')) + +# Implement the BackendServicer class with the service methods +class BackendServicer(backend_pb2_grpc.BackendServicer): + """ + A gRPC servicer that implements the Backend service defined in backend.proto. + """ + def generate(self,prompt, max_new_tokens): + """ + Generates text based on the given prompt and maximum number of new tokens. + + Args: + prompt (str): The prompt to generate text from. + max_new_tokens (int): The maximum number of new tokens to generate. + + Returns: + str: The generated text. + """ + self.generator.end_beam_search() + + # Tokenizing the input + ids = self.generator.tokenizer.encode(prompt) + + self.generator.gen_begin_reuse(ids) + initial_len = self.generator.sequence[0].shape[0] + has_leading_space = False + decoded_text = '' + for i in range(max_new_tokens): + token = self.generator.gen_single_token() + if i == 0 and self.generator.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'): + has_leading_space = True + + decoded_text = self.generator.tokenizer.decode(self.generator.sequence[0][initial_len:]) + if has_leading_space: + decoded_text = ' ' + decoded_text + + if token.item() == self.generator.tokenizer.eos_token_id: + break + return decoded_text + + def Health(self, request, context): + """ + Returns a health check message. + + Args: + request: The health check request. + context: The gRPC context. + + Returns: + backend_pb2.Reply: The health check reply. + """ + return backend_pb2.Reply(message=bytes("OK", 'utf-8')) + + async def LoadModel(self, request, context): + """ + Loads a language model. + + Args: + request: The load model request. + context: The gRPC context. + + Returns: + backend_pb2.Result: The load model result. + """ + engine_args = AsyncEngineArgs( + model=request.Model, + ) + + if request.Quantization != "": + engine_args.quantization = request.Quantization + if request.LoadFormat != "": + engine_args.load_format = request.LoadFormat + if request.GPUMemoryUtilization != 0: + engine_args.gpu_memory_utilization = request.GPUMemoryUtilization + if request.TrustRemoteCode: + engine_args.trust_remote_code = request.TrustRemoteCode + if request.EnforceEager: + engine_args.enforce_eager = request.EnforceEager + if request.TensorParallelSize: + engine_args.tensor_parallel_size = request.TensorParallelSize + if request.SwapSpace != 0: + engine_args.swap_space = request.SwapSpace + if request.MaxModelLen != 0: + engine_args.max_model_len = request.MaxModelLen + if request.DisableLogStatus: + engine_args.disable_log_status = request.DisableLogStatus + if request.DType != "": + engine_args.dtype = request.DType + if request.LimitImagePerPrompt != 0 or request.LimitVideoPerPrompt != 0 or request.LimitAudioPerPrompt != 0: + # limit-mm-per-prompt defaults to 1 per modality, based on vLLM docs + engine_args.limit_mm_per_prompt = { + "image": max(request.LimitImagePerPrompt, 1), + "video": max(request.LimitVideoPerPrompt, 1), + "audio": max(request.LimitAudioPerPrompt, 1) + } + + try: + self.llm = AsyncLLMEngine.from_engine_args(engine_args) + except Exception as err: + print(f"Unexpected {err=}, {type(err)=}", file=sys.stderr) + return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") + + try: + engine_model_config = await self.llm.get_model_config() + self.tokenizer = get_tokenizer( + engine_model_config.tokenizer, + tokenizer_mode=engine_model_config.tokenizer_mode, + trust_remote_code=engine_model_config.trust_remote_code, + truncation_side="left", + ) + except Exception as err: + return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") + print("Model loaded successfully", file=sys.stderr) + return backend_pb2.Result(message="Model loaded successfully", success=True) + + async def Predict(self, request, context): + """ + Generates text based on the given prompt and sampling parameters. + + Args: + request: The predict request. + context: The gRPC context. + + Returns: + backend_pb2.Reply: The predict result. + """ + gen = self._predict(request, context, streaming=False) + res = await gen.__anext__() + return res + + def Embedding(self, request, context): + """ + A gRPC method that calculates embeddings for a given sentence. + + Args: + request: An EmbeddingRequest object that contains the request parameters. + context: A grpc.ServicerContext object that provides information about the RPC. + + Returns: + An EmbeddingResult object that contains the calculated embeddings. + """ + print("Calculated embeddings for: " + request.Embeddings, file=sys.stderr) + outputs = self.model.encode(request.Embeddings) + # Check if we have one result at least + if len(outputs) == 0: + context.set_code(grpc.StatusCode.INVALID_ARGUMENT) + context.set_details("No embeddings were calculated.") + return backend_pb2.EmbeddingResult() + return backend_pb2.EmbeddingResult(embeddings=outputs[0].outputs.embedding) + + async def PredictStream(self, request, context): + """ + Generates text based on the given prompt and sampling parameters, and streams the results. + + Args: + request: The predict stream request. + context: The gRPC context. + + Returns: + backend_pb2.Result: The predict stream result. + """ + iterations = self._predict(request, context, streaming=True) + try: + async for iteration in iterations: + yield iteration + finally: + await iterations.aclose() + + async def _predict(self, request, context, streaming=False): + # Build the sampling parameters + # NOTE: this must stay in sync with the vllm backend + request_to_sampling_params = { + "N": "n", + "PresencePenalty": "presence_penalty", + "FrequencyPenalty": "frequency_penalty", + "RepetitionPenalty": "repetition_penalty", + "Temperature": "temperature", + "TopP": "top_p", + "TopK": "top_k", + "MinP": "min_p", + "Seed": "seed", + "StopPrompts": "stop", + "StopTokenIds": "stop_token_ids", + "BadWords": "bad_words", + "IncludeStopStrInOutput": "include_stop_str_in_output", + "IgnoreEOS": "ignore_eos", + "Tokens": "max_tokens", + "MinTokens": "min_tokens", + "Logprobs": "logprobs", + "PromptLogprobs": "prompt_logprobs", + "SkipSpecialTokens": "skip_special_tokens", + "SpacesBetweenSpecialTokens": "spaces_between_special_tokens", + "TruncatePromptTokens": "truncate_prompt_tokens", + "GuidedDecoding": "guided_decoding", + } + + sampling_params = SamplingParams(top_p=0.9, max_tokens=200) + + for request_field, param_field in request_to_sampling_params.items(): + if hasattr(request, request_field): + value = getattr(request, request_field) + if value not in (None, 0, [], False): + setattr(sampling_params, param_field, value) + + # Extract image paths and process images + prompt = request.Prompt + + image_paths = request.Images + image_data = [self.load_image(img_path) for img_path in image_paths] + + videos_path = request.Videos + video_data = [self.load_video(video_path) for video_path in videos_path] + + # If tokenizer template is enabled and messages are provided instead of prompt, apply the tokenizer template + if not request.Prompt and request.UseTokenizerTemplate and request.Messages: + prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True) + + # Generate text using the LLM engine + request_id = random_uuid() + print(f"Generating text with request_id: {request_id}", file=sys.stderr) + multi_modal_data = {} + if image_data: + multi_modal_data["image"] = image_data + if video_data: + multi_modal_data["video"] = video_data + outputs = self.llm.generate( + { + "prompt": prompt, + "multi_modal_data": multi_modal_data if multi_modal_data else None, + }, + sampling_params=sampling_params, + request_id=request_id, + ) + + # Stream the results + generated_text = "" + try: + async for request_output in outputs: + iteration_text = request_output.outputs[0].text + + if streaming: + # Remove text already sent as vllm concatenates the text from previous yields + delta_iteration_text = iteration_text.removeprefix(generated_text) + # Send the partial result + yield backend_pb2.Reply(message=bytes(delta_iteration_text, encoding='utf-8')) + + # Keep track of text generated + generated_text = iteration_text + finally: + await outputs.aclose() + + # If streaming, we already sent everything + if streaming: + return + + # Remove the image files from /tmp folder + for img_path in image_paths: + try: + os.remove(img_path) + except Exception as e: + print(f"Error removing image file: {img_path}, {e}", file=sys.stderr) + + # Sending the final generated text + yield backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8')) + + def load_image(self, image_path: str): + """ + Load an image from the given file path or base64 encoded data. + + Args: + image_path (str): The path to the image file or base64 encoded data. + + Returns: + Image: The loaded image. + """ + try: + + image_data = base64.b64decode(image_path) + image = Image.open(io.BytesIO(image_data)) + return image + except Exception as e: + print(f"Error loading image {image_path}: {e}", file=sys.stderr) + return None + + def load_video(self, video_path: str): + """ + Load a video from the given file path. + + Args: + video_path (str): The path to the image file. + + Returns: + Video: The loaded video. + """ + try: + timestamp = str(int(time.time() * 1000)) # Generate timestamp + p = f"/tmp/vl-{timestamp}.data" # Use timestamp in filename + with open(p, "wb") as f: + f.write(base64.b64decode(video_path)) + video = VideoAsset(name=p).np_ndarrays + os.remove(p) + return video + except Exception as e: + print(f"Error loading video {video_path}: {e}", file=sys.stderr) + return None + +async def serve(address): + # Start asyncio gRPC server + server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), + options=[ + ('grpc.max_message_length', 50 * 1024 * 1024), # 50MB + ('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB + ('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB + ]) + # Add the servicer to the server + backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) + # Bind the server to the address + server.add_insecure_port(address) + + # Gracefully shutdown the server on SIGTERM or SIGINT + loop = asyncio.get_event_loop() + for sig in (signal.SIGINT, signal.SIGTERM): + loop.add_signal_handler( + sig, lambda: asyncio.ensure_future(server.stop(5)) + ) + + # Start the server + await server.start() + print("Server started. Listening on: " + address, file=sys.stderr) + # Wait for the server to be terminated + await server.wait_for_termination() + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run the gRPC server.") + parser.add_argument( + "--addr", default="localhost:50051", help="The address to bind the server to." + ) + args = parser.parse_args() + + asyncio.run(serve(args.addr)) diff --git a/backend/python/mlx-vlm/install.sh b/backend/python/mlx-vlm/install.sh new file mode 100755 index 000000000000..b8ee48552490 --- /dev/null +++ b/backend/python/mlx-vlm/install.sh @@ -0,0 +1,14 @@ +#!/bin/bash +set -e + +USE_PIP=true + +backend_dir=$(dirname $0) + +if [ -d $backend_dir/common ]; then + source $backend_dir/common/libbackend.sh +else + source $backend_dir/../common/libbackend.sh +fi + +installRequirements diff --git a/backend/python/mlx-vlm/requirements.txt b/backend/python/mlx-vlm/requirements.txt new file mode 100644 index 000000000000..f1771cc4adb4 --- /dev/null +++ b/backend/python/mlx-vlm/requirements.txt @@ -0,0 +1,4 @@ +grpcio==1.71.0 +protobuf +certifi +setuptools \ No newline at end of file diff --git a/backend/python/mlx-vlm/run.sh b/backend/python/mlx-vlm/run.sh new file mode 100755 index 000000000000..fc88f97da712 --- /dev/null +++ b/backend/python/mlx-vlm/run.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +backend_dir=$(dirname $0) + +if [ -d $backend_dir/common ]; then + source $backend_dir/common/libbackend.sh +else + source $backend_dir/../common/libbackend.sh +fi + +startBackend $@ \ No newline at end of file diff --git a/backend/python/mlx-vlm/test.py b/backend/python/mlx-vlm/test.py new file mode 100644 index 000000000000..827aa71a3e33 --- /dev/null +++ b/backend/python/mlx-vlm/test.py @@ -0,0 +1,146 @@ +import unittest +import subprocess +import time +import backend_pb2 +import backend_pb2_grpc + +import grpc + +import unittest +import subprocess +import time +import grpc +import backend_pb2_grpc +import backend_pb2 + +class TestBackendServicer(unittest.TestCase): + """ + TestBackendServicer is the class that tests the gRPC service. + + This class contains methods to test the startup and shutdown of the gRPC service. + """ + def setUp(self): + self.service = subprocess.Popen(["python", "backend.py", "--addr", "localhost:50051"]) + time.sleep(10) + + def tearDown(self) -> None: + self.service.terminate() + self.service.wait() + + def test_server_startup(self): + try: + self.setUp() + with grpc.insecure_channel("localhost:50051") as channel: + stub = backend_pb2_grpc.BackendStub(channel) + response = stub.Health(backend_pb2.HealthMessage()) + self.assertEqual(response.message, b'OK') + except Exception as err: + print(err) + self.fail("Server failed to start") + finally: + self.tearDown() + def test_load_model(self): + """ + This method tests if the model is loaded successfully + """ + try: + self.setUp() + with grpc.insecure_channel("localhost:50051") as channel: + stub = backend_pb2_grpc.BackendStub(channel) + response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m")) + self.assertTrue(response.success) + self.assertEqual(response.message, "Model loaded successfully") + except Exception as err: + print(err) + self.fail("LoadModel service failed") + finally: + self.tearDown() + + def test_text(self): + """ + This method tests if the embeddings are generated successfully + """ + try: + self.setUp() + with grpc.insecure_channel("localhost:50051") as channel: + stub = backend_pb2_grpc.BackendStub(channel) + response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m")) + self.assertTrue(response.success) + req = backend_pb2.PredictOptions(Prompt="The capital of France is") + resp = stub.Predict(req) + self.assertIsNotNone(resp.message) + except Exception as err: + print(err) + self.fail("text service failed") + finally: + self.tearDown() + + def test_sampling_params(self): + """ + This method tests if all sampling parameters are correctly processed + NOTE: this does NOT test for correctness, just that we received a compatible response + """ + try: + self.setUp() + with grpc.insecure_channel("localhost:50051") as channel: + stub = backend_pb2_grpc.BackendStub(channel) + response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m")) + self.assertTrue(response.success) + + req = backend_pb2.PredictOptions( + Prompt="The capital of France is", + TopP=0.8, + Tokens=50, + Temperature=0.7, + TopK=40, + PresencePenalty=0.1, + FrequencyPenalty=0.2, + RepetitionPenalty=1.1, + MinP=0.05, + Seed=42, + StopPrompts=["\n"], + StopTokenIds=[50256], + BadWords=["badword"], + IncludeStopStrInOutput=True, + IgnoreEOS=True, + MinTokens=5, + Logprobs=5, + PromptLogprobs=5, + SkipSpecialTokens=True, + SpacesBetweenSpecialTokens=True, + TruncatePromptTokens=10, + GuidedDecoding=True, + N=2, + ) + resp = stub.Predict(req) + self.assertIsNotNone(resp.message) + self.assertIsNotNone(resp.logprobs) + except Exception as err: + print(err) + self.fail("sampling params service failed") + finally: + self.tearDown() + + + def test_embedding(self): + """ + This method tests if the embeddings are generated successfully + """ + try: + self.setUp() + with grpc.insecure_channel("localhost:50051") as channel: + stub = backend_pb2_grpc.BackendStub(channel) + response = stub.LoadModel(backend_pb2.ModelOptions(Model="intfloat/e5-mistral-7b-instruct")) + self.assertTrue(response.success) + embedding_request = backend_pb2.PredictOptions(Embeddings="This is a test sentence.") + embedding_response = stub.Embedding(embedding_request) + self.assertIsNotNone(embedding_response.embeddings) + # assert that is a list of floats + self.assertIsInstance(embedding_response.embeddings, list) + # assert that the list is not empty + self.assertTrue(len(embedding_response.embeddings) > 0) + except Exception as err: + print(err) + self.fail("Embedding service failed") + finally: + self.tearDown() \ No newline at end of file diff --git a/backend/python/mlx-vlm/test.sh b/backend/python/mlx-vlm/test.sh new file mode 100755 index 000000000000..f31ae54e47dc --- /dev/null +++ b/backend/python/mlx-vlm/test.sh @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +backend_dir=$(dirname $0) + +if [ -d $backend_dir/common ]; then + source $backend_dir/common/libbackend.sh +else + source $backend_dir/../common/libbackend.sh +fi + +runUnittests diff --git a/backend/python/mlx/Makefile b/backend/python/mlx/Makefile new file mode 100644 index 000000000000..c4c18bee55b0 --- /dev/null +++ b/backend/python/mlx/Makefile @@ -0,0 +1,29 @@ +.PHONY: mlx +mlx: protogen + bash install.sh + +.PHONY: run +run: protogen + @echo "Running mlx..." + bash run.sh + @echo "mlx run." + +.PHONY: test +test: protogen + @echo "Testing mlx..." + bash test.sh + @echo "mlx tested." + +.PHONY: protogen +protogen: backend_pb2_grpc.py backend_pb2.py + +.PHONY: protogen-clean +protogen-clean: + $(RM) backend_pb2_grpc.py backend_pb2.py + +backend_pb2_grpc.py backend_pb2.py: + python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto + +.PHONY: clean +clean: protogen-clean + rm -rf venv __pycache__ \ No newline at end of file diff --git a/backend/python/mlx/backend.py b/backend/python/mlx/backend.py new file mode 100644 index 000000000000..56698a54e5f5 --- /dev/null +++ b/backend/python/mlx/backend.py @@ -0,0 +1,367 @@ +#!/usr/bin/env python3 +import asyncio +from concurrent import futures +import argparse +import signal +import sys +import os +from typing import List +from PIL import Image + +import backend_pb2 +import backend_pb2_grpc + +import grpc +from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.engine.async_llm_engine import AsyncLLMEngine +from vllm.sampling_params import SamplingParams +from vllm.utils import random_uuid +from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.multimodal.utils import fetch_image +from vllm.assets.video import VideoAsset +import base64 +import io + +_ONE_DAY_IN_SECONDS = 60 * 60 * 24 + +# If MAX_WORKERS are specified in the environment use it, otherwise default to 1 +MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1')) + +# Implement the BackendServicer class with the service methods +class BackendServicer(backend_pb2_grpc.BackendServicer): + """ + A gRPC servicer that implements the Backend service defined in backend.proto. + """ + def generate(self,prompt, max_new_tokens): + """ + Generates text based on the given prompt and maximum number of new tokens. + + Args: + prompt (str): The prompt to generate text from. + max_new_tokens (int): The maximum number of new tokens to generate. + + Returns: + str: The generated text. + """ + self.generator.end_beam_search() + + # Tokenizing the input + ids = self.generator.tokenizer.encode(prompt) + + self.generator.gen_begin_reuse(ids) + initial_len = self.generator.sequence[0].shape[0] + has_leading_space = False + decoded_text = '' + for i in range(max_new_tokens): + token = self.generator.gen_single_token() + if i == 0 and self.generator.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'): + has_leading_space = True + + decoded_text = self.generator.tokenizer.decode(self.generator.sequence[0][initial_len:]) + if has_leading_space: + decoded_text = ' ' + decoded_text + + if token.item() == self.generator.tokenizer.eos_token_id: + break + return decoded_text + + def Health(self, request, context): + """ + Returns a health check message. + + Args: + request: The health check request. + context: The gRPC context. + + Returns: + backend_pb2.Reply: The health check reply. + """ + return backend_pb2.Reply(message=bytes("OK", 'utf-8')) + + async def LoadModel(self, request, context): + """ + Loads a language model. + + Args: + request: The load model request. + context: The gRPC context. + + Returns: + backend_pb2.Result: The load model result. + """ + engine_args = AsyncEngineArgs( + model=request.Model, + ) + + if request.Quantization != "": + engine_args.quantization = request.Quantization + if request.LoadFormat != "": + engine_args.load_format = request.LoadFormat + if request.GPUMemoryUtilization != 0: + engine_args.gpu_memory_utilization = request.GPUMemoryUtilization + if request.TrustRemoteCode: + engine_args.trust_remote_code = request.TrustRemoteCode + if request.EnforceEager: + engine_args.enforce_eager = request.EnforceEager + if request.TensorParallelSize: + engine_args.tensor_parallel_size = request.TensorParallelSize + if request.SwapSpace != 0: + engine_args.swap_space = request.SwapSpace + if request.MaxModelLen != 0: + engine_args.max_model_len = request.MaxModelLen + if request.DisableLogStatus: + engine_args.disable_log_status = request.DisableLogStatus + if request.DType != "": + engine_args.dtype = request.DType + if request.LimitImagePerPrompt != 0 or request.LimitVideoPerPrompt != 0 or request.LimitAudioPerPrompt != 0: + # limit-mm-per-prompt defaults to 1 per modality, based on vLLM docs + engine_args.limit_mm_per_prompt = { + "image": max(request.LimitImagePerPrompt, 1), + "video": max(request.LimitVideoPerPrompt, 1), + "audio": max(request.LimitAudioPerPrompt, 1) + } + + try: + self.llm = AsyncLLMEngine.from_engine_args(engine_args) + except Exception as err: + print(f"Unexpected {err=}, {type(err)=}", file=sys.stderr) + return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") + + try: + engine_model_config = await self.llm.get_model_config() + self.tokenizer = get_tokenizer( + engine_model_config.tokenizer, + tokenizer_mode=engine_model_config.tokenizer_mode, + trust_remote_code=engine_model_config.trust_remote_code, + truncation_side="left", + ) + except Exception as err: + return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") + print("Model loaded successfully", file=sys.stderr) + return backend_pb2.Result(message="Model loaded successfully", success=True) + + async def Predict(self, request, context): + """ + Generates text based on the given prompt and sampling parameters. + + Args: + request: The predict request. + context: The gRPC context. + + Returns: + backend_pb2.Reply: The predict result. + """ + gen = self._predict(request, context, streaming=False) + res = await gen.__anext__() + return res + + def Embedding(self, request, context): + """ + A gRPC method that calculates embeddings for a given sentence. + + Args: + request: An EmbeddingRequest object that contains the request parameters. + context: A grpc.ServicerContext object that provides information about the RPC. + + Returns: + An EmbeddingResult object that contains the calculated embeddings. + """ + print("Calculated embeddings for: " + request.Embeddings, file=sys.stderr) + outputs = self.model.encode(request.Embeddings) + # Check if we have one result at least + if len(outputs) == 0: + context.set_code(grpc.StatusCode.INVALID_ARGUMENT) + context.set_details("No embeddings were calculated.") + return backend_pb2.EmbeddingResult() + return backend_pb2.EmbeddingResult(embeddings=outputs[0].outputs.embedding) + + async def PredictStream(self, request, context): + """ + Generates text based on the given prompt and sampling parameters, and streams the results. + + Args: + request: The predict stream request. + context: The gRPC context. + + Returns: + backend_pb2.Result: The predict stream result. + """ + iterations = self._predict(request, context, streaming=True) + try: + async for iteration in iterations: + yield iteration + finally: + await iterations.aclose() + + async def _predict(self, request, context, streaming=False): + # Build the sampling parameters + # NOTE: this must stay in sync with the vllm backend + request_to_sampling_params = { + "N": "n", + "PresencePenalty": "presence_penalty", + "FrequencyPenalty": "frequency_penalty", + "RepetitionPenalty": "repetition_penalty", + "Temperature": "temperature", + "TopP": "top_p", + "TopK": "top_k", + "MinP": "min_p", + "Seed": "seed", + "StopPrompts": "stop", + "StopTokenIds": "stop_token_ids", + "BadWords": "bad_words", + "IncludeStopStrInOutput": "include_stop_str_in_output", + "IgnoreEOS": "ignore_eos", + "Tokens": "max_tokens", + "MinTokens": "min_tokens", + "Logprobs": "logprobs", + "PromptLogprobs": "prompt_logprobs", + "SkipSpecialTokens": "skip_special_tokens", + "SpacesBetweenSpecialTokens": "spaces_between_special_tokens", + "TruncatePromptTokens": "truncate_prompt_tokens", + "GuidedDecoding": "guided_decoding", + } + + sampling_params = SamplingParams(top_p=0.9, max_tokens=200) + + for request_field, param_field in request_to_sampling_params.items(): + if hasattr(request, request_field): + value = getattr(request, request_field) + if value not in (None, 0, [], False): + setattr(sampling_params, param_field, value) + + # Extract image paths and process images + prompt = request.Prompt + + image_paths = request.Images + image_data = [self.load_image(img_path) for img_path in image_paths] + + videos_path = request.Videos + video_data = [self.load_video(video_path) for video_path in videos_path] + + # If tokenizer template is enabled and messages are provided instead of prompt, apply the tokenizer template + if not request.Prompt and request.UseTokenizerTemplate and request.Messages: + prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True) + + # Generate text using the LLM engine + request_id = random_uuid() + print(f"Generating text with request_id: {request_id}", file=sys.stderr) + multi_modal_data = {} + if image_data: + multi_modal_data["image"] = image_data + if video_data: + multi_modal_data["video"] = video_data + outputs = self.llm.generate( + { + "prompt": prompt, + "multi_modal_data": multi_modal_data if multi_modal_data else None, + }, + sampling_params=sampling_params, + request_id=request_id, + ) + + # Stream the results + generated_text = "" + try: + async for request_output in outputs: + iteration_text = request_output.outputs[0].text + + if streaming: + # Remove text already sent as vllm concatenates the text from previous yields + delta_iteration_text = iteration_text.removeprefix(generated_text) + # Send the partial result + yield backend_pb2.Reply(message=bytes(delta_iteration_text, encoding='utf-8')) + + # Keep track of text generated + generated_text = iteration_text + finally: + await outputs.aclose() + + # If streaming, we already sent everything + if streaming: + return + + # Remove the image files from /tmp folder + for img_path in image_paths: + try: + os.remove(img_path) + except Exception as e: + print(f"Error removing image file: {img_path}, {e}", file=sys.stderr) + + # Sending the final generated text + yield backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8')) + + def load_image(self, image_path: str): + """ + Load an image from the given file path or base64 encoded data. + + Args: + image_path (str): The path to the image file or base64 encoded data. + + Returns: + Image: The loaded image. + """ + try: + + image_data = base64.b64decode(image_path) + image = Image.open(io.BytesIO(image_data)) + return image + except Exception as e: + print(f"Error loading image {image_path}: {e}", file=sys.stderr) + return None + + def load_video(self, video_path: str): + """ + Load a video from the given file path. + + Args: + video_path (str): The path to the image file. + + Returns: + Video: The loaded video. + """ + try: + timestamp = str(int(time.time() * 1000)) # Generate timestamp + p = f"/tmp/vl-{timestamp}.data" # Use timestamp in filename + with open(p, "wb") as f: + f.write(base64.b64decode(video_path)) + video = VideoAsset(name=p).np_ndarrays + os.remove(p) + return video + except Exception as e: + print(f"Error loading video {video_path}: {e}", file=sys.stderr) + return None + +async def serve(address): + # Start asyncio gRPC server + server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), + options=[ + ('grpc.max_message_length', 50 * 1024 * 1024), # 50MB + ('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB + ('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB + ]) + # Add the servicer to the server + backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) + # Bind the server to the address + server.add_insecure_port(address) + + # Gracefully shutdown the server on SIGTERM or SIGINT + loop = asyncio.get_event_loop() + for sig in (signal.SIGINT, signal.SIGTERM): + loop.add_signal_handler( + sig, lambda: asyncio.ensure_future(server.stop(5)) + ) + + # Start the server + await server.start() + print("Server started. Listening on: " + address, file=sys.stderr) + # Wait for the server to be terminated + await server.wait_for_termination() + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run the gRPC server.") + parser.add_argument( + "--addr", default="localhost:50051", help="The address to bind the server to." + ) + args = parser.parse_args() + + asyncio.run(serve(args.addr)) diff --git a/backend/python/mlx/install.sh b/backend/python/mlx/install.sh new file mode 100755 index 000000000000..b8ee48552490 --- /dev/null +++ b/backend/python/mlx/install.sh @@ -0,0 +1,14 @@ +#!/bin/bash +set -e + +USE_PIP=true + +backend_dir=$(dirname $0) + +if [ -d $backend_dir/common ]; then + source $backend_dir/common/libbackend.sh +else + source $backend_dir/../common/libbackend.sh +fi + +installRequirements diff --git a/backend/python/mlx/requirements.txt b/backend/python/mlx/requirements.txt new file mode 100644 index 000000000000..f1771cc4adb4 --- /dev/null +++ b/backend/python/mlx/requirements.txt @@ -0,0 +1,4 @@ +grpcio==1.71.0 +protobuf +certifi +setuptools \ No newline at end of file diff --git a/backend/python/mlx/run.sh b/backend/python/mlx/run.sh new file mode 100755 index 000000000000..fc88f97da712 --- /dev/null +++ b/backend/python/mlx/run.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +backend_dir=$(dirname $0) + +if [ -d $backend_dir/common ]; then + source $backend_dir/common/libbackend.sh +else + source $backend_dir/../common/libbackend.sh +fi + +startBackend $@ \ No newline at end of file diff --git a/backend/python/mlx/test.py b/backend/python/mlx/test.py new file mode 100644 index 000000000000..827aa71a3e33 --- /dev/null +++ b/backend/python/mlx/test.py @@ -0,0 +1,146 @@ +import unittest +import subprocess +import time +import backend_pb2 +import backend_pb2_grpc + +import grpc + +import unittest +import subprocess +import time +import grpc +import backend_pb2_grpc +import backend_pb2 + +class TestBackendServicer(unittest.TestCase): + """ + TestBackendServicer is the class that tests the gRPC service. + + This class contains methods to test the startup and shutdown of the gRPC service. + """ + def setUp(self): + self.service = subprocess.Popen(["python", "backend.py", "--addr", "localhost:50051"]) + time.sleep(10) + + def tearDown(self) -> None: + self.service.terminate() + self.service.wait() + + def test_server_startup(self): + try: + self.setUp() + with grpc.insecure_channel("localhost:50051") as channel: + stub = backend_pb2_grpc.BackendStub(channel) + response = stub.Health(backend_pb2.HealthMessage()) + self.assertEqual(response.message, b'OK') + except Exception as err: + print(err) + self.fail("Server failed to start") + finally: + self.tearDown() + def test_load_model(self): + """ + This method tests if the model is loaded successfully + """ + try: + self.setUp() + with grpc.insecure_channel("localhost:50051") as channel: + stub = backend_pb2_grpc.BackendStub(channel) + response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m")) + self.assertTrue(response.success) + self.assertEqual(response.message, "Model loaded successfully") + except Exception as err: + print(err) + self.fail("LoadModel service failed") + finally: + self.tearDown() + + def test_text(self): + """ + This method tests if the embeddings are generated successfully + """ + try: + self.setUp() + with grpc.insecure_channel("localhost:50051") as channel: + stub = backend_pb2_grpc.BackendStub(channel) + response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m")) + self.assertTrue(response.success) + req = backend_pb2.PredictOptions(Prompt="The capital of France is") + resp = stub.Predict(req) + self.assertIsNotNone(resp.message) + except Exception as err: + print(err) + self.fail("text service failed") + finally: + self.tearDown() + + def test_sampling_params(self): + """ + This method tests if all sampling parameters are correctly processed + NOTE: this does NOT test for correctness, just that we received a compatible response + """ + try: + self.setUp() + with grpc.insecure_channel("localhost:50051") as channel: + stub = backend_pb2_grpc.BackendStub(channel) + response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m")) + self.assertTrue(response.success) + + req = backend_pb2.PredictOptions( + Prompt="The capital of France is", + TopP=0.8, + Tokens=50, + Temperature=0.7, + TopK=40, + PresencePenalty=0.1, + FrequencyPenalty=0.2, + RepetitionPenalty=1.1, + MinP=0.05, + Seed=42, + StopPrompts=["\n"], + StopTokenIds=[50256], + BadWords=["badword"], + IncludeStopStrInOutput=True, + IgnoreEOS=True, + MinTokens=5, + Logprobs=5, + PromptLogprobs=5, + SkipSpecialTokens=True, + SpacesBetweenSpecialTokens=True, + TruncatePromptTokens=10, + GuidedDecoding=True, + N=2, + ) + resp = stub.Predict(req) + self.assertIsNotNone(resp.message) + self.assertIsNotNone(resp.logprobs) + except Exception as err: + print(err) + self.fail("sampling params service failed") + finally: + self.tearDown() + + + def test_embedding(self): + """ + This method tests if the embeddings are generated successfully + """ + try: + self.setUp() + with grpc.insecure_channel("localhost:50051") as channel: + stub = backend_pb2_grpc.BackendStub(channel) + response = stub.LoadModel(backend_pb2.ModelOptions(Model="intfloat/e5-mistral-7b-instruct")) + self.assertTrue(response.success) + embedding_request = backend_pb2.PredictOptions(Embeddings="This is a test sentence.") + embedding_response = stub.Embedding(embedding_request) + self.assertIsNotNone(embedding_response.embeddings) + # assert that is a list of floats + self.assertIsInstance(embedding_response.embeddings, list) + # assert that the list is not empty + self.assertTrue(len(embedding_response.embeddings) > 0) + except Exception as err: + print(err) + self.fail("Embedding service failed") + finally: + self.tearDown() \ No newline at end of file diff --git a/backend/python/mlx/test.sh b/backend/python/mlx/test.sh new file mode 100755 index 000000000000..f31ae54e47dc --- /dev/null +++ b/backend/python/mlx/test.sh @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +backend_dir=$(dirname $0) + +if [ -d $backend_dir/common ]; then + source $backend_dir/common/libbackend.sh +else + source $backend_dir/../common/libbackend.sh +fi + +runUnittests From d933847f04b94f96ee52a78c8b94e4a3ae2fbce8 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Thu, 21 Aug 2025 19:21:35 +0200 Subject: [PATCH 03/14] Make the backend to build and actually work Signed-off-by: Ettore Di Giacinto --- Makefile | 84 ++-- backend/python/mlx/Makefile | 12 +- backend/python/mlx/backend.py | 463 +++++++++--------- backend/python/mlx/install.sh | 1 + backend/python/mlx/requirements-mps.txt | 1 + .../llama-cpp-darwin.sh} | 0 6 files changed, 288 insertions(+), 273 deletions(-) create mode 100644 backend/python/mlx/requirements-mps.txt rename scripts/{build-llama-cpp-darwin.sh => build/llama-cpp-darwin.sh} (100%) diff --git a/Makefile b/Makefile index a050f84f8d7c..9ce4c079e73e 100644 --- a/Makefile +++ b/Makefile @@ -132,43 +132,6 @@ test: test-models/testmodel.ggml protogen-go $(MAKE) test-tts $(MAKE) test-stablediffusion -backends/diffusers: docker-build-diffusers docker-save-diffusers build - ./local-ai backends install "ocifile://$(abspath ./backend-images/diffusers.tar)" - -backends/llama-cpp: docker-build-llama-cpp docker-save-llama-cpp build - ./local-ai backends install "ocifile://$(abspath ./backend-images/llama-cpp.tar)" - -backends/piper: docker-build-piper docker-save-piper build - ./local-ai backends install "ocifile://$(abspath ./backend-images/piper.tar)" - -backends/stablediffusion-ggml: docker-build-stablediffusion-ggml docker-save-stablediffusion-ggml build - ./local-ai backends install "ocifile://$(abspath ./backend-images/stablediffusion-ggml.tar)" - -backends/whisper: docker-build-whisper docker-save-whisper build - ./local-ai backends install "ocifile://$(abspath ./backend-images/whisper.tar)" - -backends/silero-vad: docker-build-silero-vad docker-save-silero-vad build - ./local-ai backends install "ocifile://$(abspath ./backend-images/silero-vad.tar)" - -backends/local-store: docker-build-local-store docker-save-local-store build - ./local-ai backends install "ocifile://$(abspath ./backend-images/local-store.tar)" - -backends/huggingface: docker-build-huggingface docker-save-huggingface build - ./local-ai backends install "ocifile://$(abspath ./backend-images/huggingface.tar)" - -backends/rfdetr: docker-build-rfdetr docker-save-rfdetr build - ./local-ai backends install "ocifile://$(abspath ./backend-images/rfdetr.tar)" - -backends/kitten-tts: docker-build-kitten-tts docker-save-kitten-tts build - ./local-ai backends install "ocifile://$(abspath ./backend-images/kitten-tts.tar)" - -backends/kokoro: docker-build-kokoro docker-save-kokoro build - ./local-ai backends install "ocifile://$(abspath ./backend-images/kokoro.tar)" - -backends/llama-cpp-darwin: build - bash ./scripts/build-llama-cpp-darwin.sh - ./local-ai backends install "ocifile://$(abspath ./backend-images/llama-cpp.tar)" - ######################################################## ## AIO tests ######################################################## @@ -361,6 +324,47 @@ docker-image-intel: ## Backends ######################################################## + +backends/diffusers: docker-build-diffusers docker-save-diffusers build + ./local-ai backends install "ocifile://$(abspath ./backend-images/diffusers.tar)" + +backends/llama-cpp: docker-build-llama-cpp docker-save-llama-cpp build + ./local-ai backends install "ocifile://$(abspath ./backend-images/llama-cpp.tar)" + +backends/piper: docker-build-piper docker-save-piper build + ./local-ai backends install "ocifile://$(abspath ./backend-images/piper.tar)" + +backends/stablediffusion-ggml: docker-build-stablediffusion-ggml docker-save-stablediffusion-ggml build + ./local-ai backends install "ocifile://$(abspath ./backend-images/stablediffusion-ggml.tar)" + +backends/whisper: docker-build-whisper docker-save-whisper build + ./local-ai backends install "ocifile://$(abspath ./backend-images/whisper.tar)" + +backends/silero-vad: docker-build-silero-vad docker-save-silero-vad build + ./local-ai backends install "ocifile://$(abspath ./backend-images/silero-vad.tar)" + +backends/local-store: docker-build-local-store docker-save-local-store build + ./local-ai backends install "ocifile://$(abspath ./backend-images/local-store.tar)" + +backends/huggingface: docker-build-huggingface docker-save-huggingface build + ./local-ai backends install "ocifile://$(abspath ./backend-images/huggingface.tar)" + +backends/rfdetr: docker-build-rfdetr docker-save-rfdetr build + ./local-ai backends install "ocifile://$(abspath ./backend-images/rfdetr.tar)" + +backends/kitten-tts: docker-build-kitten-tts docker-save-kitten-tts build + ./local-ai backends install "ocifile://$(abspath ./backend-images/kitten-tts.tar)" + +backends/kokoro: docker-build-kokoro docker-save-kokoro build + ./local-ai backends install "ocifile://$(abspath ./backend-images/kokoro.tar)" + +backends/llama-cpp-darwin: build + bash ./scripts/build/llama-cpp-darwin.sh + ./local-ai backends install "ocifile://$(abspath ./backend-images/llama-cpp.tar)" + +backends/nemo: docker-build-nemo docker-save-nemo build + ./local-ai backends install "ocifile://$(abspath ./backend-images/nemo.tar)" + backend-images: mkdir -p backend-images @@ -391,6 +395,12 @@ docker-save-kitten-tts: backend-images docker-build-kokoro: docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:kokoro -f backend/Dockerfile.python --build-arg BACKEND=kokoro ./backend +docker-build-nemo: + docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:nemo -f backend/Dockerfile.python --build-arg BACKEND=nemo ./backend + +docker-save-nemo: backend-images + docker save local-ai-backend:nemo -o backend-images/nemo.tar + docker-save-kokoro: backend-images docker save local-ai-backend:kokoro -o backend-images/kokoro.tar diff --git a/backend/python/mlx/Makefile b/backend/python/mlx/Makefile index c4c18bee55b0..06f3bf614854 100644 --- a/backend/python/mlx/Makefile +++ b/backend/python/mlx/Makefile @@ -1,29 +1,23 @@ .PHONY: mlx -mlx: protogen +mlx: bash install.sh .PHONY: run -run: protogen +run: @echo "Running mlx..." bash run.sh @echo "mlx run." .PHONY: test -test: protogen +test: @echo "Testing mlx..." bash test.sh @echo "mlx tested." -.PHONY: protogen -protogen: backend_pb2_grpc.py backend_pb2.py - .PHONY: protogen-clean protogen-clean: $(RM) backend_pb2_grpc.py backend_pb2.py -backend_pb2_grpc.py backend_pb2.py: - python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto - .PHONY: clean clean: protogen-clean rm -rf venv __pycache__ \ No newline at end of file diff --git a/backend/python/mlx/backend.py b/backend/python/mlx/backend.py index 56698a54e5f5..84024b387f29 100644 --- a/backend/python/mlx/backend.py +++ b/backend/python/mlx/backend.py @@ -6,19 +6,16 @@ import sys import os from typing import List -from PIL import Image +import time import backend_pb2 import backend_pb2_grpc import grpc -from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.engine.async_llm_engine import AsyncLLMEngine -from vllm.sampling_params import SamplingParams -from vllm.utils import random_uuid -from vllm.transformers_utils.tokenizer import get_tokenizer -from vllm.multimodal.utils import fetch_image -from vllm.assets.video import VideoAsset +from mlx_lm import load, generate, stream_generate +from mlx_lm.sample_utils import make_sampler +from mlx_lm.models.cache import make_prompt_cache +import mlx.core as mx import base64 import io @@ -32,38 +29,22 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): """ A gRPC servicer that implements the Backend service defined in backend.proto. """ - def generate(self,prompt, max_new_tokens): - """ - Generates text based on the given prompt and maximum number of new tokens. - - Args: - prompt (str): The prompt to generate text from. - max_new_tokens (int): The maximum number of new tokens to generate. - - Returns: - str: The generated text. - """ - self.generator.end_beam_search() - - # Tokenizing the input - ids = self.generator.tokenizer.encode(prompt) - - self.generator.gen_begin_reuse(ids) - initial_len = self.generator.sequence[0].shape[0] - has_leading_space = False - decoded_text = '' - for i in range(max_new_tokens): - token = self.generator.gen_single_token() - if i == 0 and self.generator.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'): - has_leading_space = True - decoded_text = self.generator.tokenizer.decode(self.generator.sequence[0][initial_len:]) - if has_leading_space: - decoded_text = ' ' + decoded_text + def _is_float(self, s): + """Check if a string can be converted to float.""" + try: + float(s) + return True + except ValueError: + return False - if token.item() == self.generator.tokenizer.eos_token_id: - break - return decoded_text + def _is_int(self, s): + """Check if a string can be converted to int.""" + try: + int(s) + return True + except ValueError: + return False def Health(self, request, context): """ @@ -80,7 +61,7 @@ def Health(self, request, context): async def LoadModel(self, request, context): """ - Loads a language model. + Loads a language model using MLX. Args: request: The load model request. @@ -89,60 +70,70 @@ async def LoadModel(self, request, context): Returns: backend_pb2.Result: The load model result. """ - engine_args = AsyncEngineArgs( - model=request.Model, - ) - - if request.Quantization != "": - engine_args.quantization = request.Quantization - if request.LoadFormat != "": - engine_args.load_format = request.LoadFormat - if request.GPUMemoryUtilization != 0: - engine_args.gpu_memory_utilization = request.GPUMemoryUtilization - if request.TrustRemoteCode: - engine_args.trust_remote_code = request.TrustRemoteCode - if request.EnforceEager: - engine_args.enforce_eager = request.EnforceEager - if request.TensorParallelSize: - engine_args.tensor_parallel_size = request.TensorParallelSize - if request.SwapSpace != 0: - engine_args.swap_space = request.SwapSpace - if request.MaxModelLen != 0: - engine_args.max_model_len = request.MaxModelLen - if request.DisableLogStatus: - engine_args.disable_log_status = request.DisableLogStatus - if request.DType != "": - engine_args.dtype = request.DType - if request.LimitImagePerPrompt != 0 or request.LimitVideoPerPrompt != 0 or request.LimitAudioPerPrompt != 0: - # limit-mm-per-prompt defaults to 1 per modality, based on vLLM docs - engine_args.limit_mm_per_prompt = { - "image": max(request.LimitImagePerPrompt, 1), - "video": max(request.LimitVideoPerPrompt, 1), - "audio": max(request.LimitAudioPerPrompt, 1) - } - try: - self.llm = AsyncLLMEngine.from_engine_args(engine_args) + print(f"Loading MLX model: {request.Model}", file=sys.stderr) + print(f"Request: {request}", file=sys.stderr) + + # Parse options like in the diffusers backend + options = request.Options + self.options = {} + + # The options are a list of strings in this form optname:optvalue + # We store all the options in a dict for later use + for opt in options: + if ":" not in opt: + continue + key, value = opt.split(":", 1) # Split only on first colon to handle values with colons + + # Convert numeric values to appropriate types + if self._is_float(value): + value = float(value) + elif self._is_int(value): + value = int(value) + elif value.lower() in ["true", "false"]: + value = value.lower() == "true" + + self.options[key] = value + + print(f"Options: {self.options}", file=sys.stderr) + + # Build tokenizer config for MLX using options + tokenizer_config = {} + + # Handle trust_remote_code from request or options + if request.TrustRemoteCode or self.options.get("trust_remote_code", False): + tokenizer_config["trust_remote_code"] = True + + # Handle EOS token from options + if "eos_token" in self.options: + tokenizer_config["eos_token"] = self.options["eos_token"] + + # Handle other tokenizer config options + for key in ["pad_token", "bos_token", "unk_token", "sep_token", "cls_token", "mask_token"]: + if key in self.options: + tokenizer_config[key] = self.options[key] + + # Load model and tokenizer using MLX + if tokenizer_config: + print(f"Loading with tokenizer_config: {tokenizer_config}", file=sys.stderr) + self.model, self.tokenizer = load(request.Model, tokenizer_config=tokenizer_config) + else: + self.model, self.tokenizer = load(request.Model) + + # Initialize prompt cache for efficient generation + max_kv_size = self.options.get("max_kv_size", None) + self.prompt_cache = make_prompt_cache(self.model, max_kv_size) + except Exception as err: - print(f"Unexpected {err=}, {type(err)=}", file=sys.stderr) - return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") + print(f"Error loading MLX model {err=}, {type(err)=}", file=sys.stderr) + return backend_pb2.Result(success=False, message=f"Error loading MLX model: {err}") - try: - engine_model_config = await self.llm.get_model_config() - self.tokenizer = get_tokenizer( - engine_model_config.tokenizer, - tokenizer_mode=engine_model_config.tokenizer_mode, - trust_remote_code=engine_model_config.trust_remote_code, - truncation_side="left", - ) - except Exception as err: - return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") - print("Model loaded successfully", file=sys.stderr) - return backend_pb2.Result(message="Model loaded successfully", success=True) + print("MLX model loaded successfully", file=sys.stderr) + return backend_pb2.Result(message="MLX model loaded successfully", success=True) async def Predict(self, request, context): """ - Generates text based on the given prompt and sampling parameters. + Generates text based on the given prompt and sampling parameters using MLX. Args: request: The predict request. @@ -151,13 +142,42 @@ async def Predict(self, request, context): Returns: backend_pb2.Reply: The predict result. """ - gen = self._predict(request, context, streaming=False) - res = await gen.__anext__() - return res + try: + # Prepare the prompt + prompt = self._prepare_prompt(request) + + # Build generation parameters using request attributes and options + max_tokens, sampler_params = self._build_generation_params(request) + + print(f"Generating text with MLX - max_tokens: {max_tokens}, sampler_params: {sampler_params}", file=sys.stderr) + + # Create sampler with parameters + sampler = make_sampler(**sampler_params) + + # Generate text using MLX with proper parameters + response = generate( + self.model, + self.tokenizer, + prompt=prompt, + max_tokens=max_tokens, + sampler=sampler, + prompt_cache=self.prompt_cache, + verbose=False + ) + + return backend_pb2.Reply(message=bytes(response, encoding='utf-8')) + + except Exception as e: + print(f"Error in MLX Predict: {e}", file=sys.stderr) + context.set_code(grpc.StatusCode.INTERNAL) + context.set_details(f"Generation failed: {str(e)}") + return backend_pb2.Reply(message=bytes("", encoding='utf-8')) def Embedding(self, request, context): """ A gRPC method that calculates embeddings for a given sentence. + + Note: MLX-LM doesn't support embeddings directly. This method returns an error. Args: request: An EmbeddingRequest object that contains the request parameters. @@ -166,170 +186,159 @@ def Embedding(self, request, context): Returns: An EmbeddingResult object that contains the calculated embeddings. """ - print("Calculated embeddings for: " + request.Embeddings, file=sys.stderr) - outputs = self.model.encode(request.Embeddings) - # Check if we have one result at least - if len(outputs) == 0: - context.set_code(grpc.StatusCode.INVALID_ARGUMENT) - context.set_details("No embeddings were calculated.") - return backend_pb2.EmbeddingResult() - return backend_pb2.EmbeddingResult(embeddings=outputs[0].outputs.embedding) + print("Embeddings not supported in MLX backend", file=sys.stderr) + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details("Embeddings are not supported in the MLX backend.") + return backend_pb2.EmbeddingResult() async def PredictStream(self, request, context): """ - Generates text based on the given prompt and sampling parameters, and streams the results. + Generates text based on the given prompt and sampling parameters, and streams the results using MLX. Args: request: The predict stream request. context: The gRPC context. - Returns: - backend_pb2.Result: The predict stream result. + Yields: + backend_pb2.Reply: Streaming predict results. """ - iterations = self._predict(request, context, streaming=True) try: - async for iteration in iterations: - yield iteration - finally: - await iterations.aclose() - - async def _predict(self, request, context, streaming=False): - # Build the sampling parameters - # NOTE: this must stay in sync with the vllm backend - request_to_sampling_params = { - "N": "n", - "PresencePenalty": "presence_penalty", - "FrequencyPenalty": "frequency_penalty", - "RepetitionPenalty": "repetition_penalty", - "Temperature": "temperature", - "TopP": "top_p", - "TopK": "top_k", - "MinP": "min_p", - "Seed": "seed", - "StopPrompts": "stop", - "StopTokenIds": "stop_token_ids", - "BadWords": "bad_words", - "IncludeStopStrInOutput": "include_stop_str_in_output", - "IgnoreEOS": "ignore_eos", - "Tokens": "max_tokens", - "MinTokens": "min_tokens", - "Logprobs": "logprobs", - "PromptLogprobs": "prompt_logprobs", - "SkipSpecialTokens": "skip_special_tokens", - "SpacesBetweenSpecialTokens": "spaces_between_special_tokens", - "TruncatePromptTokens": "truncate_prompt_tokens", - "GuidedDecoding": "guided_decoding", - } - - sampling_params = SamplingParams(top_p=0.9, max_tokens=200) - - for request_field, param_field in request_to_sampling_params.items(): - if hasattr(request, request_field): - value = getattr(request, request_field) - if value not in (None, 0, [], False): - setattr(sampling_params, param_field, value) - - # Extract image paths and process images - prompt = request.Prompt + # Prepare the prompt + prompt = self._prepare_prompt(request) + + # Build generation parameters using request attributes and options + max_tokens, sampler_params = self._build_generation_params(request, default_max_tokens=512) + + print(f"Streaming text with MLX - max_tokens: {max_tokens}, sampler_params: {sampler_params}", file=sys.stderr) + + # Create sampler with parameters + sampler = make_sampler(**sampler_params) + + # Stream text generation using MLX with proper parameters + for response in stream_generate( + self.model, + self.tokenizer, + prompt=prompt, + max_tokens=max_tokens, + sampler=sampler, + prompt_cache=self.prompt_cache, + ): + yield backend_pb2.Reply(message=bytes(response.text, encoding='utf-8')) + + except Exception as e: + print(f"Error in MLX PredictStream: {e}", file=sys.stderr) + context.set_code(grpc.StatusCode.INTERNAL) + context.set_details(f"Streaming generation failed: {str(e)}") + yield backend_pb2.Reply(message=bytes("", encoding='utf-8')) - image_paths = request.Images - image_data = [self.load_image(img_path) for img_path in image_paths] + def _prepare_prompt(self, request): + """ + Prepare the prompt for MLX generation, handling chat templates if needed. - videos_path = request.Videos - video_data = [self.load_video(video_path) for video_path in videos_path] + Args: + request: The gRPC request containing prompt and message information. + Returns: + str: The prepared prompt. + """ # If tokenizer template is enabled and messages are provided instead of prompt, apply the tokenizer template if not request.Prompt and request.UseTokenizerTemplate and request.Messages: - prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True) - - # Generate text using the LLM engine - request_id = random_uuid() - print(f"Generating text with request_id: {request_id}", file=sys.stderr) - multi_modal_data = {} - if image_data: - multi_modal_data["image"] = image_data - if video_data: - multi_modal_data["video"] = video_data - outputs = self.llm.generate( - { - "prompt": prompt, - "multi_modal_data": multi_modal_data if multi_modal_data else None, - }, - sampling_params=sampling_params, - request_id=request_id, - ) + # Convert gRPC messages to the format expected by apply_chat_template + messages = [] + for msg in request.Messages: + messages.append({"role": msg.role, "content": msg.content}) + + prompt = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True + ) + return prompt + else: + return request.Prompt - # Stream the results - generated_text = "" - try: - async for request_output in outputs: - iteration_text = request_output.outputs[0].text - - if streaming: - # Remove text already sent as vllm concatenates the text from previous yields - delta_iteration_text = iteration_text.removeprefix(generated_text) - # Send the partial result - yield backend_pb2.Reply(message=bytes(delta_iteration_text, encoding='utf-8')) - - # Keep track of text generated - generated_text = iteration_text - finally: - await outputs.aclose() - - # If streaming, we already sent everything - if streaming: - return - - # Remove the image files from /tmp folder - for img_path in image_paths: - try: - os.remove(img_path) - except Exception as e: - print(f"Error removing image file: {img_path}, {e}", file=sys.stderr) - - # Sending the final generated text - yield backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8')) - - def load_image(self, image_path: str): - """ - Load an image from the given file path or base64 encoded data. - Args: - image_path (str): The path to the image file or base64 encoded data. - Returns: - Image: The loaded image. - """ - try: - image_data = base64.b64decode(image_path) - image = Image.open(io.BytesIO(image_data)) - return image - except Exception as e: - print(f"Error loading image {image_path}: {e}", file=sys.stderr) - return None - def load_video(self, video_path: str): + def _build_generation_params(self, request, default_max_tokens=200): """ - Load a video from the given file path. + Build generation parameters from request attributes and options. Args: - video_path (str): The path to the image file. + request: The gRPC request. + default_max_tokens: Default max_tokens if not specified. Returns: - Video: The loaded video. + tuple: (max_tokens, sampler_params dict) """ + # Extract max_tokens + max_tokens = getattr(request, 'Tokens', default_max_tokens) + if max_tokens == 0: + max_tokens = default_max_tokens + + # Extract sampler parameters from request attributes + temp = getattr(request, 'Temperature', 0.0) + if temp == 0.0: + temp = 0.6 # Default temperature + + top_p = getattr(request, 'TopP', 0.0) + if top_p == 0.0: + top_p = 1.0 # Default top_p + + # Initialize sampler parameters + sampler_params = { + 'temp': temp, + 'top_p': top_p, + 'xtc_threshold': 0.0, + 'xtc_probability': 0.0, + } + + # Add seed if specified + seed = getattr(request, 'Seed', 0) + if seed != 0: + mx.random.seed(seed) + + # Override with options if available + if hasattr(self, 'options'): + # Max tokens from options + if 'max_tokens' in self.options: + max_tokens = self.options['max_tokens'] + + # Sampler parameters from options + sampler_option_mapping = { + 'temp': 'temp', + 'temperature': 'temp', # alias + 'top_p': 'top_p', + 'xtc_threshold': 'xtc_threshold', + 'xtc_probability': 'xtc_probability', + } + + for option_key, param_key in sampler_option_mapping.items(): + if option_key in self.options: + sampler_params[param_key] = self.options[option_key] + + # Handle seed from options + if 'seed' in self.options: + mx.random.seed(self.options['seed']) + + # Special tokens for XTC sampling (if tokenizer has eos_token_ids) + xtc_special_tokens = [] + if hasattr(self.tokenizer, 'eos_token_ids') and self.tokenizer.eos_token_ids: + xtc_special_tokens = list(self.tokenizer.eos_token_ids) + elif hasattr(self.tokenizer, 'eos_token_id') and self.tokenizer.eos_token_id is not None: + xtc_special_tokens = [self.tokenizer.eos_token_id] + + # Add newline token if available try: - timestamp = str(int(time.time() * 1000)) # Generate timestamp - p = f"/tmp/vl-{timestamp}.data" # Use timestamp in filename - with open(p, "wb") as f: - f.write(base64.b64decode(video_path)) - video = VideoAsset(name=p).np_ndarrays - os.remove(p) - return video - except Exception as e: - print(f"Error loading video {video_path}: {e}", file=sys.stderr) - return None + newline_tokens = self.tokenizer.encode("\n") + xtc_special_tokens.extend(newline_tokens) + except: + pass # Skip if encoding fails + + sampler_params['xtc_special_tokens'] = xtc_special_tokens + + return max_tokens, sampler_params async def serve(address): # Start asyncio gRPC server diff --git a/backend/python/mlx/install.sh b/backend/python/mlx/install.sh index b8ee48552490..253ee0c13f1b 100755 --- a/backend/python/mlx/install.sh +++ b/backend/python/mlx/install.sh @@ -2,6 +2,7 @@ set -e USE_PIP=true +PYTHON_VERSION="" backend_dir=$(dirname $0) diff --git a/backend/python/mlx/requirements-mps.txt b/backend/python/mlx/requirements-mps.txt new file mode 100644 index 000000000000..22737f5fdda7 --- /dev/null +++ b/backend/python/mlx/requirements-mps.txt @@ -0,0 +1 @@ +mlx-lm \ No newline at end of file diff --git a/scripts/build-llama-cpp-darwin.sh b/scripts/build/llama-cpp-darwin.sh similarity index 100% rename from scripts/build-llama-cpp-darwin.sh rename to scripts/build/llama-cpp-darwin.sh From 8ed272ec0a23a679f1cb6bfae1b86a59c1876aca Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Thu, 21 Aug 2025 19:22:42 +0200 Subject: [PATCH 04/14] List models from system only Signed-off-by: Ettore Di Giacinto --- Makefile | 11 +++-------- core/gallery/gallery.go | 13 +++++++------ core/http/endpoints/localai/welcome.go | 10 +++------- 3 files changed, 13 insertions(+), 21 deletions(-) diff --git a/Makefile b/Makefile index 9ce4c079e73e..af95e2374575 100644 --- a/Makefile +++ b/Makefile @@ -362,8 +362,9 @@ backends/llama-cpp-darwin: build bash ./scripts/build/llama-cpp-darwin.sh ./local-ai backends install "ocifile://$(abspath ./backend-images/llama-cpp.tar)" -backends/nemo: docker-build-nemo docker-save-nemo build - ./local-ai backends install "ocifile://$(abspath ./backend-images/nemo.tar)" +backends/mlx: build + BACKEND=mlx BUILD_TYPE=mps bash ./scripts/build/python-darwin.sh + ./local-ai backends install "ocifile://$(abspath ./backend-images/mlx.tar)" backend-images: mkdir -p backend-images @@ -395,12 +396,6 @@ docker-save-kitten-tts: backend-images docker-build-kokoro: docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:kokoro -f backend/Dockerfile.python --build-arg BACKEND=kokoro ./backend -docker-build-nemo: - docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:nemo -f backend/Dockerfile.python --build-arg BACKEND=nemo ./backend - -docker-save-nemo: backend-images - docker save local-ai-backend:nemo -o backend-images/nemo.tar - docker-save-kokoro: backend-images docker save local-ai-backend:kokoro -o backend-images/kokoro.tar diff --git a/core/gallery/gallery.go b/core/gallery/gallery.go index a80550102b17..e746f71a347f 100644 --- a/core/gallery/gallery.go +++ b/core/gallery/gallery.go @@ -141,14 +141,15 @@ func AvailableGalleryModels(galleries []config.Gallery, systemState *system.Syst func AvailableBackends(galleries []config.Gallery, systemState *system.SystemState) (GalleryElements[*GalleryBackend], error) { var backends []*GalleryBackend + systemBackends, err := ListSystemBackends(systemState) + if err != nil { + return nil, err + } + // Get backends from galleries for _, gallery := range galleries { - galleryBackends, err := getGalleryElements[*GalleryBackend](gallery, systemState.Backend.BackendsPath, func(backend *GalleryBackend) bool { - backends, err := ListSystemBackends(systemState) - if err != nil { - return false - } - return backends.Exists(backend.GetName()) + galleryBackends, err := getGalleryElements(gallery, systemState.Backend.BackendsPath, func(backend *GalleryBackend) bool { + return systemBackends.Exists(backend.GetName()) }) if err != nil { return nil, err diff --git a/core/http/endpoints/localai/welcome.go b/core/http/endpoints/localai/welcome.go index 23efd0788dc5..04f72743e34e 100644 --- a/core/http/endpoints/localai/welcome.go +++ b/core/http/endpoints/localai/welcome.go @@ -16,13 +16,9 @@ func WelcomeEndpoint(appConfig *config.ApplicationConfig, modelConfigs := cl.GetAllModelsConfigs() galleryConfigs := map[string]*gallery.ModelConfig{} - backends, _ := gallery.AvailableBackends(appConfig.BackendGalleries, appConfig.SystemState) - - installedBackends := gallery.GalleryElements[*gallery.GalleryBackend]{} - for _, b := range backends { - if b.Installed { - installedBackends = append(installedBackends, b) - } + installedBackends, err := gallery.ListSystemBackends(appConfig.SystemState) + if err != nil { + return err } for _, m := range modelConfigs { From 4662a9f65145f3152a7d43d8c5bae24cf69419b7 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Thu, 21 Aug 2025 19:23:20 +0200 Subject: [PATCH 05/14] Add script to build darwin python backends Signed-off-by: Ettore Di Giacinto --- scripts/build/python-darwin.sh | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 scripts/build/python-darwin.sh diff --git a/scripts/build/python-darwin.sh b/scripts/build/python-darwin.sh new file mode 100644 index 000000000000..6166a2630322 --- /dev/null +++ b/scripts/build/python-darwin.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +set -ex + +IMAGE_NAME="${IMAGE_NAME:-localai/llama-cpp-darwin}" +mkdir -p backend-images +make -C backend/python/${BACKEND} + +cp -rfv backend/python/common backend/python/${BACKEND}/ + +PLATFORMARCH="${PLATFORMARCH:-darwin/arm64}" + +./local-ai util create-oci-image \ + backend/python/${BACKEND}/. \ + --output ./backend-images/${BACKEND}.tar \ + --image-name $IMAGE_NAME \ + --platform $PLATFORMARCH + +make -C backend/python/${BACKEND} clean + From 1c86dfd2c8c941a2a4beefc3a65aecbbb5ed0475 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Thu, 21 Aug 2025 19:24:15 +0200 Subject: [PATCH 06/14] Run protogen in libbackend Signed-off-by: Ettore Di Giacinto --- backend/python/bark/Makefile | 12 +--- backend/python/chatterbox/Makefile | 14 ++-- backend/python/common/libbackend.sh | 75 ++++++++++++---------- backend/python/common/template/Makefile | 7 -- backend/python/common/template/protogen.sh | 4 +- backend/python/coqui/Makefile | 12 +--- backend/python/diffusers/Makefile | 12 +--- backend/python/exllama2/Makefile | 10 +-- backend/python/faster-whisper/Makefile | 7 -- backend/python/kitten-tts/Makefile | 12 +--- backend/python/kokoro/Makefile | 6 -- backend/python/rerankers/Makefile | 12 +--- backend/python/rfdetr/Makefile | 7 -- backend/python/rfdetr/protogen.sh | 13 ---- backend/python/transformers/Makefile | 12 +--- backend/python/vllm/Makefile | 12 +--- 16 files changed, 68 insertions(+), 159 deletions(-) delete mode 100644 backend/python/rfdetr/protogen.sh diff --git a/backend/python/bark/Makefile b/backend/python/bark/Makefile index ef4fff1bef9d..da996aabeef0 100644 --- a/backend/python/bark/Makefile +++ b/backend/python/bark/Makefile @@ -1,29 +1,23 @@ .PHONY: ttsbark -ttsbark: protogen +ttsbark: bash install.sh .PHONY: run -run: protogen +run: ttsbark @echo "Running bark..." bash run.sh @echo "bark run." .PHONY: test -test: protogen +test: ttsbark @echo "Testing bark..." bash test.sh @echo "bark tested." -.PHONY: protogen -protogen: backend_pb2_grpc.py backend_pb2.py - .PHONY: protogen-clean protogen-clean: $(RM) backend_pb2_grpc.py backend_pb2.py -backend_pb2_grpc.py backend_pb2.py: - python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto - .PHONY: clean clean: protogen-clean rm -rf venv __pycache__ \ No newline at end of file diff --git a/backend/python/chatterbox/Makefile b/backend/python/chatterbox/Makefile index a69c0bcf58ca..be9330f8eac9 100644 --- a/backend/python/chatterbox/Makefile +++ b/backend/python/chatterbox/Makefile @@ -1,29 +1,23 @@ -.PHONY: coqui -coqui: protogen +.PHONY: chatterbox +chatterbox: bash install.sh .PHONY: run -run: protogen +run: chatterbox @echo "Running coqui..." bash run.sh @echo "coqui run." .PHONY: test -test: protogen +test: chatterbox @echo "Testing coqui..." bash test.sh @echo "coqui tested." -.PHONY: protogen -protogen: backend_pb2_grpc.py backend_pb2.py - .PHONY: protogen-clean protogen-clean: $(RM) backend_pb2_grpc.py backend_pb2.py -backend_pb2_grpc.py backend_pb2.py: - python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto - .PHONY: clean clean: protogen-clean rm -rf venv __pycache__ \ No newline at end of file diff --git a/backend/python/common/libbackend.sh b/backend/python/common/libbackend.sh index d98924fe3550..409d20665932 100644 --- a/backend/python/common/libbackend.sh +++ b/backend/python/common/libbackend.sh @@ -21,7 +21,7 @@ # USE_PIP=true source $(dirname $0)/../common/libbackend.sh # -PYTHON_VERSION="3.10" +PYTHON_VERSION="${PYTHON_VERSION:-3.10}" # Default to uv if USE_PIP is not set if [ "x${USE_PIP}" == "x" ]; then @@ -56,11 +56,6 @@ function init() { fi echo "Initializing libbackend for ${BACKEND_NAME}" - if [ "x${USE_PIP}" == "xtrue" ]; then - echo "Using pip and Python virtual environments" - else - echo "Using uv package manager" - fi } # getBuildProfile will inspect the system to determine which build profile is appropriate: @@ -70,11 +65,6 @@ function init() { # - hipblas # - intel function getBuildProfile() { - if [ "x${BUILD_TYPE}" == "xl4t" ]; then - echo "l4t" - return 0 - fi - # First check if we are a cublas build, and if so report the correct build profile if [ x"${BUILD_TYPE}" == "xcublas" ]; then if [ ! -z ${CUDA_MAJOR_VERSION} ]; then @@ -94,7 +84,7 @@ function getBuildProfile() { fi # If for any other values of BUILD_TYPE, we don't need any special handling/discovery - if [ ! -z ${BUILD_TYPE} ]; then + if [ -n ${BUILD_TYPE} ]; then echo ${BUILD_TYPE} return 0 fi @@ -108,35 +98,48 @@ function getBuildProfile() { # This function is idempotent, so you can call it as many times as you want and it will # always result in an activated virtual environment function ensureVenv() { - if [ "x${USE_PIP}" == "xtrue" ]; then - # Use Python virtual environment with pip - if [ ! -d "${EDIR}/venv" ]; then - python${PYTHON_VERSION} -m venv ${EDIR}/venv - echo "Python virtual environment created" - fi - - # Source if we are not already in a Virtual env - if [ "x${VIRTUAL_ENV}" != "x${EDIR}/venv" ]; then - source ${EDIR}/venv/bin/activate - echo "Python virtual environment activated" - fi - else - # Use uv (conda-like) - if [ ! -d "${EDIR}/venv" ]; then - uv venv --python ${PYTHON_VERSION} ${EDIR}/venv - echo "uv virtual environment created" - fi - - # Source if we are not already in a Virtual env - if [ "x${VIRTUAL_ENV}" != "x${EDIR}/venv" ]; then - source ${EDIR}/venv/bin/activate - echo "uv virtual environment activated" + if [ ! -d "${EDIR}/venv" ]; then + if [ "x${USE_PIP}" == "xtrue" ]; then + echo "Using pip and Python virtual environments" + + # Use Python virtual environment with pip + interpreter="python3" + # if there is no python , call python${PYTHON_VERSION} + + if command -v python${PYTHON_VERSION} &> /dev/null; then + interpreter="python${PYTHON_VERSION}" + fi + echo "Using interpreter: ${interpreter}" + ${interpreter} -m venv ${EDIR}/venv + echo "Python virtual environment created" + else + echo "Using uv package manager" + uv venv --python ${PYTHON_VERSION} ${EDIR}/venv + echo "uv virtual environment created" fi fi + # Source if we are not already in a Virtual env + if [ "x${VIRTUAL_ENV}" != "x${EDIR}/venv" ]; then + source ${EDIR}/venv/bin/activate + echo "Python virtual environment activated" + fi echo "activated virtual environment has been ensured" } +function runProtogen() { + ensureVenv + + if [ "x${USE_PIP}" == "xtrue" ]; then + pip install grpcio-tools + else + uv pip install grpcio-tools + fi + pushd ${EDIR} + python3 -m grpc_tools.protoc -I../../ -I./ --python_out=. --grpc_python_out=. backend.proto + popd +} + # installRequirements looks for several requirements files and if they exist runs the install for them in order # # - requirements-install.txt @@ -196,6 +199,8 @@ function installRequirements() { echo "finished requirements install for ${reqFile}" fi done + + runProtogen } # startBackend discovers and runs the backend GRPC server diff --git a/backend/python/common/template/Makefile b/backend/python/common/template/Makefile index c0e5169f75c4..f6b9ddc6c888 100644 --- a/backend/python/common/template/Makefile +++ b/backend/python/common/template/Makefile @@ -3,18 +3,11 @@ .PHONY: install install: bash install.sh - $(MAKE) protogen - -.PHONY: protogen -protogen: backend_pb2_grpc.py backend_pb2.py .PHONY: protogen-clean protogen-clean: $(RM) backend_pb2_grpc.py backend_pb2.py -backend_pb2_grpc.py backend_pb2.py: - bash protogen.sh - .PHONY: clean clean: protogen-clean rm -rf venv __pycache__ \ No newline at end of file diff --git a/backend/python/common/template/protogen.sh b/backend/python/common/template/protogen.sh index 0569b6c6e4b3..cba7791cbce3 100644 --- a/backend/python/common/template/protogen.sh +++ b/backend/python/common/template/protogen.sh @@ -8,6 +8,4 @@ else source $backend_dir/../common/libbackend.sh fi -ensureVenv - -python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto \ No newline at end of file +runProtogen \ No newline at end of file diff --git a/backend/python/coqui/Makefile b/backend/python/coqui/Makefile index a69c0bcf58ca..6915b0f9f896 100644 --- a/backend/python/coqui/Makefile +++ b/backend/python/coqui/Makefile @@ -1,29 +1,23 @@ .PHONY: coqui -coqui: protogen +coqui: bash install.sh .PHONY: run -run: protogen +run: coqui @echo "Running coqui..." bash run.sh @echo "coqui run." .PHONY: test -test: protogen +test: coqui @echo "Testing coqui..." bash test.sh @echo "coqui tested." -.PHONY: protogen -protogen: backend_pb2_grpc.py backend_pb2.py - .PHONY: protogen-clean protogen-clean: $(RM) backend_pb2_grpc.py backend_pb2.py -backend_pb2_grpc.py backend_pb2.py: - python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto - .PHONY: clean clean: protogen-clean rm -rf venv __pycache__ \ No newline at end of file diff --git a/backend/python/diffusers/Makefile b/backend/python/diffusers/Makefile index 01156f876f00..f9ded4a1cff7 100644 --- a/backend/python/diffusers/Makefile +++ b/backend/python/diffusers/Makefile @@ -12,28 +12,22 @@ export SKIP_CONDA=1 endif .PHONY: diffusers -diffusers: protogen +diffusers: bash install.sh .PHONY: run -run: protogen +run: diffusers @echo "Running diffusers..." bash run.sh @echo "Diffusers run." -test: protogen +test: diffusers bash test.sh -.PHONY: protogen -protogen: backend_pb2_grpc.py backend_pb2.py - .PHONY: protogen-clean protogen-clean: $(RM) backend_pb2_grpc.py backend_pb2.py -backend_pb2_grpc.py backend_pb2.py: - python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto - .PHONY: clean clean: protogen-clean rm -rf venv __pycache__ \ No newline at end of file diff --git a/backend/python/exllama2/Makefile b/backend/python/exllama2/Makefile index 68a18f3aa855..15ba38d120f3 100644 --- a/backend/python/exllama2/Makefile +++ b/backend/python/exllama2/Makefile @@ -1,23 +1,17 @@ .PHONY: exllama2 -exllama2: protogen +exllama2: bash install.sh .PHONY: run -run: protogen +run: exllama2 @echo "Running exllama2..." bash run.sh @echo "exllama2 run." -.PHONY: protogen -protogen: backend_pb2_grpc.py backend_pb2.py - .PHONY: protogen-clean protogen-clean: $(RM) backend_pb2_grpc.py backend_pb2.py -backend_pb2_grpc.py backend_pb2.py: - python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto - .PHONY: clean clean: protogen-clean $(RM) -r venv source __pycache__ \ No newline at end of file diff --git a/backend/python/faster-whisper/Makefile b/backend/python/faster-whisper/Makefile index c0e5169f75c4..f6b9ddc6c888 100644 --- a/backend/python/faster-whisper/Makefile +++ b/backend/python/faster-whisper/Makefile @@ -3,18 +3,11 @@ .PHONY: install install: bash install.sh - $(MAKE) protogen - -.PHONY: protogen -protogen: backend_pb2_grpc.py backend_pb2.py .PHONY: protogen-clean protogen-clean: $(RM) backend_pb2_grpc.py backend_pb2.py -backend_pb2_grpc.py backend_pb2.py: - bash protogen.sh - .PHONY: clean clean: protogen-clean rm -rf venv __pycache__ \ No newline at end of file diff --git a/backend/python/kitten-tts/Makefile b/backend/python/kitten-tts/Makefile index f05fc191698f..021a9679bfd2 100644 --- a/backend/python/kitten-tts/Makefile +++ b/backend/python/kitten-tts/Makefile @@ -1,29 +1,23 @@ .PHONY: kitten-tts -kitten-tts: protogen +kitten-tts: bash install.sh .PHONY: run -run: protogen +run: kitten-tts @echo "Running kitten-tts..." bash run.sh @echo "kitten-tts run." .PHONY: test -test: protogen +test: kitten-tts @echo "Testing kitten-tts..." bash test.sh @echo "kitten-tts tested." -.PHONY: protogen -protogen: backend_pb2_grpc.py backend_pb2.py - .PHONY: protogen-clean protogen-clean: $(RM) backend_pb2_grpc.py backend_pb2.py -backend_pb2_grpc.py backend_pb2.py: - python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto - .PHONY: clean clean: protogen-clean rm -rf venv __pycache__ \ No newline at end of file diff --git a/backend/python/kokoro/Makefile b/backend/python/kokoro/Makefile index 660aabc34cec..29fc84b53159 100644 --- a/backend/python/kokoro/Makefile +++ b/backend/python/kokoro/Makefile @@ -14,16 +14,10 @@ test: protogen bash test.sh @echo "kokoro tested." -.PHONY: protogen -protogen: backend_pb2_grpc.py backend_pb2.py - .PHONY: protogen-clean protogen-clean: $(RM) backend_pb2_grpc.py backend_pb2.py -backend_pb2_grpc.py backend_pb2.py: - python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto - .PHONY: clean clean: protogen-clean rm -rf venv __pycache__ \ No newline at end of file diff --git a/backend/python/rerankers/Makefile b/backend/python/rerankers/Makefile index 82de822ff326..c9a1d30104b4 100644 --- a/backend/python/rerankers/Makefile +++ b/backend/python/rerankers/Makefile @@ -1,30 +1,24 @@ .PHONY: rerankers -rerankers: protogen +rerankers: bash install.sh .PHONY: run -run: protogen +run: rerankers @echo "Running rerankers..." bash run.sh @echo "rerankers run." # It is not working well by using command line. It only6 works with IDE like VSCode. .PHONY: test -test: protogen +test: rerankers @echo "Testing rerankers..." bash test.sh @echo "rerankers tested." -.PHONY: protogen -protogen: backend_pb2_grpc.py backend_pb2.py - .PHONY: protogen-clean protogen-clean: $(RM) backend_pb2_grpc.py backend_pb2.py -backend_pb2_grpc.py backend_pb2.py: - python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto - .PHONY: clean clean: protogen-clean rm -rf venv __pycache__ \ No newline at end of file diff --git a/backend/python/rfdetr/Makefile b/backend/python/rfdetr/Makefile index c0e5169f75c4..f6b9ddc6c888 100644 --- a/backend/python/rfdetr/Makefile +++ b/backend/python/rfdetr/Makefile @@ -3,18 +3,11 @@ .PHONY: install install: bash install.sh - $(MAKE) protogen - -.PHONY: protogen -protogen: backend_pb2_grpc.py backend_pb2.py .PHONY: protogen-clean protogen-clean: $(RM) backend_pb2_grpc.py backend_pb2.py -backend_pb2_grpc.py backend_pb2.py: - bash protogen.sh - .PHONY: clean clean: protogen-clean rm -rf venv __pycache__ \ No newline at end of file diff --git a/backend/python/rfdetr/protogen.sh b/backend/python/rfdetr/protogen.sh deleted file mode 100644 index 0569b6c6e4b3..000000000000 --- a/backend/python/rfdetr/protogen.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -set -e - -backend_dir=$(dirname $0) -if [ -d $backend_dir/common ]; then - source $backend_dir/common/libbackend.sh -else - source $backend_dir/../common/libbackend.sh -fi - -ensureVenv - -python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto \ No newline at end of file diff --git a/backend/python/transformers/Makefile b/backend/python/transformers/Makefile index 8d3f7fd73c9d..6897baf0c9b4 100644 --- a/backend/python/transformers/Makefile +++ b/backend/python/transformers/Makefile @@ -1,30 +1,24 @@ .PHONY: transformers -transformers: protogen +transformers: bash install.sh .PHONY: run -run: protogen +run: transformers @echo "Running transformers..." bash run.sh @echo "transformers run." # It is not working well by using command line. It only6 works with IDE like VSCode. .PHONY: test -test: protogen +test: transformers @echo "Testing transformers..." bash test.sh @echo "transformers tested." -.PHONY: protogen -protogen: backend_pb2_grpc.py backend_pb2.py - .PHONY: protogen-clean protogen-clean: $(RM) backend_pb2_grpc.py backend_pb2.py -backend_pb2_grpc.py backend_pb2.py: - python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto - .PHONY: clean clean: protogen-clean rm -rf venv __pycache__ \ No newline at end of file diff --git a/backend/python/vllm/Makefile b/backend/python/vllm/Makefile index bb57a19266bd..c7c1b6869c02 100644 --- a/backend/python/vllm/Makefile +++ b/backend/python/vllm/Makefile @@ -1,29 +1,23 @@ .PHONY: vllm -vllm: protogen +vllm: bash install.sh .PHONY: run -run: protogen +run: vllm @echo "Running vllm..." bash run.sh @echo "vllm run." .PHONY: test -test: protogen +test: vllm @echo "Testing vllm..." bash test.sh @echo "vllm tested." -.PHONY: protogen -protogen: backend_pb2_grpc.py backend_pb2.py - .PHONY: protogen-clean protogen-clean: $(RM) backend_pb2_grpc.py backend_pb2.py -backend_pb2_grpc.py backend_pb2.py: - python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto - .PHONY: clean clean: protogen-clean rm -rf venv __pycache__ \ No newline at end of file From 1fbe95f1d70c2fb9d4fe8cdadbf392824d756fb8 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Thu, 21 Aug 2025 19:24:40 +0200 Subject: [PATCH 07/14] Detect if mps is available across python backends Signed-off-by: Ettore Di Giacinto --- backend/python/chatterbox/backend.py | 4 +++- backend/python/coqui/backend.py | 4 +++- backend/python/diffusers/backend.py | 3 +++ backend/python/faster-whisper/backend.py | 6 ++++-- backend/python/kitten-tts/backend.py | 12 ------------ backend/python/kokoro/backend.py | 11 ----------- backend/python/transformers/backend.py | 4 +++- 7 files changed, 16 insertions(+), 28 deletions(-) diff --git a/backend/python/chatterbox/backend.py b/backend/python/chatterbox/backend.py index d7d241c302ac..0944202b9457 100644 --- a/backend/python/chatterbox/backend.py +++ b/backend/python/chatterbox/backend.py @@ -41,7 +41,9 @@ def LoadModel(self, request, context): else: print("CUDA is not available", file=sys.stderr) device = "cpu" - + mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available() + if mps_available: + device = "mps" if not torch.cuda.is_available() and request.CUDA: return backend_pb2.Result(success=False, message="CUDA is not available") diff --git a/backend/python/coqui/backend.py b/backend/python/coqui/backend.py index f940f8e0a403..df115adb5030 100644 --- a/backend/python/coqui/backend.py +++ b/backend/python/coqui/backend.py @@ -40,7 +40,9 @@ def LoadModel(self, request, context): else: print("CUDA is not available", file=sys.stderr) device = "cpu" - + mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available() + if mps_available: + device = "mps" if not torch.cuda.is_available() and request.CUDA: return backend_pb2.Result(success=False, message="CUDA is not available") diff --git a/backend/python/diffusers/backend.py b/backend/python/diffusers/backend.py index 185838209895..ef5f1b5c07ce 100755 --- a/backend/python/diffusers/backend.py +++ b/backend/python/diffusers/backend.py @@ -368,6 +368,9 @@ def LoadModel(self, request, context): device = "cpu" if not request.CUDA else "cuda" if XPU: device = "xpu" + mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available() + if mps_available: + device = "mps" self.device = device if request.LoraAdapter: # Check if its a local file and not a directory ( we load lora differently for a safetensor file ) diff --git a/backend/python/faster-whisper/backend.py b/backend/python/faster-whisper/backend.py index b73664ab88f7..808f29238207 100755 --- a/backend/python/faster-whisper/backend.py +++ b/backend/python/faster-whisper/backend.py @@ -10,7 +10,7 @@ import os import backend_pb2 import backend_pb2_grpc - +import torch from faster_whisper import WhisperModel import grpc @@ -35,7 +35,9 @@ def LoadModel(self, request, context): # device = "cuda" if request.CUDA else "cpu" if request.CUDA: device = "cuda" - + mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available() + if mps_available: + device = "mps" try: print("Preparing models, please wait", file=sys.stderr) self.model = WhisperModel(request.Model, device=device, compute_type="float16") diff --git a/backend/python/kitten-tts/backend.py b/backend/python/kitten-tts/backend.py index 775f85f57d0b..b31023c8cac6 100644 --- a/backend/python/kitten-tts/backend.py +++ b/backend/python/kitten-tts/backend.py @@ -33,18 +33,6 @@ def Health(self, request, context): return backend_pb2.Reply(message=bytes("OK", 'utf-8')) def LoadModel(self, request, context): - # Get device - # device = "cuda" if request.CUDA else "cpu" - if torch.cuda.is_available(): - print("CUDA is available", file=sys.stderr) - device = "cuda" - else: - print("CUDA is not available", file=sys.stderr) - device = "cpu" - - if not torch.cuda.is_available() and request.CUDA: - return backend_pb2.Result(success=False, message="CUDA is not available") - self.AudioPath = None # List available KittenTTS models print("Available KittenTTS voices: expr-voice-2-m, expr-voice-2-f, expr-voice-3-m, expr-voice-3-f, expr-voice-4-m, expr-voice-4-f, expr-voice-5-m, expr-voice-5-f") diff --git a/backend/python/kokoro/backend.py b/backend/python/kokoro/backend.py index 83a3f3326fbd..32aefa558e8a 100644 --- a/backend/python/kokoro/backend.py +++ b/backend/python/kokoro/backend.py @@ -33,17 +33,6 @@ def Health(self, request, context): return backend_pb2.Reply(message=bytes("OK", 'utf-8')) def LoadModel(self, request, context): - # Get device - if torch.cuda.is_available(): - print("CUDA is available", file=sys.stderr) - device = "cuda" - else: - print("CUDA is not available", file=sys.stderr) - device = "cpu" - - if not torch.cuda.is_available() and request.CUDA: - return backend_pb2.Result(success=False, message="CUDA is not available") - try: print("Preparing Kokoro TTS pipeline, please wait", file=sys.stderr) # empty dict diff --git a/backend/python/transformers/backend.py b/backend/python/transformers/backend.py index ef8a2fd40b6e..05713b917d2a 100644 --- a/backend/python/transformers/backend.py +++ b/backend/python/transformers/backend.py @@ -94,7 +94,9 @@ def LoadModel(self, request, context): self.SentenceTransformer = False device_map="cpu" - + mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available() + if mps_available: + device_map = "mps" quantization = None autoTokenizer = True From cfaecbaaf582f23996d3c4178217960fb1e41fe2 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Thu, 21 Aug 2025 19:24:53 +0200 Subject: [PATCH 08/14] CI: try to build backend Signed-off-by: Ettore Di Giacinto --- .github/workflows/backend.yml | 86 +++++++++++++++++++++++++++++++++++ backend/index.yaml | 20 ++++++++ 2 files changed, 106 insertions(+) diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml index 3ff701d76864..704c90322575 100644 --- a/.github/workflows/backend.yml +++ b/.github/workflows/backend.yml @@ -1119,3 +1119,89 @@ jobs: for tag in $(echo "${{ steps.quaymeta.outputs.tags }}" | tr ',' '\n'); do crane push llama-cpp.tar $tag done + mlx-darwin: + runs-on: macOS-14 + strategy: + matrix: + go-version: ['1.24.x'] + steps: + - name: Clone + uses: actions/checkout@v5 + with: + submodules: true + - name: Setup Go ${{ matrix.go-version }} + uses: actions/setup-go@v5 + with: + go-version: ${{ matrix.go-version }} + cache: false + # You can test your matrix by printing the current Go version + - name: Display Go version + run: go version + - name: Dependencies + run: | + brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm + - name: Build mlx-darwin + run: | + make protogen-go + make backends/mlx + - name: Upload mlx.tar + uses: actions/upload-artifact@v4 + with: + name: mlx-tar + path: backend-images/mlx.tar + mlx-darwin-publish: + needs: mlx-darwin + if: github.event_name != 'pull_request' + runs-on: ubuntu-latest + steps: + - name: Download mlx.tar + uses: actions/download-artifact@v5 + with: + name: mlx-tar + path: . + - name: Install crane + run: | + curl -L https://github.com/google/go-containerregistry/releases/latest/download/go-containerregistry_Linux_x86_64.tar.gz | tar -xz + sudo mv crane /usr/local/bin/ + - name: Log in to DockerHub + run: | + echo "${{ secrets.DOCKERHUB_PASSWORD }}" | crane auth login docker.io -u "${{ secrets.DOCKERHUB_USERNAME }}" --password-stdin + - name: Log in to quay.io + run: | + echo "${{ secrets.LOCALAI_REGISTRY_PASSWORD }}" | crane auth login quay.io -u "${{ secrets.LOCALAI_REGISTRY_USERNAME }}" --password-stdin + - name: Docker meta + id: meta + uses: docker/metadata-action@v5 + with: + images: | + localai/localai-backends + tags: | + type=ref,event=branch + type=semver,pattern={{raw}} + type=sha + flavor: | + latest=auto + suffix=-metal-darwin-arm64-mlx,onlatest=true + - name: Docker meta + id: quaymeta + uses: docker/metadata-action@v5 + with: + images: | + quay.io/go-skynet/local-ai-backends + tags: | + type=ref,event=branch + type=semver,pattern={{raw}} + type=sha + flavor: | + latest=auto + suffix=-metal-darwin-arm64-mlx,onlatest=true + - name: Push Docker image (DockerHub) + run: | + for tag in $(echo "${{ steps.meta.outputs.tags }}" | tr ',' '\n'); do + crane push mlx.tar $tag + done + - name: Push Docker image (Quay) + run: | + for tag in $(echo "${{ steps.quaymeta.outputs.tags }}" | tr ',' '\n'); do + crane push mlx.tar $tag + done \ No newline at end of file diff --git a/backend/index.yaml b/backend/index.yaml index 8bedccb67d42..3fed08f275d4 100644 --- a/backend/index.yaml +++ b/backend/index.yaml @@ -127,6 +127,21 @@ nvidia: "cuda12-vllm" amd: "rocm-vllm" intel: "intel-vllm" +- &mlx + name: "mlx" + uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-mlx" + icon: https://avatars.githubusercontent.com/u/102832242?s=200&v=4 + urls: + - https://github.com/ml-explore/mlx-lm + mirrors: + - localai/localai-backends:latest-metal-darwin-arm64-mlx + license: MIT + description: | + Run LLMs with MLX + tags: + - text-to-text + - LLM + - MLX - &rerankers name: "rerankers" alias: "rerankers" @@ -371,6 +386,11 @@ - text-to-speech - TTS license: apache-2.0 +- !!merge <<: *mlx + name: "mlx-development" + uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-mlx" + mirrors: + - localai/localai-backends:master-metal-darwin-arm64-mlx - !!merge <<: *kitten-tts name: "kitten-tts-development" uri: "quay.io/go-skynet/local-ai-backends:master-kitten-tts" From d9d0439e0a79828292f398c5cb2bc2cf317b6cc8 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Thu, 21 Aug 2025 19:29:34 +0200 Subject: [PATCH 09/14] Debug CI Signed-off-by: Ettore Di Giacinto --- .github/workflows/backend.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml index 704c90322575..f2f102082e3c 100644 --- a/.github/workflows/backend.yml +++ b/.github/workflows/backend.yml @@ -7,6 +7,7 @@ on: - master tags: - '*' + pull_request: concurrency: group: ci-backends-${{ github.head_ref || github.ref }}-${{ github.repository }} From 0a8975d9319d8815fa11b6e5688ac99c8d7e1c91 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Thu, 21 Aug 2025 19:39:46 +0200 Subject: [PATCH 10/14] Fixups Signed-off-by: Ettore Di Giacinto --- backend/python/kokoro/Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/python/kokoro/Makefile b/backend/python/kokoro/Makefile index 29fc84b53159..7e1b238228b1 100644 --- a/backend/python/kokoro/Makefile +++ b/backend/python/kokoro/Makefile @@ -1,15 +1,15 @@ .PHONY: kokoro -kokoro: protogen +kokoro: bash install.sh .PHONY: run -run: protogen +run: kokoro @echo "Running kokoro..." bash run.sh @echo "kokoro run." .PHONY: test -test: protogen +test: kokoro @echo "Testing kokoro..." bash test.sh @echo "kokoro tested." From ab9ab20d53c3600ffb6daa87413a6f35612e96a4 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Thu, 21 Aug 2025 23:44:56 +0200 Subject: [PATCH 11/14] Fixups Signed-off-by: Ettore Di Giacinto --- .github/workflows/backend.yml | 99 +--- .github/workflows/backend_build_darwin.yml | 136 ++++++ Makefile | 5 +- backend/python/common/libbackend.sh | 2 + backend/python/mlx-vlm/Makefile | 18 +- backend/python/mlx-vlm/backend.py | 544 +++++++++++++-------- 6 files changed, 488 insertions(+), 316 deletions(-) create mode 100644 .github/workflows/backend_build_darwin.yml diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml index f2f102082e3c..eac2e2e83635 100644 --- a/.github/workflows/backend.yml +++ b/.github/workflows/backend.yml @@ -946,6 +946,19 @@ jobs: backend: "kitten-tts" dockerfile: "./backend/Dockerfile.python" context: "./backend" + mlx-darwin: + uses: ./.github/workflows/backend_build_darwin.yml + with: + backend: "mlx" + build-type: "mps" + go-version: "1.24.x" + tag-suffix: "-metal-darwin-arm64-mlx" + runs-on: "macOS-14" + secrets: + dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }} + dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }} + quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }} + quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }} llama-cpp-darwin: runs-on: macOS-14 strategy: @@ -1119,90 +1132,4 @@ jobs: run: | for tag in $(echo "${{ steps.quaymeta.outputs.tags }}" | tr ',' '\n'); do crane push llama-cpp.tar $tag - done - mlx-darwin: - runs-on: macOS-14 - strategy: - matrix: - go-version: ['1.24.x'] - steps: - - name: Clone - uses: actions/checkout@v5 - with: - submodules: true - - name: Setup Go ${{ matrix.go-version }} - uses: actions/setup-go@v5 - with: - go-version: ${{ matrix.go-version }} - cache: false - # You can test your matrix by printing the current Go version - - name: Display Go version - run: go version - - name: Dependencies - run: | - brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm - - name: Build mlx-darwin - run: | - make protogen-go - make backends/mlx - - name: Upload mlx.tar - uses: actions/upload-artifact@v4 - with: - name: mlx-tar - path: backend-images/mlx.tar - mlx-darwin-publish: - needs: mlx-darwin - if: github.event_name != 'pull_request' - runs-on: ubuntu-latest - steps: - - name: Download mlx.tar - uses: actions/download-artifact@v5 - with: - name: mlx-tar - path: . - - name: Install crane - run: | - curl -L https://github.com/google/go-containerregistry/releases/latest/download/go-containerregistry_Linux_x86_64.tar.gz | tar -xz - sudo mv crane /usr/local/bin/ - - name: Log in to DockerHub - run: | - echo "${{ secrets.DOCKERHUB_PASSWORD }}" | crane auth login docker.io -u "${{ secrets.DOCKERHUB_USERNAME }}" --password-stdin - - name: Log in to quay.io - run: | - echo "${{ secrets.LOCALAI_REGISTRY_PASSWORD }}" | crane auth login quay.io -u "${{ secrets.LOCALAI_REGISTRY_USERNAME }}" --password-stdin - - name: Docker meta - id: meta - uses: docker/metadata-action@v5 - with: - images: | - localai/localai-backends - tags: | - type=ref,event=branch - type=semver,pattern={{raw}} - type=sha - flavor: | - latest=auto - suffix=-metal-darwin-arm64-mlx,onlatest=true - - name: Docker meta - id: quaymeta - uses: docker/metadata-action@v5 - with: - images: | - quay.io/go-skynet/local-ai-backends - tags: | - type=ref,event=branch - type=semver,pattern={{raw}} - type=sha - flavor: | - latest=auto - suffix=-metal-darwin-arm64-mlx,onlatest=true - - name: Push Docker image (DockerHub) - run: | - for tag in $(echo "${{ steps.meta.outputs.tags }}" | tr ',' '\n'); do - crane push mlx.tar $tag - done - - name: Push Docker image (Quay) - run: | - for tag in $(echo "${{ steps.quaymeta.outputs.tags }}" | tr ',' '\n'); do - crane push mlx.tar $tag done \ No newline at end of file diff --git a/.github/workflows/backend_build_darwin.yml b/.github/workflows/backend_build_darwin.yml new file mode 100644 index 000000000000..e6a2b4d388e4 --- /dev/null +++ b/.github/workflows/backend_build_darwin.yml @@ -0,0 +1,136 @@ +--- +name: 'build darwin python backend container images (reusable)' + +on: + workflow_call: + inputs: + backend: + description: 'Backend to build' + required: true + type: string + build-type: + description: 'Build type (e.g., mps)' + default: '' + type: string + go-version: + description: 'Go version to use' + default: '1.24.x' + type: string + tag-suffix: + description: 'Tag suffix for the built image' + required: true + type: string + runs-on: + description: 'Runner to use' + default: 'macOS-14' + type: string + secrets: + dockerUsername: + required: false + dockerPassword: + required: false + quayUsername: + required: true + quayPassword: + required: true + +jobs: + darwin-backend-build: + runs-on: ${{ inputs.runs-on }} + strategy: + matrix: + go-version: ['${{ inputs.go-version }}'] + steps: + - name: Clone + uses: actions/checkout@v5 + with: + submodules: true + + - name: Setup Go ${{ matrix.go-version }} + uses: actions/setup-go@v5 + with: + go-version: ${{ matrix.go-version }} + cache: false + + # You can test your matrix by printing the current Go version + - name: Display Go version + run: go version + + - name: Dependencies + run: | + brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm + + - name: Build ${{ inputs.backend }}-darwin + run: | + make protogen-go + BACKEND=${{ inputs.backend }} BUILD_TYPE=${{ inputs.build-type }} make build-darwin-python-backend + + - name: Upload ${{ inputs.backend }}.tar + uses: actions/upload-artifact@v4 + with: + name: ${{ inputs.backend }}-tar + path: backend-images/${{ inputs.backend }}.tar + + darwin-backend-publish: + needs: darwin-backend-build + if: github.event_name != 'pull_request' + runs-on: ubuntu-latest + steps: + - name: Download ${{ inputs.backend }}.tar + uses: actions/download-artifact@v5 + with: + name: ${{ inputs.backend }}-tar + path: . + + - name: Install crane + run: | + curl -L https://github.com/google/go-containerregistry/releases/latest/download/go-containerregistry_Linux_x86_64.tar.gz | tar -xz + sudo mv crane /usr/local/bin/ + + - name: Log in to DockerHub + run: | + echo "${{ secrets.dockerPassword }}" | crane auth login docker.io -u "${{ secrets.dockerUsername }}" --password-stdin + + - name: Log in to quay.io + run: | + echo "${{ secrets.quayPassword }}" | crane auth login quay.io -u "${{ secrets.quayUsername }}" --password-stdin + + - name: Docker meta + id: meta + uses: docker/metadata-action@v5 + with: + images: | + localai/localai-backends + tags: | + type=ref,event=branch + type=semver,pattern={{raw}} + type=sha + flavor: | + latest=auto + suffix=${{ inputs.tag-suffix }},onlatest=true + + - name: Docker meta + id: quaymeta + uses: docker/metadata-action@v5 + with: + images: | + quay.io/go-skynet/local-ai-backends + tags: | + type=ref,event=branch + type=semver,pattern={{raw}} + type=sha + flavor: | + latest=auto + suffix=${{ inputs.tag-suffix }},onlatest=true + + - name: Push Docker image (DockerHub) + run: | + for tag in $(echo "${{ steps.meta.outputs.tags }}" | tr ',' '\n'); do + crane push ${{ inputs.backend }}.tar $tag + done + + - name: Push Docker image (Quay) + run: | + for tag in $(echo "${{ steps.quaymeta.outputs.tags }}" | tr ',' '\n'); do + crane push ${{ inputs.backend }}.tar $tag + done diff --git a/Makefile b/Makefile index af95e2374575..5be5bca1ca01 100644 --- a/Makefile +++ b/Makefile @@ -362,8 +362,11 @@ backends/llama-cpp-darwin: build bash ./scripts/build/llama-cpp-darwin.sh ./local-ai backends install "ocifile://$(abspath ./backend-images/llama-cpp.tar)" +build-darwin-python-backend: + bash ./scripts/build/python-darwin.sh + backends/mlx: build - BACKEND=mlx BUILD_TYPE=mps bash ./scripts/build/python-darwin.sh + BACKEND=mlx BUILD_TYPE=mps $(MAKE) build-darwin-python-backend ./local-ai backends install "ocifile://$(abspath ./backend-images/mlx.tar)" backend-images: diff --git a/backend/python/common/libbackend.sh b/backend/python/common/libbackend.sh index 409d20665932..79430ad2d4f1 100644 --- a/backend/python/common/libbackend.sh +++ b/backend/python/common/libbackend.sh @@ -111,6 +111,8 @@ function ensureVenv() { fi echo "Using interpreter: ${interpreter}" ${interpreter} -m venv ${EDIR}/venv + source ${EDIR}/venv/bin/activate + ${interpreter} -m pip install --upgrade pip echo "Python virtual environment created" else echo "Using uv package manager" diff --git a/backend/python/mlx-vlm/Makefile b/backend/python/mlx-vlm/Makefile index c4c18bee55b0..804031aa970d 100644 --- a/backend/python/mlx-vlm/Makefile +++ b/backend/python/mlx-vlm/Makefile @@ -1,29 +1,23 @@ -.PHONY: mlx -mlx: protogen +.PHONY: mlx-vlm +mlx-vlm: bash install.sh .PHONY: run -run: protogen - @echo "Running mlx..." +run: mlx-vlm + @echo "Running mlx-vlm..." bash run.sh @echo "mlx run." .PHONY: test -test: protogen - @echo "Testing mlx..." +test: mlx-vlm + @echo "Testing mlx-vlm..." bash test.sh @echo "mlx tested." -.PHONY: protogen -protogen: backend_pb2_grpc.py backend_pb2.py - .PHONY: protogen-clean protogen-clean: $(RM) backend_pb2_grpc.py backend_pb2.py -backend_pb2_grpc.py backend_pb2.py: - python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto - .PHONY: clean clean: protogen-clean rm -rf venv __pycache__ \ No newline at end of file diff --git a/backend/python/mlx-vlm/backend.py b/backend/python/mlx-vlm/backend.py index 56698a54e5f5..02730c814965 100644 --- a/backend/python/mlx-vlm/backend.py +++ b/backend/python/mlx-vlm/backend.py @@ -6,21 +6,20 @@ import sys import os from typing import List -from PIL import Image +import time import backend_pb2 import backend_pb2_grpc import grpc -from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.engine.async_llm_engine import AsyncLLMEngine -from vllm.sampling_params import SamplingParams -from vllm.utils import random_uuid -from vllm.transformers_utils.tokenizer import get_tokenizer -from vllm.multimodal.utils import fetch_image -from vllm.assets.video import VideoAsset +from mlx_vlm import load, generate, stream_generate +from mlx_vlm.prompt_utils import apply_chat_template +from mlx_vlm.utils import load_config, load_image +import mlx.core as mx import base64 import io +from PIL import Image +import tempfile _ONE_DAY_IN_SECONDS = 60 * 60 * 24 @@ -32,38 +31,22 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): """ A gRPC servicer that implements the Backend service defined in backend.proto. """ - def generate(self,prompt, max_new_tokens): - """ - Generates text based on the given prompt and maximum number of new tokens. - - Args: - prompt (str): The prompt to generate text from. - max_new_tokens (int): The maximum number of new tokens to generate. - - Returns: - str: The generated text. - """ - self.generator.end_beam_search() - - # Tokenizing the input - ids = self.generator.tokenizer.encode(prompt) - self.generator.gen_begin_reuse(ids) - initial_len = self.generator.sequence[0].shape[0] - has_leading_space = False - decoded_text = '' - for i in range(max_new_tokens): - token = self.generator.gen_single_token() - if i == 0 and self.generator.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'): - has_leading_space = True - - decoded_text = self.generator.tokenizer.decode(self.generator.sequence[0][initial_len:]) - if has_leading_space: - decoded_text = ' ' + decoded_text + def _is_float(self, s): + """Check if a string can be converted to float.""" + try: + float(s) + return True + except ValueError: + return False - if token.item() == self.generator.tokenizer.eos_token_id: - break - return decoded_text + def _is_int(self, s): + """Check if a string can be converted to int.""" + try: + int(s) + return True + except ValueError: + return False def Health(self, request, context): """ @@ -80,7 +63,7 @@ def Health(self, request, context): async def LoadModel(self, request, context): """ - Loads a language model. + Loads a multimodal vision-language model using MLX-VLM. Args: request: The load model request. @@ -89,60 +72,50 @@ async def LoadModel(self, request, context): Returns: backend_pb2.Result: The load model result. """ - engine_args = AsyncEngineArgs( - model=request.Model, - ) - - if request.Quantization != "": - engine_args.quantization = request.Quantization - if request.LoadFormat != "": - engine_args.load_format = request.LoadFormat - if request.GPUMemoryUtilization != 0: - engine_args.gpu_memory_utilization = request.GPUMemoryUtilization - if request.TrustRemoteCode: - engine_args.trust_remote_code = request.TrustRemoteCode - if request.EnforceEager: - engine_args.enforce_eager = request.EnforceEager - if request.TensorParallelSize: - engine_args.tensor_parallel_size = request.TensorParallelSize - if request.SwapSpace != 0: - engine_args.swap_space = request.SwapSpace - if request.MaxModelLen != 0: - engine_args.max_model_len = request.MaxModelLen - if request.DisableLogStatus: - engine_args.disable_log_status = request.DisableLogStatus - if request.DType != "": - engine_args.dtype = request.DType - if request.LimitImagePerPrompt != 0 or request.LimitVideoPerPrompt != 0 or request.LimitAudioPerPrompt != 0: - # limit-mm-per-prompt defaults to 1 per modality, based on vLLM docs - engine_args.limit_mm_per_prompt = { - "image": max(request.LimitImagePerPrompt, 1), - "video": max(request.LimitVideoPerPrompt, 1), - "audio": max(request.LimitAudioPerPrompt, 1) - } - try: - self.llm = AsyncLLMEngine.from_engine_args(engine_args) + print(f"Loading MLX-VLM model: {request.Model}", file=sys.stderr) + print(f"Request: {request}", file=sys.stderr) + + # Parse options like in the diffusers backend + options = request.Options + self.options = {} + + # The options are a list of strings in this form optname:optvalue + # We store all the options in a dict for later use + for opt in options: + if ":" not in opt: + continue + key, value = opt.split(":", 1) # Split only on first colon to handle values with colons + + # Convert numeric values to appropriate types + if self._is_float(value): + value = float(value) + elif self._is_int(value): + value = int(value) + elif value.lower() in ["true", "false"]: + value = value.lower() == "true" + + self.options[key] = value + + print(f"Options: {self.options}", file=sys.stderr) + + # Load model and processor using MLX-VLM + # mlx-vlm load function returns (model, processor) instead of (model, tokenizer) + self.model, self.processor = load(request.Model) + + # Load model config for chat template support + self.config = load_config(request.Model) + except Exception as err: - print(f"Unexpected {err=}, {type(err)=}", file=sys.stderr) - return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") + print(f"Error loading MLX-VLM model {err=}, {type(err)=}", file=sys.stderr) + return backend_pb2.Result(success=False, message=f"Error loading MLX-VLM model: {err}") - try: - engine_model_config = await self.llm.get_model_config() - self.tokenizer = get_tokenizer( - engine_model_config.tokenizer, - tokenizer_mode=engine_model_config.tokenizer_mode, - trust_remote_code=engine_model_config.trust_remote_code, - truncation_side="left", - ) - except Exception as err: - return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") - print("Model loaded successfully", file=sys.stderr) - return backend_pb2.Result(message="Model loaded successfully", success=True) + print("MLX-VLM model loaded successfully", file=sys.stderr) + return backend_pb2.Result(message="MLX-VLM model loaded successfully", success=True) async def Predict(self, request, context): """ - Generates text based on the given prompt and sampling parameters. + Generates text based on the given prompt and sampling parameters using MLX-VLM with multimodal support. Args: request: The predict request. @@ -151,13 +124,66 @@ async def Predict(self, request, context): Returns: backend_pb2.Reply: The predict result. """ - gen = self._predict(request, context, streaming=False) - res = await gen.__anext__() - return res + temp_files = [] + try: + # Process images and audios from request + image_paths = [] + audio_paths = [] + + # Process images + if request.Images: + for img_data in request.Images: + img_path = self.load_image_from_base64(img_data) + if img_path: + image_paths.append(img_path) + temp_files.append(img_path) + + # Process audios + if request.Audios: + for audio_data in request.Audios: + audio_path = self.load_audio_from_base64(audio_data) + if audio_path: + audio_paths.append(audio_path) + temp_files.append(audio_path) + + # Prepare the prompt with multimodal information + prompt = self._prepare_prompt(request, num_images=len(image_paths), num_audios=len(audio_paths)) + + # Build generation parameters using request attributes and options + max_tokens, generation_params = self._build_generation_params(request) + + print(f"Generating text with MLX-VLM - max_tokens: {max_tokens}, params: {generation_params}", file=sys.stderr) + print(f"Images: {len(image_paths)}, Audios: {len(audio_paths)}", file=sys.stderr) + + # Generate text using MLX-VLM with multimodal inputs + response = generate( + model=self.model, + processor=self.processor, + prompt=prompt, + image=image_paths if image_paths else None, + audio=audio_paths if audio_paths else None, + max_tokens=max_tokens, + temperature=generation_params.get('temp', 0.6), + top_p=generation_params.get('top_p', 1.0), + verbose=False + ) + + return backend_pb2.Reply(message=bytes(response, encoding='utf-8')) + + except Exception as e: + print(f"Error in MLX-VLM Predict: {e}", file=sys.stderr) + context.set_code(grpc.StatusCode.INTERNAL) + context.set_details(f"Generation failed: {str(e)}") + return backend_pb2.Reply(message=bytes("", encoding='utf-8')) + finally: + # Clean up temporary files + self.cleanup_temp_files(temp_files) def Embedding(self, request, context): """ A gRPC method that calculates embeddings for a given sentence. + + Note: MLX-VLM doesn't support embeddings directly. This method returns an error. Args: request: An EmbeddingRequest object that contains the request parameters. @@ -166,171 +192,255 @@ def Embedding(self, request, context): Returns: An EmbeddingResult object that contains the calculated embeddings. """ - print("Calculated embeddings for: " + request.Embeddings, file=sys.stderr) - outputs = self.model.encode(request.Embeddings) - # Check if we have one result at least - if len(outputs) == 0: - context.set_code(grpc.StatusCode.INVALID_ARGUMENT) - context.set_details("No embeddings were calculated.") - return backend_pb2.EmbeddingResult() - return backend_pb2.EmbeddingResult(embeddings=outputs[0].outputs.embedding) + print("Embeddings not supported in MLX-VLM backend", file=sys.stderr) + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details("Embeddings are not supported in the MLX-VLM backend.") + return backend_pb2.EmbeddingResult() async def PredictStream(self, request, context): """ - Generates text based on the given prompt and sampling parameters, and streams the results. + Generates text based on the given prompt and sampling parameters, and streams the results using MLX-VLM with multimodal support. Args: request: The predict stream request. context: The gRPC context. - Returns: - backend_pb2.Result: The predict stream result. + Yields: + backend_pb2.Reply: Streaming predict results. """ - iterations = self._predict(request, context, streaming=True) + temp_files = [] try: - async for iteration in iterations: - yield iteration + # Process images and audios from request + image_paths = [] + audio_paths = [] + + # Process images + if request.Images: + for img_data in request.Images: + img_path = self.load_image_from_base64(img_data) + if img_path: + image_paths.append(img_path) + temp_files.append(img_path) + + # Process audios + if request.Audios: + for audio_data in request.Audios: + audio_path = self.load_audio_from_base64(audio_data) + if audio_path: + audio_paths.append(audio_path) + temp_files.append(audio_path) + + # Prepare the prompt with multimodal information + prompt = self._prepare_prompt(request, num_images=len(image_paths), num_audios=len(audio_paths)) + + # Build generation parameters using request attributes and options + max_tokens, generation_params = self._build_generation_params(request, default_max_tokens=512) + + print(f"Streaming text with MLX-VLM - max_tokens: {max_tokens}, params: {generation_params}", file=sys.stderr) + print(f"Images: {len(image_paths)}, Audios: {len(audio_paths)}", file=sys.stderr) + + # Stream text generation using MLX-VLM with multimodal inputs + for response in stream_generate( + model=self.model, + processor=self.processor, + prompt=prompt, + image=image_paths if image_paths else None, + audio=audio_paths if audio_paths else None, + max_tokens=max_tokens, + temperature=generation_params.get('temp', 0.6), + top_p=generation_params.get('top_p', 1.0), + ): + yield backend_pb2.Reply(message=bytes(response.text, encoding='utf-8')) + + except Exception as e: + print(f"Error in MLX-VLM PredictStream: {e}", file=sys.stderr) + context.set_code(grpc.StatusCode.INTERNAL) + context.set_details(f"Streaming generation failed: {str(e)}") + yield backend_pb2.Reply(message=bytes("", encoding='utf-8')) finally: - await iterations.aclose() - - async def _predict(self, request, context, streaming=False): - # Build the sampling parameters - # NOTE: this must stay in sync with the vllm backend - request_to_sampling_params = { - "N": "n", - "PresencePenalty": "presence_penalty", - "FrequencyPenalty": "frequency_penalty", - "RepetitionPenalty": "repetition_penalty", - "Temperature": "temperature", - "TopP": "top_p", - "TopK": "top_k", - "MinP": "min_p", - "Seed": "seed", - "StopPrompts": "stop", - "StopTokenIds": "stop_token_ids", - "BadWords": "bad_words", - "IncludeStopStrInOutput": "include_stop_str_in_output", - "IgnoreEOS": "ignore_eos", - "Tokens": "max_tokens", - "MinTokens": "min_tokens", - "Logprobs": "logprobs", - "PromptLogprobs": "prompt_logprobs", - "SkipSpecialTokens": "skip_special_tokens", - "SpacesBetweenSpecialTokens": "spaces_between_special_tokens", - "TruncatePromptTokens": "truncate_prompt_tokens", - "GuidedDecoding": "guided_decoding", - } - - sampling_params = SamplingParams(top_p=0.9, max_tokens=200) + # Clean up temporary files + self.cleanup_temp_files(temp_files) - for request_field, param_field in request_to_sampling_params.items(): - if hasattr(request, request_field): - value = getattr(request, request_field) - if value not in (None, 0, [], False): - setattr(sampling_params, param_field, value) - - # Extract image paths and process images - prompt = request.Prompt - - image_paths = request.Images - image_data = [self.load_image(img_path) for img_path in image_paths] + def _prepare_prompt(self, request, num_images=0, num_audios=0): + """ + Prepare the prompt for MLX-VLM generation, handling chat templates and multimodal inputs. - videos_path = request.Videos - video_data = [self.load_video(video_path) for video_path in videos_path] + Args: + request: The gRPC request containing prompt and message information. + num_images: Number of images in the request. + num_audios: Number of audio files in the request. + Returns: + str: The prepared prompt. + """ # If tokenizer template is enabled and messages are provided instead of prompt, apply the tokenizer template if not request.Prompt and request.UseTokenizerTemplate and request.Messages: - prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True) - - # Generate text using the LLM engine - request_id = random_uuid() - print(f"Generating text with request_id: {request_id}", file=sys.stderr) - multi_modal_data = {} - if image_data: - multi_modal_data["image"] = image_data - if video_data: - multi_modal_data["video"] = video_data - outputs = self.llm.generate( - { - "prompt": prompt, - "multi_modal_data": multi_modal_data if multi_modal_data else None, - }, - sampling_params=sampling_params, - request_id=request_id, - ) - - # Stream the results - generated_text = "" - try: - async for request_output in outputs: - iteration_text = request_output.outputs[0].text - - if streaming: - # Remove text already sent as vllm concatenates the text from previous yields - delta_iteration_text = iteration_text.removeprefix(generated_text) - # Send the partial result - yield backend_pb2.Reply(message=bytes(delta_iteration_text, encoding='utf-8')) - - # Keep track of text generated - generated_text = iteration_text - finally: - await outputs.aclose() - - # If streaming, we already sent everything - if streaming: - return - - # Remove the image files from /tmp folder - for img_path in image_paths: - try: - os.remove(img_path) - except Exception as e: - print(f"Error removing image file: {img_path}, {e}", file=sys.stderr) + # Convert gRPC messages to the format expected by apply_chat_template + messages = [] + for msg in request.Messages: + messages.append({"role": msg.role, "content": msg.content}) + + # Use mlx-vlm's apply_chat_template which handles multimodal inputs + prompt = apply_chat_template( + self.processor, + self.config, + messages, + num_images=num_images, + num_audios=num_audios + ) + return prompt + elif request.Prompt: + # If we have a direct prompt but also have images/audio, we need to format it properly + if num_images > 0 or num_audios > 0: + # Create a simple message structure for multimodal prompt + messages = [{"role": "user", "content": request.Prompt}] + prompt = apply_chat_template( + self.processor, + self.config, + messages, + num_images=num_images, + num_audios=num_audios + ) + return prompt + else: + return request.Prompt + else: + # Fallback to empty prompt with multimodal template if we have media + if num_images > 0 or num_audios > 0: + messages = [{"role": "user", "content": ""}] + prompt = apply_chat_template( + self.processor, + self.config, + messages, + num_images=num_images, + num_audios=num_audios + ) + return prompt + else: + return "" + + + + + + def _build_generation_params(self, request, default_max_tokens=200): + """ + Build generation parameters from request attributes and options for MLX-VLM. - # Sending the final generated text - yield backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8')) + Args: + request: The gRPC request. + default_max_tokens: Default max_tokens if not specified. - def load_image(self, image_path: str): + Returns: + tuple: (max_tokens, generation_params dict) """ - Load an image from the given file path or base64 encoded data. + # Extract max_tokens + max_tokens = getattr(request, 'Tokens', default_max_tokens) + if max_tokens == 0: + max_tokens = default_max_tokens + + # Extract generation parameters from request attributes + temp = getattr(request, 'Temperature', 0.0) + if temp == 0.0: + temp = 0.6 # Default temperature + + top_p = getattr(request, 'TopP', 0.0) + if top_p == 0.0: + top_p = 1.0 # Default top_p + + # Initialize generation parameters for MLX-VLM + generation_params = { + 'temp': temp, + 'top_p': top_p, + } + + # Add seed if specified + seed = getattr(request, 'Seed', 0) + if seed != 0: + mx.random.seed(seed) + + # Override with options if available + if hasattr(self, 'options'): + # Max tokens from options + if 'max_tokens' in self.options: + max_tokens = self.options['max_tokens'] + + # Generation parameters from options + param_option_mapping = { + 'temp': 'temp', + 'temperature': 'temp', # alias + 'top_p': 'top_p', + } + + for option_key, param_key in param_option_mapping.items(): + if option_key in self.options: + generation_params[param_key] = self.options[option_key] + + # Handle seed from options + if 'seed' in self.options: + mx.random.seed(self.options['seed']) + + return max_tokens, generation_params + + def load_image_from_base64(self, image_data: str): + """ + Load an image from base64 encoded data. Args: - image_path (str): The path to the image file or base64 encoded data. + image_data (str): Base64 encoded image data. Returns: - Image: The loaded image. + PIL.Image or str: The loaded image or path to the image. """ try: - - image_data = base64.b64decode(image_path) - image = Image.open(io.BytesIO(image_data)) - return image + decoded_data = base64.b64decode(image_data) + image = Image.open(io.BytesIO(decoded_data)) + + # Save to temporary file for mlx-vlm + with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as tmp_file: + image.save(tmp_file.name, format='JPEG') + return tmp_file.name + except Exception as e: - print(f"Error loading image {image_path}: {e}", file=sys.stderr) + print(f"Error loading image from base64: {e}", file=sys.stderr) return None - def load_video(self, video_path: str): + def load_audio_from_base64(self, audio_data: str): """ - Load a video from the given file path. + Load audio from base64 encoded data. Args: - video_path (str): The path to the image file. + audio_data (str): Base64 encoded audio data. Returns: - Video: The loaded video. + str: Path to the loaded audio file. """ try: - timestamp = str(int(time.time() * 1000)) # Generate timestamp - p = f"/tmp/vl-{timestamp}.data" # Use timestamp in filename - with open(p, "wb") as f: - f.write(base64.b64decode(video_path)) - video = VideoAsset(name=p).np_ndarrays - os.remove(p) - return video + decoded_data = base64.b64decode(audio_data) + + # Save to temporary file for mlx-vlm + with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file: + tmp_file.write(decoded_data) + return tmp_file.name + except Exception as e: - print(f"Error loading video {video_path}: {e}", file=sys.stderr) + print(f"Error loading audio from base64: {e}", file=sys.stderr) return None + def cleanup_temp_files(self, file_paths: List[str]): + """ + Clean up temporary files. + + Args: + file_paths (List[str]): List of file paths to clean up. + """ + for file_path in file_paths: + try: + if file_path and os.path.exists(file_path): + os.remove(file_path) + except Exception as e: + print(f"Error removing temporary file {file_path}: {e}", file=sys.stderr) + async def serve(address): # Start asyncio gRPC server server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), From c59b6a951dbf38426fdd2b510a1304aa0ea0d429 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Thu, 21 Aug 2025 23:45:12 +0200 Subject: [PATCH 12/14] Index mlx-vlm Signed-off-by: Ettore Di Giacinto --- Makefile | 4 ++++ backend/index.yaml | 22 ++++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/Makefile b/Makefile index 5be5bca1ca01..aebff8599dc9 100644 --- a/Makefile +++ b/Makefile @@ -369,6 +369,10 @@ backends/mlx: build BACKEND=mlx BUILD_TYPE=mps $(MAKE) build-darwin-python-backend ./local-ai backends install "ocifile://$(abspath ./backend-images/mlx.tar)" +backends/mlx-vlm: build + BACKEND=mlx-vlm BUILD_TYPE=mps bash ./scripts/build/python-darwin.sh + ./local-ai backends install "ocifile://$(abspath ./backend-images/mlx-vlm.tar)" + backend-images: mkdir -p backend-images diff --git a/backend/index.yaml b/backend/index.yaml index 3fed08f275d4..b7f1efc1a9f9 100644 --- a/backend/index.yaml +++ b/backend/index.yaml @@ -142,6 +142,23 @@ - text-to-text - LLM - MLX +- &mlx-vlm + name: "mlx-vlm" + uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-mlx-vlm" + icon: https://avatars.githubusercontent.com/u/102832242?s=200&v=4 + urls: + - https://github.com/ml-explore/mlx-vlm + mirrors: + - localai/localai-backends:latest-metal-darwin-arm64-mlx-vlm + license: MIT + description: | + Run Vision-Language Models with MLX + tags: + - text-to-text + - multimodal + - vision-language + - LLM + - MLX - &rerankers name: "rerankers" alias: "rerankers" @@ -391,6 +408,11 @@ uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-mlx" mirrors: - localai/localai-backends:master-metal-darwin-arm64-mlx +- !!merge <<: *mlx-vlm + name: "mlx-vlm-development" + uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-mlx-vlm" + mirrors: + - localai/localai-backends:master-metal-darwin-arm64-mlx-vlm - !!merge <<: *kitten-tts name: "kitten-tts-development" uri: "quay.io/go-skynet/local-ai-backends:master-kitten-tts" From 92456f59c4c0ea43a73b82e8138ded87c891b921 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Thu, 21 Aug 2025 23:45:52 +0200 Subject: [PATCH 13/14] Remove mlx-vlm Signed-off-by: Ettore Di Giacinto --- Makefile | 4 - backend/index.yaml | 22 -- backend/python/mlx-vlm/Makefile | 23 -- backend/python/mlx-vlm/backend.py | 477 ------------------------ backend/python/mlx-vlm/install.sh | 14 - backend/python/mlx-vlm/requirements.txt | 4 - backend/python/mlx-vlm/run.sh | 11 - backend/python/mlx-vlm/test.py | 146 -------- backend/python/mlx-vlm/test.sh | 12 - 9 files changed, 713 deletions(-) delete mode 100644 backend/python/mlx-vlm/Makefile delete mode 100644 backend/python/mlx-vlm/backend.py delete mode 100755 backend/python/mlx-vlm/install.sh delete mode 100644 backend/python/mlx-vlm/requirements.txt delete mode 100755 backend/python/mlx-vlm/run.sh delete mode 100644 backend/python/mlx-vlm/test.py delete mode 100755 backend/python/mlx-vlm/test.sh diff --git a/Makefile b/Makefile index aebff8599dc9..5be5bca1ca01 100644 --- a/Makefile +++ b/Makefile @@ -369,10 +369,6 @@ backends/mlx: build BACKEND=mlx BUILD_TYPE=mps $(MAKE) build-darwin-python-backend ./local-ai backends install "ocifile://$(abspath ./backend-images/mlx.tar)" -backends/mlx-vlm: build - BACKEND=mlx-vlm BUILD_TYPE=mps bash ./scripts/build/python-darwin.sh - ./local-ai backends install "ocifile://$(abspath ./backend-images/mlx-vlm.tar)" - backend-images: mkdir -p backend-images diff --git a/backend/index.yaml b/backend/index.yaml index b7f1efc1a9f9..3fed08f275d4 100644 --- a/backend/index.yaml +++ b/backend/index.yaml @@ -142,23 +142,6 @@ - text-to-text - LLM - MLX -- &mlx-vlm - name: "mlx-vlm" - uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-mlx-vlm" - icon: https://avatars.githubusercontent.com/u/102832242?s=200&v=4 - urls: - - https://github.com/ml-explore/mlx-vlm - mirrors: - - localai/localai-backends:latest-metal-darwin-arm64-mlx-vlm - license: MIT - description: | - Run Vision-Language Models with MLX - tags: - - text-to-text - - multimodal - - vision-language - - LLM - - MLX - &rerankers name: "rerankers" alias: "rerankers" @@ -408,11 +391,6 @@ uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-mlx" mirrors: - localai/localai-backends:master-metal-darwin-arm64-mlx -- !!merge <<: *mlx-vlm - name: "mlx-vlm-development" - uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-mlx-vlm" - mirrors: - - localai/localai-backends:master-metal-darwin-arm64-mlx-vlm - !!merge <<: *kitten-tts name: "kitten-tts-development" uri: "quay.io/go-skynet/local-ai-backends:master-kitten-tts" diff --git a/backend/python/mlx-vlm/Makefile b/backend/python/mlx-vlm/Makefile deleted file mode 100644 index 804031aa970d..000000000000 --- a/backend/python/mlx-vlm/Makefile +++ /dev/null @@ -1,23 +0,0 @@ -.PHONY: mlx-vlm -mlx-vlm: - bash install.sh - -.PHONY: run -run: mlx-vlm - @echo "Running mlx-vlm..." - bash run.sh - @echo "mlx run." - -.PHONY: test -test: mlx-vlm - @echo "Testing mlx-vlm..." - bash test.sh - @echo "mlx tested." - -.PHONY: protogen-clean -protogen-clean: - $(RM) backend_pb2_grpc.py backend_pb2.py - -.PHONY: clean -clean: protogen-clean - rm -rf venv __pycache__ \ No newline at end of file diff --git a/backend/python/mlx-vlm/backend.py b/backend/python/mlx-vlm/backend.py deleted file mode 100644 index 02730c814965..000000000000 --- a/backend/python/mlx-vlm/backend.py +++ /dev/null @@ -1,477 +0,0 @@ -#!/usr/bin/env python3 -import asyncio -from concurrent import futures -import argparse -import signal -import sys -import os -from typing import List -import time - -import backend_pb2 -import backend_pb2_grpc - -import grpc -from mlx_vlm import load, generate, stream_generate -from mlx_vlm.prompt_utils import apply_chat_template -from mlx_vlm.utils import load_config, load_image -import mlx.core as mx -import base64 -import io -from PIL import Image -import tempfile - -_ONE_DAY_IN_SECONDS = 60 * 60 * 24 - -# If MAX_WORKERS are specified in the environment use it, otherwise default to 1 -MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1')) - -# Implement the BackendServicer class with the service methods -class BackendServicer(backend_pb2_grpc.BackendServicer): - """ - A gRPC servicer that implements the Backend service defined in backend.proto. - """ - - def _is_float(self, s): - """Check if a string can be converted to float.""" - try: - float(s) - return True - except ValueError: - return False - - def _is_int(self, s): - """Check if a string can be converted to int.""" - try: - int(s) - return True - except ValueError: - return False - - def Health(self, request, context): - """ - Returns a health check message. - - Args: - request: The health check request. - context: The gRPC context. - - Returns: - backend_pb2.Reply: The health check reply. - """ - return backend_pb2.Reply(message=bytes("OK", 'utf-8')) - - async def LoadModel(self, request, context): - """ - Loads a multimodal vision-language model using MLX-VLM. - - Args: - request: The load model request. - context: The gRPC context. - - Returns: - backend_pb2.Result: The load model result. - """ - try: - print(f"Loading MLX-VLM model: {request.Model}", file=sys.stderr) - print(f"Request: {request}", file=sys.stderr) - - # Parse options like in the diffusers backend - options = request.Options - self.options = {} - - # The options are a list of strings in this form optname:optvalue - # We store all the options in a dict for later use - for opt in options: - if ":" not in opt: - continue - key, value = opt.split(":", 1) # Split only on first colon to handle values with colons - - # Convert numeric values to appropriate types - if self._is_float(value): - value = float(value) - elif self._is_int(value): - value = int(value) - elif value.lower() in ["true", "false"]: - value = value.lower() == "true" - - self.options[key] = value - - print(f"Options: {self.options}", file=sys.stderr) - - # Load model and processor using MLX-VLM - # mlx-vlm load function returns (model, processor) instead of (model, tokenizer) - self.model, self.processor = load(request.Model) - - # Load model config for chat template support - self.config = load_config(request.Model) - - except Exception as err: - print(f"Error loading MLX-VLM model {err=}, {type(err)=}", file=sys.stderr) - return backend_pb2.Result(success=False, message=f"Error loading MLX-VLM model: {err}") - - print("MLX-VLM model loaded successfully", file=sys.stderr) - return backend_pb2.Result(message="MLX-VLM model loaded successfully", success=True) - - async def Predict(self, request, context): - """ - Generates text based on the given prompt and sampling parameters using MLX-VLM with multimodal support. - - Args: - request: The predict request. - context: The gRPC context. - - Returns: - backend_pb2.Reply: The predict result. - """ - temp_files = [] - try: - # Process images and audios from request - image_paths = [] - audio_paths = [] - - # Process images - if request.Images: - for img_data in request.Images: - img_path = self.load_image_from_base64(img_data) - if img_path: - image_paths.append(img_path) - temp_files.append(img_path) - - # Process audios - if request.Audios: - for audio_data in request.Audios: - audio_path = self.load_audio_from_base64(audio_data) - if audio_path: - audio_paths.append(audio_path) - temp_files.append(audio_path) - - # Prepare the prompt with multimodal information - prompt = self._prepare_prompt(request, num_images=len(image_paths), num_audios=len(audio_paths)) - - # Build generation parameters using request attributes and options - max_tokens, generation_params = self._build_generation_params(request) - - print(f"Generating text with MLX-VLM - max_tokens: {max_tokens}, params: {generation_params}", file=sys.stderr) - print(f"Images: {len(image_paths)}, Audios: {len(audio_paths)}", file=sys.stderr) - - # Generate text using MLX-VLM with multimodal inputs - response = generate( - model=self.model, - processor=self.processor, - prompt=prompt, - image=image_paths if image_paths else None, - audio=audio_paths if audio_paths else None, - max_tokens=max_tokens, - temperature=generation_params.get('temp', 0.6), - top_p=generation_params.get('top_p', 1.0), - verbose=False - ) - - return backend_pb2.Reply(message=bytes(response, encoding='utf-8')) - - except Exception as e: - print(f"Error in MLX-VLM Predict: {e}", file=sys.stderr) - context.set_code(grpc.StatusCode.INTERNAL) - context.set_details(f"Generation failed: {str(e)}") - return backend_pb2.Reply(message=bytes("", encoding='utf-8')) - finally: - # Clean up temporary files - self.cleanup_temp_files(temp_files) - - def Embedding(self, request, context): - """ - A gRPC method that calculates embeddings for a given sentence. - - Note: MLX-VLM doesn't support embeddings directly. This method returns an error. - - Args: - request: An EmbeddingRequest object that contains the request parameters. - context: A grpc.ServicerContext object that provides information about the RPC. - - Returns: - An EmbeddingResult object that contains the calculated embeddings. - """ - print("Embeddings not supported in MLX-VLM backend", file=sys.stderr) - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Embeddings are not supported in the MLX-VLM backend.") - return backend_pb2.EmbeddingResult() - - async def PredictStream(self, request, context): - """ - Generates text based on the given prompt and sampling parameters, and streams the results using MLX-VLM with multimodal support. - - Args: - request: The predict stream request. - context: The gRPC context. - - Yields: - backend_pb2.Reply: Streaming predict results. - """ - temp_files = [] - try: - # Process images and audios from request - image_paths = [] - audio_paths = [] - - # Process images - if request.Images: - for img_data in request.Images: - img_path = self.load_image_from_base64(img_data) - if img_path: - image_paths.append(img_path) - temp_files.append(img_path) - - # Process audios - if request.Audios: - for audio_data in request.Audios: - audio_path = self.load_audio_from_base64(audio_data) - if audio_path: - audio_paths.append(audio_path) - temp_files.append(audio_path) - - # Prepare the prompt with multimodal information - prompt = self._prepare_prompt(request, num_images=len(image_paths), num_audios=len(audio_paths)) - - # Build generation parameters using request attributes and options - max_tokens, generation_params = self._build_generation_params(request, default_max_tokens=512) - - print(f"Streaming text with MLX-VLM - max_tokens: {max_tokens}, params: {generation_params}", file=sys.stderr) - print(f"Images: {len(image_paths)}, Audios: {len(audio_paths)}", file=sys.stderr) - - # Stream text generation using MLX-VLM with multimodal inputs - for response in stream_generate( - model=self.model, - processor=self.processor, - prompt=prompt, - image=image_paths if image_paths else None, - audio=audio_paths if audio_paths else None, - max_tokens=max_tokens, - temperature=generation_params.get('temp', 0.6), - top_p=generation_params.get('top_p', 1.0), - ): - yield backend_pb2.Reply(message=bytes(response.text, encoding='utf-8')) - - except Exception as e: - print(f"Error in MLX-VLM PredictStream: {e}", file=sys.stderr) - context.set_code(grpc.StatusCode.INTERNAL) - context.set_details(f"Streaming generation failed: {str(e)}") - yield backend_pb2.Reply(message=bytes("", encoding='utf-8')) - finally: - # Clean up temporary files - self.cleanup_temp_files(temp_files) - - def _prepare_prompt(self, request, num_images=0, num_audios=0): - """ - Prepare the prompt for MLX-VLM generation, handling chat templates and multimodal inputs. - - Args: - request: The gRPC request containing prompt and message information. - num_images: Number of images in the request. - num_audios: Number of audio files in the request. - - Returns: - str: The prepared prompt. - """ - # If tokenizer template is enabled and messages are provided instead of prompt, apply the tokenizer template - if not request.Prompt and request.UseTokenizerTemplate and request.Messages: - # Convert gRPC messages to the format expected by apply_chat_template - messages = [] - for msg in request.Messages: - messages.append({"role": msg.role, "content": msg.content}) - - # Use mlx-vlm's apply_chat_template which handles multimodal inputs - prompt = apply_chat_template( - self.processor, - self.config, - messages, - num_images=num_images, - num_audios=num_audios - ) - return prompt - elif request.Prompt: - # If we have a direct prompt but also have images/audio, we need to format it properly - if num_images > 0 or num_audios > 0: - # Create a simple message structure for multimodal prompt - messages = [{"role": "user", "content": request.Prompt}] - prompt = apply_chat_template( - self.processor, - self.config, - messages, - num_images=num_images, - num_audios=num_audios - ) - return prompt - else: - return request.Prompt - else: - # Fallback to empty prompt with multimodal template if we have media - if num_images > 0 or num_audios > 0: - messages = [{"role": "user", "content": ""}] - prompt = apply_chat_template( - self.processor, - self.config, - messages, - num_images=num_images, - num_audios=num_audios - ) - return prompt - else: - return "" - - - - - - def _build_generation_params(self, request, default_max_tokens=200): - """ - Build generation parameters from request attributes and options for MLX-VLM. - - Args: - request: The gRPC request. - default_max_tokens: Default max_tokens if not specified. - - Returns: - tuple: (max_tokens, generation_params dict) - """ - # Extract max_tokens - max_tokens = getattr(request, 'Tokens', default_max_tokens) - if max_tokens == 0: - max_tokens = default_max_tokens - - # Extract generation parameters from request attributes - temp = getattr(request, 'Temperature', 0.0) - if temp == 0.0: - temp = 0.6 # Default temperature - - top_p = getattr(request, 'TopP', 0.0) - if top_p == 0.0: - top_p = 1.0 # Default top_p - - # Initialize generation parameters for MLX-VLM - generation_params = { - 'temp': temp, - 'top_p': top_p, - } - - # Add seed if specified - seed = getattr(request, 'Seed', 0) - if seed != 0: - mx.random.seed(seed) - - # Override with options if available - if hasattr(self, 'options'): - # Max tokens from options - if 'max_tokens' in self.options: - max_tokens = self.options['max_tokens'] - - # Generation parameters from options - param_option_mapping = { - 'temp': 'temp', - 'temperature': 'temp', # alias - 'top_p': 'top_p', - } - - for option_key, param_key in param_option_mapping.items(): - if option_key in self.options: - generation_params[param_key] = self.options[option_key] - - # Handle seed from options - if 'seed' in self.options: - mx.random.seed(self.options['seed']) - - return max_tokens, generation_params - - def load_image_from_base64(self, image_data: str): - """ - Load an image from base64 encoded data. - - Args: - image_data (str): Base64 encoded image data. - - Returns: - PIL.Image or str: The loaded image or path to the image. - """ - try: - decoded_data = base64.b64decode(image_data) - image = Image.open(io.BytesIO(decoded_data)) - - # Save to temporary file for mlx-vlm - with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as tmp_file: - image.save(tmp_file.name, format='JPEG') - return tmp_file.name - - except Exception as e: - print(f"Error loading image from base64: {e}", file=sys.stderr) - return None - - def load_audio_from_base64(self, audio_data: str): - """ - Load audio from base64 encoded data. - - Args: - audio_data (str): Base64 encoded audio data. - - Returns: - str: Path to the loaded audio file. - """ - try: - decoded_data = base64.b64decode(audio_data) - - # Save to temporary file for mlx-vlm - with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file: - tmp_file.write(decoded_data) - return tmp_file.name - - except Exception as e: - print(f"Error loading audio from base64: {e}", file=sys.stderr) - return None - - def cleanup_temp_files(self, file_paths: List[str]): - """ - Clean up temporary files. - - Args: - file_paths (List[str]): List of file paths to clean up. - """ - for file_path in file_paths: - try: - if file_path and os.path.exists(file_path): - os.remove(file_path) - except Exception as e: - print(f"Error removing temporary file {file_path}: {e}", file=sys.stderr) - -async def serve(address): - # Start asyncio gRPC server - server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), - options=[ - ('grpc.max_message_length', 50 * 1024 * 1024), # 50MB - ('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB - ('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB - ]) - # Add the servicer to the server - backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) - # Bind the server to the address - server.add_insecure_port(address) - - # Gracefully shutdown the server on SIGTERM or SIGINT - loop = asyncio.get_event_loop() - for sig in (signal.SIGINT, signal.SIGTERM): - loop.add_signal_handler( - sig, lambda: asyncio.ensure_future(server.stop(5)) - ) - - # Start the server - await server.start() - print("Server started. Listening on: " + address, file=sys.stderr) - # Wait for the server to be terminated - await server.wait_for_termination() - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Run the gRPC server.") - parser.add_argument( - "--addr", default="localhost:50051", help="The address to bind the server to." - ) - args = parser.parse_args() - - asyncio.run(serve(args.addr)) diff --git a/backend/python/mlx-vlm/install.sh b/backend/python/mlx-vlm/install.sh deleted file mode 100755 index b8ee48552490..000000000000 --- a/backend/python/mlx-vlm/install.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash -set -e - -USE_PIP=true - -backend_dir=$(dirname $0) - -if [ -d $backend_dir/common ]; then - source $backend_dir/common/libbackend.sh -else - source $backend_dir/../common/libbackend.sh -fi - -installRequirements diff --git a/backend/python/mlx-vlm/requirements.txt b/backend/python/mlx-vlm/requirements.txt deleted file mode 100644 index f1771cc4adb4..000000000000 --- a/backend/python/mlx-vlm/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -grpcio==1.71.0 -protobuf -certifi -setuptools \ No newline at end of file diff --git a/backend/python/mlx-vlm/run.sh b/backend/python/mlx-vlm/run.sh deleted file mode 100755 index fc88f97da712..000000000000 --- a/backend/python/mlx-vlm/run.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -backend_dir=$(dirname $0) - -if [ -d $backend_dir/common ]; then - source $backend_dir/common/libbackend.sh -else - source $backend_dir/../common/libbackend.sh -fi - -startBackend $@ \ No newline at end of file diff --git a/backend/python/mlx-vlm/test.py b/backend/python/mlx-vlm/test.py deleted file mode 100644 index 827aa71a3e33..000000000000 --- a/backend/python/mlx-vlm/test.py +++ /dev/null @@ -1,146 +0,0 @@ -import unittest -import subprocess -import time -import backend_pb2 -import backend_pb2_grpc - -import grpc - -import unittest -import subprocess -import time -import grpc -import backend_pb2_grpc -import backend_pb2 - -class TestBackendServicer(unittest.TestCase): - """ - TestBackendServicer is the class that tests the gRPC service. - - This class contains methods to test the startup and shutdown of the gRPC service. - """ - def setUp(self): - self.service = subprocess.Popen(["python", "backend.py", "--addr", "localhost:50051"]) - time.sleep(10) - - def tearDown(self) -> None: - self.service.terminate() - self.service.wait() - - def test_server_startup(self): - try: - self.setUp() - with grpc.insecure_channel("localhost:50051") as channel: - stub = backend_pb2_grpc.BackendStub(channel) - response = stub.Health(backend_pb2.HealthMessage()) - self.assertEqual(response.message, b'OK') - except Exception as err: - print(err) - self.fail("Server failed to start") - finally: - self.tearDown() - def test_load_model(self): - """ - This method tests if the model is loaded successfully - """ - try: - self.setUp() - with grpc.insecure_channel("localhost:50051") as channel: - stub = backend_pb2_grpc.BackendStub(channel) - response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m")) - self.assertTrue(response.success) - self.assertEqual(response.message, "Model loaded successfully") - except Exception as err: - print(err) - self.fail("LoadModel service failed") - finally: - self.tearDown() - - def test_text(self): - """ - This method tests if the embeddings are generated successfully - """ - try: - self.setUp() - with grpc.insecure_channel("localhost:50051") as channel: - stub = backend_pb2_grpc.BackendStub(channel) - response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m")) - self.assertTrue(response.success) - req = backend_pb2.PredictOptions(Prompt="The capital of France is") - resp = stub.Predict(req) - self.assertIsNotNone(resp.message) - except Exception as err: - print(err) - self.fail("text service failed") - finally: - self.tearDown() - - def test_sampling_params(self): - """ - This method tests if all sampling parameters are correctly processed - NOTE: this does NOT test for correctness, just that we received a compatible response - """ - try: - self.setUp() - with grpc.insecure_channel("localhost:50051") as channel: - stub = backend_pb2_grpc.BackendStub(channel) - response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m")) - self.assertTrue(response.success) - - req = backend_pb2.PredictOptions( - Prompt="The capital of France is", - TopP=0.8, - Tokens=50, - Temperature=0.7, - TopK=40, - PresencePenalty=0.1, - FrequencyPenalty=0.2, - RepetitionPenalty=1.1, - MinP=0.05, - Seed=42, - StopPrompts=["\n"], - StopTokenIds=[50256], - BadWords=["badword"], - IncludeStopStrInOutput=True, - IgnoreEOS=True, - MinTokens=5, - Logprobs=5, - PromptLogprobs=5, - SkipSpecialTokens=True, - SpacesBetweenSpecialTokens=True, - TruncatePromptTokens=10, - GuidedDecoding=True, - N=2, - ) - resp = stub.Predict(req) - self.assertIsNotNone(resp.message) - self.assertIsNotNone(resp.logprobs) - except Exception as err: - print(err) - self.fail("sampling params service failed") - finally: - self.tearDown() - - - def test_embedding(self): - """ - This method tests if the embeddings are generated successfully - """ - try: - self.setUp() - with grpc.insecure_channel("localhost:50051") as channel: - stub = backend_pb2_grpc.BackendStub(channel) - response = stub.LoadModel(backend_pb2.ModelOptions(Model="intfloat/e5-mistral-7b-instruct")) - self.assertTrue(response.success) - embedding_request = backend_pb2.PredictOptions(Embeddings="This is a test sentence.") - embedding_response = stub.Embedding(embedding_request) - self.assertIsNotNone(embedding_response.embeddings) - # assert that is a list of floats - self.assertIsInstance(embedding_response.embeddings, list) - # assert that the list is not empty - self.assertTrue(len(embedding_response.embeddings) > 0) - except Exception as err: - print(err) - self.fail("Embedding service failed") - finally: - self.tearDown() \ No newline at end of file diff --git a/backend/python/mlx-vlm/test.sh b/backend/python/mlx-vlm/test.sh deleted file mode 100755 index f31ae54e47dc..000000000000 --- a/backend/python/mlx-vlm/test.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash -set -e - -backend_dir=$(dirname $0) - -if [ -d $backend_dir/common ]; then - source $backend_dir/common/libbackend.sh -else - source $backend_dir/../common/libbackend.sh -fi - -runUnittests From a4bdaab172ddac85c34dbea6fcd7862c7788e357 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 22 Aug 2025 08:40:38 +0200 Subject: [PATCH 14/14] Drop CI test Signed-off-by: Ettore Di Giacinto --- .github/workflows/backend.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml index eac2e2e83635..965427f4013c 100644 --- a/.github/workflows/backend.yml +++ b/.github/workflows/backend.yml @@ -7,7 +7,6 @@ on: - master tags: - '*' - pull_request: concurrency: group: ci-backends-${{ github.head_ref || github.ref }}-${{ github.repository }}