diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml index 3ff701d76864..965427f4013c 100644 --- a/.github/workflows/backend.yml +++ b/.github/workflows/backend.yml @@ -945,6 +945,19 @@ jobs: backend: "kitten-tts" dockerfile: "./backend/Dockerfile.python" context: "./backend" + mlx-darwin: + uses: ./.github/workflows/backend_build_darwin.yml + with: + backend: "mlx" + build-type: "mps" + go-version: "1.24.x" + tag-suffix: "-metal-darwin-arm64-mlx" + runs-on: "macOS-14" + secrets: + dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }} + dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }} + quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }} + quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }} llama-cpp-darwin: runs-on: macOS-14 strategy: @@ -1118,4 +1131,4 @@ jobs: run: | for tag in $(echo "${{ steps.quaymeta.outputs.tags }}" | tr ',' '\n'); do crane push llama-cpp.tar $tag - done + done \ No newline at end of file diff --git a/.github/workflows/backend_build_darwin.yml b/.github/workflows/backend_build_darwin.yml new file mode 100644 index 000000000000..e6a2b4d388e4 --- /dev/null +++ b/.github/workflows/backend_build_darwin.yml @@ -0,0 +1,136 @@ +--- +name: 'build darwin python backend container images (reusable)' + +on: + workflow_call: + inputs: + backend: + description: 'Backend to build' + required: true + type: string + build-type: + description: 'Build type (e.g., mps)' + default: '' + type: string + go-version: + description: 'Go version to use' + default: '1.24.x' + type: string + tag-suffix: + description: 'Tag suffix for the built image' + required: true + type: string + runs-on: + description: 'Runner to use' + default: 'macOS-14' + type: string + secrets: + dockerUsername: + required: false + dockerPassword: + required: false + quayUsername: + required: true + quayPassword: + required: true + +jobs: + darwin-backend-build: + runs-on: ${{ inputs.runs-on }} + strategy: + matrix: + go-version: ['${{ inputs.go-version }}'] + steps: + - name: Clone + uses: actions/checkout@v5 + with: + submodules: true + + - name: Setup Go ${{ matrix.go-version }} + uses: actions/setup-go@v5 + with: + go-version: ${{ matrix.go-version }} + cache: false + + # You can test your matrix by printing the current Go version + - name: Display Go version + run: go version + + - name: Dependencies + run: | + brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm + + - name: Build ${{ inputs.backend }}-darwin + run: | + make protogen-go + BACKEND=${{ inputs.backend }} BUILD_TYPE=${{ inputs.build-type }} make build-darwin-python-backend + + - name: Upload ${{ inputs.backend }}.tar + uses: actions/upload-artifact@v4 + with: + name: ${{ inputs.backend }}-tar + path: backend-images/${{ inputs.backend }}.tar + + darwin-backend-publish: + needs: darwin-backend-build + if: github.event_name != 'pull_request' + runs-on: ubuntu-latest + steps: + - name: Download ${{ inputs.backend }}.tar + uses: actions/download-artifact@v5 + with: + name: ${{ inputs.backend }}-tar + path: . + + - name: Install crane + run: | + curl -L https://github.com/google/go-containerregistry/releases/latest/download/go-containerregistry_Linux_x86_64.tar.gz | tar -xz + sudo mv crane /usr/local/bin/ + + - name: Log in to DockerHub + run: | + echo "${{ secrets.dockerPassword }}" | crane auth login docker.io -u "${{ secrets.dockerUsername }}" --password-stdin + + - name: Log in to quay.io + run: | + echo "${{ secrets.quayPassword }}" | crane auth login quay.io -u "${{ secrets.quayUsername }}" --password-stdin + + - name: Docker meta + id: meta + uses: docker/metadata-action@v5 + with: + images: | + localai/localai-backends + tags: | + type=ref,event=branch + type=semver,pattern={{raw}} + type=sha + flavor: | + latest=auto + suffix=${{ inputs.tag-suffix }},onlatest=true + + - name: Docker meta + id: quaymeta + uses: docker/metadata-action@v5 + with: + images: | + quay.io/go-skynet/local-ai-backends + tags: | + type=ref,event=branch + type=semver,pattern={{raw}} + type=sha + flavor: | + latest=auto + suffix=${{ inputs.tag-suffix }},onlatest=true + + - name: Push Docker image (DockerHub) + run: | + for tag in $(echo "${{ steps.meta.outputs.tags }}" | tr ',' '\n'); do + crane push ${{ inputs.backend }}.tar $tag + done + + - name: Push Docker image (Quay) + run: | + for tag in $(echo "${{ steps.quaymeta.outputs.tags }}" | tr ',' '\n'); do + crane push ${{ inputs.backend }}.tar $tag + done diff --git a/Makefile b/Makefile index a050f84f8d7c..5be5bca1ca01 100644 --- a/Makefile +++ b/Makefile @@ -132,43 +132,6 @@ test: test-models/testmodel.ggml protogen-go $(MAKE) test-tts $(MAKE) test-stablediffusion -backends/diffusers: docker-build-diffusers docker-save-diffusers build - ./local-ai backends install "ocifile://$(abspath ./backend-images/diffusers.tar)" - -backends/llama-cpp: docker-build-llama-cpp docker-save-llama-cpp build - ./local-ai backends install "ocifile://$(abspath ./backend-images/llama-cpp.tar)" - -backends/piper: docker-build-piper docker-save-piper build - ./local-ai backends install "ocifile://$(abspath ./backend-images/piper.tar)" - -backends/stablediffusion-ggml: docker-build-stablediffusion-ggml docker-save-stablediffusion-ggml build - ./local-ai backends install "ocifile://$(abspath ./backend-images/stablediffusion-ggml.tar)" - -backends/whisper: docker-build-whisper docker-save-whisper build - ./local-ai backends install "ocifile://$(abspath ./backend-images/whisper.tar)" - -backends/silero-vad: docker-build-silero-vad docker-save-silero-vad build - ./local-ai backends install "ocifile://$(abspath ./backend-images/silero-vad.tar)" - -backends/local-store: docker-build-local-store docker-save-local-store build - ./local-ai backends install "ocifile://$(abspath ./backend-images/local-store.tar)" - -backends/huggingface: docker-build-huggingface docker-save-huggingface build - ./local-ai backends install "ocifile://$(abspath ./backend-images/huggingface.tar)" - -backends/rfdetr: docker-build-rfdetr docker-save-rfdetr build - ./local-ai backends install "ocifile://$(abspath ./backend-images/rfdetr.tar)" - -backends/kitten-tts: docker-build-kitten-tts docker-save-kitten-tts build - ./local-ai backends install "ocifile://$(abspath ./backend-images/kitten-tts.tar)" - -backends/kokoro: docker-build-kokoro docker-save-kokoro build - ./local-ai backends install "ocifile://$(abspath ./backend-images/kokoro.tar)" - -backends/llama-cpp-darwin: build - bash ./scripts/build-llama-cpp-darwin.sh - ./local-ai backends install "ocifile://$(abspath ./backend-images/llama-cpp.tar)" - ######################################################## ## AIO tests ######################################################## @@ -361,6 +324,51 @@ docker-image-intel: ## Backends ######################################################## + +backends/diffusers: docker-build-diffusers docker-save-diffusers build + ./local-ai backends install "ocifile://$(abspath ./backend-images/diffusers.tar)" + +backends/llama-cpp: docker-build-llama-cpp docker-save-llama-cpp build + ./local-ai backends install "ocifile://$(abspath ./backend-images/llama-cpp.tar)" + +backends/piper: docker-build-piper docker-save-piper build + ./local-ai backends install "ocifile://$(abspath ./backend-images/piper.tar)" + +backends/stablediffusion-ggml: docker-build-stablediffusion-ggml docker-save-stablediffusion-ggml build + ./local-ai backends install "ocifile://$(abspath ./backend-images/stablediffusion-ggml.tar)" + +backends/whisper: docker-build-whisper docker-save-whisper build + ./local-ai backends install "ocifile://$(abspath ./backend-images/whisper.tar)" + +backends/silero-vad: docker-build-silero-vad docker-save-silero-vad build + ./local-ai backends install "ocifile://$(abspath ./backend-images/silero-vad.tar)" + +backends/local-store: docker-build-local-store docker-save-local-store build + ./local-ai backends install "ocifile://$(abspath ./backend-images/local-store.tar)" + +backends/huggingface: docker-build-huggingface docker-save-huggingface build + ./local-ai backends install "ocifile://$(abspath ./backend-images/huggingface.tar)" + +backends/rfdetr: docker-build-rfdetr docker-save-rfdetr build + ./local-ai backends install "ocifile://$(abspath ./backend-images/rfdetr.tar)" + +backends/kitten-tts: docker-build-kitten-tts docker-save-kitten-tts build + ./local-ai backends install "ocifile://$(abspath ./backend-images/kitten-tts.tar)" + +backends/kokoro: docker-build-kokoro docker-save-kokoro build + ./local-ai backends install "ocifile://$(abspath ./backend-images/kokoro.tar)" + +backends/llama-cpp-darwin: build + bash ./scripts/build/llama-cpp-darwin.sh + ./local-ai backends install "ocifile://$(abspath ./backend-images/llama-cpp.tar)" + +build-darwin-python-backend: + bash ./scripts/build/python-darwin.sh + +backends/mlx: build + BACKEND=mlx BUILD_TYPE=mps $(MAKE) build-darwin-python-backend + ./local-ai backends install "ocifile://$(abspath ./backend-images/mlx.tar)" + backend-images: mkdir -p backend-images diff --git a/backend/index.yaml b/backend/index.yaml index 8bedccb67d42..3fed08f275d4 100644 --- a/backend/index.yaml +++ b/backend/index.yaml @@ -127,6 +127,21 @@ nvidia: "cuda12-vllm" amd: "rocm-vllm" intel: "intel-vllm" +- &mlx + name: "mlx" + uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-mlx" + icon: https://avatars.githubusercontent.com/u/102832242?s=200&v=4 + urls: + - https://github.com/ml-explore/mlx-lm + mirrors: + - localai/localai-backends:latest-metal-darwin-arm64-mlx + license: MIT + description: | + Run LLMs with MLX + tags: + - text-to-text + - LLM + - MLX - &rerankers name: "rerankers" alias: "rerankers" @@ -371,6 +386,11 @@ - text-to-speech - TTS license: apache-2.0 +- !!merge <<: *mlx + name: "mlx-development" + uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-mlx" + mirrors: + - localai/localai-backends:master-metal-darwin-arm64-mlx - !!merge <<: *kitten-tts name: "kitten-tts-development" uri: "quay.io/go-skynet/local-ai-backends:master-kitten-tts" diff --git a/backend/python/bark/Makefile b/backend/python/bark/Makefile index ef4fff1bef9d..da996aabeef0 100644 --- a/backend/python/bark/Makefile +++ b/backend/python/bark/Makefile @@ -1,29 +1,23 @@ .PHONY: ttsbark -ttsbark: protogen +ttsbark: bash install.sh .PHONY: run -run: protogen +run: ttsbark @echo "Running bark..." bash run.sh @echo "bark run." .PHONY: test -test: protogen +test: ttsbark @echo "Testing bark..." bash test.sh @echo "bark tested." -.PHONY: protogen -protogen: backend_pb2_grpc.py backend_pb2.py - .PHONY: protogen-clean protogen-clean: $(RM) backend_pb2_grpc.py backend_pb2.py -backend_pb2_grpc.py backend_pb2.py: - python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto - .PHONY: clean clean: protogen-clean rm -rf venv __pycache__ \ No newline at end of file diff --git a/backend/python/chatterbox/Makefile b/backend/python/chatterbox/Makefile index a69c0bcf58ca..be9330f8eac9 100644 --- a/backend/python/chatterbox/Makefile +++ b/backend/python/chatterbox/Makefile @@ -1,29 +1,23 @@ -.PHONY: coqui -coqui: protogen +.PHONY: chatterbox +chatterbox: bash install.sh .PHONY: run -run: protogen +run: chatterbox @echo "Running coqui..." bash run.sh @echo "coqui run." .PHONY: test -test: protogen +test: chatterbox @echo "Testing coqui..." bash test.sh @echo "coqui tested." -.PHONY: protogen -protogen: backend_pb2_grpc.py backend_pb2.py - .PHONY: protogen-clean protogen-clean: $(RM) backend_pb2_grpc.py backend_pb2.py -backend_pb2_grpc.py backend_pb2.py: - python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto - .PHONY: clean clean: protogen-clean rm -rf venv __pycache__ \ No newline at end of file diff --git a/backend/python/chatterbox/backend.py b/backend/python/chatterbox/backend.py index d7d241c302ac..0944202b9457 100644 --- a/backend/python/chatterbox/backend.py +++ b/backend/python/chatterbox/backend.py @@ -41,7 +41,9 @@ def LoadModel(self, request, context): else: print("CUDA is not available", file=sys.stderr) device = "cpu" - + mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available() + if mps_available: + device = "mps" if not torch.cuda.is_available() and request.CUDA: return backend_pb2.Result(success=False, message="CUDA is not available") diff --git a/backend/python/common/libbackend.sh b/backend/python/common/libbackend.sh index daa47c3c2080..79430ad2d4f1 100644 --- a/backend/python/common/libbackend.sh +++ b/backend/python/common/libbackend.sh @@ -17,8 +17,16 @@ # LIMIT_TARGETS="cublas12" # source $(dirname $0)/../common/libbackend.sh # +# You can switch between uv (conda-like) and pip installation methods by setting USE_PIP: +# USE_PIP=true source $(dirname $0)/../common/libbackend.sh +# + +PYTHON_VERSION="${PYTHON_VERSION:-3.10}" -PYTHON_VERSION="3.10" +# Default to uv if USE_PIP is not set +if [ "x${USE_PIP}" == "x" ]; then + USE_PIP=false +fi function init() { # Name of the backend (directory name) @@ -57,11 +65,6 @@ function init() { # - hipblas # - intel function getBuildProfile() { - if [ "x${BUILD_TYPE}" == "xl4t" ]; then - echo "l4t" - return 0 - fi - # First check if we are a cublas build, and if so report the correct build profile if [ x"${BUILD_TYPE}" == "xcublas" ]; then if [ ! -z ${CUDA_MAJOR_VERSION} ]; then @@ -81,7 +84,7 @@ function getBuildProfile() { fi # If for any other values of BUILD_TYPE, we don't need any special handling/discovery - if [ ! -z ${BUILD_TYPE} ]; then + if [ -n ${BUILD_TYPE} ]; then echo ${BUILD_TYPE} return 0 fi @@ -95,18 +98,48 @@ function getBuildProfile() { # This function is idempotent, so you can call it as many times as you want and it will # always result in an activated virtual environment function ensureVenv() { - if [ ! -d "${EDIR}/venv" ]; then - uv venv --python ${PYTHON_VERSION} ${EDIR}/venv - echo "virtualenv created" + if [ ! -d "${EDIR}/venv" ]; then + if [ "x${USE_PIP}" == "xtrue" ]; then + echo "Using pip and Python virtual environments" + + # Use Python virtual environment with pip + interpreter="python3" + # if there is no python , call python${PYTHON_VERSION} + + if command -v python${PYTHON_VERSION} &> /dev/null; then + interpreter="python${PYTHON_VERSION}" + fi + echo "Using interpreter: ${interpreter}" + ${interpreter} -m venv ${EDIR}/venv + source ${EDIR}/venv/bin/activate + ${interpreter} -m pip install --upgrade pip + echo "Python virtual environment created" + else + echo "Using uv package manager" + uv venv --python ${PYTHON_VERSION} ${EDIR}/venv + echo "uv virtual environment created" + fi fi - # Source if we are not already in a Virtual env if [ "x${VIRTUAL_ENV}" != "x${EDIR}/venv" ]; then source ${EDIR}/venv/bin/activate - echo "virtualenv activated" + echo "Python virtual environment activated" fi - echo "activated virtualenv has been ensured" + echo "activated virtual environment has been ensured" +} + +function runProtogen() { + ensureVenv + + if [ "x${USE_PIP}" == "xtrue" ]; then + pip install grpcio-tools + else + uv pip install grpcio-tools + fi + pushd ${EDIR} + python3 -m grpc_tools.protoc -I../../ -I./ --python_out=. --grpc_python_out=. backend.proto + popd } # installRequirements looks for several requirements files and if they exist runs the install for them in order @@ -116,7 +149,7 @@ function ensureVenv() { # - requirements-${BUILD_TYPE}.txt # - requirements-${BUILD_PROFILE}.txt # -# BUILD_PROFILE is a pore specific version of BUILD_TYPE, ex: cuda-11 or cuda-12 +# BUILD_PROFILE is a more specific version of BUILD_TYPE, ex: cuda-11 or cuda-12 # it can also include some options that we do not have BUILD_TYPES for, ex: intel # # NOTE: for BUILD_PROFILE==intel, this function does NOT automatically use the Intel python package index. @@ -158,10 +191,18 @@ function installRequirements() { for reqFile in ${requirementFiles[@]}; do if [ -f ${reqFile} ]; then echo "starting requirements install for ${reqFile}" - uv pip install ${EXTRA_PIP_INSTALL_FLAGS} --requirement ${reqFile} + if [ "x${USE_PIP}" == "xtrue" ]; then + # Use pip for installation + pip install ${EXTRA_PIP_INSTALL_FLAGS} --requirement ${reqFile} + else + # Use uv for installation + uv pip install ${EXTRA_PIP_INSTALL_FLAGS} --requirement ${reqFile} + fi echo "finished requirements install for ${reqFile}" fi done + + runProtogen } # startBackend discovers and runs the backend GRPC server diff --git a/backend/python/common/template/Makefile b/backend/python/common/template/Makefile index c0e5169f75c4..f6b9ddc6c888 100644 --- a/backend/python/common/template/Makefile +++ b/backend/python/common/template/Makefile @@ -3,18 +3,11 @@ .PHONY: install install: bash install.sh - $(MAKE) protogen - -.PHONY: protogen -protogen: backend_pb2_grpc.py backend_pb2.py .PHONY: protogen-clean protogen-clean: $(RM) backend_pb2_grpc.py backend_pb2.py -backend_pb2_grpc.py backend_pb2.py: - bash protogen.sh - .PHONY: clean clean: protogen-clean rm -rf venv __pycache__ \ No newline at end of file diff --git a/backend/python/common/template/protogen.sh b/backend/python/common/template/protogen.sh index 0569b6c6e4b3..cba7791cbce3 100644 --- a/backend/python/common/template/protogen.sh +++ b/backend/python/common/template/protogen.sh @@ -8,6 +8,4 @@ else source $backend_dir/../common/libbackend.sh fi -ensureVenv - -python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto \ No newline at end of file +runProtogen \ No newline at end of file diff --git a/backend/python/coqui/Makefile b/backend/python/coqui/Makefile index a69c0bcf58ca..6915b0f9f896 100644 --- a/backend/python/coqui/Makefile +++ b/backend/python/coqui/Makefile @@ -1,29 +1,23 @@ .PHONY: coqui -coqui: protogen +coqui: bash install.sh .PHONY: run -run: protogen +run: coqui @echo "Running coqui..." bash run.sh @echo "coqui run." .PHONY: test -test: protogen +test: coqui @echo "Testing coqui..." bash test.sh @echo "coqui tested." -.PHONY: protogen -protogen: backend_pb2_grpc.py backend_pb2.py - .PHONY: protogen-clean protogen-clean: $(RM) backend_pb2_grpc.py backend_pb2.py -backend_pb2_grpc.py backend_pb2.py: - python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto - .PHONY: clean clean: protogen-clean rm -rf venv __pycache__ \ No newline at end of file diff --git a/backend/python/coqui/backend.py b/backend/python/coqui/backend.py index f940f8e0a403..df115adb5030 100644 --- a/backend/python/coqui/backend.py +++ b/backend/python/coqui/backend.py @@ -40,7 +40,9 @@ def LoadModel(self, request, context): else: print("CUDA is not available", file=sys.stderr) device = "cpu" - + mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available() + if mps_available: + device = "mps" if not torch.cuda.is_available() and request.CUDA: return backend_pb2.Result(success=False, message="CUDA is not available") diff --git a/backend/python/diffusers/Makefile b/backend/python/diffusers/Makefile index 01156f876f00..f9ded4a1cff7 100644 --- a/backend/python/diffusers/Makefile +++ b/backend/python/diffusers/Makefile @@ -12,28 +12,22 @@ export SKIP_CONDA=1 endif .PHONY: diffusers -diffusers: protogen +diffusers: bash install.sh .PHONY: run -run: protogen +run: diffusers @echo "Running diffusers..." bash run.sh @echo "Diffusers run." -test: protogen +test: diffusers bash test.sh -.PHONY: protogen -protogen: backend_pb2_grpc.py backend_pb2.py - .PHONY: protogen-clean protogen-clean: $(RM) backend_pb2_grpc.py backend_pb2.py -backend_pb2_grpc.py backend_pb2.py: - python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto - .PHONY: clean clean: protogen-clean rm -rf venv __pycache__ \ No newline at end of file diff --git a/backend/python/diffusers/backend.py b/backend/python/diffusers/backend.py index 185838209895..ef5f1b5c07ce 100755 --- a/backend/python/diffusers/backend.py +++ b/backend/python/diffusers/backend.py @@ -368,6 +368,9 @@ def LoadModel(self, request, context): device = "cpu" if not request.CUDA else "cuda" if XPU: device = "xpu" + mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available() + if mps_available: + device = "mps" self.device = device if request.LoraAdapter: # Check if its a local file and not a directory ( we load lora differently for a safetensor file ) diff --git a/backend/python/exllama2/Makefile b/backend/python/exllama2/Makefile index 68a18f3aa855..15ba38d120f3 100644 --- a/backend/python/exllama2/Makefile +++ b/backend/python/exllama2/Makefile @@ -1,23 +1,17 @@ .PHONY: exllama2 -exllama2: protogen +exllama2: bash install.sh .PHONY: run -run: protogen +run: exllama2 @echo "Running exllama2..." bash run.sh @echo "exllama2 run." -.PHONY: protogen -protogen: backend_pb2_grpc.py backend_pb2.py - .PHONY: protogen-clean protogen-clean: $(RM) backend_pb2_grpc.py backend_pb2.py -backend_pb2_grpc.py backend_pb2.py: - python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto - .PHONY: clean clean: protogen-clean $(RM) -r venv source __pycache__ \ No newline at end of file diff --git a/backend/python/faster-whisper/Makefile b/backend/python/faster-whisper/Makefile index c0e5169f75c4..f6b9ddc6c888 100644 --- a/backend/python/faster-whisper/Makefile +++ b/backend/python/faster-whisper/Makefile @@ -3,18 +3,11 @@ .PHONY: install install: bash install.sh - $(MAKE) protogen - -.PHONY: protogen -protogen: backend_pb2_grpc.py backend_pb2.py .PHONY: protogen-clean protogen-clean: $(RM) backend_pb2_grpc.py backend_pb2.py -backend_pb2_grpc.py backend_pb2.py: - bash protogen.sh - .PHONY: clean clean: protogen-clean rm -rf venv __pycache__ \ No newline at end of file diff --git a/backend/python/faster-whisper/backend.py b/backend/python/faster-whisper/backend.py index b73664ab88f7..808f29238207 100755 --- a/backend/python/faster-whisper/backend.py +++ b/backend/python/faster-whisper/backend.py @@ -10,7 +10,7 @@ import os import backend_pb2 import backend_pb2_grpc - +import torch from faster_whisper import WhisperModel import grpc @@ -35,7 +35,9 @@ def LoadModel(self, request, context): # device = "cuda" if request.CUDA else "cpu" if request.CUDA: device = "cuda" - + mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available() + if mps_available: + device = "mps" try: print("Preparing models, please wait", file=sys.stderr) self.model = WhisperModel(request.Model, device=device, compute_type="float16") diff --git a/backend/python/kitten-tts/Makefile b/backend/python/kitten-tts/Makefile index f05fc191698f..021a9679bfd2 100644 --- a/backend/python/kitten-tts/Makefile +++ b/backend/python/kitten-tts/Makefile @@ -1,29 +1,23 @@ .PHONY: kitten-tts -kitten-tts: protogen +kitten-tts: bash install.sh .PHONY: run -run: protogen +run: kitten-tts @echo "Running kitten-tts..." bash run.sh @echo "kitten-tts run." .PHONY: test -test: protogen +test: kitten-tts @echo "Testing kitten-tts..." bash test.sh @echo "kitten-tts tested." -.PHONY: protogen -protogen: backend_pb2_grpc.py backend_pb2.py - .PHONY: protogen-clean protogen-clean: $(RM) backend_pb2_grpc.py backend_pb2.py -backend_pb2_grpc.py backend_pb2.py: - python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto - .PHONY: clean clean: protogen-clean rm -rf venv __pycache__ \ No newline at end of file diff --git a/backend/python/kitten-tts/backend.py b/backend/python/kitten-tts/backend.py index 775f85f57d0b..b31023c8cac6 100644 --- a/backend/python/kitten-tts/backend.py +++ b/backend/python/kitten-tts/backend.py @@ -33,18 +33,6 @@ def Health(self, request, context): return backend_pb2.Reply(message=bytes("OK", 'utf-8')) def LoadModel(self, request, context): - # Get device - # device = "cuda" if request.CUDA else "cpu" - if torch.cuda.is_available(): - print("CUDA is available", file=sys.stderr) - device = "cuda" - else: - print("CUDA is not available", file=sys.stderr) - device = "cpu" - - if not torch.cuda.is_available() and request.CUDA: - return backend_pb2.Result(success=False, message="CUDA is not available") - self.AudioPath = None # List available KittenTTS models print("Available KittenTTS voices: expr-voice-2-m, expr-voice-2-f, expr-voice-3-m, expr-voice-3-f, expr-voice-4-m, expr-voice-4-f, expr-voice-5-m, expr-voice-5-f") diff --git a/backend/python/kokoro/Makefile b/backend/python/kokoro/Makefile index 660aabc34cec..7e1b238228b1 100644 --- a/backend/python/kokoro/Makefile +++ b/backend/python/kokoro/Makefile @@ -1,29 +1,23 @@ .PHONY: kokoro -kokoro: protogen +kokoro: bash install.sh .PHONY: run -run: protogen +run: kokoro @echo "Running kokoro..." bash run.sh @echo "kokoro run." .PHONY: test -test: protogen +test: kokoro @echo "Testing kokoro..." bash test.sh @echo "kokoro tested." -.PHONY: protogen -protogen: backend_pb2_grpc.py backend_pb2.py - .PHONY: protogen-clean protogen-clean: $(RM) backend_pb2_grpc.py backend_pb2.py -backend_pb2_grpc.py backend_pb2.py: - python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto - .PHONY: clean clean: protogen-clean rm -rf venv __pycache__ \ No newline at end of file diff --git a/backend/python/kokoro/backend.py b/backend/python/kokoro/backend.py index 83a3f3326fbd..32aefa558e8a 100644 --- a/backend/python/kokoro/backend.py +++ b/backend/python/kokoro/backend.py @@ -33,17 +33,6 @@ def Health(self, request, context): return backend_pb2.Reply(message=bytes("OK", 'utf-8')) def LoadModel(self, request, context): - # Get device - if torch.cuda.is_available(): - print("CUDA is available", file=sys.stderr) - device = "cuda" - else: - print("CUDA is not available", file=sys.stderr) - device = "cpu" - - if not torch.cuda.is_available() and request.CUDA: - return backend_pb2.Result(success=False, message="CUDA is not available") - try: print("Preparing Kokoro TTS pipeline, please wait", file=sys.stderr) # empty dict diff --git a/backend/python/mlx/Makefile b/backend/python/mlx/Makefile new file mode 100644 index 000000000000..06f3bf614854 --- /dev/null +++ b/backend/python/mlx/Makefile @@ -0,0 +1,23 @@ +.PHONY: mlx +mlx: + bash install.sh + +.PHONY: run +run: + @echo "Running mlx..." + bash run.sh + @echo "mlx run." + +.PHONY: test +test: + @echo "Testing mlx..." + bash test.sh + @echo "mlx tested." + +.PHONY: protogen-clean +protogen-clean: + $(RM) backend_pb2_grpc.py backend_pb2.py + +.PHONY: clean +clean: protogen-clean + rm -rf venv __pycache__ \ No newline at end of file diff --git a/backend/python/mlx/backend.py b/backend/python/mlx/backend.py new file mode 100644 index 000000000000..84024b387f29 --- /dev/null +++ b/backend/python/mlx/backend.py @@ -0,0 +1,376 @@ +#!/usr/bin/env python3 +import asyncio +from concurrent import futures +import argparse +import signal +import sys +import os +from typing import List +import time + +import backend_pb2 +import backend_pb2_grpc + +import grpc +from mlx_lm import load, generate, stream_generate +from mlx_lm.sample_utils import make_sampler +from mlx_lm.models.cache import make_prompt_cache +import mlx.core as mx +import base64 +import io + +_ONE_DAY_IN_SECONDS = 60 * 60 * 24 + +# If MAX_WORKERS are specified in the environment use it, otherwise default to 1 +MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1')) + +# Implement the BackendServicer class with the service methods +class BackendServicer(backend_pb2_grpc.BackendServicer): + """ + A gRPC servicer that implements the Backend service defined in backend.proto. + """ + + def _is_float(self, s): + """Check if a string can be converted to float.""" + try: + float(s) + return True + except ValueError: + return False + + def _is_int(self, s): + """Check if a string can be converted to int.""" + try: + int(s) + return True + except ValueError: + return False + + def Health(self, request, context): + """ + Returns a health check message. + + Args: + request: The health check request. + context: The gRPC context. + + Returns: + backend_pb2.Reply: The health check reply. + """ + return backend_pb2.Reply(message=bytes("OK", 'utf-8')) + + async def LoadModel(self, request, context): + """ + Loads a language model using MLX. + + Args: + request: The load model request. + context: The gRPC context. + + Returns: + backend_pb2.Result: The load model result. + """ + try: + print(f"Loading MLX model: {request.Model}", file=sys.stderr) + print(f"Request: {request}", file=sys.stderr) + + # Parse options like in the diffusers backend + options = request.Options + self.options = {} + + # The options are a list of strings in this form optname:optvalue + # We store all the options in a dict for later use + for opt in options: + if ":" not in opt: + continue + key, value = opt.split(":", 1) # Split only on first colon to handle values with colons + + # Convert numeric values to appropriate types + if self._is_float(value): + value = float(value) + elif self._is_int(value): + value = int(value) + elif value.lower() in ["true", "false"]: + value = value.lower() == "true" + + self.options[key] = value + + print(f"Options: {self.options}", file=sys.stderr) + + # Build tokenizer config for MLX using options + tokenizer_config = {} + + # Handle trust_remote_code from request or options + if request.TrustRemoteCode or self.options.get("trust_remote_code", False): + tokenizer_config["trust_remote_code"] = True + + # Handle EOS token from options + if "eos_token" in self.options: + tokenizer_config["eos_token"] = self.options["eos_token"] + + # Handle other tokenizer config options + for key in ["pad_token", "bos_token", "unk_token", "sep_token", "cls_token", "mask_token"]: + if key in self.options: + tokenizer_config[key] = self.options[key] + + # Load model and tokenizer using MLX + if tokenizer_config: + print(f"Loading with tokenizer_config: {tokenizer_config}", file=sys.stderr) + self.model, self.tokenizer = load(request.Model, tokenizer_config=tokenizer_config) + else: + self.model, self.tokenizer = load(request.Model) + + # Initialize prompt cache for efficient generation + max_kv_size = self.options.get("max_kv_size", None) + self.prompt_cache = make_prompt_cache(self.model, max_kv_size) + + except Exception as err: + print(f"Error loading MLX model {err=}, {type(err)=}", file=sys.stderr) + return backend_pb2.Result(success=False, message=f"Error loading MLX model: {err}") + + print("MLX model loaded successfully", file=sys.stderr) + return backend_pb2.Result(message="MLX model loaded successfully", success=True) + + async def Predict(self, request, context): + """ + Generates text based on the given prompt and sampling parameters using MLX. + + Args: + request: The predict request. + context: The gRPC context. + + Returns: + backend_pb2.Reply: The predict result. + """ + try: + # Prepare the prompt + prompt = self._prepare_prompt(request) + + # Build generation parameters using request attributes and options + max_tokens, sampler_params = self._build_generation_params(request) + + print(f"Generating text with MLX - max_tokens: {max_tokens}, sampler_params: {sampler_params}", file=sys.stderr) + + # Create sampler with parameters + sampler = make_sampler(**sampler_params) + + # Generate text using MLX with proper parameters + response = generate( + self.model, + self.tokenizer, + prompt=prompt, + max_tokens=max_tokens, + sampler=sampler, + prompt_cache=self.prompt_cache, + verbose=False + ) + + return backend_pb2.Reply(message=bytes(response, encoding='utf-8')) + + except Exception as e: + print(f"Error in MLX Predict: {e}", file=sys.stderr) + context.set_code(grpc.StatusCode.INTERNAL) + context.set_details(f"Generation failed: {str(e)}") + return backend_pb2.Reply(message=bytes("", encoding='utf-8')) + + def Embedding(self, request, context): + """ + A gRPC method that calculates embeddings for a given sentence. + + Note: MLX-LM doesn't support embeddings directly. This method returns an error. + + Args: + request: An EmbeddingRequest object that contains the request parameters. + context: A grpc.ServicerContext object that provides information about the RPC. + + Returns: + An EmbeddingResult object that contains the calculated embeddings. + """ + print("Embeddings not supported in MLX backend", file=sys.stderr) + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details("Embeddings are not supported in the MLX backend.") + return backend_pb2.EmbeddingResult() + + async def PredictStream(self, request, context): + """ + Generates text based on the given prompt and sampling parameters, and streams the results using MLX. + + Args: + request: The predict stream request. + context: The gRPC context. + + Yields: + backend_pb2.Reply: Streaming predict results. + """ + try: + # Prepare the prompt + prompt = self._prepare_prompt(request) + + # Build generation parameters using request attributes and options + max_tokens, sampler_params = self._build_generation_params(request, default_max_tokens=512) + + print(f"Streaming text with MLX - max_tokens: {max_tokens}, sampler_params: {sampler_params}", file=sys.stderr) + + # Create sampler with parameters + sampler = make_sampler(**sampler_params) + + # Stream text generation using MLX with proper parameters + for response in stream_generate( + self.model, + self.tokenizer, + prompt=prompt, + max_tokens=max_tokens, + sampler=sampler, + prompt_cache=self.prompt_cache, + ): + yield backend_pb2.Reply(message=bytes(response.text, encoding='utf-8')) + + except Exception as e: + print(f"Error in MLX PredictStream: {e}", file=sys.stderr) + context.set_code(grpc.StatusCode.INTERNAL) + context.set_details(f"Streaming generation failed: {str(e)}") + yield backend_pb2.Reply(message=bytes("", encoding='utf-8')) + + def _prepare_prompt(self, request): + """ + Prepare the prompt for MLX generation, handling chat templates if needed. + + Args: + request: The gRPC request containing prompt and message information. + + Returns: + str: The prepared prompt. + """ + # If tokenizer template is enabled and messages are provided instead of prompt, apply the tokenizer template + if not request.Prompt and request.UseTokenizerTemplate and request.Messages: + # Convert gRPC messages to the format expected by apply_chat_template + messages = [] + for msg in request.Messages: + messages.append({"role": msg.role, "content": msg.content}) + + prompt = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True + ) + return prompt + else: + return request.Prompt + + + + + + def _build_generation_params(self, request, default_max_tokens=200): + """ + Build generation parameters from request attributes and options. + + Args: + request: The gRPC request. + default_max_tokens: Default max_tokens if not specified. + + Returns: + tuple: (max_tokens, sampler_params dict) + """ + # Extract max_tokens + max_tokens = getattr(request, 'Tokens', default_max_tokens) + if max_tokens == 0: + max_tokens = default_max_tokens + + # Extract sampler parameters from request attributes + temp = getattr(request, 'Temperature', 0.0) + if temp == 0.0: + temp = 0.6 # Default temperature + + top_p = getattr(request, 'TopP', 0.0) + if top_p == 0.0: + top_p = 1.0 # Default top_p + + # Initialize sampler parameters + sampler_params = { + 'temp': temp, + 'top_p': top_p, + 'xtc_threshold': 0.0, + 'xtc_probability': 0.0, + } + + # Add seed if specified + seed = getattr(request, 'Seed', 0) + if seed != 0: + mx.random.seed(seed) + + # Override with options if available + if hasattr(self, 'options'): + # Max tokens from options + if 'max_tokens' in self.options: + max_tokens = self.options['max_tokens'] + + # Sampler parameters from options + sampler_option_mapping = { + 'temp': 'temp', + 'temperature': 'temp', # alias + 'top_p': 'top_p', + 'xtc_threshold': 'xtc_threshold', + 'xtc_probability': 'xtc_probability', + } + + for option_key, param_key in sampler_option_mapping.items(): + if option_key in self.options: + sampler_params[param_key] = self.options[option_key] + + # Handle seed from options + if 'seed' in self.options: + mx.random.seed(self.options['seed']) + + # Special tokens for XTC sampling (if tokenizer has eos_token_ids) + xtc_special_tokens = [] + if hasattr(self.tokenizer, 'eos_token_ids') and self.tokenizer.eos_token_ids: + xtc_special_tokens = list(self.tokenizer.eos_token_ids) + elif hasattr(self.tokenizer, 'eos_token_id') and self.tokenizer.eos_token_id is not None: + xtc_special_tokens = [self.tokenizer.eos_token_id] + + # Add newline token if available + try: + newline_tokens = self.tokenizer.encode("\n") + xtc_special_tokens.extend(newline_tokens) + except: + pass # Skip if encoding fails + + sampler_params['xtc_special_tokens'] = xtc_special_tokens + + return max_tokens, sampler_params + +async def serve(address): + # Start asyncio gRPC server + server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), + options=[ + ('grpc.max_message_length', 50 * 1024 * 1024), # 50MB + ('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB + ('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB + ]) + # Add the servicer to the server + backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) + # Bind the server to the address + server.add_insecure_port(address) + + # Gracefully shutdown the server on SIGTERM or SIGINT + loop = asyncio.get_event_loop() + for sig in (signal.SIGINT, signal.SIGTERM): + loop.add_signal_handler( + sig, lambda: asyncio.ensure_future(server.stop(5)) + ) + + # Start the server + await server.start() + print("Server started. Listening on: " + address, file=sys.stderr) + # Wait for the server to be terminated + await server.wait_for_termination() + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run the gRPC server.") + parser.add_argument( + "--addr", default="localhost:50051", help="The address to bind the server to." + ) + args = parser.parse_args() + + asyncio.run(serve(args.addr)) diff --git a/backend/python/rfdetr/protogen.sh b/backend/python/mlx/install.sh old mode 100644 new mode 100755 similarity index 64% rename from backend/python/rfdetr/protogen.sh rename to backend/python/mlx/install.sh index 0569b6c6e4b3..253ee0c13f1b --- a/backend/python/rfdetr/protogen.sh +++ b/backend/python/mlx/install.sh @@ -1,13 +1,15 @@ #!/bin/bash set -e +USE_PIP=true +PYTHON_VERSION="" + backend_dir=$(dirname $0) + if [ -d $backend_dir/common ]; then source $backend_dir/common/libbackend.sh else source $backend_dir/../common/libbackend.sh fi -ensureVenv - -python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto \ No newline at end of file +installRequirements diff --git a/backend/python/mlx/requirements-mps.txt b/backend/python/mlx/requirements-mps.txt new file mode 100644 index 000000000000..22737f5fdda7 --- /dev/null +++ b/backend/python/mlx/requirements-mps.txt @@ -0,0 +1 @@ +mlx-lm \ No newline at end of file diff --git a/backend/python/mlx/requirements.txt b/backend/python/mlx/requirements.txt new file mode 100644 index 000000000000..f1771cc4adb4 --- /dev/null +++ b/backend/python/mlx/requirements.txt @@ -0,0 +1,4 @@ +grpcio==1.71.0 +protobuf +certifi +setuptools \ No newline at end of file diff --git a/backend/python/mlx/run.sh b/backend/python/mlx/run.sh new file mode 100755 index 000000000000..fc88f97da712 --- /dev/null +++ b/backend/python/mlx/run.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +backend_dir=$(dirname $0) + +if [ -d $backend_dir/common ]; then + source $backend_dir/common/libbackend.sh +else + source $backend_dir/../common/libbackend.sh +fi + +startBackend $@ \ No newline at end of file diff --git a/backend/python/mlx/test.py b/backend/python/mlx/test.py new file mode 100644 index 000000000000..827aa71a3e33 --- /dev/null +++ b/backend/python/mlx/test.py @@ -0,0 +1,146 @@ +import unittest +import subprocess +import time +import backend_pb2 +import backend_pb2_grpc + +import grpc + +import unittest +import subprocess +import time +import grpc +import backend_pb2_grpc +import backend_pb2 + +class TestBackendServicer(unittest.TestCase): + """ + TestBackendServicer is the class that tests the gRPC service. + + This class contains methods to test the startup and shutdown of the gRPC service. + """ + def setUp(self): + self.service = subprocess.Popen(["python", "backend.py", "--addr", "localhost:50051"]) + time.sleep(10) + + def tearDown(self) -> None: + self.service.terminate() + self.service.wait() + + def test_server_startup(self): + try: + self.setUp() + with grpc.insecure_channel("localhost:50051") as channel: + stub = backend_pb2_grpc.BackendStub(channel) + response = stub.Health(backend_pb2.HealthMessage()) + self.assertEqual(response.message, b'OK') + except Exception as err: + print(err) + self.fail("Server failed to start") + finally: + self.tearDown() + def test_load_model(self): + """ + This method tests if the model is loaded successfully + """ + try: + self.setUp() + with grpc.insecure_channel("localhost:50051") as channel: + stub = backend_pb2_grpc.BackendStub(channel) + response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m")) + self.assertTrue(response.success) + self.assertEqual(response.message, "Model loaded successfully") + except Exception as err: + print(err) + self.fail("LoadModel service failed") + finally: + self.tearDown() + + def test_text(self): + """ + This method tests if the embeddings are generated successfully + """ + try: + self.setUp() + with grpc.insecure_channel("localhost:50051") as channel: + stub = backend_pb2_grpc.BackendStub(channel) + response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m")) + self.assertTrue(response.success) + req = backend_pb2.PredictOptions(Prompt="The capital of France is") + resp = stub.Predict(req) + self.assertIsNotNone(resp.message) + except Exception as err: + print(err) + self.fail("text service failed") + finally: + self.tearDown() + + def test_sampling_params(self): + """ + This method tests if all sampling parameters are correctly processed + NOTE: this does NOT test for correctness, just that we received a compatible response + """ + try: + self.setUp() + with grpc.insecure_channel("localhost:50051") as channel: + stub = backend_pb2_grpc.BackendStub(channel) + response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m")) + self.assertTrue(response.success) + + req = backend_pb2.PredictOptions( + Prompt="The capital of France is", + TopP=0.8, + Tokens=50, + Temperature=0.7, + TopK=40, + PresencePenalty=0.1, + FrequencyPenalty=0.2, + RepetitionPenalty=1.1, + MinP=0.05, + Seed=42, + StopPrompts=["\n"], + StopTokenIds=[50256], + BadWords=["badword"], + IncludeStopStrInOutput=True, + IgnoreEOS=True, + MinTokens=5, + Logprobs=5, + PromptLogprobs=5, + SkipSpecialTokens=True, + SpacesBetweenSpecialTokens=True, + TruncatePromptTokens=10, + GuidedDecoding=True, + N=2, + ) + resp = stub.Predict(req) + self.assertIsNotNone(resp.message) + self.assertIsNotNone(resp.logprobs) + except Exception as err: + print(err) + self.fail("sampling params service failed") + finally: + self.tearDown() + + + def test_embedding(self): + """ + This method tests if the embeddings are generated successfully + """ + try: + self.setUp() + with grpc.insecure_channel("localhost:50051") as channel: + stub = backend_pb2_grpc.BackendStub(channel) + response = stub.LoadModel(backend_pb2.ModelOptions(Model="intfloat/e5-mistral-7b-instruct")) + self.assertTrue(response.success) + embedding_request = backend_pb2.PredictOptions(Embeddings="This is a test sentence.") + embedding_response = stub.Embedding(embedding_request) + self.assertIsNotNone(embedding_response.embeddings) + # assert that is a list of floats + self.assertIsInstance(embedding_response.embeddings, list) + # assert that the list is not empty + self.assertTrue(len(embedding_response.embeddings) > 0) + except Exception as err: + print(err) + self.fail("Embedding service failed") + finally: + self.tearDown() \ No newline at end of file diff --git a/backend/python/mlx/test.sh b/backend/python/mlx/test.sh new file mode 100755 index 000000000000..f31ae54e47dc --- /dev/null +++ b/backend/python/mlx/test.sh @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +backend_dir=$(dirname $0) + +if [ -d $backend_dir/common ]; then + source $backend_dir/common/libbackend.sh +else + source $backend_dir/../common/libbackend.sh +fi + +runUnittests diff --git a/backend/python/rerankers/Makefile b/backend/python/rerankers/Makefile index 82de822ff326..c9a1d30104b4 100644 --- a/backend/python/rerankers/Makefile +++ b/backend/python/rerankers/Makefile @@ -1,30 +1,24 @@ .PHONY: rerankers -rerankers: protogen +rerankers: bash install.sh .PHONY: run -run: protogen +run: rerankers @echo "Running rerankers..." bash run.sh @echo "rerankers run." # It is not working well by using command line. It only6 works with IDE like VSCode. .PHONY: test -test: protogen +test: rerankers @echo "Testing rerankers..." bash test.sh @echo "rerankers tested." -.PHONY: protogen -protogen: backend_pb2_grpc.py backend_pb2.py - .PHONY: protogen-clean protogen-clean: $(RM) backend_pb2_grpc.py backend_pb2.py -backend_pb2_grpc.py backend_pb2.py: - python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto - .PHONY: clean clean: protogen-clean rm -rf venv __pycache__ \ No newline at end of file diff --git a/backend/python/rfdetr/Makefile b/backend/python/rfdetr/Makefile index c0e5169f75c4..f6b9ddc6c888 100644 --- a/backend/python/rfdetr/Makefile +++ b/backend/python/rfdetr/Makefile @@ -3,18 +3,11 @@ .PHONY: install install: bash install.sh - $(MAKE) protogen - -.PHONY: protogen -protogen: backend_pb2_grpc.py backend_pb2.py .PHONY: protogen-clean protogen-clean: $(RM) backend_pb2_grpc.py backend_pb2.py -backend_pb2_grpc.py backend_pb2.py: - bash protogen.sh - .PHONY: clean clean: protogen-clean rm -rf venv __pycache__ \ No newline at end of file diff --git a/backend/python/transformers/Makefile b/backend/python/transformers/Makefile index 8d3f7fd73c9d..6897baf0c9b4 100644 --- a/backend/python/transformers/Makefile +++ b/backend/python/transformers/Makefile @@ -1,30 +1,24 @@ .PHONY: transformers -transformers: protogen +transformers: bash install.sh .PHONY: run -run: protogen +run: transformers @echo "Running transformers..." bash run.sh @echo "transformers run." # It is not working well by using command line. It only6 works with IDE like VSCode. .PHONY: test -test: protogen +test: transformers @echo "Testing transformers..." bash test.sh @echo "transformers tested." -.PHONY: protogen -protogen: backend_pb2_grpc.py backend_pb2.py - .PHONY: protogen-clean protogen-clean: $(RM) backend_pb2_grpc.py backend_pb2.py -backend_pb2_grpc.py backend_pb2.py: - python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto - .PHONY: clean clean: protogen-clean rm -rf venv __pycache__ \ No newline at end of file diff --git a/backend/python/transformers/backend.py b/backend/python/transformers/backend.py index ef8a2fd40b6e..05713b917d2a 100644 --- a/backend/python/transformers/backend.py +++ b/backend/python/transformers/backend.py @@ -94,7 +94,9 @@ def LoadModel(self, request, context): self.SentenceTransformer = False device_map="cpu" - + mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available() + if mps_available: + device_map = "mps" quantization = None autoTokenizer = True diff --git a/backend/python/vllm/Makefile b/backend/python/vllm/Makefile index bb57a19266bd..c7c1b6869c02 100644 --- a/backend/python/vllm/Makefile +++ b/backend/python/vllm/Makefile @@ -1,29 +1,23 @@ .PHONY: vllm -vllm: protogen +vllm: bash install.sh .PHONY: run -run: protogen +run: vllm @echo "Running vllm..." bash run.sh @echo "vllm run." .PHONY: test -test: protogen +test: vllm @echo "Testing vllm..." bash test.sh @echo "vllm tested." -.PHONY: protogen -protogen: backend_pb2_grpc.py backend_pb2.py - .PHONY: protogen-clean protogen-clean: $(RM) backend_pb2_grpc.py backend_pb2.py -backend_pb2_grpc.py backend_pb2.py: - python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto - .PHONY: clean clean: protogen-clean rm -rf venv __pycache__ \ No newline at end of file diff --git a/core/gallery/gallery.go b/core/gallery/gallery.go index a80550102b17..e746f71a347f 100644 --- a/core/gallery/gallery.go +++ b/core/gallery/gallery.go @@ -141,14 +141,15 @@ func AvailableGalleryModels(galleries []config.Gallery, systemState *system.Syst func AvailableBackends(galleries []config.Gallery, systemState *system.SystemState) (GalleryElements[*GalleryBackend], error) { var backends []*GalleryBackend + systemBackends, err := ListSystemBackends(systemState) + if err != nil { + return nil, err + } + // Get backends from galleries for _, gallery := range galleries { - galleryBackends, err := getGalleryElements[*GalleryBackend](gallery, systemState.Backend.BackendsPath, func(backend *GalleryBackend) bool { - backends, err := ListSystemBackends(systemState) - if err != nil { - return false - } - return backends.Exists(backend.GetName()) + galleryBackends, err := getGalleryElements(gallery, systemState.Backend.BackendsPath, func(backend *GalleryBackend) bool { + return systemBackends.Exists(backend.GetName()) }) if err != nil { return nil, err diff --git a/core/http/endpoints/localai/welcome.go b/core/http/endpoints/localai/welcome.go index 23efd0788dc5..04f72743e34e 100644 --- a/core/http/endpoints/localai/welcome.go +++ b/core/http/endpoints/localai/welcome.go @@ -16,13 +16,9 @@ func WelcomeEndpoint(appConfig *config.ApplicationConfig, modelConfigs := cl.GetAllModelsConfigs() galleryConfigs := map[string]*gallery.ModelConfig{} - backends, _ := gallery.AvailableBackends(appConfig.BackendGalleries, appConfig.SystemState) - - installedBackends := gallery.GalleryElements[*gallery.GalleryBackend]{} - for _, b := range backends { - if b.Installed { - installedBackends = append(installedBackends, b) - } + installedBackends, err := gallery.ListSystemBackends(appConfig.SystemState) + if err != nil { + return err } for _, m := range modelConfigs { diff --git a/scripts/build-llama-cpp-darwin.sh b/scripts/build/llama-cpp-darwin.sh similarity index 100% rename from scripts/build-llama-cpp-darwin.sh rename to scripts/build/llama-cpp-darwin.sh diff --git a/scripts/build/python-darwin.sh b/scripts/build/python-darwin.sh new file mode 100644 index 000000000000..6166a2630322 --- /dev/null +++ b/scripts/build/python-darwin.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +set -ex + +IMAGE_NAME="${IMAGE_NAME:-localai/llama-cpp-darwin}" +mkdir -p backend-images +make -C backend/python/${BACKEND} + +cp -rfv backend/python/common backend/python/${BACKEND}/ + +PLATFORMARCH="${PLATFORMARCH:-darwin/arm64}" + +./local-ai util create-oci-image \ + backend/python/${BACKEND}/. \ + --output ./backend-images/${BACKEND}.tar \ + --image-name $IMAGE_NAME \ + --platform $PLATFORMARCH + +make -C backend/python/${BACKEND} clean +