Skip to content

Commit f38b1b0

Browse files
lianhaoeero-t
authored andcommitted
Adapt to latest vllm changes
- Remove --eager-enforce on hpu to improve performance - Refactor to the upstream docker entrypoint changes Fixes issue #631. Signed-off-by: Lianhao Lu <[email protected]>
1 parent 9408720 commit f38b1b0

File tree

4 files changed

+4
-6
lines changed

4 files changed

+4
-6
lines changed

helm-charts/common/agent/values.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ tgi:
1414
vllm:
1515
enabled: false
1616
LLM_MODEL_ID: "mistralai/Mistral-7B-Instruct-v0.3"
17-
extraCmdArgs: ["/bin/bash", "-c", "python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model mistralai/Mistral-7B-Instruct-v0.3 --tensor-parallel-size 1 --host 0.0.0.0 --port 2080 --download-dir /data --block-size 128 --max-num-seqs 4096 --max-seq_len-to-capture 8192 --enable-auto-tool-choice --tool-call-parser mistral"]
17+
extraCmdArgs: ["--tensor-parallel-size", "1", "--block-size", "128", "--max-num-seqs", "4096", "--max-seq_len-to-capture", "8192", "--enable-auto-tool-choice", "--tool-call-parser", "mistral"]
1818

1919
replicaCount: 1
2020
llm_endpoint_url: ""

helm-charts/common/llm-uservice/ci-vllm-gaudi-values.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ vllm:
1313
tag: "latest"
1414
LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
1515
OMPI_MCA_btl_vader_single_copy_mechanism: none
16-
extraCmdArgs: ["--enforce-eager","--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq_len-to-capture","2048"]
16+
extraCmdArgs: ["--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq_len-to-capture","2048"]
1717
resources:
1818
limits:
1919
habana.ai/gaudi: 1

helm-charts/common/vllm/gaudi-values.yaml

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,7 @@ image:
1313

1414
# VLLM_CPU_KVCACHE_SPACE: "40"
1515
OMPI_MCA_btl_vader_single_copy_mechanism: none
16-
extraCmdArgs: ["--enforce-eager","--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq_len-to-capture","2048"]
17-
# Workaround for current HPU image with start command /bin/bash
18-
# extraCmdArgs: ["/bin/bash","-c","python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model Intel/neural-chat-7b-v3-3 --tensor-parallel-size 1 --host 0.0.0.0 --port 2080 --download-dir /data --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048"]
16+
extraCmdArgs: ["--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq_len-to-capture","2048"]
1917
resources:
2018
limits:
2119
habana.ai/gaudi: 1

helm-charts/common/vllm/values.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ resources: {}
7373
# cpu: 100m
7474
# memory: 128Mi
7575

76-
extraCmdArgs: ["--enforce-eager", "--dtype", "auto"]
76+
extraCmdArgs: []
7777

7878
livenessProbe:
7979
httpGet:

0 commit comments

Comments
 (0)