Skip to content

Commit 8a4cc7d

Browse files
committed
WIP: add vLLM support for ChatQnA
For now vLLM replaces just TGI, but as it seems to support also embedding, it may be able to replace also TEI-embed/-rerank. Signed-off-by: Eero Tamminen <[email protected]>
1 parent 29a0e29 commit 8a4cc7d

File tree

9 files changed

+106
-5
lines changed

9 files changed

+106
-5
lines changed

helm-charts/chatqna/Chart.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,11 @@ dependencies:
1818
- name: tgi
1919
version: 1.0.0
2020
repository: "file://../common/tgi"
21+
condition: tgi.enabled
22+
- name: vllm
23+
version: 1.0.0
24+
repository: "file://../common/vllm"
25+
condition: vllm.enabled
2126
- name: tei
2227
version: 1.0.0
2328
repository: "file://../common/tei"

helm-charts/chatqna/README.md

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ Helm chart for deploying ChatQnA service. ChatQnA depends on the following servi
1111
- [teirerank](../common/teirerank/README.md)
1212
- [llm-uservice](../common/llm-uservice/README.md)
1313
- [tgi](../common/tgi/README.md)
14+
- [vllm](../common/vllm/README.md)
1415

1516
## Installing the Chart
1617

@@ -26,13 +27,15 @@ export MODELNAME="Intel/neural-chat-7b-v3-3"
2627
# If you would like to use the traditional UI, please change the image as well as the containerport within the values
2728
# append these at the end of the command "--set chatqna-ui.image.repository=opea/chatqna-ui,chatqna-ui.image.tag=latest,chatqna-ui.containerPort=5173"
2829
helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME}
29-
# To use Gaudi device
30-
#helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -f chatqna/gaudi-values.yaml
30+
# To use Gaudi device with TGI
31+
#helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -f chatqna/gaudi-tgi-values.yaml
32+
# To use Gaudi device with vLLM
33+
#helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -f chatqna/gaudi-vllm-values.yaml
3134
# To use Nvidia GPU
3235
#helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -f chatqna/nv-values.yaml
33-
# To include guardrail component in chatqna on Xeon
36+
# To include guardrail component in chatqna on Xeon with TGI
3437
#helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} -f chatqna/guardrails-values.yaml
35-
# To include guardrail component in chatqna on Gaudi
38+
# To include guardrail component in chatqna on Gaudi with TGI
3639
#helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} -f chatqna/guardrails-gaudi-values.yaml
3740
```
3841

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
gaudi-tgi-values.yaml

helm-charts/chatqna/ci-gaudi-values.yaml

Lines changed: 0 additions & 1 deletion
This file was deleted.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
gaudi-vllm-values.yaml
File renamed without changes.
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# Copyright (C) 2024 Intel Corporation
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
# Accelerate inferencing in heaviest components to improve performance
5+
# by overriding their subchart values
6+
7+
tgi:
8+
enabled: false
9+
10+
vllm:
11+
enabled: true
12+
image:
13+
repository: opea/vllm-gaudi
14+
tag: "latest"
15+
resources:
16+
limits:
17+
habana.ai/gaudi: 1
18+
startupProbe:
19+
initialDelaySeconds: 5
20+
periodSeconds: 5
21+
timeoutSeconds: 1
22+
failureThreshold: 120
23+
readinessProbe:
24+
initialDelaySeconds: 5
25+
periodSeconds: 5
26+
timeoutSeconds: 1
27+
livenessProbe:
28+
initialDelaySeconds: 5
29+
periodSeconds: 5
30+
timeoutSeconds: 1
31+
32+
# TODO: these are taken from GenAIExamples HPU manifest as-is
33+
# vLLM chart needs to adopt / apply relevant ones
34+
HABANA_LOGS: "/tmp/habana_logs"
35+
NUMBA_CACHE_DIR: "/tmp"
36+
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
37+
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
38+
HF_HOME: "/tmp/.cache/huggingface"
39+
GPU_MEMORY_UTILIZATION: "0.5"
40+
DTYPE: "auto"
41+
TENSOR_PARALLEL_SIZE: "1"
42+
BLOCK_SIZE: "128"
43+
MAX_NUM_SEQS: "256"
44+
MAX_SEQ_LEN_TO_CAPTURE: "2048"
45+
46+
47+
# Reranking: second largest bottleneck when reranking is in use
48+
# (i.e. query context docs have been uploaded with data-prep)
49+
#
50+
# TODO: could vLLM be used also for reranking / embedding?
51+
teirerank:
52+
accelDevice: "gaudi"
53+
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
54+
MAX_WARMUP_SEQUENCE_LENGTH: "512"
55+
image:
56+
repository: ghcr.io/huggingface/tei-gaudi
57+
tag: 1.5.0
58+
resources:
59+
limits:
60+
habana.ai/gaudi: 1
61+
securityContext:
62+
readOnlyRootFilesystem: false
63+
livenessProbe:
64+
timeoutSeconds: 1
65+
readinessProbe:
66+
timeoutSeconds: 1

helm-charts/chatqna/templates/deployment.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,19 @@ spec:
3434
- name: {{ .Release.Name }}
3535
env:
3636
- name: LLM_SERVER_HOST_IP
37+
{{- if .Values.vllm.enabled }}
38+
value: {{ .Release.Name }}-vllm
39+
{{- else }}
3740
value: {{ .Release.Name }}-tgi
41+
{{- end }}
3842
- name: LLM_SERVER_PORT
3943
value: "80"
4044
- name: LLM_MODEL
45+
{{- if .Values.vllm.enabled }}
46+
value: {{ .Values.vllm.LLM_MODEL_ID | quote }}
47+
{{- else }}
4148
value: {{ .Values.tgi.LLM_MODEL_ID | quote }}
49+
{{- end }}
4250
- name: RERANK_SERVER_HOST_IP
4351
value: {{ .Release.Name }}-teirerank
4452
- name: RERANK_SERVER_PORT

helm-charts/chatqna/values.yaml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,25 @@ autoscaling:
4646

4747
# Override values in specific subcharts
4848
tgi:
49+
enabled: true
4950
LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
51+
vllm:
52+
enabled: false
53+
# TODO: manifest in GenAIExamples uses "meta-llama/Meta-Llama-3-8B-Instruct" instead?
54+
LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
55+
# TODO: these are non-redundant/non-broken options used by Agent component,
56+
# but I think their values should be handled inside vLLM component, with
57+
# deployment applying numbers set in configMap, based on values YAML file
58+
# variables.
59+
extraCmdArgs: [
60+
"--enforce-eager",
61+
"--tensor-parallel-size", "1",
62+
"--dtype", "auto",
63+
"--block-size", "128",
64+
"--max-num-seqs", "256",
65+
"--max-seq_len-to-capture", "2048",
66+
"--gpu-memory-utilization", "0.5"
67+
]
5068

5169
# disable guardrails-usvc by default
5270
# See guardrails-values.yaml for guardrail related options

0 commit comments

Comments
 (0)