Skip to content

Commit c5741dc

Browse files
committed
Add HPA scaling support for ChatQnA / vLLM
Signed-off-by: Eero Tamminen <[email protected]>
1 parent 10af11a commit c5741dc

File tree

9 files changed

+110
-7
lines changed

9 files changed

+110
-7
lines changed

helm-charts/chatqna/gaudi-vllm-values.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ tgi:
99

1010
vllm:
1111
enabled: true
12+
accelDevice: "gaudi"
1213
image:
1314
repository: opea/vllm-gaudi
1415
tag: "latest"

helm-charts/chatqna/hpa-values.yaml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
# Enable HorizontalPodAutoscaler (HPA)
55
#
66
# That will overwrite named PrometheusAdapter configMap with ChatQnA specific
7-
# custom metric queries for embedding, reranking, tgi services.
7+
# custom metric queries for embedding, reranking, and LLM services.
88
#
99
# Default upstream configMap is in:
1010
# - https://github.com/kubernetes-sigs/prometheus-adapter/blob/master/deploy/manifests/config-map.yaml
@@ -15,6 +15,10 @@ autoscaling:
1515
# Override values in specific subcharts
1616

1717
# Enabling "autoscaling" for any of the subcharts requires enabling it also above!
18+
vllm:
19+
autoscaling:
20+
maxReplicas: 4
21+
enabled: true
1822
tgi:
1923
autoscaling:
2024
maxReplicas: 4

helm-charts/chatqna/templates/custom-metrics-configmap.yaml

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,27 @@ metadata:
1313
data:
1414
config.yaml: |
1515
rules:
16-
{{- if .Values.tgi.autoscaling.enabled }}
16+
{{- if and .Values.vllm.enabled .Values.vllm.autoscaling.enabled }}
1717
# check metric with:
1818
# kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/<metric> | jq
1919
#
20+
- seriesQuery: '{__name__="vllm:time_per_output_token_seconds_sum",service="{{ include "vllm.fullname" .Subcharts.vllm }}"}'
21+
# Average output token latency from vLLM histograms, over 1 min
22+
# (interval should be at least 4x serviceMonitor query interval,
23+
# 0.001 divider add is to make sure there's always a valid value)
24+
metricsQuery: 'rate(vllm:time_per_output_token_seconds_sum{service="{{ include "vllm.fullname" .Subcharts.vllm }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(vllm:time_per_output_token_seconds_count{service="{{ include "vllm.fullname" .Subcharts.vllm }}",<<.LabelMatchers>>}[1m]))'
25+
name:
26+
matches: ^vllm:time_per_output_token_seconds_sum
27+
as: "{{ include "vllm.metricPrefix" .Subcharts.vllm }}_token_latency"
28+
resources:
29+
# HPA needs both namespace + suitable object resource for its query paths:
30+
# /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/<metric>
31+
# (pod is not suitable object type for matching as each instance has different name)
32+
overrides:
33+
namespace: {resource: "namespace"}
34+
service: {resource: "service"}
35+
{{- end }}
36+
{{- if and .Values.tgi.enabled .Values.tgi.autoscaling.enabled }}
2037
{{- if .Values.tgi.accelDevice }}
2138
- seriesQuery: '{__name__="tgi_queue_size",service="{{ include "tgi.fullname" .Subcharts.tgi }}"}'
2239
# TGI instances queue_size sum
@@ -27,16 +44,12 @@ data:
2744
{{- else }}
2845
- seriesQuery: '{__name__="tgi_request_inference_duration_sum",service="{{ include "tgi.fullname" .Subcharts.tgi }}"}'
2946
# Average request latency from TGI histograms, over 1 min
30-
# (0.001 divider add is to make sure there's always a valid value)
3147
metricsQuery: 'rate(tgi_request_inference_duration_sum{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]))'
3248
name:
3349
matches: ^tgi_request_inference_duration_sum
3450
as: "{{ include "tgi.metricPrefix" .Subcharts.tgi }}_request_latency"
3551
{{- end }}
3652
resources:
37-
# HPA needs both namespace + suitable object resource for its query paths:
38-
# /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/<metric>
39-
# (pod is not suitable object type for matching as each instance has different name)
4053
overrides:
4154
namespace: {resource: "namespace"}
4255
service: {resource: "service"}

helm-charts/common/vllm/README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,4 +51,5 @@ curl http://localhost:2080/v1/completions \
5151
| global.modelUseHostPath | string | `""` | Cached models directory, vllm will not download if the model is cached here. The host path "modelUseHostPath" will be mounted to container as /data directory. Set this to null/empty will force it to download model. |
5252
| image.repository | string | `"opea/vllm"` | |
5353
| image.tag | string | `"latest"` | |
54-
| global.monitoring | bool | `false` | Enable usage metrics for the service. See [monitoring instructions](../../monitoring.md) before enabling! |
54+
| autoscaling.enabled | bool | `false` | Enable HPA autoscaling for the service deployment based on metrics it provides. See [HPA instructions](../../HPA.md) before enabling! |
55+
| global.monitoring | bool | `false` | Enable usage metrics for the service. Required for HPA. See [monitoring instructions](../../monitoring.md) before enabling! |

helm-charts/common/vllm/gaudi-values.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
# This is a YAML-formatted file.
66
# Declare variables to be passed into your templates.
77

8+
accelDevice: "gaudi"
9+
810
image:
911
repository: opea/vllm-gaudi
1012
tag: "latest"

helm-charts/common/vllm/templates/_helpers.tpl

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,13 @@ Create chart name and version as used by the chart label.
3030
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
3131
{{- end }}
3232

33+
{{/*
34+
Convert chart name to a string suitable as metric prefix
35+
*/}}
36+
{{- define "vllm.metricPrefix" -}}
37+
{{- include "vllm.fullname" . | replace "-" "_" | regexFind "[a-zA-Z_:][a-zA-Z0-9_:]*" }}
38+
{{- end }}
39+
3340
{{/*
3441
Common labels
3542
*/}}

helm-charts/common/vllm/templates/deployment.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,10 @@ metadata:
88
labels:
99
{{- include "vllm.labels" . | nindent 4 }}
1010
spec:
11+
{{- if ne (int .Values.replicaCount) 1 }}
12+
# remove if replica count should not be reset on pod update (e.g. with HPA)
1113
replicas: {{ .Values.replicaCount }}
14+
{{- end }}
1215
selector:
1316
matchLabels:
1417
{{- include "vllm.selectorLabels" . | nindent 6 }}
@@ -125,3 +128,7 @@ spec:
125128
matchLabels:
126129
{{- include "vllm.selectorLabels" . | nindent 14 }}
127130
{{- end }}
131+
{{- if not .Values.accelDevice }}
132+
# extra time to finish processing buffered requests on CPU before pod is forcibly terminated
133+
terminationGracePeriodSeconds: 120
134+
{{- end }}
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# Copyright (C) 2024 Intel Corporation
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
{{- if and .Values.global.monitoring .Values.autoscaling.enabled }}
5+
apiVersion: autoscaling/v2
6+
kind: HorizontalPodAutoscaler
7+
metadata:
8+
name: {{ include "vllm.fullname" . }}
9+
spec:
10+
scaleTargetRef:
11+
apiVersion: apps/v1
12+
kind: Deployment
13+
name: {{ include "vllm.fullname" . }}
14+
minReplicas: 1
15+
maxReplicas: {{ .Values.autoscaling.maxReplicas }}
16+
metrics:
17+
- type: Object
18+
object:
19+
describedObject:
20+
apiVersion: v1
21+
# get metric for named object of given type (in same namespace)
22+
kind: Service
23+
name: {{ include "vllm.fullname" . }}
24+
target:
25+
# Metric is sum from all pods. "AverageValue" divides value returned from
26+
# the custom metrics API by the number of Pods before comparing to the target:
27+
# https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
28+
# https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale-walkthrough/#autoscaling-on-multiple-metrics-and-custom-metrics
29+
type: AverageValue
30+
{{- if .Values.accelDevice }}
31+
averageValue: 0.1
32+
{{- else }}
33+
# allow larger latencies with unaccelerated service
34+
averageValue: 1.0
35+
{{- end }}
36+
metric:
37+
name: {{ include "vllm.metricPrefix" . }}_token_latency
38+
behavior:
39+
scaleDown:
40+
stabilizationWindowSeconds: 180
41+
policies:
42+
- type: Percent
43+
value: 25
44+
periodSeconds: 90
45+
scaleUp:
46+
selectPolicy: Max
47+
stabilizationWindowSeconds: 0
48+
policies:
49+
# Slow linear rampup in case additional CPU pods go to same node
50+
# (i.e. interfere with each other)
51+
- type: Pods
52+
value: 1
53+
periodSeconds: 90
54+
#- type: Percent
55+
# value: 25
56+
# periodSeconds: 90
57+
{{- end }}

helm-charts/common/vllm/values.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,17 @@
77

88
replicaCount: 1
99

10+
# Enabling HPA will:
11+
# - Ignore above replica count, as it will be controlled by HPA
12+
# - Add example HPA scaling rules with custom metrics thresholds
13+
# - Require custom metrics ConfigMap available in the main application chart
14+
autoscaling:
15+
maxReplicas: 4
16+
enabled: false
17+
18+
# empty for CPU (longer latencies are tolerated before HPA scaling unaccelerated service)
19+
accelDevice: ""
20+
1021
port: 2080
1122
shmSize: 1Gi
1223
image:

0 commit comments

Comments
 (0)