Skip to content

Commit ae94a96

Browse files
committed
[wiztelemetry-monitoring-helper] add HAMi rules
Signed-off-by: frezes <[email protected]>
1 parent 004b2c6 commit ae94a96

File tree

6 files changed

+179
-9
lines changed

6 files changed

+179
-9
lines changed

charts/wiztelemetry-monitoring-helper/Chart.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@ type: application
1515
# This is the chart version. This version number should be incremented each time you make changes
1616
# to the chart and its templates, including the app version.
1717
# Versions are expected to follow Semantic Versioning (https://semver.org/)
18-
version: 0.10.3
18+
version: 0.11.0
1919

2020
# This is the version number of the application being deployed. This version number should be
2121
# incremented each time you make changes to the application. Versions are not expected to
2222
# follow Semantic Versioning. They should reflect the version the application is using.
2323
# It is recommended to use it with quotes.
24-
appVersion: "1.2.0"
24+
appVersion: "1.2.1"

charts/wiztelemetry-monitoring-helper/hack/sync_prometheus_rules.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,8 @@ def new_representer(dumper, data):
142142
'wiztelemetry-kube-scheduler.rules': ' .Values.defaultRules.rules.wiztelemetry.scheduler',
143143
'wiztelemetry-ascend-npu.rules': ' .Values.defaultRules.rules.gpuDevice.ascendNPU',
144144
'wiztelemetry-cambricon-mlu.rules': ' .Values.defaultRules.rules.gpuDevice.cambriconMLU',
145-
'wiztelemetry-nvidia-gpu.rules': ' .Values.defaultRules.rules.gpuDevice.nvidiaGPU'
145+
'wiztelemetry-nvidia-gpu.rules': ' .Values.defaultRules.rules.gpuDevice.nvidiaGPU',
146+
'wiztelemetry-hami.rules': ' .Values.defaultRules.rules.gpuDevice.hami'
146147
}
147148

148149
alert_condition_map = {
Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
{{- /*
2+
Generated from 'wiztelemetry-hami.rules' group from file://../../../ks-prometheus/manifests/wiztelemetry-prometheusRule.yaml
3+
Do not change in-place! In order to change this file first read following link:
4+
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
5+
*/ -}}
6+
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
7+
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.gpuDevice.hami }}
8+
apiVersion: monitoring.coreos.com/v1
9+
kind: PrometheusRule
10+
metadata:
11+
name: {{ printf "%s-%s" (include "wiztelemetry-monitoring-helper.fullname" .) "wiztelemetry-hami.rules" | trunc 63 | trimSuffix "-" }}
12+
namespace: {{ template "wiztelemetry-monitoring-helper.namespace" . }}
13+
labels:
14+
app: {{ template "wiztelemetry-monitoring-helper.name" . }}
15+
{{ include "wiztelemetry-monitoring-helper.labels" . | indent 4 }}
16+
{{- if .Values.defaultRules.labels }}
17+
{{ toYaml .Values.defaultRules.labels | indent 4 }}
18+
{{- end }}
19+
{{- if .Values.defaultRules.annotations }}
20+
annotations:
21+
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
22+
{{- end }}
23+
spec:
24+
groups:
25+
- name: wiztelemetry-hami.rules
26+
rules:
27+
- expr: |-
28+
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, node, namespace, pod, container) (
29+
label_replace(
30+
label_replace(
31+
label_replace(Device_utilization_desc_of_container, "namespace", "$1", "podnamespace", "(.*)"),
32+
"pod",
33+
"$1",
34+
"podname",
35+
"(.*)"
36+
),
37+
"container",
38+
"$1",
39+
"ctrname",
40+
"(.*)"
41+
)
42+
)
43+
record: node_namespace_pod_container:container_gpu_utilization
44+
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
45+
labels:
46+
{{- with .Values.defaultRules.additionalRuleLabels }}
47+
{{- toYaml . | nindent 8 }}
48+
{{- end }}
49+
{{- with .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
50+
{{- toYaml . | nindent 8 }}
51+
{{- end }}
52+
{{- end }}
53+
- expr: |-
54+
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, node, namespace, pod, container) (
55+
label_replace(
56+
label_replace(
57+
label_replace(Device_memory_desc_of_container, "namespace", "$1", "podnamespace", "(.*)"),
58+
"pod",
59+
"$1",
60+
"podname",
61+
"(.*)"
62+
),
63+
"container",
64+
"$1",
65+
"ctrname",
66+
"(.*)"
67+
)
68+
)
69+
record: node_namespace_pod_container:container_gpu_memory_usage
70+
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
71+
labels:
72+
{{- with .Values.defaultRules.additionalRuleLabels }}
73+
{{- toYaml . | nindent 8 }}
74+
{{- end }}
75+
{{- with .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
76+
{{- toYaml . | nindent 8 }}
77+
{{- end }}
78+
{{- end }}
79+
- expr: |-
80+
label_replace(
81+
label_replace(GPUDeviceSharedNum / GPUDeviceCoreLimit, "node", "$1", "nodeid", "(.*)"),
82+
"device_num",
83+
"$1",
84+
"deviceidx",
85+
"(.*)"
86+
)
87+
record: node:vgpu_device:vgpu_allocated_utilization
88+
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
89+
labels:
90+
{{- with .Values.defaultRules.additionalRuleLabels }}
91+
{{- toYaml . | nindent 8 }}
92+
{{- end }}
93+
{{- with .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
94+
{{- toYaml . | nindent 8 }}
95+
{{- end }}
96+
{{- end }}
97+
- expr: |-
98+
label_replace(
99+
label_replace(GPUDeviceCoreAllocated / GPUDeviceCoreLimit, "node", "$1", "nodeid", "(.*)"),
100+
"device_num",
101+
"$1",
102+
"deviceidx",
103+
"(.*)"
104+
)
105+
record: node:vgpu_device:vgpu_core_allocated_utilization
106+
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
107+
labels:
108+
{{- with .Values.defaultRules.additionalRuleLabels }}
109+
{{- toYaml . | nindent 8 }}
110+
{{- end }}
111+
{{- with .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
112+
{{- toYaml . | nindent 8 }}
113+
{{- end }}
114+
{{- end }}
115+
- expr: |-
116+
label_replace(
117+
label_replace(
118+
sum without (devicecores) (GPUDeviceMemoryAllocated) / GPUDeviceMemoryLimit,
119+
"node",
120+
"$1",
121+
"nodeid",
122+
"(.*)"
123+
),
124+
"device_num",
125+
"$1",
126+
"deviceidx",
127+
"(.*)"
128+
)
129+
record: node:vgpu_device:vgpu_memory_allocated_utilization
130+
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
131+
labels:
132+
{{- with .Values.defaultRules.additionalRuleLabels }}
133+
{{- toYaml . | nindent 8 }}
134+
{{- end }}
135+
{{- with .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
136+
{{- toYaml . | nindent 8 }}
137+
{{- end }}
138+
{{- end }}
139+
- expr: |-
140+
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, node) (
141+
kube_pod_container_resource_requests{job="kube-state-metrics",resource=~"nvidia_com_vgpu|qingcloud_nvidia_com_vgpu"}
142+
)
143+
record: node:node_gpu_allocated_num:sum
144+
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
145+
labels:
146+
{{- with .Values.defaultRules.additionalRuleLabels }}
147+
{{- toYaml . | nindent 8 }}
148+
{{- end }}
149+
{{- with .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
150+
{{- toYaml . | nindent 8 }}
151+
{{- end }}
152+
{{- end }}
153+
- expr: |-
154+
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, node) (
155+
kube_node_status_allocatable{job="kube-state-metrics",resource=~"nvidia_com_vgpu|qingcloud_nvidia_com_vgpu"}
156+
)
157+
record: node:node_gpu_num:sum
158+
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
159+
labels:
160+
{{- with .Values.defaultRules.additionalRuleLabels }}
161+
{{- toYaml . | nindent 8 }}
162+
{{- end }}
163+
{{- with .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
164+
{{- toYaml . | nindent 8 }}
165+
{{- end }}
166+
{{- end }}
167+
{{- end }}

charts/wiztelemetry-monitoring-helper/values.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ defaultRules:
6666
ascendNPU: true
6767
cambriconMLU: true
6868
nvidiaGPU: true
69+
hami: true
6970

7071
additionalRuleGroupLabels:
7172
wiztelemetry:
@@ -79,6 +80,7 @@ defaultRules:
7980
ascendNPU: {}
8081
cambriconMLU: {}
8182
nvidiaGPU: {}
83+
hami: {}
8284

8385
## Additional labels for PrometheusRule alerts
8486
additionalRuleLabels: {}

ks-prometheus/components/wiztelemetry-mixin/rules/gpu.libsonnet

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -541,7 +541,7 @@
541541
record: 'node:vgpu_device:vgpu_allocated_utilization',
542542
expr: |||
543543
label_replace(
544-
label_replace(GPUDeviceSharedNum / GPUDeviceCoreLimit * 100, "node", "$1", "nodeid", "(.*)"),
544+
label_replace(GPUDeviceSharedNum / GPUDeviceCoreLimit, "node", "$1", "nodeid", "(.*)"),
545545
"device_num",
546546
"$1",
547547
"deviceidx",
@@ -583,15 +583,15 @@
583583
record: 'node:node_gpu_allocated_num:sum',
584584
expr: |||
585585
sum by (%(clusterLabel)s, node) (
586-
kube_pod_container_resource_requests{%(kubeStateMetricsSelector)s,resource=~"nvidia_com_vgpu"}
586+
kube_pod_container_resource_requests{%(kubeStateMetricsSelector)s,resource=~"nvidia_com_vgpu|qingcloud_nvidia_com_vgpu"}
587587
)
588588
||| % $._config,
589589
},
590590
{
591591
record: 'node:node_gpu_num:sum',
592592
expr: |||
593593
sum by(%(clusterLabel)s, node) (
594-
kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource=~"nvidia_com_vgpu"}
594+
kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource=~"nvidia_com_vgpu|qingcloud_nvidia_com_vgpu"}
595595
)
596596
||| % $._config,
597597
},

ks-prometheus/manifests/wiztelemetry-prometheusRule.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1002,7 +1002,7 @@ spec:
10021002
record: node_namespace_pod_container:container_gpu_memory_usage
10031003
- expr: |
10041004
label_replace(
1005-
label_replace(GPUDeviceSharedNum / GPUDeviceCoreLimit * 100, "node", "$1", "nodeid", "(.*)"),
1005+
label_replace(GPUDeviceSharedNum / GPUDeviceCoreLimit, "node", "$1", "nodeid", "(.*)"),
10061006
"device_num",
10071007
"$1",
10081008
"deviceidx",
@@ -1035,12 +1035,12 @@ spec:
10351035
record: node:vgpu_device:vgpu_memory_allocated_utilization
10361036
- expr: |
10371037
sum by (cluster, node) (
1038-
kube_pod_container_resource_requests{job="kube-state-metrics",resource=~"nvidia_com_vgpu"}
1038+
kube_pod_container_resource_requests{job="kube-state-metrics",resource=~"nvidia_com_vgpu|qingcloud_nvidia_com_vgpu"}
10391039
)
10401040
record: node:node_gpu_allocated_num:sum
10411041
- expr: |
10421042
sum by(cluster, node) (
1043-
kube_node_status_allocatable{job="kube-state-metrics",resource=~"nvidia_com_vgpu"}
1043+
kube_node_status_allocatable{job="kube-state-metrics",resource=~"nvidia_com_vgpu|qingcloud_nvidia_com_vgpu"}
10441044
)
10451045
record: node:node_gpu_num:sum
10461046
- name: wiztelemetry-kubelet.rules

0 commit comments

Comments
 (0)