Skip to content

Commit ba551fb

Browse files
authored
Merge pull request #118 from frezes/feat/hamiRules
[wiztelemetry-monitoring-helper] add HAMi rules
2 parents b827cf7 + ae94a96 commit ba551fb

File tree

6 files changed

+355
-3
lines changed

6 files changed

+355
-3
lines changed

charts/wiztelemetry-monitoring-helper/Chart.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@ type: application
1515
# This is the chart version. This version number should be incremented each time you make changes
1616
# to the chart and its templates, including the app version.
1717
# Versions are expected to follow Semantic Versioning (https://semver.org/)
18-
version: 0.10.3
18+
version: 0.11.0
1919

2020
# This is the version number of the application being deployed. This version number should be
2121
# incremented each time you make changes to the application. Versions are not expected to
2222
# follow Semantic Versioning. They should reflect the version the application is using.
2323
# It is recommended to use it with quotes.
24-
appVersion: "1.2.0"
24+
appVersion: "1.2.1"

charts/wiztelemetry-monitoring-helper/hack/sync_prometheus_rules.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,8 @@ def new_representer(dumper, data):
142142
'wiztelemetry-kube-scheduler.rules': ' .Values.defaultRules.rules.wiztelemetry.scheduler',
143143
'wiztelemetry-ascend-npu.rules': ' .Values.defaultRules.rules.gpuDevice.ascendNPU',
144144
'wiztelemetry-cambricon-mlu.rules': ' .Values.defaultRules.rules.gpuDevice.cambriconMLU',
145-
'wiztelemetry-nvidia-gpu.rules': ' .Values.defaultRules.rules.gpuDevice.nvidiaGPU'
145+
'wiztelemetry-nvidia-gpu.rules': ' .Values.defaultRules.rules.gpuDevice.nvidiaGPU',
146+
'wiztelemetry-hami.rules': ' .Values.defaultRules.rules.gpuDevice.hami'
146147
}
147148

148149
alert_condition_map = {
Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
{{- /*
2+
Generated from 'wiztelemetry-hami.rules' group from file://../../../ks-prometheus/manifests/wiztelemetry-prometheusRule.yaml
3+
Do not change in-place! In order to change this file first read following link:
4+
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
5+
*/ -}}
6+
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
7+
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.gpuDevice.hami }}
8+
apiVersion: monitoring.coreos.com/v1
9+
kind: PrometheusRule
10+
metadata:
11+
name: {{ printf "%s-%s" (include "wiztelemetry-monitoring-helper.fullname" .) "wiztelemetry-hami.rules" | trunc 63 | trimSuffix "-" }}
12+
namespace: {{ template "wiztelemetry-monitoring-helper.namespace" . }}
13+
labels:
14+
app: {{ template "wiztelemetry-monitoring-helper.name" . }}
15+
{{ include "wiztelemetry-monitoring-helper.labels" . | indent 4 }}
16+
{{- if .Values.defaultRules.labels }}
17+
{{ toYaml .Values.defaultRules.labels | indent 4 }}
18+
{{- end }}
19+
{{- if .Values.defaultRules.annotations }}
20+
annotations:
21+
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
22+
{{- end }}
23+
spec:
24+
groups:
25+
- name: wiztelemetry-hami.rules
26+
rules:
27+
- expr: |-
28+
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, node, namespace, pod, container) (
29+
label_replace(
30+
label_replace(
31+
label_replace(Device_utilization_desc_of_container, "namespace", "$1", "podnamespace", "(.*)"),
32+
"pod",
33+
"$1",
34+
"podname",
35+
"(.*)"
36+
),
37+
"container",
38+
"$1",
39+
"ctrname",
40+
"(.*)"
41+
)
42+
)
43+
record: node_namespace_pod_container:container_gpu_utilization
44+
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
45+
labels:
46+
{{- with .Values.defaultRules.additionalRuleLabels }}
47+
{{- toYaml . | nindent 8 }}
48+
{{- end }}
49+
{{- with .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
50+
{{- toYaml . | nindent 8 }}
51+
{{- end }}
52+
{{- end }}
53+
- expr: |-
54+
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, node, namespace, pod, container) (
55+
label_replace(
56+
label_replace(
57+
label_replace(Device_memory_desc_of_container, "namespace", "$1", "podnamespace", "(.*)"),
58+
"pod",
59+
"$1",
60+
"podname",
61+
"(.*)"
62+
),
63+
"container",
64+
"$1",
65+
"ctrname",
66+
"(.*)"
67+
)
68+
)
69+
record: node_namespace_pod_container:container_gpu_memory_usage
70+
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
71+
labels:
72+
{{- with .Values.defaultRules.additionalRuleLabels }}
73+
{{- toYaml . | nindent 8 }}
74+
{{- end }}
75+
{{- with .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
76+
{{- toYaml . | nindent 8 }}
77+
{{- end }}
78+
{{- end }}
79+
- expr: |-
80+
label_replace(
81+
label_replace(GPUDeviceSharedNum / GPUDeviceCoreLimit, "node", "$1", "nodeid", "(.*)"),
82+
"device_num",
83+
"$1",
84+
"deviceidx",
85+
"(.*)"
86+
)
87+
record: node:vgpu_device:vgpu_allocated_utilization
88+
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
89+
labels:
90+
{{- with .Values.defaultRules.additionalRuleLabels }}
91+
{{- toYaml . | nindent 8 }}
92+
{{- end }}
93+
{{- with .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
94+
{{- toYaml . | nindent 8 }}
95+
{{- end }}
96+
{{- end }}
97+
- expr: |-
98+
label_replace(
99+
label_replace(GPUDeviceCoreAllocated / GPUDeviceCoreLimit, "node", "$1", "nodeid", "(.*)"),
100+
"device_num",
101+
"$1",
102+
"deviceidx",
103+
"(.*)"
104+
)
105+
record: node:vgpu_device:vgpu_core_allocated_utilization
106+
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
107+
labels:
108+
{{- with .Values.defaultRules.additionalRuleLabels }}
109+
{{- toYaml . | nindent 8 }}
110+
{{- end }}
111+
{{- with .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
112+
{{- toYaml . | nindent 8 }}
113+
{{- end }}
114+
{{- end }}
115+
- expr: |-
116+
label_replace(
117+
label_replace(
118+
sum without (devicecores) (GPUDeviceMemoryAllocated) / GPUDeviceMemoryLimit,
119+
"node",
120+
"$1",
121+
"nodeid",
122+
"(.*)"
123+
),
124+
"device_num",
125+
"$1",
126+
"deviceidx",
127+
"(.*)"
128+
)
129+
record: node:vgpu_device:vgpu_memory_allocated_utilization
130+
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
131+
labels:
132+
{{- with .Values.defaultRules.additionalRuleLabels }}
133+
{{- toYaml . | nindent 8 }}
134+
{{- end }}
135+
{{- with .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
136+
{{- toYaml . | nindent 8 }}
137+
{{- end }}
138+
{{- end }}
139+
- expr: |-
140+
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, node) (
141+
kube_pod_container_resource_requests{job="kube-state-metrics",resource=~"nvidia_com_vgpu|qingcloud_nvidia_com_vgpu"}
142+
)
143+
record: node:node_gpu_allocated_num:sum
144+
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
145+
labels:
146+
{{- with .Values.defaultRules.additionalRuleLabels }}
147+
{{- toYaml . | nindent 8 }}
148+
{{- end }}
149+
{{- with .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
150+
{{- toYaml . | nindent 8 }}
151+
{{- end }}
152+
{{- end }}
153+
- expr: |-
154+
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, node) (
155+
kube_node_status_allocatable{job="kube-state-metrics",resource=~"nvidia_com_vgpu|qingcloud_nvidia_com_vgpu"}
156+
)
157+
record: node:node_gpu_num:sum
158+
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
159+
labels:
160+
{{- with .Values.defaultRules.additionalRuleLabels }}
161+
{{- toYaml . | nindent 8 }}
162+
{{- end }}
163+
{{- with .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
164+
{{- toYaml . | nindent 8 }}
165+
{{- end }}
166+
{{- end }}
167+
{{- end }}

charts/wiztelemetry-monitoring-helper/values.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ defaultRules:
6666
ascendNPU: true
6767
cambriconMLU: true
6868
nvidiaGPU: true
69+
hami: true
6970

7071
additionalRuleGroupLabels:
7172
wiztelemetry:
@@ -79,6 +80,7 @@ defaultRules:
7980
ascendNPU: {}
8081
cambriconMLU: {}
8182
nvidiaGPU: {}
83+
hami: {}
8284

8385
## Additional labels for PrometheusRule alerts
8486
additionalRuleLabels: {}

ks-prometheus/components/wiztelemetry-mixin/rules/gpu.libsonnet

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,109 @@
494494
},
495495
],
496496
},
497+
{
498+
name: "wiztelemetry-hami.rules",
499+
rules: [
500+
{
501+
record: 'node_namespace_pod_container:container_gpu_utilization',
502+
expr: |||
503+
sum by (%(clusterLabel)s, node, namespace, pod, container) (
504+
label_replace(
505+
label_replace(
506+
label_replace(Device_utilization_desc_of_container, "namespace", "$1", "podnamespace", "(.*)"),
507+
"pod",
508+
"$1",
509+
"podname",
510+
"(.*)"
511+
),
512+
"container",
513+
"$1",
514+
"ctrname",
515+
"(.*)"
516+
)
517+
)
518+
||| % $._config,
519+
},
520+
{
521+
record: 'node_namespace_pod_container:container_gpu_memory_usage',
522+
expr: |||
523+
sum by (%(clusterLabel)s, node, namespace, pod, container) (
524+
label_replace(
525+
label_replace(
526+
label_replace(Device_memory_desc_of_container, "namespace", "$1", "podnamespace", "(.*)"),
527+
"pod",
528+
"$1",
529+
"podname",
530+
"(.*)"
531+
),
532+
"container",
533+
"$1",
534+
"ctrname",
535+
"(.*)"
536+
)
537+
)
538+
||| % $._config,
539+
},
540+
{
541+
record: 'node:vgpu_device:vgpu_allocated_utilization',
542+
expr: |||
543+
label_replace(
544+
label_replace(GPUDeviceSharedNum / GPUDeviceCoreLimit, "node", "$1", "nodeid", "(.*)"),
545+
"device_num",
546+
"$1",
547+
"deviceidx",
548+
"(.*)"
549+
)
550+
||| % $._config,
551+
},
552+
{
553+
record: 'node:vgpu_device:vgpu_core_allocated_utilization',
554+
expr: |||
555+
label_replace(
556+
label_replace(GPUDeviceCoreAllocated / GPUDeviceCoreLimit, "node", "$1", "nodeid", "(.*)"),
557+
"device_num",
558+
"$1",
559+
"deviceidx",
560+
"(.*)"
561+
)
562+
||| % $._config,
563+
},
564+
{
565+
record: 'node:vgpu_device:vgpu_memory_allocated_utilization',
566+
expr: |||
567+
label_replace(
568+
label_replace(
569+
sum without (devicecores) (GPUDeviceMemoryAllocated) / GPUDeviceMemoryLimit,
570+
"node",
571+
"$1",
572+
"nodeid",
573+
"(.*)"
574+
),
575+
"device_num",
576+
"$1",
577+
"deviceidx",
578+
"(.*)"
579+
)
580+
||| % $._config,
581+
},
582+
{
583+
record: 'node:node_gpu_allocated_num:sum',
584+
expr: |||
585+
sum by (%(clusterLabel)s, node) (
586+
kube_pod_container_resource_requests{%(kubeStateMetricsSelector)s,resource=~"nvidia_com_vgpu|qingcloud_nvidia_com_vgpu"}
587+
)
588+
||| % $._config,
589+
},
590+
{
591+
record: 'node:node_gpu_num:sum',
592+
expr: |||
593+
sum by(%(clusterLabel)s, node) (
594+
kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource=~"nvidia_com_vgpu|qingcloud_nvidia_com_vgpu"}
595+
)
596+
||| % $._config,
597+
},
598+
],
599+
},
497600
],
498601
},
499602
}

0 commit comments

Comments
 (0)