1+ {{- /*
2+ Generated from 'wiztelemetry-hami.rules' group from file://../../../ks-prometheus/manifests/wiztelemetry-prometheusRule.yaml
3+ Do not change in-place! In order to change this file first read following link :
4+ https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
5+ */ -}}
6+ {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
7+ {{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.gpuDevice.hami }}
8+ apiVersion : monitoring.coreos.com/v1
9+ kind : PrometheusRule
10+ metadata :
11+ name : {{ printf "%s-%s" (include "wiztelemetry-monitoring-helper.fullname" .) "wiztelemetry-hami.rules" | trunc 63 | trimSuffix "-" }}
12+ namespace : {{ template "wiztelemetry-monitoring-helper.namespace" . }}
13+ labels :
14+ app : {{ template "wiztelemetry-monitoring-helper.name" . }}
15+ {{ include "wiztelemetry-monitoring-helper.labels" . | indent 4 }}
16+ {{- if .Values.defaultRules.labels }}
17+ {{ toYaml .Values.defaultRules.labels | indent 4 }}
18+ {{- end }}
19+ {{- if .Values.defaultRules.annotations }}
20+ annotations :
21+ {{ toYaml .Values.defaultRules.annotations | indent 4 }}
22+ {{- end }}
23+ spec :
24+ groups :
25+ - name : wiztelemetry-hami.rules
26+ rules :
27+ - expr : |-
28+ sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, node, namespace, pod, container) (
29+ label_replace(
30+ label_replace(
31+ label_replace(Device_utilization_desc_of_container, "namespace", "$1", "podnamespace", "(.*)"),
32+ "pod",
33+ "$1",
34+ "podname",
35+ "(.*)"
36+ ),
37+ "container",
38+ "$1",
39+ "ctrname",
40+ "(.*)"
41+ )
42+ )
43+ record: node_namespace_pod_container:container_gpu_utilization
44+ {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
45+ labels:
46+ {{- with .Values.defaultRules.additionalRuleLabels }}
47+ {{- toYaml . | nindent 8 }}
48+ {{- end }}
49+ {{- with .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
50+ {{- toYaml . | nindent 8 }}
51+ {{- end }}
52+ {{- end }}
53+ - expr : |-
54+ sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, node, namespace, pod, container) (
55+ label_replace(
56+ label_replace(
57+ label_replace(Device_memory_desc_of_container, "namespace", "$1", "podnamespace", "(.*)"),
58+ "pod",
59+ "$1",
60+ "podname",
61+ "(.*)"
62+ ),
63+ "container",
64+ "$1",
65+ "ctrname",
66+ "(.*)"
67+ )
68+ )
69+ record: node_namespace_pod_container:container_gpu_memory_usage
70+ {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
71+ labels:
72+ {{- with .Values.defaultRules.additionalRuleLabels }}
73+ {{- toYaml . | nindent 8 }}
74+ {{- end }}
75+ {{- with .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
76+ {{- toYaml . | nindent 8 }}
77+ {{- end }}
78+ {{- end }}
79+ - expr : |-
80+ label_replace(
81+ label_replace(GPUDeviceSharedNum / GPUDeviceCoreLimit, "node", "$1", "nodeid", "(.*)"),
82+ "device_num",
83+ "$1",
84+ "deviceidx",
85+ "(.*)"
86+ )
87+ record: node:vgpu_device:vgpu_allocated_utilization
88+ {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
89+ labels:
90+ {{- with .Values.defaultRules.additionalRuleLabels }}
91+ {{- toYaml . | nindent 8 }}
92+ {{- end }}
93+ {{- with .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
94+ {{- toYaml . | nindent 8 }}
95+ {{- end }}
96+ {{- end }}
97+ - expr : |-
98+ label_replace(
99+ label_replace(GPUDeviceCoreAllocated / GPUDeviceCoreLimit, "node", "$1", "nodeid", "(.*)"),
100+ "device_num",
101+ "$1",
102+ "deviceidx",
103+ "(.*)"
104+ )
105+ record: node:vgpu_device:vgpu_core_allocated_utilization
106+ {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
107+ labels:
108+ {{- with .Values.defaultRules.additionalRuleLabels }}
109+ {{- toYaml . | nindent 8 }}
110+ {{- end }}
111+ {{- with .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
112+ {{- toYaml . | nindent 8 }}
113+ {{- end }}
114+ {{- end }}
115+ - expr : |-
116+ label_replace(
117+ label_replace(
118+ sum without (devicecores) (GPUDeviceMemoryAllocated) / GPUDeviceMemoryLimit,
119+ "node",
120+ "$1",
121+ "nodeid",
122+ "(.*)"
123+ ),
124+ "device_num",
125+ "$1",
126+ "deviceidx",
127+ "(.*)"
128+ )
129+ record: node:vgpu_device:vgpu_memory_allocated_utilization
130+ {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
131+ labels:
132+ {{- with .Values.defaultRules.additionalRuleLabels }}
133+ {{- toYaml . | nindent 8 }}
134+ {{- end }}
135+ {{- with .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
136+ {{- toYaml . | nindent 8 }}
137+ {{- end }}
138+ {{- end }}
139+ - expr : |-
140+ sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, node) (
141+ kube_pod_container_resource_requests{job="kube-state-metrics",resource=~"nvidia_com_vgpu|qingcloud_nvidia_com_vgpu"}
142+ )
143+ record: node:node_gpu_allocated_num:sum
144+ {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
145+ labels:
146+ {{- with .Values.defaultRules.additionalRuleLabels }}
147+ {{- toYaml . | nindent 8 }}
148+ {{- end }}
149+ {{- with .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
150+ {{- toYaml . | nindent 8 }}
151+ {{- end }}
152+ {{- end }}
153+ - expr : |-
154+ sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, node) (
155+ kube_node_status_allocatable{job="kube-state-metrics",resource=~"nvidia_com_vgpu|qingcloud_nvidia_com_vgpu"}
156+ )
157+ record: node:node_gpu_num:sum
158+ {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
159+ labels:
160+ {{- with .Values.defaultRules.additionalRuleLabels }}
161+ {{- toYaml . | nindent 8 }}
162+ {{- end }}
163+ {{- with .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.hami }}
164+ {{- toYaml . | nindent 8 }}
165+ {{- end }}
166+ {{- end }}
167+ {{- end }}
0 commit comments