Skip to content

Commit fd56a4c

Browse files
yansun1996sajmera-pensando
authored andcommitted
[Feature] Add pod UID and framework name in test runner event (#1004)
Signed-off-by: yansun1996 <[email protected]>
1 parent 68bdc54 commit fd56a4c

File tree

9 files changed

+55
-5
lines changed

9 files changed

+55
-5
lines changed

docs/releasenotes.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,15 @@
22

33
## GPU Operator v1.4.1 Release Notes
44

5-
The AMD GPU Operator v1.4.1 release extends platform support to OpenShift v4.19
5+
The AMD GPU Operator v1.4.1 release extends platform support to OpenShift v4.20
66

77
### Release Highlights
88
- **Device-Metrics-Exporter enhancements**
99
- **Enhanced Pod and Service Annotations**
1010
- **Pod Annotations**, **Service Annotations** : Custom annotations can now be applied to exporter pods via the DeviceConfig CRD
11+
- **Test Runner enhancements**
12+
- **Enhanced Test Result Events**
13+
- Test runner Kubernetes events now include additional information: pod UID and test framework name (e.g., RVS, AGFHC) as event labels, providing more comprehensive test run information for improved tracking and diagnostics.
1114

1215
## GPU Operator v1.4.0 Release Notes
1316

docs/test/auto-unhealthy-device-test.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ Test runner generated event can be retrieved by filtering the source component:
9090
"labels": {
9191
"testrunner.amd.com/category": "gpu_health_check",
9292
"testrunner.amd.com/gpu.id.0": "35824",
93+
"testrunner.amd.com/framework": "RVS",
9394
"testrunner.amd.com/gpu.kfd.35824": "0",
9495
"testrunner.amd.com/hostname": "leto",
9596
"testrunner.amd.com/recipe": "gst_single",
@@ -191,6 +192,7 @@ in the above example ```35824``` is the GPU's KFD ID reported by amd-smi (in roc
191192
* ```testrunner.amd.com/recipe``` is the test recipe name.
192193
* ```testrunner.amd.com/hostname``` is the name of the host where the test happened.
193194
* ```testrunner.amd.com/gpu.id.X``` shows which GPU was involved and ```X``` is the GPU index number, the value is corresponding GPU KFD ID.
195+
* ```testrunner.amd.com/framework``` indicates the test framework used to execute the test.
194196
* ```testrunner.amd.com/gpu.kfd.Y``` shows which GPU was involved and ```Y``` is the GPU KFD ID, the value is corresponding GPU index number.
195197
* ```reason``` gives an overall result of the whole test run, it could be ```TestPassed```, ```TestFailed``` or ```TestTimedOut```.
196198
* ```source``` shows where the event came from, including component name ```amd-test-runner``` and worker node's host name.

docs/test/manual-test.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,10 @@ spec:
138138
valueFrom:
139139
fieldRef:
140140
fieldPath: metadata.namespace
141+
- name: POD_UID # Use downward API to pass pod UID to test runner container
142+
valueFrom:
143+
fieldRef:
144+
fieldPath: metadata.uid
141145
- name: NODE_NAME # Use downward API to pass host name to test runner container
142146
valueFrom:
143147
fieldRef:
@@ -268,6 +272,10 @@ spec:
268272
valueFrom:
269273
fieldRef:
270274
fieldPath: metadata.namespace
275+
- name: POD_UID # Use downward API to pass pod UID to test runner container
276+
valueFrom:
277+
fieldRef:
278+
fieldPath: metadata.uid
271279
- name: NODE_NAME # Use downward API to pass host name to test runner container
272280
valueFrom:
273281
fieldRef:
@@ -433,6 +441,10 @@ spec:
433441
valueFrom:
434442
fieldRef:
435443
fieldPath: metadata.namespace
444+
- name: POD_UID
445+
valueFrom:
446+
fieldRef:
447+
fieldPath: metadata.uid
436448
- name: NODE_NAME
437449
valueFrom:
438450
fieldRef:
@@ -623,6 +635,10 @@ spec:
623635
valueFrom:
624636
fieldRef:
625637
fieldPath: metadata.namespace
638+
- name: POD_UID # Use downward API to pass pod UID to test runner container
639+
valueFrom:
640+
fieldRef:
641+
fieldPath: metadata.uid
626642
- name: NODE_NAME # Use downward API to pass host name to test runner container
627643
valueFrom:
628644
fieldRef:
@@ -783,6 +799,10 @@ spec:
783799
valueFrom:
784800
fieldRef:
785801
fieldPath: metadata.namespace
802+
- name: POD_UID # Use downward API to pass pod UID to test runner container
803+
valueFrom:
804+
fieldRef:
805+
fieldPath: metadata.uid
786806
- name: NODE_NAME # Use downward API to pass host name to test runner container
787807
valueFrom:
788808
fieldRef:

docs/test/pre-start-job-test.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,10 @@ spec:
137137
valueFrom:
138138
fieldRef:
139139
fieldPath: metadata.namespace
140+
- name: POD_UID # Use downward API to pass pod UID to test runner container
141+
valueFrom:
142+
fieldRef:
143+
fieldPath: metadata.uid
140144
- name: NODE_NAME # Use downward API to pass host name to test runner container
141145
valueFrom:
142146
fieldRef:

example/testrunner/manual_test_job.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,10 @@ spec:
181181
valueFrom:
182182
fieldRef:
183183
fieldPath: metadata.namespace
184+
- name: POD_UID # Use downward API to pass pod UID to test runner container
185+
valueFrom:
186+
fieldRef:
187+
fieldPath: metadata.uid
184188
- name: NODE_NAME # Use downward API to pass host name to test runner container
185189
valueFrom:
186190
fieldRef:

example/testrunner/pre_start_job_check.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,10 @@ spec:
169169
valueFrom:
170170
fieldRef:
171171
fieldPath: metadata.namespace
172+
- name: POD_UID # Use downward API to pass pod UID to test runner container
173+
valueFrom:
174+
fieldRef:
175+
fieldPath: metadata.uid
172176
- name: NODE_NAME # Use downward API to pass host name to test runner container
173177
valueFrom:
174178
fieldRef:

example/testrunner/schedule_test_cronjob.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,10 @@ spec:
176176
valueFrom:
177177
fieldRef:
178178
fieldPath: metadata.namespace
179+
- name: POD_UID
180+
valueFrom:
181+
fieldRef:
182+
fieldPath: metadata.uid
179183
- name: NODE_NAME
180184
valueFrom:
181185
fieldRef:

internal/testrunner/testrunner.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,14 @@ func (nl *testRunner) SetTestRunnerAsDesired(ds *appsv1.DaemonSet, devConfig *am
233233
},
234234
},
235235
},
236+
{
237+
Name: "POD_UID",
238+
ValueFrom: &v1.EnvVarSource{
239+
FieldRef: &v1.ObjectFieldSelector{
240+
FieldPath: "metadata.uid",
241+
},
242+
},
243+
},
236244
{
237245
Name: "NODE_NAME",
238246
ValueFrom: &v1.EnvVarSource{

tests/e2e/testrunner_test.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -334,10 +334,11 @@ func (s *E2ESuite) verifyTestResultEvts(node, recipe string, devCfg *v1alpha1.De
334334
// verify that the test run event got generated
335335
logger.Print("Verifying test result event(s)")
336336
testEventLabel := map[string]string{
337-
"testrunner.amd.com/category": "gpu_health_check",
338-
"testrunner.amd.com/trigger": "auto_unhealthy_gpu_watch",
339-
"testrunner.amd.com/recipe": recipe,
340-
"testrunner.amd.com/hostname": node,
337+
"testrunner.amd.com/category": "gpu_health_check",
338+
"testrunner.amd.com/trigger": "auto_unhealthy_gpu_watch",
339+
"testrunner.amd.com/recipe": recipe,
340+
"testrunner.amd.com/hostname": node,
341+
"testrunner.amd.com/framework": s.framework,
341342
}
342343
assert.Eventually(c, func() bool {
343344
evts, err := s.clientSet.CoreV1().Events(devCfg.Namespace).List(context.TODO(), metav1.ListOptions{

0 commit comments

Comments
 (0)