Skip to content

Commit 03f7ce4

Browse files
committed
GPU Operator Integration with Remediation Workflows using Argo Workflows
1 parent 3726540 commit 03f7ce4

File tree

268 files changed

+94544
-938
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

268 files changed

+94544
-938
lines changed

Makefile

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,10 @@ ifdef SKIP_INSTALL_DEFAULT_CR
7070
SKIP_INSTALL_DEFAULT_CR_CMD=--set crds.defaultCR.install=false
7171
endif
7272

73+
ifdef SKIP_REMEDIATION_CONTROLLER
74+
SKIP_REMEDIATION_CONTROLLER_CMD=--set remediation.enabled=false
75+
endif
76+
7377
#################################
7478
# OpenShift OLM Bundle varaiables
7579
# BUNDLE_IMG defines the image:tag used for the bundle.
@@ -325,6 +329,9 @@ helm-k8s: helmify manifests kustomize clean-helm-k8s gen-kmm-charts-k8s ## Build
325329
# Patching k8s helm chart kmm subchart
326330
cp $(shell pwd)/hack/k8s-patch/k8s-kmm-patch/metadata-patch/*.yaml $(shell pwd)/helm-charts-k8s/charts/kmm/
327331
cp $(shell pwd)/hack/k8s-patch/k8s-kmm-patch/template-patch/*.yaml $(shell pwd)/helm-charts-k8s/charts/kmm/templates/
332+
mkdir -p $(shell pwd)/helm-charts-k8s/charts/remediation/templates
333+
cp $(shell pwd)/hack/k8s-patch/k8s-remediation-patch/metadata-patch/*.yaml $(shell pwd)/helm-charts-k8s/charts/remediation/
334+
cp $(shell pwd)/hack/k8s-patch/k8s-remediation-patch/template-patch/*.yaml $(shell pwd)/helm-charts-k8s/charts/remediation/templates/
328335
cd $(shell pwd)/helm-charts-k8s; helm dependency update; helm lint; cd ..;
329336
mkdir $(shell pwd)/helm-charts-k8s/crds
330337
echo "moving crd yaml files to crds folder"
@@ -575,7 +582,7 @@ helm-uninstall-openshift:
575582
helm uninstall amd-gpu-operator -n kube-amd-gpu
576583

577584
helm-install-k8s:
578-
helm install -f helm-charts-k8s/values.yaml amd-gpu-operator ${GPU_OPERATOR_CHART} -n kube-amd-gpu --create-namespace ${SKIP_NFD_CMD} ${SKIP_KMM_CMD} ${HELM_OC_CMD} ${SIM_ENABLE_CMD} ${SKIP_INSTALL_DEFAULT_CR_CMD}
585+
helm install -f helm-charts-k8s/values.yaml amd-gpu-operator ${GPU_OPERATOR_CHART} -n kube-amd-gpu --create-namespace ${SKIP_NFD_CMD} ${SKIP_KMM_CMD} ${SKIP_REMEDIATION_CONTROLLER_CMD} ${HELM_OC_CMD} ${SIM_ENABLE_CMD} ${SKIP_INSTALL_DEFAULT_CR_CMD}
579586

580587
helm-uninstall-k8s:
581588
echo "Deleting all device configs before uninstalling operator..."

api/v1alpha1/deviceconfig_types.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,28 @@ type DeviceConfigSpec struct {
7575
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Selector",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:selector"}
7676
// +optional
7777
Selector map[string]string `json:"selector,omitempty"`
78+
79+
// remediation workflow
80+
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="RemediationWorkflow",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:remediationWorkflow"}
81+
// +optional
82+
RemediationWorkflow RemediationWorkflowSpec `json:"remediationWorkflow,omitempty"`
83+
}
84+
85+
// RemediationWorkflowSpec defines workflows to run based on node conditions
86+
type RemediationWorkflowSpec struct {
87+
// enable remediation workflows. disabled by default
88+
// enable if operator should automatically handle remediation of node incase of gpu issues
89+
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Enable",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:enable"}
90+
Enable *bool `json:"enable,omitempty"`
91+
92+
// Name of the ConfigMap that holds condition-to-workflow mappings.
93+
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="ConditionalWorkflows",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:conditionalWorkflows"}
94+
ConditionalWorkflows *v1.LocalObjectReference `json:"conditionalWorkflows,omitempty"`
95+
96+
// Time to live for argo workflow object and its pods for a failed workflow in hours. By default, it is set to 24 hours
97+
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="TtlForFailedWorkflows",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:ttlForFailedWorkflows"}
98+
// +kubebuilder:default:=24
99+
TtlForFailedWorkflows int `json:"ttlForFailedWorkflows,omitempty"`
78100
}
79101

80102
type RegistryTLS struct {

api/v1alpha1/zz_generated.deepcopy.go

Lines changed: 26 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ metadata:
3232
capabilities: Seamless Upgrades
3333
categories: AI/Machine Learning,Monitoring
3434
containerImage: docker.io/rocm/gpu-operator:v1.2.0
35-
createdAt: "2025-08-06T05:54:02Z"
35+
createdAt: "2025-08-12T02:38:34Z"
3636
description: |-
3737
Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter
3838
For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/)
@@ -603,6 +603,28 @@ spec:
603603
path: metricsExporter.upgradePolicy.upgradeStrategy
604604
x-descriptors:
605605
- urn:alm:descriptor:com.amd.deviceconfigs:upgradeStrategy
606+
- description: remediation workflow
607+
displayName: RemediationWorkflow
608+
path: remediationWorkflow
609+
x-descriptors:
610+
- urn:alm:descriptor:com.amd.deviceconfigs:remediationWorkflow
611+
- description: Name of the ConfigMap that holds condition-to-workflow mappings.
612+
displayName: ConditionalWorkflows
613+
path: remediationWorkflow.conditionalWorkflows
614+
x-descriptors:
615+
- urn:alm:descriptor:com.amd.deviceconfigs:conditionalWorkflows
616+
- description: enable remediation workflows. disabled by default enable if operator
617+
should automatically handle remediation of node incase of gpu issues
618+
displayName: Enable
619+
path: remediationWorkflow.enable
620+
x-descriptors:
621+
- urn:alm:descriptor:com.amd.deviceconfigs:enable
622+
- description: Time to live for argo workflow object and its pods for a failed
623+
workflow in hours. By default, it is set to 24 hours
624+
displayName: TtlForFailedWorkflows
625+
path: remediationWorkflow.ttlForFailedWorkflows
626+
x-descriptors:
627+
- urn:alm:descriptor:com.amd.deviceconfigs:ttlForFailedWorkflows
606628
- description: Selector describes on which nodes the GPU Operator should enable
607629
the GPU device.
608630
displayName: Selector

bundle/manifests/amd.com_deviceconfigs.yaml

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1219,6 +1219,36 @@ spec:
12191219
type: string
12201220
type: object
12211221
type: object
1222+
remediationWorkflow:
1223+
description: remediation workflow
1224+
properties:
1225+
conditionalWorkflows:
1226+
description: Name of the ConfigMap that holds condition-to-workflow
1227+
mappings.
1228+
properties:
1229+
name:
1230+
default: ""
1231+
description: |-
1232+
Name of the referent.
1233+
This field is effectively required, but due to backwards compatibility is
1234+
allowed to be empty. Instances of this type with an empty value here are
1235+
almost certainly wrong.
1236+
More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
1237+
type: string
1238+
type: object
1239+
x-kubernetes-map-type: atomic
1240+
enable:
1241+
description: |-
1242+
enable remediation workflows. disabled by default
1243+
enable if operator should automatically handle remediation of node incase of gpu issues
1244+
type: boolean
1245+
ttlForFailedWorkflows:
1246+
default: 24
1247+
description: Time to live for argo workflow object and its pods
1248+
for a failed workflow in hours. By default, it is set to 24
1249+
hours
1250+
type: integer
1251+
type: object
12221252
selector:
12231253
additionalProperties:
12241254
type: string

cmd/main.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ package main
3535
import (
3636
"flag"
3737

38+
workflowv1alpha1 "github.com/argoproj/argo-workflows/v3/pkg/apis/workflow/v1alpha1"
3839
kmmv1beta1 "github.com/rh-ecosystem-edge/kernel-module-management/api/v1beta1"
3940
"k8s.io/apimachinery/pkg/runtime"
4041
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
@@ -75,6 +76,7 @@ func init() {
7576
utilruntime.Must(kmmv1beta1.AddToScheme(scheme))
7677
utilruntime.Must(apiextensionsv1.AddToScheme(scheme))
7778
utilruntime.Must(monitoringv1.AddToScheme(scheme))
79+
utilruntime.Must(workflowv1alpha1.AddToScheme(scheme))
7880
//+kubebuilder:scaffold:scheme
7981
}
8082

config/crd/bases/amd.com_deviceconfigs.yaml

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1215,6 +1215,36 @@ spec:
12151215
type: string
12161216
type: object
12171217
type: object
1218+
remediationWorkflow:
1219+
description: remediation workflow
1220+
properties:
1221+
conditionalWorkflows:
1222+
description: Name of the ConfigMap that holds condition-to-workflow
1223+
mappings.
1224+
properties:
1225+
name:
1226+
default: ""
1227+
description: |-
1228+
Name of the referent.
1229+
This field is effectively required, but due to backwards compatibility is
1230+
allowed to be empty. Instances of this type with an empty value here are
1231+
almost certainly wrong.
1232+
More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
1233+
type: string
1234+
type: object
1235+
x-kubernetes-map-type: atomic
1236+
enable:
1237+
description: |-
1238+
enable remediation workflows. disabled by default
1239+
enable if operator should automatically handle remediation of node incase of gpu issues
1240+
type: boolean
1241+
ttlForFailedWorkflows:
1242+
default: 24
1243+
description: Time to live for argo workflow object and its pods
1244+
for a failed workflow in hours. By default, it is set to 24
1245+
hours
1246+
type: integer
1247+
type: object
12181248
selector:
12191249
additionalProperties:
12201250
type: string

config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -574,6 +574,28 @@ spec:
574574
path: metricsExporter.upgradePolicy.upgradeStrategy
575575
x-descriptors:
576576
- urn:alm:descriptor:com.amd.deviceconfigs:upgradeStrategy
577+
- description: remediation workflow
578+
displayName: RemediationWorkflow
579+
path: remediationWorkflow
580+
x-descriptors:
581+
- urn:alm:descriptor:com.amd.deviceconfigs:remediationWorkflow
582+
- description: Name of the ConfigMap that holds condition-to-workflow mappings.
583+
displayName: ConditionalWorkflows
584+
path: remediationWorkflow.conditionalWorkflows
585+
x-descriptors:
586+
- urn:alm:descriptor:com.amd.deviceconfigs:conditionalWorkflows
587+
- description: enable remediation workflows. disabled by default enable if operator
588+
should automatically handle remediation of node incase of gpu issues
589+
displayName: Enable
590+
path: remediationWorkflow.enable
591+
x-descriptors:
592+
- urn:alm:descriptor:com.amd.deviceconfigs:enable
593+
- description: Time to live for argo workflow object and its pods for a failed
594+
workflow in hours. By default, it is set to 24 hours
595+
displayName: TtlForFailedWorkflows
596+
path: remediationWorkflow.ttlForFailedWorkflows
597+
x-descriptors:
598+
- urn:alm:descriptor:com.amd.deviceconfigs:ttlForFailedWorkflows
577599
- description: Selector describes on which nodes the GPU Operator should enable
578600
the GPU device.
579601
displayName: Selector

0 commit comments

Comments
 (0)