Skip to content

Commit f4fce60

Browse files
committed
GPU Operator Integration with Remediation Workflows using Argo Workflows
1 parent 6b7d758 commit f4fce60

File tree

268 files changed

+94546
-940
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

268 files changed

+94546
-940
lines changed

Makefile

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,10 @@ ifdef SKIP_INSTALL_DEFAULT_CR
7070
SKIP_INSTALL_DEFAULT_CR_CMD=--set crds.defaultCR.install=false
7171
endif
7272

73+
ifdef SKIP_REMEDIATION_CONTROLLER
74+
SKIP_REMEDIATION_CONTROLLER_CMD=--set remediation.enabled=false
75+
endif
76+
7377
#################################
7478
# OpenShift OLM Bundle varaiables
7579
# BUNDLE_IMG defines the image:tag used for the bundle.
@@ -325,6 +329,9 @@ helm-k8s: helmify manifests kustomize clean-helm-k8s gen-kmm-charts-k8s ## Build
325329
# Patching k8s helm chart kmm subchart
326330
cp $(shell pwd)/hack/k8s-patch/k8s-kmm-patch/metadata-patch/*.yaml $(shell pwd)/helm-charts-k8s/charts/kmm/
327331
cp $(shell pwd)/hack/k8s-patch/k8s-kmm-patch/template-patch/*.yaml $(shell pwd)/helm-charts-k8s/charts/kmm/templates/
332+
mkdir -p $(shell pwd)/helm-charts-k8s/charts/remediation/templates
333+
cp $(shell pwd)/hack/k8s-patch/k8s-remediation-patch/metadata-patch/*.yaml $(shell pwd)/helm-charts-k8s/charts/remediation/
334+
cp $(shell pwd)/hack/k8s-patch/k8s-remediation-patch/template-patch/*.yaml $(shell pwd)/helm-charts-k8s/charts/remediation/templates/
328335
cd $(shell pwd)/helm-charts-k8s; helm dependency update; helm lint; cd ..;
329336
mkdir $(shell pwd)/helm-charts-k8s/crds
330337
echo "moving crd yaml files to crds folder"
@@ -590,7 +597,7 @@ helm-uninstall-openshift:
590597
helm uninstall amd-gpu-operator -n kube-amd-gpu
591598

592599
helm-install-k8s:
593-
helm install -f helm-charts-k8s/values.yaml amd-gpu-operator ${GPU_OPERATOR_CHART} -n kube-amd-gpu --create-namespace ${SKIP_NFD_CMD} ${SKIP_KMM_CMD} ${HELM_OC_CMD} ${SIM_ENABLE_CMD} ${SKIP_INSTALL_DEFAULT_CR_CMD}
600+
helm install -f helm-charts-k8s/values.yaml amd-gpu-operator ${GPU_OPERATOR_CHART} -n kube-amd-gpu --create-namespace ${SKIP_NFD_CMD} ${SKIP_KMM_CMD} ${SKIP_REMEDIATION_CONTROLLER_CMD} ${HELM_OC_CMD} ${SIM_ENABLE_CMD} ${SKIP_INSTALL_DEFAULT_CR_CMD}
594601

595602
helm-uninstall-k8s:
596603
echo "Deleting all device configs before uninstalling operator..."

api/v1alpha1/deviceconfig_types.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,28 @@ type DeviceConfigSpec struct {
7575
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Selector",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:selector"}
7676
// +optional
7777
Selector map[string]string `json:"selector,omitempty"`
78+
79+
// remediation workflow
80+
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="RemediationWorkflow",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:remediationWorkflow"}
81+
// +optional
82+
RemediationWorkflow RemediationWorkflowSpec `json:"remediationWorkflow,omitempty"`
83+
}
84+
85+
// RemediationWorkflowSpec defines workflows to run based on node conditions
86+
type RemediationWorkflowSpec struct {
87+
// enable remediation workflows. disabled by default
88+
// enable if operator should automatically handle remediation of node incase of gpu issues
89+
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Enable",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:enable"}
90+
Enable *bool `json:"enable,omitempty"`
91+
92+
// Name of the ConfigMap that holds condition-to-workflow mappings.
93+
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="ConditionalWorkflows",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:conditionalWorkflows"}
94+
ConditionalWorkflows *v1.LocalObjectReference `json:"conditionalWorkflows,omitempty"`
95+
96+
// Time to live for argo workflow object and its pods for a failed workflow in hours. By default, it is set to 24 hours
97+
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="TtlForFailedWorkflows",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:ttlForFailedWorkflows"}
98+
// +kubebuilder:default:=24
99+
TtlForFailedWorkflows int `json:"ttlForFailedWorkflows,omitempty"`
78100
}
79101

80102
type RegistryTLS struct {

api/v1alpha1/zz_generated.deepcopy.go

Lines changed: 26 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ metadata:
3232
capabilities: Seamless Upgrades
3333
categories: AI/Machine Learning,Monitoring
3434
containerImage: docker.io/rocm/gpu-operator:v1.2.0
35-
createdAt: "2025-08-13T23:39:41Z"
35+
createdAt: "2025-08-14T12:20:47Z"
3636
description: |-
3737
Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter
3838
For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/)
@@ -645,6 +645,28 @@ spec:
645645
path: metricsExporter.upgradePolicy.upgradeStrategy
646646
x-descriptors:
647647
- urn:alm:descriptor:com.amd.deviceconfigs:upgradeStrategy
648+
- description: remediation workflow
649+
displayName: RemediationWorkflow
650+
path: remediationWorkflow
651+
x-descriptors:
652+
- urn:alm:descriptor:com.amd.deviceconfigs:remediationWorkflow
653+
- description: Name of the ConfigMap that holds condition-to-workflow mappings.
654+
displayName: ConditionalWorkflows
655+
path: remediationWorkflow.conditionalWorkflows
656+
x-descriptors:
657+
- urn:alm:descriptor:com.amd.deviceconfigs:conditionalWorkflows
658+
- description: enable remediation workflows. disabled by default enable if operator
659+
should automatically handle remediation of node incase of gpu issues
660+
displayName: Enable
661+
path: remediationWorkflow.enable
662+
x-descriptors:
663+
- urn:alm:descriptor:com.amd.deviceconfigs:enable
664+
- description: Time to live for argo workflow object and its pods for a failed
665+
workflow in hours. By default, it is set to 24 hours
666+
displayName: TtlForFailedWorkflows
667+
path: remediationWorkflow.ttlForFailedWorkflows
668+
x-descriptors:
669+
- urn:alm:descriptor:com.amd.deviceconfigs:ttlForFailedWorkflows
648670
- description: Selector describes on which nodes the GPU Operator should enable
649671
the GPU device.
650672
displayName: Selector

bundle/manifests/amd.com_deviceconfigs.yaml

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1269,6 +1269,36 @@ spec:
12691269
type: string
12701270
type: object
12711271
type: object
1272+
remediationWorkflow:
1273+
description: remediation workflow
1274+
properties:
1275+
conditionalWorkflows:
1276+
description: Name of the ConfigMap that holds condition-to-workflow
1277+
mappings.
1278+
properties:
1279+
name:
1280+
default: ""
1281+
description: |-
1282+
Name of the referent.
1283+
This field is effectively required, but due to backwards compatibility is
1284+
allowed to be empty. Instances of this type with an empty value here are
1285+
almost certainly wrong.
1286+
More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
1287+
type: string
1288+
type: object
1289+
x-kubernetes-map-type: atomic
1290+
enable:
1291+
description: |-
1292+
enable remediation workflows. disabled by default
1293+
enable if operator should automatically handle remediation of node incase of gpu issues
1294+
type: boolean
1295+
ttlForFailedWorkflows:
1296+
default: 24
1297+
description: Time to live for argo workflow object and its pods
1298+
for a failed workflow in hours. By default, it is set to 24
1299+
hours
1300+
type: integer
1301+
type: object
12721302
selector:
12731303
additionalProperties:
12741304
type: string

cmd/main.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ package main
3535
import (
3636
"flag"
3737

38+
workflowv1alpha1 "github.com/argoproj/argo-workflows/v3/pkg/apis/workflow/v1alpha1"
3839
kmmv1beta1 "github.com/rh-ecosystem-edge/kernel-module-management/api/v1beta1"
3940
"k8s.io/apimachinery/pkg/runtime"
4041
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
@@ -76,6 +77,7 @@ func init() {
7677
utilruntime.Must(kmmv1beta1.AddToScheme(scheme))
7778
utilruntime.Must(apiextensionsv1.AddToScheme(scheme))
7879
utilruntime.Must(monitoringv1.AddToScheme(scheme))
80+
utilruntime.Must(workflowv1alpha1.AddToScheme(scheme))
7981
//+kubebuilder:scaffold:scheme
8082
}
8183

config/crd/bases/amd.com_deviceconfigs.yaml

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1265,6 +1265,36 @@ spec:
12651265
type: string
12661266
type: object
12671267
type: object
1268+
remediationWorkflow:
1269+
description: remediation workflow
1270+
properties:
1271+
conditionalWorkflows:
1272+
description: Name of the ConfigMap that holds condition-to-workflow
1273+
mappings.
1274+
properties:
1275+
name:
1276+
default: ""
1277+
description: |-
1278+
Name of the referent.
1279+
This field is effectively required, but due to backwards compatibility is
1280+
allowed to be empty. Instances of this type with an empty value here are
1281+
almost certainly wrong.
1282+
More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
1283+
type: string
1284+
type: object
1285+
x-kubernetes-map-type: atomic
1286+
enable:
1287+
description: |-
1288+
enable remediation workflows. disabled by default
1289+
enable if operator should automatically handle remediation of node incase of gpu issues
1290+
type: boolean
1291+
ttlForFailedWorkflows:
1292+
default: 24
1293+
description: Time to live for argo workflow object and its pods
1294+
for a failed workflow in hours. By default, it is set to 24
1295+
hours
1296+
type: integer
1297+
type: object
12681298
selector:
12691299
additionalProperties:
12701300
type: string

config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -616,6 +616,28 @@ spec:
616616
path: metricsExporter.upgradePolicy.upgradeStrategy
617617
x-descriptors:
618618
- urn:alm:descriptor:com.amd.deviceconfigs:upgradeStrategy
619+
- description: remediation workflow
620+
displayName: RemediationWorkflow
621+
path: remediationWorkflow
622+
x-descriptors:
623+
- urn:alm:descriptor:com.amd.deviceconfigs:remediationWorkflow
624+
- description: Name of the ConfigMap that holds condition-to-workflow mappings.
625+
displayName: ConditionalWorkflows
626+
path: remediationWorkflow.conditionalWorkflows
627+
x-descriptors:
628+
- urn:alm:descriptor:com.amd.deviceconfigs:conditionalWorkflows
629+
- description: enable remediation workflows. disabled by default enable if operator
630+
should automatically handle remediation of node incase of gpu issues
631+
displayName: Enable
632+
path: remediationWorkflow.enable
633+
x-descriptors:
634+
- urn:alm:descriptor:com.amd.deviceconfigs:enable
635+
- description: Time to live for argo workflow object and its pods for a failed
636+
workflow in hours. By default, it is set to 24 hours
637+
displayName: TtlForFailedWorkflows
638+
path: remediationWorkflow.ttlForFailedWorkflows
639+
x-descriptors:
640+
- urn:alm:descriptor:com.amd.deviceconfigs:ttlForFailedWorkflows
619641
- description: Selector describes on which nodes the GPU Operator should enable
620642
the GPU device.
621643
displayName: Selector

0 commit comments

Comments
 (0)