Skip to content

Commit 878efce

Browse files
committed
GPU Operator Integration with Remediation Workflows using Argo Workflows
1 parent 55f57e4 commit 878efce

File tree

268 files changed

+94544
-938
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

268 files changed

+94544
-938
lines changed

Makefile

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,10 @@ ifdef SKIP_INSTALL_DEFAULT_CR
7070
SKIP_INSTALL_DEFAULT_CR_CMD=--set crds.defaultCR.install=false
7171
endif
7272

73+
ifdef SKIP_REMEDIATION_CONTROLLER
74+
SKIP_REMEDIATION_CONTROLLER_CMD=--set remediation.enabled=false
75+
endif
76+
7377
#################################
7478
# OpenShift OLM Bundle varaiables
7579
# BUNDLE_IMG defines the image:tag used for the bundle.
@@ -324,6 +328,9 @@ helm-k8s: helmify manifests kustomize clean-helm-k8s gen-kmm-charts-k8s ## Build
324328
# Patching k8s helm chart kmm subchart
325329
cp $(shell pwd)/hack/k8s-patch/k8s-kmm-patch/metadata-patch/*.yaml $(shell pwd)/helm-charts-k8s/charts/kmm/
326330
cp $(shell pwd)/hack/k8s-patch/k8s-kmm-patch/template-patch/*.yaml $(shell pwd)/helm-charts-k8s/charts/kmm/templates/
331+
mkdir -p $(shell pwd)/helm-charts-k8s/charts/remediation/templates
332+
cp $(shell pwd)/hack/k8s-patch/k8s-remediation-patch/metadata-patch/*.yaml $(shell pwd)/helm-charts-k8s/charts/remediation/
333+
cp $(shell pwd)/hack/k8s-patch/k8s-remediation-patch/template-patch/*.yaml $(shell pwd)/helm-charts-k8s/charts/remediation/templates/
327334
cd $(shell pwd)/helm-charts-k8s; helm dependency update; helm lint; cd ..;
328335
mkdir $(shell pwd)/helm-charts-k8s/crds
329336
echo "moving crd yaml files to crds folder"
@@ -574,7 +581,7 @@ helm-uninstall-openshift:
574581
helm uninstall amd-gpu-operator -n kube-amd-gpu
575582

576583
helm-install-k8s:
577-
helm install -f helm-charts-k8s/values.yaml amd-gpu-operator ${GPU_OPERATOR_CHART} -n kube-amd-gpu --create-namespace ${SKIP_NFD_CMD} ${SKIP_KMM_CMD} ${HELM_OC_CMD} ${SIM_ENABLE_CMD} ${SKIP_INSTALL_DEFAULT_CR_CMD}
584+
helm install -f helm-charts-k8s/values.yaml amd-gpu-operator ${GPU_OPERATOR_CHART} -n kube-amd-gpu --create-namespace ${SKIP_NFD_CMD} ${SKIP_KMM_CMD} ${SKIP_REMEDIATION_CONTROLLER_CMD} ${HELM_OC_CMD} ${SIM_ENABLE_CMD} ${SKIP_INSTALL_DEFAULT_CR_CMD}
578585

579586
helm-uninstall-k8s:
580587
echo "Deleting all device configs before uninstalling operator..."

api/v1alpha1/deviceconfig_types.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,28 @@ type DeviceConfigSpec struct {
7575
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Selector",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:selector"}
7676
// +optional
7777
Selector map[string]string `json:"selector,omitempty"`
78+
79+
// remediation workflow
80+
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="RemediationWorkflow",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:remediationWorkflow"}
81+
// +optional
82+
RemediationWorkflow RemediationWorkflowSpec `json:"remediationWorkflow,omitempty"`
83+
}
84+
85+
// RemediationWorkflowSpec defines workflows to run based on node conditions
86+
type RemediationWorkflowSpec struct {
87+
// enable remediation workflows. disabled by default
88+
// enable if operator should automatically handle remediation of node incase of gpu issues
89+
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Enable",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:enable"}
90+
Enable *bool `json:"enable,omitempty"`
91+
92+
// Name of the ConfigMap that holds condition-to-workflow mappings.
93+
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="ConditionalWorkflows",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:conditionalWorkflows"}
94+
ConditionalWorkflows *v1.LocalObjectReference `json:"conditionalWorkflows,omitempty"`
95+
96+
// Time to live for argo workflow object and its pods for a failed workflow in hours. By default, it is set to 24 hours
97+
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="TtlForFailedWorkflows",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:ttlForFailedWorkflows"}
98+
// +kubebuilder:default:=24
99+
TtlForFailedWorkflows int `json:"ttlForFailedWorkflows,omitempty"`
78100
}
79101

80102
type RegistryTLS struct {

api/v1alpha1/zz_generated.deepcopy.go

Lines changed: 26 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ metadata:
3232
capabilities: Seamless Upgrades
3333
categories: AI/Machine Learning,Monitoring
3434
containerImage: docker.io/rocm/gpu-operator:v1.2.0
35-
createdAt: "2025-06-12T00:51:00Z"
35+
createdAt: "2025-07-17T08:55:25Z"
3636
description: |-
3737
Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter
3838
For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/)
@@ -598,6 +598,28 @@ spec:
598598
path: metricsExporter.upgradePolicy.upgradeStrategy
599599
x-descriptors:
600600
- urn:alm:descriptor:com.amd.deviceconfigs:upgradeStrategy
601+
- description: remediation workflow
602+
displayName: RemediationWorkflow
603+
path: remediationWorkflow
604+
x-descriptors:
605+
- urn:alm:descriptor:com.amd.deviceconfigs:remediationWorkflow
606+
- description: Name of the ConfigMap that holds condition-to-workflow mappings.
607+
displayName: ConditionalWorkflows
608+
path: remediationWorkflow.conditionalWorkflows
609+
x-descriptors:
610+
- urn:alm:descriptor:com.amd.deviceconfigs:conditionalWorkflows
611+
- description: enable remediation workflows. disabled by default enable if operator
612+
should automatically handle remediation of node incase of gpu issues
613+
displayName: Enable
614+
path: remediationWorkflow.enable
615+
x-descriptors:
616+
- urn:alm:descriptor:com.amd.deviceconfigs:enable
617+
- description: Time to live for argo workflow object and its pods for a failed
618+
workflow in hours. By default, it is set to 24 hours
619+
displayName: TtlForFailedWorkflows
620+
path: remediationWorkflow.ttlForFailedWorkflows
621+
x-descriptors:
622+
- urn:alm:descriptor:com.amd.deviceconfigs:ttlForFailedWorkflows
601623
- description: Selector describes on which nodes the GPU Operator should enable
602624
the GPU device.
603625
displayName: Selector

bundle/manifests/amd.com_deviceconfigs.yaml

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1180,6 +1180,36 @@ spec:
11801180
type: string
11811181
type: object
11821182
type: object
1183+
remediationWorkflow:
1184+
description: remediation workflow
1185+
properties:
1186+
conditionalWorkflows:
1187+
description: Name of the ConfigMap that holds condition-to-workflow
1188+
mappings.
1189+
properties:
1190+
name:
1191+
default: ""
1192+
description: |-
1193+
Name of the referent.
1194+
This field is effectively required, but due to backwards compatibility is
1195+
allowed to be empty. Instances of this type with an empty value here are
1196+
almost certainly wrong.
1197+
More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
1198+
type: string
1199+
type: object
1200+
x-kubernetes-map-type: atomic
1201+
enable:
1202+
description: |-
1203+
enable remediation workflows. disabled by default
1204+
enable if operator should automatically handle remediation of node incase of gpu issues
1205+
type: boolean
1206+
ttlForFailedWorkflows:
1207+
default: 24
1208+
description: Time to live for argo workflow object and its pods
1209+
for a failed workflow in hours. By default, it is set to 24
1210+
hours
1211+
type: integer
1212+
type: object
11831213
selector:
11841214
additionalProperties:
11851215
type: string

cmd/main.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ package main
3535
import (
3636
"flag"
3737

38+
workflowv1alpha1 "github.com/argoproj/argo-workflows/v3/pkg/apis/workflow/v1alpha1"
3839
kmmv1beta1 "github.com/rh-ecosystem-edge/kernel-module-management/api/v1beta1"
3940
"k8s.io/apimachinery/pkg/runtime"
4041
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
@@ -75,6 +76,7 @@ func init() {
7576
utilruntime.Must(kmmv1beta1.AddToScheme(scheme))
7677
utilruntime.Must(apiextensionsv1.AddToScheme(scheme))
7778
utilruntime.Must(monitoringv1.AddToScheme(scheme))
79+
utilruntime.Must(workflowv1alpha1.AddToScheme(scheme))
7880
//+kubebuilder:scaffold:scheme
7981
}
8082

config/crd/bases/amd.com_deviceconfigs.yaml

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1176,6 +1176,36 @@ spec:
11761176
type: string
11771177
type: object
11781178
type: object
1179+
remediationWorkflow:
1180+
description: remediation workflow
1181+
properties:
1182+
conditionalWorkflows:
1183+
description: Name of the ConfigMap that holds condition-to-workflow
1184+
mappings.
1185+
properties:
1186+
name:
1187+
default: ""
1188+
description: |-
1189+
Name of the referent.
1190+
This field is effectively required, but due to backwards compatibility is
1191+
allowed to be empty. Instances of this type with an empty value here are
1192+
almost certainly wrong.
1193+
More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
1194+
type: string
1195+
type: object
1196+
x-kubernetes-map-type: atomic
1197+
enable:
1198+
description: |-
1199+
enable remediation workflows. disabled by default
1200+
enable if operator should automatically handle remediation of node incase of gpu issues
1201+
type: boolean
1202+
ttlForFailedWorkflows:
1203+
default: 24
1204+
description: Time to live for argo workflow object and its pods
1205+
for a failed workflow in hours. By default, it is set to 24
1206+
hours
1207+
type: integer
1208+
type: object
11791209
selector:
11801210
additionalProperties:
11811211
type: string

config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -569,6 +569,28 @@ spec:
569569
path: metricsExporter.upgradePolicy.upgradeStrategy
570570
x-descriptors:
571571
- urn:alm:descriptor:com.amd.deviceconfigs:upgradeStrategy
572+
- description: remediation workflow
573+
displayName: RemediationWorkflow
574+
path: remediationWorkflow
575+
x-descriptors:
576+
- urn:alm:descriptor:com.amd.deviceconfigs:remediationWorkflow
577+
- description: Name of the ConfigMap that holds condition-to-workflow mappings.
578+
displayName: ConditionalWorkflows
579+
path: remediationWorkflow.conditionalWorkflows
580+
x-descriptors:
581+
- urn:alm:descriptor:com.amd.deviceconfigs:conditionalWorkflows
582+
- description: enable remediation workflows. disabled by default enable if operator
583+
should automatically handle remediation of node incase of gpu issues
584+
displayName: Enable
585+
path: remediationWorkflow.enable
586+
x-descriptors:
587+
- urn:alm:descriptor:com.amd.deviceconfigs:enable
588+
- description: Time to live for argo workflow object and its pods for a failed
589+
workflow in hours. By default, it is set to 24 hours
590+
displayName: TtlForFailedWorkflows
591+
path: remediationWorkflow.ttlForFailedWorkflows
592+
x-descriptors:
593+
- urn:alm:descriptor:com.amd.deviceconfigs:ttlForFailedWorkflows
572594
- description: Selector describes on which nodes the GPU Operator should enable
573595
the GPU device.
574596
displayName: Selector

0 commit comments

Comments
 (0)