ROCm · sriram-30 · Aug 1, 2025 · Jul 17, 2025 · Aug 12, 2025 · May 2, 2025
diff --git a/Makefile b/Makefile
@@ -70,6 +70,10 @@ ifdef SKIP_INSTALL_DEFAULT_CR
 	SKIP_INSTALL_DEFAULT_CR_CMD=--set crds.defaultCR.install=false
 endif
 
+ifdef SKIP_REMEDIATION_CONTROLLER
+	SKIP_REMEDIATION_CONTROLLER_CMD=--set remediation.enabled=false
+endif
+
 #################################
 # OpenShift OLM Bundle varaiables
 # BUNDLE_IMG defines the image:tag used for the bundle.
@@ -229,7 +233,7 @@ fmt: ## Run go fmt against code.
 vet: ## Run go vet against code.
 	go vet ./...
 
-UNIT_TEST ?= ./internal/controllers ./internal/kmmmodule ./internal
+UNIT_TEST ?= ./internal ./internal/controllers ./internal/kmmmodule
 
 .PHONY: unit-test
 unit-test: vet ## Run the unit tests.
@@ -325,6 +329,9 @@ helm-k8s: helmify manifests kustomize clean-helm-k8s gen-kmm-charts-k8s ## Build
 	# Patching k8s helm chart kmm subchart
 	cp $(shell pwd)/hack/k8s-patch/k8s-kmm-patch/metadata-patch/*.yaml $(shell pwd)/helm-charts-k8s/charts/kmm/
 	cp $(shell pwd)/hack/k8s-patch/k8s-kmm-patch/template-patch/*.yaml $(shell pwd)/helm-charts-k8s/charts/kmm/templates/
+	mkdir -p $(shell pwd)/helm-charts-k8s/charts/remediation/templates
+	cp $(shell pwd)/hack/k8s-patch/k8s-remediation-patch/metadata-patch/*.yaml $(shell pwd)/helm-charts-k8s/charts/remediation/
+	cp $(shell pwd)/hack/k8s-patch/k8s-remediation-patch/template-patch/*.yaml $(shell pwd)/helm-charts-k8s/charts/remediation/templates/
 	cd $(shell pwd)/helm-charts-k8s; helm dependency update; helm lint; cd ..;
 	mkdir $(shell pwd)/helm-charts-k8s/crds
 	echo "moving crd yaml files to crds folder"
@@ -481,6 +488,21 @@ rm -f $(1); \
 fi
 endef
 
+# remove-wrong-version-tool will use $1 $2 to check binary version
+# any binary with mismatched version compared to $3 will be removed
+# 1 - Path to the binary
+# 2 - Version argument (e.g., --version)
+# 3 - Expected version string (e.g., v0.17.0)
+define remove-wrong-version-tool
+@if [ -f $(1) ]; then \
+version_output=`$(1) $(2) 2>/dev/null || echo "not found"`; \
+echo "$$version_output" | grep -q $(3) || { \
+echo "Incorrect version ($$version_output), removing $(1)"; \
+rm -f $(1); \
+}; \
+fi
+endef
+
 OPERATOR_SDK = $(shell pwd)/bin/operator-sdk
 OPERATOR_SDK_VERSION=v1.32.0
 .PHONY: operator-sdk
@@ -575,7 +597,7 @@ helm-uninstall-openshift:
 	helm uninstall amd-gpu-operator -n kube-amd-gpu
 
 helm-install-k8s:
-	helm install -f helm-charts-k8s/values.yaml amd-gpu-operator ${GPU_OPERATOR_CHART} -n kube-amd-gpu --create-namespace ${SKIP_NFD_CMD} ${SKIP_KMM_CMD} ${HELM_OC_CMD} ${SIM_ENABLE_CMD} ${SKIP_INSTALL_DEFAULT_CR_CMD}
+	helm install -f helm-charts-k8s/values.yaml amd-gpu-operator ${GPU_OPERATOR_CHART} -n kube-amd-gpu --create-namespace ${SKIP_NFD_CMD} ${SKIP_KMM_CMD} ${SKIP_REMEDIATION_CONTROLLER_CMD} ${HELM_OC_CMD} ${SIM_ENABLE_CMD} ${SKIP_INSTALL_DEFAULT_CR_CMD}
 
 helm-uninstall-k8s:
 	echo "Deleting all device configs before uninstalling operator..."

diff --git a/api/v1alpha1/deviceconfig_types.go b/api/v1alpha1/deviceconfig_types.go
@@ -75,6 +75,28 @@ type DeviceConfigSpec struct {
 	//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Selector",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:selector"}
 	// +optional
 	Selector map[string]string `json:"selector,omitempty"`
+
+	// remediation workflow
+	//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="RemediationWorkflow",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:remediationWorkflow"}
+	// +optional
+	RemediationWorkflow RemediationWorkflowSpec `json:"remediationWorkflow,omitempty"`
+}
+
+// RemediationWorkflowSpec defines workflows to run based on node conditions
+type RemediationWorkflowSpec struct {
+	// enable remediation workflows. disabled by default
+	// enable if operator should automatically handle remediation of node incase of gpu issues
+	//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Enable",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:enable"}
+	Enable *bool `json:"enable,omitempty"`
+
+	// Name of the ConfigMap that holds condition-to-workflow mappings.
+	//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="ConditionalWorkflows",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:conditionalWorkflows"}
+	ConditionalWorkflows *v1.LocalObjectReference `json:"conditionalWorkflows,omitempty"`
+
+	// Time to live for argo workflow object and its pods for a failed workflow in hours. By default, it is set to 24 hours
+	//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="TtlForFailedWorkflows",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:ttlForFailedWorkflows"}
+	// +kubebuilder:default:=24
+	TtlForFailedWorkflows int `json:"ttlForFailedWorkflows,omitempty"`
 }
 
 type RegistryTLS struct {
@@ -88,13 +110,38 @@ type RegistryTLS struct {
 	InsecureSkipTLSVerify *bool `json:"insecureSkipTLSVerify,omitempty"`
 }
 
+type VFIOConfigSpec struct {
+	// list of PCI device IDs to load into vfio-pci driver. default is the list of AMD GPU PF/VF PCI device IDs based on driver type vf-passthrough/pf-passthrough.
+	DeviceIDs []string `json:"deviceIDs,omitempty"`
+}
+
 type DriverSpec struct {
 	// enable driver install. default value is true.
 	// disable is for skipping driver install/uninstall for dryrun or using in-tree amdgpu kernel module
 	//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Enable",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:enable"}
 	// +kubebuilder:default=true
 	Enable *bool `json:"enable,omitempty"`
 
+	// specify the type of driver (container/vf-passthrough/pf-passthrough) to install on the worker node. default value is container.
+	// container: normal amdgpu-dkms driver for Bare Metal GPU nodes or guest VM.
+	// vf-passthrough: MxGPU GIM driver on the host machine to generate VF, then mount VF to vfio-pci
+	// pf-passthrough: directly mount PF device to vfio-pci
+	//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="DriverType",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:driverType"}
+	// +kubebuilder:validation:Enum=container;vf-passthrough;pf-passthrough
+	// +kubebuilder:default=container
+	DriverType string `json:"driverType,omitempty"`
+
+	// vfio config
+	// specify the specific configs for binding PCI devices to vfio-pci kernel module, applies for driver type vf-passthrough and pf-passthrough
+	//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="VFIOConfig",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:vfioConfig"}
+	// +optional
+	VFIOConfig VFIOConfigSpec `json:"vfioConfig,omitempty"`
+
+	// advanced arguments, parameters and more configs to manage tne driver
+	//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="KernelModuleConfig",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:kernelModuleConfig"}
+	// +optional
+	KernelModuleConfig KernelModuleConfigSpec `json:"kernelModuleConfig,omitempty"`
+
 	// blacklist amdgpu drivers on the host. Node reboot is required to apply the baclklist on the worker nodes.
 	// Not working for OpenShift cluster. OpenShift users please use the Machine Config Operator (MCO) resource to configure amdgpu blacklist.
 	// Example MCO resource is available at https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/openshift-olm.html#create-blacklist-for-installing-out-of-tree-kernel-module
@@ -156,6 +203,22 @@ type DriverSpec struct {
 	Tolerations []v1.Toleration `json:"tolerations,omitempty"`
 }
 
+// KernelModuleConfigSpec contains the advanced configs to manage the driver kernel module
+type KernelModuleConfigSpec struct {
+	// LoadArg are the arguments when modprobe is executed to load the kernel module. The command will be `modprobe ${Args} module_name`.
+	//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="LoadArg",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:loadArg"}
+	// +optional
+	LoadArgs []string `json:"loadArgs,omitempty"`
+	// UnloadArg are the arguments when modprobe is executed to unload the kernel module. The command will be `modprobe -r ${Args} module_name`.
+	//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="UnloadArg",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:unloadArg"}
+	// +optional
+	UnloadArgs []string `json:"unloadArgs,omitempty"`
+	// Parameters is being used for modprobe commands. The command will be `modprobe ${Args} module_name ${Parameters}`.
+	//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Parameters",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:parameters"}
+	// +optional
+	Parameters []string `json:"parameters,omitempty"`
+}
+
 // UpgradeState captures the state of the upgrade process on a node
 // +enum
 type UpgradeState string
@@ -757,7 +820,7 @@ type DeviceConfigStatus struct {
 //+kubebuilder:subresource:status
 
 // DeviceConfig describes how to enable AMD GPU device
-// +operator-sdk:csv:customresourcedefinitions:displayName="DeviceConfig",resources={{Module,v1beta1,modules.kmm.sigs.x-k8s.io},{Daemonset,v1,apps}, {services,v1,core}}
+// +operator-sdk:csv:customresourcedefinitions:displayName="DeviceConfig",resources={{Module,v1beta1,modules.kmm.sigs.x-k8s.io},{Daemonset,v1,apps},{services,v1,core},{Pod,v1,core}}
 type DeviceConfig struct {
 	metav1.TypeMeta   `json:",inline"`
 	metav1.ObjectMeta `json:"metadata,omitempty"`

diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go
diff --git a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
@@ -32,7 +32,7 @@ metadata:
     capabilities: Seamless Upgrades
     categories: AI/Machine Learning,Monitoring
     containerImage: docker.io/rocm/gpu-operator:v1.2.0
-    createdAt: "2025-08-09T01:44:36Z"
+    createdAt: "2025-08-14T12:20:47Z"
     description: |-
       Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter
       For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/)
@@ -67,6 +67,9 @@ spec:
       - kind: Daemonset
         name: apps
         version: v1
+      - kind: Pod
+        name: core
+        version: v1
       - kind: services
         name: core
         version: v1
@@ -265,6 +268,15 @@ spec:
         path: driver.blacklist
         x-descriptors:
         - urn:alm:descriptor:com.amd.deviceconfigs:blacklistDrivers
+      - description: 'specify the type of driver (container/vf-passthrough/pf-passthrough)
+          to install on the worker node. default value is container. container: normal
+          amdgpu-dkms driver for Bare Metal GPU nodes or guest VM. vf-passthrough:
+          MxGPU GIM driver on the host machine to generate VF, then mount VF to vfio-pci
+          pf-passthrough: directly mount PF device to vfio-pci'
+        displayName: DriverType
+        path: driver.driverType
+        x-descriptors:
+        - urn:alm:descriptor:com.amd.deviceconfigs:driverType
       - description: enable driver install. default value is true. disable is for
           skipping driver install/uninstall for dryrun or using in-tree amdgpu kernel
           module
@@ -342,6 +354,30 @@ spec:
         path: driver.imageSign.keySecret
         x-descriptors:
         - urn:alm:descriptor:com.amd.deviceconfigs:imageSignKeySecret
+      - description: advanced arguments, parameters and more configs to manage tne
+          driver
+        displayName: KernelModuleConfig
+        path: driver.kernelModuleConfig
+        x-descriptors:
+        - urn:alm:descriptor:com.amd.deviceconfigs:kernelModuleConfig
+      - description: LoadArg are the arguments when modprobe is executed to load the
+          kernel module. The command will be `modprobe ${Args} module_name`.
+        displayName: LoadArg
+        path: driver.kernelModuleConfig.loadArgs
+        x-descriptors:
+        - urn:alm:descriptor:com.amd.deviceconfigs:loadArg
+      - description: Parameters is being used for modprobe commands. The command will
+          be `modprobe ${Args} module_name ${Parameters}`.
+        displayName: Parameters
+        path: driver.kernelModuleConfig.parameters
+        x-descriptors:
+        - urn:alm:descriptor:com.amd.deviceconfigs:parameters
+      - description: UnloadArg are the arguments when modprobe is executed to unload
+          the kernel module. The command will be `modprobe -r ${Args} module_name`.
+        displayName: UnloadArg
+        path: driver.kernelModuleConfig.unloadArgs
+        x-descriptors:
+        - urn:alm:descriptor:com.amd.deviceconfigs:unloadArg
       - description: tolerations for kmm module object
         displayName: Tolerations
         path: driver.tolerations
@@ -399,6 +435,12 @@ spec:
         path: driver.version
         x-descriptors:
         - urn:alm:descriptor:com.amd.deviceconfigs:version
+      - description: vfio config specify the specific configs for binding PCI devices
+          to vfio-pci kernel module, applies for driver type vf-passthrough and pf-passthrough
+        displayName: VFIOConfig
+        path: driver.vfioConfig
+        x-descriptors:
+        - urn:alm:descriptor:com.amd.deviceconfigs:vfioConfig
       - description: metrics exporter
         displayName: MetricsExporter
         path: metricsExporter
@@ -603,6 +645,28 @@ spec:
         path: metricsExporter.upgradePolicy.upgradeStrategy
         x-descriptors:
         - urn:alm:descriptor:com.amd.deviceconfigs:upgradeStrategy
+      - description: remediation workflow
+        displayName: RemediationWorkflow
+        path: remediationWorkflow
+        x-descriptors:
+        - urn:alm:descriptor:com.amd.deviceconfigs:remediationWorkflow
+      - description: Name of the ConfigMap that holds condition-to-workflow mappings.
+        displayName: ConditionalWorkflows
+        path: remediationWorkflow.conditionalWorkflows
+        x-descriptors:
+        - urn:alm:descriptor:com.amd.deviceconfigs:conditionalWorkflows
+      - description: enable remediation workflows. disabled by default enable if operator
+          should automatically handle remediation of node incase of gpu issues
+        displayName: Enable
+        path: remediationWorkflow.enable
+        x-descriptors:
+        - urn:alm:descriptor:com.amd.deviceconfigs:enable
+      - description: Time to live for argo workflow object and its pods for a failed
+          workflow in hours. By default, it is set to 24 hours
+        displayName: TtlForFailedWorkflows
+        path: remediationWorkflow.ttlForFailedWorkflows
+        x-descriptors:
+        - urn:alm:descriptor:com.amd.deviceconfigs:ttlForFailedWorkflows
       - description: Selector describes on which nodes the GPU Operator should enable
           the GPU device.
         displayName: Selector