Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
87cfd48
[GPUOP] Adding docs for DCM Systemd integration (#851) (#852)
ci-penbot-01 Aug 1, 2025
d68303f
[Helm] Add Radeon Pro W7800 48GB PCI device ID into default NFD rule
yansun1996 Jul 17, 2025
3e7b591
[DOC] Fix invalid YAML format of amd-smi example pod
yansun1996 Aug 12, 2025
a00655f
Automate GIM driver installation on vgpu Host
yansun1996 May 2, 2025
0a3341b
Use opensource GIM driver for automation (#631)
yansun1996 May 2, 2025
c6ef4a3
Add workerMgr module to enable vfio post process after installing GIM…
yansun1996 May 7, 2025
2d39576
Misc optimization on handling VFIO device mount
yansun1996 May 8, 2025
9d6fa15
Add graceperiod for worker pod
yansun1996 May 8, 2025
6d61d06
Add basic GIM driver deployment and vfio-pci mount e2e test
yansun1996 May 9, 2025
ede4af7
[Feature] Add support for PF-Passthrough use case (#701)
yansun1996 May 27, 2025
f9ab531
[Feature] Allow users to configure modprobe arguments and parameters …
yansun1996 May 28, 2025
c125df4
[e2e] Add new CRD fields into helm-e2e
yansun1996 May 29, 2025
99f750f
[Feature] Configurable vfio binding config
yansun1996 May 21, 2025
4a4a674
[DOC] Add docs for KubeVirt integration
yansun1996 Jun 2, 2025
71b69ce
[Feature] Add KubeVirt related CRD fields in Helm default CR
yansun1996 Jun 3, 2025
d31241a
[Fix] Fix DeviceConfig operand counter out-of-sync with Daemonset
yansun1996 Jun 5, 2025
8737845
[DOC] Provide more details on KubeVirt host configuration
yansun1996 Jun 3, 2025
6173ed3
[Helm] Add MI350X and MI355X PF and VF device ID into NFD rule (#777)
yansun1996 Jun 17, 2025
a29309a
[Build] Optimize makefile to auto remove outdated binary tools
yansun1996 May 21, 2025
6b7d758
[Build] resolve conflicts for opensourcing kubevirt related commits
yansun1996 Aug 13, 2025
f4fce60
GPU Operator Integration with Remediation Workflows using Argo Workflows
sriram-30 Jul 17, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
26 changes: 24 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,10 @@ ifdef SKIP_INSTALL_DEFAULT_CR
SKIP_INSTALL_DEFAULT_CR_CMD=--set crds.defaultCR.install=false
endif

ifdef SKIP_REMEDIATION_CONTROLLER
SKIP_REMEDIATION_CONTROLLER_CMD=--set remediation.enabled=false
endif

#################################
# OpenShift OLM Bundle varaiables
# BUNDLE_IMG defines the image:tag used for the bundle.
Expand Down Expand Up @@ -229,7 +233,7 @@ fmt: ## Run go fmt against code.
vet: ## Run go vet against code.
go vet ./...

UNIT_TEST ?= ./internal/controllers ./internal/kmmmodule ./internal
UNIT_TEST ?= ./internal ./internal/controllers ./internal/kmmmodule

.PHONY: unit-test
unit-test: vet ## Run the unit tests.
Expand Down Expand Up @@ -325,6 +329,9 @@ helm-k8s: helmify manifests kustomize clean-helm-k8s gen-kmm-charts-k8s ## Build
# Patching k8s helm chart kmm subchart
cp $(shell pwd)/hack/k8s-patch/k8s-kmm-patch/metadata-patch/*.yaml $(shell pwd)/helm-charts-k8s/charts/kmm/
cp $(shell pwd)/hack/k8s-patch/k8s-kmm-patch/template-patch/*.yaml $(shell pwd)/helm-charts-k8s/charts/kmm/templates/
mkdir -p $(shell pwd)/helm-charts-k8s/charts/remediation/templates
cp $(shell pwd)/hack/k8s-patch/k8s-remediation-patch/metadata-patch/*.yaml $(shell pwd)/helm-charts-k8s/charts/remediation/
cp $(shell pwd)/hack/k8s-patch/k8s-remediation-patch/template-patch/*.yaml $(shell pwd)/helm-charts-k8s/charts/remediation/templates/
cd $(shell pwd)/helm-charts-k8s; helm dependency update; helm lint; cd ..;
mkdir $(shell pwd)/helm-charts-k8s/crds
echo "moving crd yaml files to crds folder"
Expand Down Expand Up @@ -481,6 +488,21 @@ rm -f $(1); \
fi
endef

# remove-wrong-version-tool will use $1 $2 to check binary version
# any binary with mismatched version compared to $3 will be removed
# 1 - Path to the binary
# 2 - Version argument (e.g., --version)
# 3 - Expected version string (e.g., v0.17.0)
define remove-wrong-version-tool
@if [ -f $(1) ]; then \
version_output=`$(1) $(2) 2>/dev/null || echo "not found"`; \
echo "$$version_output" | grep -q $(3) || { \
echo "Incorrect version ($$version_output), removing $(1)"; \
rm -f $(1); \
}; \
fi
endef

OPERATOR_SDK = $(shell pwd)/bin/operator-sdk
OPERATOR_SDK_VERSION=v1.32.0
.PHONY: operator-sdk
Expand Down Expand Up @@ -575,7 +597,7 @@ helm-uninstall-openshift:
helm uninstall amd-gpu-operator -n kube-amd-gpu

helm-install-k8s:
helm install -f helm-charts-k8s/values.yaml amd-gpu-operator ${GPU_OPERATOR_CHART} -n kube-amd-gpu --create-namespace ${SKIP_NFD_CMD} ${SKIP_KMM_CMD} ${HELM_OC_CMD} ${SIM_ENABLE_CMD} ${SKIP_INSTALL_DEFAULT_CR_CMD}
helm install -f helm-charts-k8s/values.yaml amd-gpu-operator ${GPU_OPERATOR_CHART} -n kube-amd-gpu --create-namespace ${SKIP_NFD_CMD} ${SKIP_KMM_CMD} ${SKIP_REMEDIATION_CONTROLLER_CMD} ${HELM_OC_CMD} ${SIM_ENABLE_CMD} ${SKIP_INSTALL_DEFAULT_CR_CMD}

helm-uninstall-k8s:
echo "Deleting all device configs before uninstalling operator..."
Expand Down
65 changes: 64 additions & 1 deletion api/v1alpha1/deviceconfig_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,28 @@ type DeviceConfigSpec struct {
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Selector",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:selector"}
// +optional
Selector map[string]string `json:"selector,omitempty"`

// remediation workflow
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="RemediationWorkflow",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:remediationWorkflow"}
// +optional
RemediationWorkflow RemediationWorkflowSpec `json:"remediationWorkflow,omitempty"`
}

// RemediationWorkflowSpec defines workflows to run based on node conditions
type RemediationWorkflowSpec struct {
// enable remediation workflows. disabled by default
// enable if operator should automatically handle remediation of node incase of gpu issues
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Enable",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:enable"}
Enable *bool `json:"enable,omitempty"`

// Name of the ConfigMap that holds condition-to-workflow mappings.
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="ConditionalWorkflows",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:conditionalWorkflows"}
ConditionalWorkflows *v1.LocalObjectReference `json:"conditionalWorkflows,omitempty"`

// Time to live for argo workflow object and its pods for a failed workflow in hours. By default, it is set to 24 hours
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="TtlForFailedWorkflows",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:ttlForFailedWorkflows"}
// +kubebuilder:default:=24
TtlForFailedWorkflows int `json:"ttlForFailedWorkflows,omitempty"`
}

type RegistryTLS struct {
Expand All @@ -88,13 +110,38 @@ type RegistryTLS struct {
InsecureSkipTLSVerify *bool `json:"insecureSkipTLSVerify,omitempty"`
}

type VFIOConfigSpec struct {
// list of PCI device IDs to load into vfio-pci driver. default is the list of AMD GPU PF/VF PCI device IDs based on driver type vf-passthrough/pf-passthrough.
DeviceIDs []string `json:"deviceIDs,omitempty"`
}

type DriverSpec struct {
// enable driver install. default value is true.
// disable is for skipping driver install/uninstall for dryrun or using in-tree amdgpu kernel module
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Enable",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:enable"}
// +kubebuilder:default=true
Enable *bool `json:"enable,omitempty"`

// specify the type of driver (container/vf-passthrough/pf-passthrough) to install on the worker node. default value is container.
// container: normal amdgpu-dkms driver for Bare Metal GPU nodes or guest VM.
// vf-passthrough: MxGPU GIM driver on the host machine to generate VF, then mount VF to vfio-pci
// pf-passthrough: directly mount PF device to vfio-pci
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="DriverType",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:driverType"}
// +kubebuilder:validation:Enum=container;vf-passthrough;pf-passthrough
// +kubebuilder:default=container
DriverType string `json:"driverType,omitempty"`

// vfio config
// specify the specific configs for binding PCI devices to vfio-pci kernel module, applies for driver type vf-passthrough and pf-passthrough
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="VFIOConfig",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:vfioConfig"}
// +optional
VFIOConfig VFIOConfigSpec `json:"vfioConfig,omitempty"`

// advanced arguments, parameters and more configs to manage tne driver
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="KernelModuleConfig",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:kernelModuleConfig"}
// +optional
KernelModuleConfig KernelModuleConfigSpec `json:"kernelModuleConfig,omitempty"`

// blacklist amdgpu drivers on the host. Node reboot is required to apply the baclklist on the worker nodes.
// Not working for OpenShift cluster. OpenShift users please use the Machine Config Operator (MCO) resource to configure amdgpu blacklist.
// Example MCO resource is available at https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/openshift-olm.html#create-blacklist-for-installing-out-of-tree-kernel-module
Expand Down Expand Up @@ -156,6 +203,22 @@ type DriverSpec struct {
Tolerations []v1.Toleration `json:"tolerations,omitempty"`
}

// KernelModuleConfigSpec contains the advanced configs to manage the driver kernel module
type KernelModuleConfigSpec struct {
// LoadArg are the arguments when modprobe is executed to load the kernel module. The command will be `modprobe ${Args} module_name`.
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="LoadArg",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:loadArg"}
// +optional
LoadArgs []string `json:"loadArgs,omitempty"`
// UnloadArg are the arguments when modprobe is executed to unload the kernel module. The command will be `modprobe -r ${Args} module_name`.
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="UnloadArg",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:unloadArg"}
// +optional
UnloadArgs []string `json:"unloadArgs,omitempty"`
// Parameters is being used for modprobe commands. The command will be `modprobe ${Args} module_name ${Parameters}`.
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Parameters",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:parameters"}
// +optional
Parameters []string `json:"parameters,omitempty"`
}

// UpgradeState captures the state of the upgrade process on a node
// +enum
type UpgradeState string
Expand Down Expand Up @@ -757,7 +820,7 @@ type DeviceConfigStatus struct {
//+kubebuilder:subresource:status

// DeviceConfig describes how to enable AMD GPU device
// +operator-sdk:csv:customresourcedefinitions:displayName="DeviceConfig",resources={{Module,v1beta1,modules.kmm.sigs.x-k8s.io},{Daemonset,v1,apps}, {services,v1,core}}
// +operator-sdk:csv:customresourcedefinitions:displayName="DeviceConfig",resources={{Module,v1beta1,modules.kmm.sigs.x-k8s.io},{Daemonset,v1,apps},{services,v1,core},{Pod,v1,core}}
type DeviceConfig struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`
Expand Down
78 changes: 78 additions & 0 deletions api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

66 changes: 65 additions & 1 deletion bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ metadata:
capabilities: Seamless Upgrades
categories: AI/Machine Learning,Monitoring
containerImage: docker.io/rocm/gpu-operator:v1.2.0
createdAt: "2025-08-09T01:44:36Z"
createdAt: "2025-08-14T12:20:47Z"
description: |-
Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter
For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/)
Expand Down Expand Up @@ -67,6 +67,9 @@ spec:
- kind: Daemonset
name: apps
version: v1
- kind: Pod
name: core
version: v1
- kind: services
name: core
version: v1
Expand Down Expand Up @@ -265,6 +268,15 @@ spec:
path: driver.blacklist
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:blacklistDrivers
- description: 'specify the type of driver (container/vf-passthrough/pf-passthrough)
to install on the worker node. default value is container. container: normal
amdgpu-dkms driver for Bare Metal GPU nodes or guest VM. vf-passthrough:
MxGPU GIM driver on the host machine to generate VF, then mount VF to vfio-pci
pf-passthrough: directly mount PF device to vfio-pci'
displayName: DriverType
path: driver.driverType
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:driverType
- description: enable driver install. default value is true. disable is for
skipping driver install/uninstall for dryrun or using in-tree amdgpu kernel
module
Expand Down Expand Up @@ -342,6 +354,30 @@ spec:
path: driver.imageSign.keySecret
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:imageSignKeySecret
- description: advanced arguments, parameters and more configs to manage tne
driver
displayName: KernelModuleConfig
path: driver.kernelModuleConfig
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:kernelModuleConfig
- description: LoadArg are the arguments when modprobe is executed to load the
kernel module. The command will be `modprobe ${Args} module_name`.
displayName: LoadArg
path: driver.kernelModuleConfig.loadArgs
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:loadArg
- description: Parameters is being used for modprobe commands. The command will
be `modprobe ${Args} module_name ${Parameters}`.
displayName: Parameters
path: driver.kernelModuleConfig.parameters
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:parameters
- description: UnloadArg are the arguments when modprobe is executed to unload
the kernel module. The command will be `modprobe -r ${Args} module_name`.
displayName: UnloadArg
path: driver.kernelModuleConfig.unloadArgs
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:unloadArg
- description: tolerations for kmm module object
displayName: Tolerations
path: driver.tolerations
Expand Down Expand Up @@ -399,6 +435,12 @@ spec:
path: driver.version
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:version
- description: vfio config specify the specific configs for binding PCI devices
to vfio-pci kernel module, applies for driver type vf-passthrough and pf-passthrough
displayName: VFIOConfig
path: driver.vfioConfig
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:vfioConfig
- description: metrics exporter
displayName: MetricsExporter
path: metricsExporter
Expand Down Expand Up @@ -603,6 +645,28 @@ spec:
path: metricsExporter.upgradePolicy.upgradeStrategy
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:upgradeStrategy
- description: remediation workflow
displayName: RemediationWorkflow
path: remediationWorkflow
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:remediationWorkflow
- description: Name of the ConfigMap that holds condition-to-workflow mappings.
displayName: ConditionalWorkflows
path: remediationWorkflow.conditionalWorkflows
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:conditionalWorkflows
- description: enable remediation workflows. disabled by default enable if operator
should automatically handle remediation of node incase of gpu issues
displayName: Enable
path: remediationWorkflow.enable
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:enable
- description: Time to live for argo workflow object and its pods for a failed
workflow in hours. By default, it is set to 24 hours
displayName: TtlForFailedWorkflows
path: remediationWorkflow.ttlForFailedWorkflows
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:ttlForFailedWorkflows
- description: Selector describes on which nodes the GPU Operator should enable
the GPU device.
displayName: Selector
Expand Down
Loading
Loading