Add ROCm support to plugin

thomas-huber · bwbarrett · commit 1660ccbad269 · 2025-10-08T16:34:36.000Z
This work is an attempt to rebase #461 on the latest master branch. I've tested functionality with CXI provider on Slingshot 11 Signed-off-by: Thomas Huber <thomas.huber@amd.com> Signed-off-by: Ryan Hankins <ryan.hankins@hpe.com>
diff --git a/configure.ac b/configure.ac
@@ -122,21 +122,53 @@ CHECK_PKG_NEURON([AS_IF([test -n "${want_cuda}"],
                         [AC_MSG_ERROR([Cannot enable both CUDA and neuron.])],
                         [want_cuda=no])
                   have_device_interface=neuron])
-CHECK_PKG_CUDA([have_device_interface=cuda])
+
+
+CHECK_PKG_CUDA([
+  AS_IF([test "${have_device_interface}" = "no" -a -z "$with_rocm"], [
+    have_device_interface=cuda
+  ])
+])
+
+
+CHECK_PKG_ROCM([
+  AS_IF([test "${have_device_interface}" = "no"], [
+    have_device_interface=rocm
+  ])
+])
+
+
 AS_IF([test "${have_device_interface}" = "no"],
-      [AC_MSG_ERROR([NCCL OFI Plugin requires either CUDA or Neuron runtime.])])
-DEVICE_INTERFACE="${have_device_interface}"
-AC_SUBST([DEVICE_INTERFACE])
+      [AC_MSG_ERROR([NCCL OFI Plugin requires either CUDA, ROCm, or Neuron runtime.])])
 
 CHECK_PKG_LTTNG()
 CHECK_PKG_NVTX()
 
+do_cuda=0
+do_rocm=0
+AS_IF([test "$have_device_interface" = "cuda"], [do_cuda=1])
+AS_IF([test "$have_device_interface" = "rocm"], [do_rocm=1; enable_tests="no"])
+
 CHECK_PKG_HWLOC([],
 		[AC_MSG_ERROR([Could not find the hwloc library. Use --with-hwloc to provide the path to non-standard hwloc installation.])])
+AC_DEFINE_UNQUOTED([HAVE_CUDA], [${do_cuda}], [Defined to 1 if CUDA is available])
+AM_CONDITIONAL([HAVE_CUDA], [test ${do_cuda} = 1])
 
 CHECK_PKG_VALGRIND()
 CHECK_VAR_REDZONE()
 
+AC_DEFINE_UNQUOTED([HAVE_ROCM], [${do_rocm}], [Defined to 1 if ROCm is available])
+AM_CONDITIONAL([HAVE_ROCM], [test ${do_rocm} = 1])
+
+AM_CONDITIONAL([ENABLE_NEURON], [test "x${have_device_interface}" = "xneuron"])
+AS_IF([test ${do_rocm} = 1],
+  [AC_DEFINE_UNQUOTED([__HIP_PLATFORM_AMD__], [1], [Select AMD/ROCm HIP APIs])])
+
+AS_CASE([$have_device_interface],
+  [neuron], [DEVICE_INTERFACE=neuron],
+  [*],      [DEVICE_INTERFACE=cuda])
+AC_SUBST([DEVICE_INTERFACE])
+
 NCCL_OFI_PLATFORM="none"
 AS_IF([test "${NCCL_OFI_PLATFORM}" = "none"], [AX_CHECK_PLATFORM_AWS()])
 
diff --git a/include/nccl_ofi_config_bottom.h b/include/nccl_ofi_config_bottom.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
  */
 
 #ifndef NCCL_OFI_CONFIG_BOTTOM_H
@@ -62,4 +63,11 @@ static inline int memfd_create(const char *name, unsigned int flags)
 }
 #endif /* ifndef HAVE_MEMFD_CREATE */
 
+/* Use HAVE_GPU to guard GPU-specific code instead of checking both individually */
+#if HAVE_CUDA || HAVE_ROCM
+#define HAVE_GPU 1
+#else
+#define HAVE_GPU 0
+#endif
+
 #endif
diff --git a/include/nccl_ofi_cuda.h b/include/nccl_ofi_cuda.h
@@ -6,7 +6,7 @@
 #ifndef NCCL_OFI_CUDA_H_
 #define NCCL_OFI_CUDA_H_
 
-int nccl_net_ofi_cuda_init(void);
+int nccl_net_ofi_gpu_init(void);
 
 /*
  * @brief	Gets the CUDA device associated with the buffer
@@ -27,7 +27,7 @@ int nccl_net_ofi_get_cuda_device_for_addr(void *data, int *dev_id);
  * @return	0 on success
  *		-1 on error
  */
-int nccl_net_ofi_cuda_flush_gpudirect_rdma_writes(void);
+int nccl_net_ofi_gpu_flush_gpudirect_rdma_writes(void);
 
 /*
  * @brief	wraps cudaGetDevice()
@@ -100,4 +100,4 @@ bool nccl_net_ofi_cuda_have_dma_buf_attr(void);
  */
 bool nccl_net_ofi_cuda_have_gdr_support_attr(void);
 
-#endif  // End NCCL_OFI_H_
+#endif  // End NCCL_OFI_CUDA_H_
diff --git a/include/nccl_ofi_rocm.h b/include/nccl_ofi_rocm.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2018-2023 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright (c) 2025, Hewlett Packard Enterprise Development LP.
+ * Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved
+ */
+
+#ifndef NCCL_OFI_ROCM_H_
+#define NCCL_OFI_ROCM_H_
+
+/* Generic GPU init (ROCm variant) */
+int nccl_net_ofi_gpu_init(void);
+
+/*
+ * @brief	Gets the device associated with the buffer
+ *
+ * @param	data
+ *		Pointer to GPU buffer.
+ *
+ * @return	Valid GPU device ID on success
+ *		-1 on error
+ * @return	0 on success
+ *		-EINVAL on error
+ */
+int nccl_net_ofi_get_cuda_device_for_addr(void *data, int *dev_id);
+
+/*
+ * @brief	wraps cudaFlushGPUDirectRDMAWrites() with default args.
+ * @return	0 on success
+ *		-1 on error
+ */
+int nccl_net_ofi_gpu_flush_gpudirect_rdma_writes(void);
+
+/*
+ * @brief	wraps cudaGetDevice()
+ * @return	0 on success
+ *		-1 on error
+ */
+int nccl_net_ofi_cuda_get_num_devices(void);
+
+/*
+ * @brief	query CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED
+ * @return	true if attr is fetched successfully and true.
+ *		    false otherwise.
+ */
+int nccl_net_ofi_cuda_get_active_device_idx(void);
+
+bool nccl_net_ofi_cuda_have_dma_buf_attr(void);
+bool nccl_net_ofi_cuda_have_gdr_support_attr(void);
+
+#endif /* NCCL_OFI_ROCM_H_ */
diff --git a/m4/check_pkg_rocm.m4 b/m4/check_pkg_rocm.m4
@@ -0,0 +1,53 @@
+# -*- autoconf -*-
+#
+# Copyright (c) 2018-2023 Amazon.com, Inc. or its affiliates. All rights reserved.
+# Copyright (c) 2025, Hewlett Packard Enterprise Development LP.
+# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved
+#
+#
+# See LICENSE.txt for license information
+#
+
+AC_DEFUN([CHECK_PKG_ROCM], [
+  check_pkg_found="yes"
+  check_pkg_CPPFLAGS_save="${CPPFLAGS}"
+  check_pkg_LDFLAGS_save="${LDFLAGS}"
+  check_pkg_LIBS_save="${LIBS}"
+
+  AC_ARG_WITH([rocm],
+             [AS_HELP_STRING([--with-rocm=PATH], [Path to non-standard ROCm installation])])
+
+  AS_IF([test -z "${with-rocm}" -o "{with_rocm}" = "yes"],
+        [],
+        [test "${with_rocm}" = "no"],
+        [check_pkg_found=no],
+        [AS_IF([test -d ${with_rocm}/lib64], [check_pkg_libdir="lib64"], [check_pkg_libdir="lib"])
+        CPPFLAGS="-I${with_rocm}/include ${CPPFLAGS}"
+        LDFLAGS="-L${with_rocm}/${check_pkg_libdir} ${LDFLAGS}"])
+
+  AS_IF([test "${check_pkg_found}" = "yes"],
+        [AC_CHECK_LIB([amdhip64], [hipMemAllocHost], [], [check_pkg_found=no])])
+
+  AS_IF([test "${check_pkg_found}" = "yes"],
+        [AC_CHECK_HEADERS([hip/hip_runtime_api.h], [], [check_pkg_found=no], [#define __HIP_PLATFORM_AMD__])])
+
+  AS_IF([test "${check_pkg_found}" = "yes"],
+        [check_pkg_define="yes"],
+        [check_pkg_define="no"
+         CPPFLAGS="${check_pkg_CPPFLAGS_save}"
+         LDFLAGS="${check_pkg_LDFLAGS_save}"
+         LIBS="${check_pkg_LIBS_save}"
+        ])
+
+  AS_IF([test -n "${with_rocm}"],
+       [AS_IF([test "${check_pkg_define}" = "yes"],
+              [$1], [$2] )
+       ], [$2]
+   )
+   
+  AS_UNSET([check_pkg_found])
+  AS_UNSET([check_pkg_define])
+  AS_UNSET([check_pkg_CPPFLAGS_save])
+  AS_UNSET([check_pkg_LDFLAGS_save])
+  AS_UNSET([check_pkg_LIBS_save])
+])
diff --git a/src/Makefile.am b/src/Makefile.am
@@ -7,7 +7,7 @@
 AM_CPPFLAGS = -I$(abs_top_srcdir)/include
 AM_CPPFLAGS += -isystem $(abs_top_srcdir)/3rd-party
 AM_CPPFLAGS += -isystem $(abs_top_srcdir)/3rd-party/nccl/$(DEVICE_INTERFACE)/include
-AM_CPPFLAGS += $(CUDA_CPPFLAGS)
+AM_CPPFLAGS += $(CUDA_CPPFLAGS) $(ROCM_CPPFLAGS)
 AM_CPPFLAGS += -DXML_DIR=\"${pkgdatadir}/xml\"
 
 sources = \
@@ -41,15 +41,21 @@ sources += platform-aws.cpp
 endif
 
 if ENABLE_NEURON
-  sources += nccl_ofi_interface_neuron.cpp
-else
-  sources += \
-	nccl_ofi_cuda.cpp \
-	nccl_ofi_interface_nvidia.cpp
+sources += nccl_ofi_interface_neuron.cpp
+endif
+
+if HAVE_ROCM
+sources += nccl_ofi_rocm.cpp nccl_ofi_interface_nvidia.cpp
+endif
+
+if HAVE_CUDA
+sources += nccl_ofi_cuda.cpp nccl_ofi_interface_nvidia.cpp
+endif
 
-  # add the tuner sources into the library
+# add the tuner sources into the library
+if !ENABLE_NEURON
 if WANT_PLATFORM_AWS
-    sources +=  \
+  sources +=  \
 	tuner/nccl_ofi_regions.cpp \
 	tuner/nccl_ofi_tuner.cpp \
 	tuner/nccl_ofi_model.cpp
@@ -61,9 +67,8 @@ endif
 # us writing dlopen() handlers for simple unit tests.
 noinst_LTLIBRARIES = libinternal_plugin.la
 libinternal_plugin_la_SOURCES = $(sources)
-libinternal_plugin_la_LDFLAGS = -static  $(CUDA_LDFLAGS)
-libinternal_plugin_la_LIBADD = $(CUDA_LIBS)
-
+libinternal_plugin_la_LDFLAGS = $(CUDA_LDFLAGS) $(ROCM_LDFLAGS)
+libinternal_plugin_la_LIBADD  = $(CUDA_LIBS) $(ROCM_LIBS)
 
 if ENABLE_NEURON
   lib_LTLIBRARIES = libnccom-net.la
@@ -74,9 +79,9 @@ if ENABLE_NEURON
 else
   lib_LTLIBRARIES = libnccl-net-ofi.la
   libnccl_net_ofi_la_SOURCES =
-  libnccl_net_ofi_la_LIBADD = libinternal_plugin.la
+  libnccl_net_ofi_la_LIBADD = libinternal_plugin.la $(CUDA_LIBS) $(ROCM_LIBS)
   libnccl_net_ofi_la_LIBTOOLFLAGS = --tag=CXX
-  libnccl_net_ofi_la_LDFLAGS = -module -avoid-version
+  libnccl_net_ofi_la_LDFLAGS = -module -avoid-version -Wl,--no-undefined
 
 # We always install libnccl-net-ofi.so.  To use the default shared library,
 # either NCCL_NET_PLUGIN=libnccl-net-ofi.so or NCCL_NET_PLUGIN=ofi must be set.
@@ -102,7 +107,7 @@ if ENABLE_NCCL_NET_LIBRARY
   libnccl_net_la_SOURCES =
   libnccl_net_la_LIBADD = libinternal_plugin.la
   libnccl_net_la_LIBTOOLFLAGS = --tag=CXX
-  libnccl_net_la_LDFLAGS = -module -avoid-version
+  libnccl_net_la_LDFLAGS = -module -avoid-version -Wl,--no-undefined
 endif
 if WANT_PLATFORM_AWS
 # NCCL standardized on the libnccl-tuner-<interface> format after we released a
@@ -120,4 +125,14 @@ if WANT_PLATFORM_AWS
   libnccl_tuner_ofi_la_LDFLAGS = -module -avoid-version
 endif
 
+if HAVE_ROCM
+install-exec-hook::
+	@$(MKDIR_P) $(DESTDIR)$(libdir)
+	@$(RM) -f $(DESTDIR)$(libdir)/librccl-net.so
+	@$(LN_S) libnccl-net-ofi.so $(DESTDIR)$(libdir)/librccl-net.so
+
+uninstall-local::
+	@$(RM) -f $(DESTDIR)$(libdir)/librccl-net.so
+endif
+
 endif
diff --git a/src/nccl_ofi_api.cpp b/src/nccl_ofi_api.cpp
@@ -535,7 +535,7 @@ ncclResult_t nccl_net_ofi_regMrDmaBuf_v6(void* comm, void* data, size_t size,
 	/* Validate type of buffer */
 	bool valid_buffer_type = false;
 	if (type == NCCL_PTR_HOST) valid_buffer_type = true;
-#if HAVE_CUDA
+#if HAVE_GPU
 	if (type == NCCL_PTR_CUDA) valid_buffer_type = true;
 #endif
 #if HAVE_NEURON
diff --git a/src/nccl_ofi_cuda.cpp b/src/nccl_ofi_cuda.cpp
@@ -66,7 +66,7 @@ DECLARE_CUDA_FUNCTION(cuMemAlloc, 3020);
 DECLARE_CUDA_FUNCTION(cuMemFree, 3020);
 DECLARE_CUDA_FUNCTION(cuMemcpy, 4000);
 
-int nccl_net_ofi_cuda_init(void)
+int nccl_net_ofi_gpu_init(void)
 {
 	int driverVersion = -1;
 	int runtimeVersion = -1;
@@ -110,7 +110,7 @@ int nccl_net_ofi_cuda_init(void)
 	return 0;
 }
 
-int nccl_net_ofi_cuda_flush_gpudirect_rdma_writes(void)
+int nccl_net_ofi_gpu_flush_gpudirect_rdma_writes(void)
 {
 #if HAVE_CUDA_GDRFLUSH_SUPPORT
 	static_assert(CUDA_VERSION >= 11030, "Requires cudart>=11.3");
diff --git a/src/nccl_ofi_net.cpp b/src/nccl_ofi_net.cpp
@@ -22,6 +22,8 @@
 #include "nccl_ofi_tracepoint.h"
 #if HAVE_CUDA
 #include "nccl_ofi_cuda.h"
+#elif HAVE_ROCM
+#include "nccl_ofi_rocm.h"
 #endif
 #include "nccl_ofi_sendrecv.h"
 #include "nccl_ofi_rdma.h"
@@ -166,8 +168,8 @@ int nccl_net_ofi_create_plugin(nccl_net_ofi_plugin_t **plugin_p)
 	 */
 	mr_cache_alignment = std::min(system_page_size, NCCL_OFI_CACHE_PAGE_SIZE);
 
-#if HAVE_CUDA
-	ret = nccl_net_ofi_cuda_init();
+#if HAVE_GPU
+	ret = nccl_net_ofi_gpu_init();
 	if (ret != 0) {
 		NCCL_OFI_WARN("CUDA initialization failed.");
 		goto exit;
diff --git a/src/nccl_ofi_ofiutils.cpp b/src/nccl_ofi_ofiutils.cpp
@@ -20,6 +20,9 @@
 #include "nccl_ofi_math.h"
 #include "nccl_ofi_ofiutils.h"
 #include "nccl_ofi_platform.h"
+#if HAVE_ROCM
+#include "nccl_ofi_rocm.h"
+#endif
 
 #define EFA_PROVIDER_NAME "efa"
 #define IS_EFA_PROVIDER(NAME) (strcmp((NAME), EFA_PROVIDER_NAME)==0)
@@ -383,6 +386,11 @@ ofi_ep_result nccl_ofi_ofiutils_ep_create(struct fi_info *info, ofi_domain_ptr &
 		 * to supported.
 		 */
 		support_gdr = GDR_SUPPORTED;
+#elif HAVE_ROCM
+		/*
+		 * ROCm does not require FI_OPT_CUDA_API_PERMITTED.
+		 */
+		support_gdr = GDR_SUPPORTED;
 #else
 		NCCL_OFI_WARN("Using Libfabric 1.18 API with GPUDirect RDMA support, and FI_OPT_CUDA_API_PERMITTED is not declared.");
 		ret = -EOPNOTSUPP;
diff --git a/src/nccl_ofi_rdma.cpp b/src/nccl_ofi_rdma.cpp
@@ -19,6 +19,8 @@
 #include "nccl_ofi_log.h"
 #if HAVE_CUDA
 #include "nccl_ofi_cuda.h"
+#elif HAVE_ROCM
+#include "nccl_ofi_rocm.h"
 #endif
 #include "nccl_ofi_environ.h"
 #include "nccl_ofi_ep_addr_list.h"
@@ -325,9 +327,9 @@ static int set_mr_req_attr(uint64_t mr_key,
 	case NCCL_PTR_HOST:
 		mr_attr->iface = FI_HMEM_SYSTEM;
 		break;
-#if HAVE_CUDA
+#if HAVE_GPU
 	case NCCL_PTR_CUDA:
-		mr_attr->iface = FI_HMEM_CUDA;
+		mr_attr->iface = HAVE_CUDA ? FI_HMEM_CUDA : FI_HMEM_ROCR;
 
 		/* Get CUDA device ID */
 		ret = nccl_net_ofi_get_cuda_device_for_addr(
@@ -3946,11 +3948,11 @@ static int flush(nccl_net_ofi_recv_comm_t *recv_comm, int n, void **buffers,
 	if (ofi_nccl_gdr_flush_disable() || support_gdr == GDR_UNSUPPORTED)
 		goto exit;
 
-#if HAVE_CUDA
+#if HAVE_GPU
 	if (cuda_flush) {
-		ret = nccl_net_ofi_cuda_flush_gpudirect_rdma_writes();
+		ret = nccl_net_ofi_gpu_flush_gpudirect_rdma_writes();
 		if (ret != 0) {
-			NCCL_OFI_WARN("Error performing CUDA GDR flush");
+			NCCL_OFI_WARN("Error performing GPU GDR flush");
 		}
 		goto exit;
 	}
diff --git a/src/nccl_ofi_rocm.cpp b/src/nccl_ofi_rocm.cpp
diff --git a/src/nccl_ofi_sendrecv.cpp b/src/nccl_ofi_sendrecv.cpp
diff --git a/src/nccl_ofi_topo.cpp b/src/nccl_ofi_topo.cpp
diff --git a/src/platform-aws.cpp b/src/platform-aws.cpp

Original file line number	Diff line number	Diff line change
`@@ -66,7 +66,7 @@ DECLARE_CUDA_FUNCTION(cuMemAlloc, 3020);`
`66`	`66`	`DECLARE_CUDA_FUNCTION(cuMemFree, 3020);`
`67`	`67`	`DECLARE_CUDA_FUNCTION(cuMemcpy, 4000);`
`68`	`68`
`69`		`-int nccl_net_ofi_cuda_init(void)`
	`69`	`+int nccl_net_ofi_gpu_init(void)`
`70`	`70`	`{`
`71`	`71`	`int driverVersion = -1;`
`72`	`72`	`int runtimeVersion = -1;`
`@@ -110,7 +110,7 @@ int nccl_net_ofi_cuda_init(void)`
`110`	`110`	`return 0;`
`111`	`111`	`}`
`112`	`112`
`113`		`-int nccl_net_ofi_cuda_flush_gpudirect_rdma_writes(void)`
	`113`	`+int nccl_net_ofi_gpu_flush_gpudirect_rdma_writes(void)`
`114`	`114`	`{`
`115`	`115`	`#if HAVE_CUDA_GDRFLUSH_SUPPORT`
`116`	`116`	`static_assert(CUDA_VERSION >= 11030, "Requires cudart>=11.3");`