aws · thomas-huber · Aug 27, 2025 · Oct 10, 2025 · Oct 10, 2025 · Oct 10, 2025
@@ -122,21 +122,53 @@ CHECK_PKG_NEURON([AS_IF([test -n "${want_cuda}"],
                         [AC_MSG_ERROR([Cannot enable both CUDA and neuron.])],
                         [want_cuda=no])
                   have_device_interface=neuron])
-CHECK_PKG_CUDA([have_device_interface=cuda])
+
+
+CHECK_PKG_CUDA([
+  AS_IF([test "${have_device_interface}" = "no" -a -z "$with_rocm"], [
+    have_device_interface=cuda
+  ])
+])
+
+
+CHECK_PKG_ROCM([
+  AS_IF([test "${have_device_interface}" = "no"], [
+    have_device_interface=rocm
+  ])
+])
+
+
 AS_IF([test "${have_device_interface}" = "no"],
-      [AC_MSG_ERROR([NCCL OFI Plugin requires either CUDA or Neuron runtime.])])
-DEVICE_INTERFACE="${have_device_interface}"
-AC_SUBST([DEVICE_INTERFACE])
+      [AC_MSG_ERROR([NCCL OFI Plugin requires either CUDA, ROCm, or Neuron runtime.])])
 
 CHECK_PKG_LTTNG()
 CHECK_PKG_NVTX()
 
+do_cuda=0
+do_rocm=0
+AS_IF([test "$have_device_interface" = "cuda"], [do_cuda=1])
+AS_IF([test "$have_device_interface" = "rocm"], [do_rocm=1; enable_tests="no"])
+
 CHECK_PKG_HWLOC([],
 		[AC_MSG_ERROR([Could not find the hwloc library. Use --with-hwloc to provide the path to non-standard hwloc installation.])])
+AC_DEFINE_UNQUOTED([HAVE_CUDA], [${do_cuda}], [Defined to 1 if CUDA is available])
+AM_CONDITIONAL([HAVE_CUDA], [test ${do_cuda} = 1])
 
 CHECK_PKG_VALGRIND()
 CHECK_VAR_REDZONE()
 
+AC_DEFINE_UNQUOTED([HAVE_ROCM], [${do_rocm}], [Defined to 1 if ROCm is available])
+AM_CONDITIONAL([HAVE_ROCM], [test ${do_rocm} = 1])
+
+AM_CONDITIONAL([ENABLE_NEURON], [test "x${have_device_interface}" = "xneuron"])
+AS_IF([test ${do_rocm} = 1],
+  [AC_DEFINE_UNQUOTED([__HIP_PLATFORM_AMD__], [1], [Select AMD/ROCm HIP APIs])])
+
+AS_CASE([$have_device_interface],
+  [neuron], [DEVICE_INTERFACE=neuron],
+  [*],      [DEVICE_INTERFACE=cuda])
+AC_SUBST([DEVICE_INTERFACE])
+
 NCCL_OFI_PLATFORM="none"
 AS_IF([test "${NCCL_OFI_PLATFORM}" = "none"], [AX_CHECK_PLATFORM_AWS()])
 

@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
  */
 
 #ifndef NCCL_OFI_CONFIG_BOTTOM_H
@@ -62,4 +63,11 @@ static inline int memfd_create(const char *name, unsigned int flags)
 }
 #endif /* ifndef HAVE_MEMFD_CREATE */
 
+/* Use HAVE_GPU to guard GPU-specific code instead of checking both individually */
+#if HAVE_CUDA || HAVE_ROCM
+#define HAVE_GPU 1
+#else
+#define HAVE_GPU 0
+#endif
+
 #endif
@@ -6,7 +6,7 @@
 #ifndef NCCL_OFI_CUDA_H_
 #define NCCL_OFI_CUDA_H_
 
-int nccl_net_ofi_cuda_init(void);
+int nccl_net_ofi_gpu_init(void);
 
 /*
  * @brief	Gets the CUDA device associated with the buffer
@@ -27,7 +27,7 @@ int nccl_net_ofi_get_cuda_device_for_addr(void *data, int *dev_id);
  * @return	0 on success
  *		-1 on error
  */
-int nccl_net_ofi_cuda_flush_gpudirect_rdma_writes(void);
+int nccl_net_ofi_gpu_flush_gpudirect_rdma_writes(void);
 
 /*
  * @brief	wraps cudaGetDevice()
@@ -100,4 +100,4 @@ bool nccl_net_ofi_cuda_have_dma_buf_attr(void);
  */
 bool nccl_net_ofi_cuda_have_gdr_support_attr(void);
 
-#endif  // End NCCL_OFI_H_
+#endif  // End NCCL_OFI_CUDA_H_
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2018-2023 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright (c) 2025, Hewlett Packard Enterprise Development LP.
+ * Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved
+ */
+
+#ifndef NCCL_OFI_ROCM_H_
+#define NCCL_OFI_ROCM_H_
+
+/* Generic GPU init (ROCm variant) */
+int nccl_net_ofi_gpu_init(void);
+
+/*
+ * @brief	Gets the device associated with the buffer
+ *
+ * @param	data
+ *		Pointer to GPU buffer.
+ *
+ * @return	Valid GPU device ID on success
+ *		-1 on error
+ * @return	0 on success
+ *		-EINVAL on error
+ */
+int nccl_net_ofi_get_cuda_device_for_addr(void *data, int *dev_id);
+
+/*
+ * @brief	wraps cudaFlushGPUDirectRDMAWrites() with default args.
+ * @return	0 on success
+ *		-1 on error
+ */
+int nccl_net_ofi_gpu_flush_gpudirect_rdma_writes(void);
+
+/*
+ * @brief	wraps cudaGetDevice()
+ * @return	0 on success
+ *		-1 on error
+ */
+int nccl_net_ofi_cuda_get_num_devices(void);
+
+/*
+ * @brief	query CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED
+ * @return	true if attr is fetched successfully and true.
+ *		    false otherwise.
+ */
+int nccl_net_ofi_cuda_get_active_device_idx(void);
+
+bool nccl_net_ofi_cuda_have_dma_buf_attr(void);
+bool nccl_net_ofi_cuda_have_gdr_support_attr(void);
+
+#endif /* NCCL_OFI_ROCM_H_ */
@@ -0,0 +1,53 @@
+# -*- autoconf -*-
+#
+# Copyright (c) 2018-2023 Amazon.com, Inc. or its affiliates. All rights reserved.
+# Copyright (c) 2025, Hewlett Packard Enterprise Development LP.
+# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved
+#
+#
+# See LICENSE.txt for license information
+#
+
+AC_DEFUN([CHECK_PKG_ROCM], [
+  check_pkg_found="yes"
+  check_pkg_CPPFLAGS_save="${CPPFLAGS}"
+  check_pkg_LDFLAGS_save="${LDFLAGS}"
+  check_pkg_LIBS_save="${LIBS}"
+
+  AC_ARG_WITH([rocm],
+             [AS_HELP_STRING([--with-rocm=PATH], [Path to non-standard ROCm installation])])
+
+  AS_IF([test -z "${with-rocm}" -o "{with_rocm}" = "yes"],
+        [],
+        [test "${with_rocm}" = "no"],
+        [check_pkg_found=no],
+        [AS_IF([test -d ${with_rocm}/lib64], [check_pkg_libdir="lib64"], [check_pkg_libdir="lib"])
+        CPPFLAGS="-I${with_rocm}/include ${CPPFLAGS}"
+        LDFLAGS="-L${with_rocm}/${check_pkg_libdir} ${LDFLAGS}"])
+
+  AS_IF([test "${check_pkg_found}" = "yes"],
+        [AC_CHECK_LIB([amdhip64], [hipMemAllocHost], [], [check_pkg_found=no])])
+
+  AS_IF([test "${check_pkg_found}" = "yes"],
+        [AC_CHECK_HEADERS([hip/hip_runtime_api.h], [], [check_pkg_found=no], [#define __HIP_PLATFORM_AMD__])])
+
+  AS_IF([test "${check_pkg_found}" = "yes"],
+        [check_pkg_define="yes"],
+        [check_pkg_define="no"
+         CPPFLAGS="${check_pkg_CPPFLAGS_save}"
+         LDFLAGS="${check_pkg_LDFLAGS_save}"
+         LIBS="${check_pkg_LIBS_save}"
+        ])
+
+  AS_IF([test -n "${with_rocm}"],
+       [AS_IF([test "${check_pkg_define}" = "yes"],
+              [$1], [$2] )
+       ], [$2]
+   )
+   
+  AS_UNSET([check_pkg_found])
+  AS_UNSET([check_pkg_define])
+  AS_UNSET([check_pkg_CPPFLAGS_save])
+  AS_UNSET([check_pkg_LDFLAGS_save])
+  AS_UNSET([check_pkg_LIBS_save])
+])
@@ -7,7 +7,7 @@
 AM_CPPFLAGS = -I$(abs_top_srcdir)/include
 AM_CPPFLAGS += -isystem $(abs_top_srcdir)/3rd-party
 AM_CPPFLAGS += -isystem $(abs_top_srcdir)/3rd-party/nccl/$(DEVICE_INTERFACE)/include
-AM_CPPFLAGS += $(CUDA_CPPFLAGS)
+AM_CPPFLAGS += $(CUDA_CPPFLAGS) $(ROCM_CPPFLAGS)
 AM_CPPFLAGS += -DXML_DIR=\"${pkgdatadir}/xml\"
 
 sources = \
@@ -41,15 +41,21 @@ sources += platform-aws.cpp
 endif
 
 if ENABLE_NEURON
-  sources += nccl_ofi_interface_neuron.cpp
-else
-  sources += \
-	nccl_ofi_cuda.cpp \
-	nccl_ofi_interface_nvidia.cpp
+sources += nccl_ofi_interface_neuron.cpp
+endif
+
+if HAVE_ROCM
+sources += nccl_ofi_rocm.cpp nccl_ofi_interface_nvidia.cpp
+endif
+
+if HAVE_CUDA
+sources += nccl_ofi_cuda.cpp nccl_ofi_interface_nvidia.cpp
+endif
 
-  # add the tuner sources into the library
+# add the tuner sources into the library
+if !ENABLE_NEURON
 if WANT_PLATFORM_AWS
-    sources +=  \
+  sources +=  \
 	tuner/nccl_ofi_regions.cpp \
 	tuner/nccl_ofi_tuner.cpp \
 	tuner/nccl_ofi_model.cpp
@@ -61,30 +67,69 @@ endif
 # us writing dlopen() handlers for simple unit tests.
 noinst_LTLIBRARIES = libinternal_plugin.la
 libinternal_plugin_la_SOURCES = $(sources)
-libinternal_plugin_la_LDFLAGS = -static  $(CUDA_LDFLAGS)
-libinternal_plugin_la_LIBADD = $(CUDA_LIBS)
+libinternal_plugin_la_LDFLAGS = -static $(CUDA_LDFLAGS) $(ROCM_LDFLAGS)
+libinternal_plugin_la_LIBADD  = $(CUDA_LIBS) $(ROCM_LIBS)
 
+lib_LTLIBRARIES =
 
 if ENABLE_NEURON
-  lib_LTLIBRARIES = libnccom-net.la
+  lib_LTLIBRARIES += libnccom-net.la
   libnccom_net_la_SOURCES =
   libnccom_net_la_LIBADD = libinternal_plugin.la
   libnccom_net_la_LIBTOOLFLAGS = --tag=CXX
   libnccom_net_la_LDFLAGS = -module -avoid-version
-else
-  lib_LTLIBRARIES = libnccl-net-ofi.la
+endif
+
+# For both NCCL and RCCL,we always install lib{n,r}ccl-net-ofi.so.  To use the
+# default shared library, either NCCL_NET_PLUGIN=lib{n,r}ccl-net-ofi.so or
+# NCCL_NET_PLUGIN=ofi must be set.
+#
+# To enable the OFI plugin by default, a lib{n,r}ccl-net.so library is created.
+# If NCCL_NET_PLUGIN is not set, NCCL will attempt to dlopen lib{n,r}ccl-net.so,
+# with dlopen() searching the default search path.  This behavior is optional,
+# as some situations (like the NGC containers) may have multiple network
+# plugins.
+
+if HAVE_ROCM
+
+if ENABLE_NCCL_NET_LIBRARY
+  lib_LTLIBRARIES += librccl-net.la
+  librccl_net_la_SOURCES =
+  librccl_net_la_LIBADD = libinternal_plugin.la
+  librccl_net_la_LIBTOOLFLAGS = --tag=CXX
+  librccl_net_la_LDFLAGS = -module -avoid-version
+endif
+
+  lib_LTLIBRARIES += librccl-net-ofi.la
+  librccl_net_ofi_la_SOURCES =
+  librccl_net_ofi_la_LIBADD = libinternal_plugin.la
+  librccl_net_ofi_la_LIBTOOLFLAGS = --tag=CXX
+  librccl_net_ofi_la_LDFLAGS = -module -avoid-version
+
+endif
+
+
+if HAVE_CUDA
+
+if ENABLE_NCCL_NET_LIBRARY
+  lib_LTLIBRARIES += libnccl-net.la
+  libnccl_net_la_SOURCES =
+  libnccl_net_la_LIBADD = libinternal_plugin.la
+  libnccl_net_la_LIBTOOLFLAGS = --tag=CXX
+  libnccl_net_la_LDFLAGS = -module -avoid-version
+endif
+
+  lib_LTLIBRARIES += libnccl-net-ofi.la
   libnccl_net_ofi_la_SOURCES =
   libnccl_net_ofi_la_LIBADD = libinternal_plugin.la
   libnccl_net_ofi_la_LIBTOOLFLAGS = --tag=CXX
   libnccl_net_ofi_la_LDFLAGS = -module -avoid-version
 
-# We always install libnccl-net-ofi.so.  To use the default shared library,
-# either NCCL_NET_PLUGIN=libnccl-net-ofi.so or NCCL_NET_PLUGIN=ofi must be set.
-#
-# To enable the OFI plugin by default, a libnccl-net.so library is created.  If
-# NCCL_NET_PLUGIN is not set, NCCL will attempt to dlopen libnccl-net.so, with
-# dlopen() searching the default search path.  This behavior is optional, as
-# some situations (like the NGC containers) may have multiple network plugins.
+
+if WANT_PLATFORM_AWS
+# The tuner is normally built into the net plugin shared library, supporting
+# recent NCCL versions by default.  For historical reasons, we build a separate
+# tuner library on Nvidia platforms.
 #
 # Recent versions of NCCL include a tuner interface for algorithm/protocol
 # selection.  The tuner code lives in the net plugin, but a libnccl-ofi-tuner.so
@@ -97,14 +142,7 @@ else
 #
 # By bundling the tuner in the net plugin, we cause the tuner to be used by
 # default on NCCL 2.21 or later.
-if ENABLE_NCCL_NET_LIBRARY
-  lib_LTLIBRARIES += libnccl-net.la
-  libnccl_net_la_SOURCES =
-  libnccl_net_la_LIBADD = libinternal_plugin.la
-  libnccl_net_la_LIBTOOLFLAGS = --tag=CXX
-  libnccl_net_la_LDFLAGS = -module -avoid-version
-endif
-if WANT_PLATFORM_AWS
+#
 # NCCL standardized on the libnccl-tuner-<interface> format after we released a
 # plugin with the tuner named libnccl-ofi-tuner.so.  Create separate libraries
 # for each name.

@@ -535,7 +535,7 @@ ncclResult_t nccl_net_ofi_regMrDmaBuf_v6(void* comm, void* data, size_t size,
 	/* Validate type of buffer */
 	bool valid_buffer_type = false;
 	if (type == NCCL_PTR_HOST) valid_buffer_type = true;
-#if HAVE_CUDA
+#if HAVE_GPU
 	if (type == NCCL_PTR_CUDA) valid_buffer_type = true;
 #endif
 #if HAVE_NEURON

@@ -66,7 +66,7 @@ DECLARE_CUDA_FUNCTION(cuMemAlloc, 3020);
 DECLARE_CUDA_FUNCTION(cuMemFree, 3020);
 DECLARE_CUDA_FUNCTION(cuMemcpy, 4000);
 
-int nccl_net_ofi_cuda_init(void)
+int nccl_net_ofi_gpu_init(void)
 {
 	int driverVersion = -1;
 	int runtimeVersion = -1;
@@ -110,7 +110,7 @@ int nccl_net_ofi_cuda_init(void)
 	return 0;
 }
 
-int nccl_net_ofi_cuda_flush_gpudirect_rdma_writes(void)
+int nccl_net_ofi_gpu_flush_gpudirect_rdma_writes(void)
 {
 #if HAVE_CUDA_GDRFLUSH_SUPPORT
 	static_assert(CUDA_VERSION >= 11030, "Requires cudart>=11.3");

@@ -22,6 +22,8 @@
 #include "nccl_ofi_tracepoint.h"
 #if HAVE_CUDA
 #include "nccl_ofi_cuda.h"
+#elif HAVE_ROCM
+#include "nccl_ofi_rocm.h"
 #endif
 #include "nccl_ofi_sendrecv.h"
 #include "nccl_ofi_rdma.h"
@@ -166,8 +168,8 @@ int nccl_net_ofi_create_plugin(nccl_net_ofi_plugin_t **plugin_p)
 	 */
 	mr_cache_alignment = std::min(system_page_size, NCCL_OFI_CACHE_PAGE_SIZE);
 
-#if HAVE_CUDA
-	ret = nccl_net_ofi_cuda_init();
+#if HAVE_GPU
+	ret = nccl_net_ofi_gpu_init();
 	if (ret != 0) {
 		NCCL_OFI_WARN("CUDA initialization failed.");
 		goto exit;