Skip to content

Commit 1660ccb

Browse files
thomas-huberbwbarrett
authored andcommitted
Add ROCm support to plugin
This work is an attempt to rebase #461 on the latest master branch. I've tested functionality with CXI provider on Slingshot 11 Signed-off-by: Thomas Huber <[email protected]> Signed-off-by: Ryan Hankins <[email protected]>
1 parent 4decd69 commit 1660ccb

15 files changed

+321
-38
lines changed

configure.ac

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -122,21 +122,53 @@ CHECK_PKG_NEURON([AS_IF([test -n "${want_cuda}"],
122122
[AC_MSG_ERROR([Cannot enable both CUDA and neuron.])],
123123
[want_cuda=no])
124124
have_device_interface=neuron])
125-
CHECK_PKG_CUDA([have_device_interface=cuda])
125+
126+
127+
CHECK_PKG_CUDA([
128+
AS_IF([test "${have_device_interface}" = "no" -a -z "$with_rocm"], [
129+
have_device_interface=cuda
130+
])
131+
])
132+
133+
134+
CHECK_PKG_ROCM([
135+
AS_IF([test "${have_device_interface}" = "no"], [
136+
have_device_interface=rocm
137+
])
138+
])
139+
140+
126141
AS_IF([test "${have_device_interface}" = "no"],
127-
[AC_MSG_ERROR([NCCL OFI Plugin requires either CUDA or Neuron runtime.])])
128-
DEVICE_INTERFACE="${have_device_interface}"
129-
AC_SUBST([DEVICE_INTERFACE])
142+
[AC_MSG_ERROR([NCCL OFI Plugin requires either CUDA, ROCm, or Neuron runtime.])])
130143

131144
CHECK_PKG_LTTNG()
132145
CHECK_PKG_NVTX()
133146

147+
do_cuda=0
148+
do_rocm=0
149+
AS_IF([test "$have_device_interface" = "cuda"], [do_cuda=1])
150+
AS_IF([test "$have_device_interface" = "rocm"], [do_rocm=1; enable_tests="no"])
151+
134152
CHECK_PKG_HWLOC([],
135153
[AC_MSG_ERROR([Could not find the hwloc library. Use --with-hwloc to provide the path to non-standard hwloc installation.])])
154+
AC_DEFINE_UNQUOTED([HAVE_CUDA], [${do_cuda}], [Defined to 1 if CUDA is available])
155+
AM_CONDITIONAL([HAVE_CUDA], [test ${do_cuda} = 1])
136156

137157
CHECK_PKG_VALGRIND()
138158
CHECK_VAR_REDZONE()
139159

160+
AC_DEFINE_UNQUOTED([HAVE_ROCM], [${do_rocm}], [Defined to 1 if ROCm is available])
161+
AM_CONDITIONAL([HAVE_ROCM], [test ${do_rocm} = 1])
162+
163+
AM_CONDITIONAL([ENABLE_NEURON], [test "x${have_device_interface}" = "xneuron"])
164+
AS_IF([test ${do_rocm} = 1],
165+
[AC_DEFINE_UNQUOTED([__HIP_PLATFORM_AMD__], [1], [Select AMD/ROCm HIP APIs])])
166+
167+
AS_CASE([$have_device_interface],
168+
[neuron], [DEVICE_INTERFACE=neuron],
169+
[*], [DEVICE_INTERFACE=cuda])
170+
AC_SUBST([DEVICE_INTERFACE])
171+
140172
NCCL_OFI_PLATFORM="none"
141173
AS_IF([test "${NCCL_OFI_PLATFORM}" = "none"], [AX_CHECK_PLATFORM_AWS()])
142174

include/nccl_ofi_config_bottom.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
/*
22
* Copyright (c) 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved.
3+
* Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
34
*/
45

56
#ifndef NCCL_OFI_CONFIG_BOTTOM_H
@@ -62,4 +63,11 @@ static inline int memfd_create(const char *name, unsigned int flags)
6263
}
6364
#endif /* ifndef HAVE_MEMFD_CREATE */
6465

66+
/* Use HAVE_GPU to guard GPU-specific code instead of checking both individually */
67+
#if HAVE_CUDA || HAVE_ROCM
68+
#define HAVE_GPU 1
69+
#else
70+
#define HAVE_GPU 0
71+
#endif
72+
6573
#endif

include/nccl_ofi_cuda.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
#ifndef NCCL_OFI_CUDA_H_
77
#define NCCL_OFI_CUDA_H_
88

9-
int nccl_net_ofi_cuda_init(void);
9+
int nccl_net_ofi_gpu_init(void);
1010

1111
/*
1212
* @brief Gets the CUDA device associated with the buffer
@@ -27,7 +27,7 @@ int nccl_net_ofi_get_cuda_device_for_addr(void *data, int *dev_id);
2727
* @return 0 on success
2828
* -1 on error
2929
*/
30-
int nccl_net_ofi_cuda_flush_gpudirect_rdma_writes(void);
30+
int nccl_net_ofi_gpu_flush_gpudirect_rdma_writes(void);
3131

3232
/*
3333
* @brief wraps cudaGetDevice()
@@ -100,4 +100,4 @@ bool nccl_net_ofi_cuda_have_dma_buf_attr(void);
100100
*/
101101
bool nccl_net_ofi_cuda_have_gdr_support_attr(void);
102102

103-
#endif // End NCCL_OFI_H_
103+
#endif // End NCCL_OFI_CUDA_H_

include/nccl_ofi_rocm.h

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
/*
2+
* Copyright (c) 2018-2023 Amazon.com, Inc. or its affiliates. All rights reserved.
3+
* Copyright (c) 2025, Hewlett Packard Enterprise Development LP.
4+
* Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved
5+
*/
6+
7+
#ifndef NCCL_OFI_ROCM_H_
8+
#define NCCL_OFI_ROCM_H_
9+
10+
/* Generic GPU init (ROCm variant) */
11+
int nccl_net_ofi_gpu_init(void);
12+
13+
/*
14+
* @brief Gets the device associated with the buffer
15+
*
16+
* @param data
17+
* Pointer to GPU buffer.
18+
*
19+
* @return Valid GPU device ID on success
20+
* -1 on error
21+
* @return 0 on success
22+
* -EINVAL on error
23+
*/
24+
int nccl_net_ofi_get_cuda_device_for_addr(void *data, int *dev_id);
25+
26+
/*
27+
* @brief wraps cudaFlushGPUDirectRDMAWrites() with default args.
28+
* @return 0 on success
29+
* -1 on error
30+
*/
31+
int nccl_net_ofi_gpu_flush_gpudirect_rdma_writes(void);
32+
33+
/*
34+
* @brief wraps cudaGetDevice()
35+
* @return 0 on success
36+
* -1 on error
37+
*/
38+
int nccl_net_ofi_cuda_get_num_devices(void);
39+
40+
/*
41+
* @brief query CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED
42+
* @return true if attr is fetched successfully and true.
43+
* false otherwise.
44+
*/
45+
int nccl_net_ofi_cuda_get_active_device_idx(void);
46+
47+
bool nccl_net_ofi_cuda_have_dma_buf_attr(void);
48+
bool nccl_net_ofi_cuda_have_gdr_support_attr(void);
49+
50+
#endif /* NCCL_OFI_ROCM_H_ */

m4/check_pkg_rocm.m4

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# -*- autoconf -*-
2+
#
3+
# Copyright (c) 2018-2023 Amazon.com, Inc. or its affiliates. All rights reserved.
4+
# Copyright (c) 2025, Hewlett Packard Enterprise Development LP.
5+
# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved
6+
#
7+
#
8+
# See LICENSE.txt for license information
9+
#
10+
11+
AC_DEFUN([CHECK_PKG_ROCM], [
12+
check_pkg_found="yes"
13+
check_pkg_CPPFLAGS_save="${CPPFLAGS}"
14+
check_pkg_LDFLAGS_save="${LDFLAGS}"
15+
check_pkg_LIBS_save="${LIBS}"
16+
17+
AC_ARG_WITH([rocm],
18+
[AS_HELP_STRING([--with-rocm=PATH], [Path to non-standard ROCm installation])])
19+
20+
AS_IF([test -z "${with-rocm}" -o "{with_rocm}" = "yes"],
21+
[],
22+
[test "${with_rocm}" = "no"],
23+
[check_pkg_found=no],
24+
[AS_IF([test -d ${with_rocm}/lib64], [check_pkg_libdir="lib64"], [check_pkg_libdir="lib"])
25+
CPPFLAGS="-I${with_rocm}/include ${CPPFLAGS}"
26+
LDFLAGS="-L${with_rocm}/${check_pkg_libdir} ${LDFLAGS}"])
27+
28+
AS_IF([test "${check_pkg_found}" = "yes"],
29+
[AC_CHECK_LIB([amdhip64], [hipMemAllocHost], [], [check_pkg_found=no])])
30+
31+
AS_IF([test "${check_pkg_found}" = "yes"],
32+
[AC_CHECK_HEADERS([hip/hip_runtime_api.h], [], [check_pkg_found=no], [#define __HIP_PLATFORM_AMD__])])
33+
34+
AS_IF([test "${check_pkg_found}" = "yes"],
35+
[check_pkg_define="yes"],
36+
[check_pkg_define="no"
37+
CPPFLAGS="${check_pkg_CPPFLAGS_save}"
38+
LDFLAGS="${check_pkg_LDFLAGS_save}"
39+
LIBS="${check_pkg_LIBS_save}"
40+
])
41+
42+
AS_IF([test -n "${with_rocm}"],
43+
[AS_IF([test "${check_pkg_define}" = "yes"],
44+
[$1], [$2] )
45+
], [$2]
46+
)
47+
48+
AS_UNSET([check_pkg_found])
49+
AS_UNSET([check_pkg_define])
50+
AS_UNSET([check_pkg_CPPFLAGS_save])
51+
AS_UNSET([check_pkg_LDFLAGS_save])
52+
AS_UNSET([check_pkg_LIBS_save])
53+
])

src/Makefile.am

Lines changed: 29 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
AM_CPPFLAGS = -I$(abs_top_srcdir)/include
88
AM_CPPFLAGS += -isystem $(abs_top_srcdir)/3rd-party
99
AM_CPPFLAGS += -isystem $(abs_top_srcdir)/3rd-party/nccl/$(DEVICE_INTERFACE)/include
10-
AM_CPPFLAGS += $(CUDA_CPPFLAGS)
10+
AM_CPPFLAGS += $(CUDA_CPPFLAGS) $(ROCM_CPPFLAGS)
1111
AM_CPPFLAGS += -DXML_DIR=\"${pkgdatadir}/xml\"
1212

1313
sources = \
@@ -41,15 +41,21 @@ sources += platform-aws.cpp
4141
endif
4242

4343
if ENABLE_NEURON
44-
sources += nccl_ofi_interface_neuron.cpp
45-
else
46-
sources += \
47-
nccl_ofi_cuda.cpp \
48-
nccl_ofi_interface_nvidia.cpp
44+
sources += nccl_ofi_interface_neuron.cpp
45+
endif
46+
47+
if HAVE_ROCM
48+
sources += nccl_ofi_rocm.cpp nccl_ofi_interface_nvidia.cpp
49+
endif
50+
51+
if HAVE_CUDA
52+
sources += nccl_ofi_cuda.cpp nccl_ofi_interface_nvidia.cpp
53+
endif
4954

50-
# add the tuner sources into the library
55+
# add the tuner sources into the library
56+
if !ENABLE_NEURON
5157
if WANT_PLATFORM_AWS
52-
sources += \
58+
sources += \
5359
tuner/nccl_ofi_regions.cpp \
5460
tuner/nccl_ofi_tuner.cpp \
5561
tuner/nccl_ofi_model.cpp
@@ -61,9 +67,8 @@ endif
6167
# us writing dlopen() handlers for simple unit tests.
6268
noinst_LTLIBRARIES = libinternal_plugin.la
6369
libinternal_plugin_la_SOURCES = $(sources)
64-
libinternal_plugin_la_LDFLAGS = -static $(CUDA_LDFLAGS)
65-
libinternal_plugin_la_LIBADD = $(CUDA_LIBS)
66-
70+
libinternal_plugin_la_LDFLAGS = $(CUDA_LDFLAGS) $(ROCM_LDFLAGS)
71+
libinternal_plugin_la_LIBADD = $(CUDA_LIBS) $(ROCM_LIBS)
6772

6873
if ENABLE_NEURON
6974
lib_LTLIBRARIES = libnccom-net.la
@@ -74,9 +79,9 @@ if ENABLE_NEURON
7479
else
7580
lib_LTLIBRARIES = libnccl-net-ofi.la
7681
libnccl_net_ofi_la_SOURCES =
77-
libnccl_net_ofi_la_LIBADD = libinternal_plugin.la
82+
libnccl_net_ofi_la_LIBADD = libinternal_plugin.la $(CUDA_LIBS) $(ROCM_LIBS)
7883
libnccl_net_ofi_la_LIBTOOLFLAGS = --tag=CXX
79-
libnccl_net_ofi_la_LDFLAGS = -module -avoid-version
84+
libnccl_net_ofi_la_LDFLAGS = -module -avoid-version -Wl,--no-undefined
8085

8186
# We always install libnccl-net-ofi.so. To use the default shared library,
8287
# either NCCL_NET_PLUGIN=libnccl-net-ofi.so or NCCL_NET_PLUGIN=ofi must be set.
@@ -102,7 +107,7 @@ if ENABLE_NCCL_NET_LIBRARY
102107
libnccl_net_la_SOURCES =
103108
libnccl_net_la_LIBADD = libinternal_plugin.la
104109
libnccl_net_la_LIBTOOLFLAGS = --tag=CXX
105-
libnccl_net_la_LDFLAGS = -module -avoid-version
110+
libnccl_net_la_LDFLAGS = -module -avoid-version -Wl,--no-undefined
106111
endif
107112
if WANT_PLATFORM_AWS
108113
# NCCL standardized on the libnccl-tuner-<interface> format after we released a
@@ -120,4 +125,14 @@ if WANT_PLATFORM_AWS
120125
libnccl_tuner_ofi_la_LDFLAGS = -module -avoid-version
121126
endif
122127

128+
if HAVE_ROCM
129+
install-exec-hook::
130+
@$(MKDIR_P) $(DESTDIR)$(libdir)
131+
@$(RM) -f $(DESTDIR)$(libdir)/librccl-net.so
132+
@$(LN_S) libnccl-net-ofi.so $(DESTDIR)$(libdir)/librccl-net.so
133+
134+
uninstall-local::
135+
@$(RM) -f $(DESTDIR)$(libdir)/librccl-net.so
136+
endif
137+
123138
endif

src/nccl_ofi_api.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -535,7 +535,7 @@ ncclResult_t nccl_net_ofi_regMrDmaBuf_v6(void* comm, void* data, size_t size,
535535
/* Validate type of buffer */
536536
bool valid_buffer_type = false;
537537
if (type == NCCL_PTR_HOST) valid_buffer_type = true;
538-
#if HAVE_CUDA
538+
#if HAVE_GPU
539539
if (type == NCCL_PTR_CUDA) valid_buffer_type = true;
540540
#endif
541541
#if HAVE_NEURON

src/nccl_ofi_cuda.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ DECLARE_CUDA_FUNCTION(cuMemAlloc, 3020);
6666
DECLARE_CUDA_FUNCTION(cuMemFree, 3020);
6767
DECLARE_CUDA_FUNCTION(cuMemcpy, 4000);
6868

69-
int nccl_net_ofi_cuda_init(void)
69+
int nccl_net_ofi_gpu_init(void)
7070
{
7171
int driverVersion = -1;
7272
int runtimeVersion = -1;
@@ -110,7 +110,7 @@ int nccl_net_ofi_cuda_init(void)
110110
return 0;
111111
}
112112

113-
int nccl_net_ofi_cuda_flush_gpudirect_rdma_writes(void)
113+
int nccl_net_ofi_gpu_flush_gpudirect_rdma_writes(void)
114114
{
115115
#if HAVE_CUDA_GDRFLUSH_SUPPORT
116116
static_assert(CUDA_VERSION >= 11030, "Requires cudart>=11.3");

src/nccl_ofi_net.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
#include "nccl_ofi_tracepoint.h"
2323
#if HAVE_CUDA
2424
#include "nccl_ofi_cuda.h"
25+
#elif HAVE_ROCM
26+
#include "nccl_ofi_rocm.h"
2527
#endif
2628
#include "nccl_ofi_sendrecv.h"
2729
#include "nccl_ofi_rdma.h"
@@ -166,8 +168,8 @@ int nccl_net_ofi_create_plugin(nccl_net_ofi_plugin_t **plugin_p)
166168
*/
167169
mr_cache_alignment = std::min(system_page_size, NCCL_OFI_CACHE_PAGE_SIZE);
168170

169-
#if HAVE_CUDA
170-
ret = nccl_net_ofi_cuda_init();
171+
#if HAVE_GPU
172+
ret = nccl_net_ofi_gpu_init();
171173
if (ret != 0) {
172174
NCCL_OFI_WARN("CUDA initialization failed.");
173175
goto exit;

src/nccl_ofi_ofiutils.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@
2020
#include "nccl_ofi_math.h"
2121
#include "nccl_ofi_ofiutils.h"
2222
#include "nccl_ofi_platform.h"
23+
#if HAVE_ROCM
24+
#include "nccl_ofi_rocm.h"
25+
#endif
2326

2427
#define EFA_PROVIDER_NAME "efa"
2528
#define IS_EFA_PROVIDER(NAME) (strcmp((NAME), EFA_PROVIDER_NAME)==0)
@@ -383,6 +386,11 @@ ofi_ep_result nccl_ofi_ofiutils_ep_create(struct fi_info *info, ofi_domain_ptr &
383386
* to supported.
384387
*/
385388
support_gdr = GDR_SUPPORTED;
389+
#elif HAVE_ROCM
390+
/*
391+
* ROCm does not require FI_OPT_CUDA_API_PERMITTED.
392+
*/
393+
support_gdr = GDR_SUPPORTED;
386394
#else
387395
NCCL_OFI_WARN("Using Libfabric 1.18 API with GPUDirect RDMA support, and FI_OPT_CUDA_API_PERMITTED is not declared.");
388396
ret = -EOPNOTSUPP;

0 commit comments

Comments
 (0)