Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 36 additions & 4 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -122,21 +122,53 @@ CHECK_PKG_NEURON([AS_IF([test -n "${want_cuda}"],
[AC_MSG_ERROR([Cannot enable both CUDA and neuron.])],
[want_cuda=no])
have_device_interface=neuron])
CHECK_PKG_CUDA([have_device_interface=cuda])


CHECK_PKG_CUDA([
AS_IF([test "${have_device_interface}" = "no" -a -z "$with_rocm"], [
have_device_interface=cuda
])
])


CHECK_PKG_ROCM([
AS_IF([test "${have_device_interface}" = "no"], [
have_device_interface=rocm
])
])


AS_IF([test "${have_device_interface}" = "no"],
[AC_MSG_ERROR([NCCL OFI Plugin requires either CUDA or Neuron runtime.])])
DEVICE_INTERFACE="${have_device_interface}"
AC_SUBST([DEVICE_INTERFACE])
[AC_MSG_ERROR([NCCL OFI Plugin requires either CUDA, ROCm, or Neuron runtime.])])

CHECK_PKG_LTTNG()
CHECK_PKG_NVTX()

do_cuda=0
do_rocm=0
AS_IF([test "$have_device_interface" = "cuda"], [do_cuda=1])
AS_IF([test "$have_device_interface" = "rocm"], [do_rocm=1; enable_tests="no"])

CHECK_PKG_HWLOC([],
[AC_MSG_ERROR([Could not find the hwloc library. Use --with-hwloc to provide the path to non-standard hwloc installation.])])
AC_DEFINE_UNQUOTED([HAVE_CUDA], [${do_cuda}], [Defined to 1 if CUDA is available])
AM_CONDITIONAL([HAVE_CUDA], [test ${do_cuda} = 1])

CHECK_PKG_VALGRIND()
CHECK_VAR_REDZONE()

AC_DEFINE_UNQUOTED([HAVE_ROCM], [${do_rocm}], [Defined to 1 if ROCm is available])
AM_CONDITIONAL([HAVE_ROCM], [test ${do_rocm} = 1])

AM_CONDITIONAL([ENABLE_NEURON], [test "x${have_device_interface}" = "xneuron"])
AS_IF([test ${do_rocm} = 1],
[AC_DEFINE_UNQUOTED([__HIP_PLATFORM_AMD__], [1], [Select AMD/ROCm HIP APIs])])

AS_CASE([$have_device_interface],
[neuron], [DEVICE_INTERFACE=neuron],
[*], [DEVICE_INTERFACE=cuda])
AC_SUBST([DEVICE_INTERFACE])

NCCL_OFI_PLATFORM="none"
AS_IF([test "${NCCL_OFI_PLATFORM}" = "none"], [AX_CHECK_PLATFORM_AWS()])

Expand Down
8 changes: 8 additions & 0 deletions include/nccl_ofi_config_bottom.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved.
* Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
*/

#ifndef NCCL_OFI_CONFIG_BOTTOM_H
Expand Down Expand Up @@ -62,4 +63,11 @@ static inline int memfd_create(const char *name, unsigned int flags)
}
#endif /* ifndef HAVE_MEMFD_CREATE */

/* Use HAVE_GPU to guard GPU-specific code instead of checking both individually */
#if HAVE_CUDA || HAVE_ROCM
#define HAVE_GPU 1
#else
#define HAVE_GPU 0
#endif

#endif
6 changes: 3 additions & 3 deletions include/nccl_ofi_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#ifndef NCCL_OFI_CUDA_H_
#define NCCL_OFI_CUDA_H_

int nccl_net_ofi_cuda_init(void);
int nccl_net_ofi_gpu_init(void);

/*
* @brief Gets the CUDA device associated with the buffer
Expand All @@ -27,7 +27,7 @@ int nccl_net_ofi_get_cuda_device_for_addr(void *data, int *dev_id);
* @return 0 on success
* -1 on error
*/
int nccl_net_ofi_cuda_flush_gpudirect_rdma_writes(void);
int nccl_net_ofi_gpu_flush_gpudirect_rdma_writes(void);

/*
* @brief wraps cudaGetDevice()
Expand Down Expand Up @@ -100,4 +100,4 @@ bool nccl_net_ofi_cuda_have_dma_buf_attr(void);
*/
bool nccl_net_ofi_cuda_have_gdr_support_attr(void);

#endif // End NCCL_OFI_H_
#endif // End NCCL_OFI_CUDA_H_
50 changes: 50 additions & 0 deletions include/nccl_ofi_rocm.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
/*
* Copyright (c) 2018-2023 Amazon.com, Inc. or its affiliates. All rights reserved.
* Copyright (c) 2025, Hewlett Packard Enterprise Development LP.
* Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved
*/

#ifndef NCCL_OFI_ROCM_H_
#define NCCL_OFI_ROCM_H_

/* Generic GPU init (ROCm variant) */
int nccl_net_ofi_gpu_init(void);

/*
* @brief Gets the device associated with the buffer
*
* @param data
* Pointer to GPU buffer.
*
* @return Valid GPU device ID on success
* -1 on error
* @return 0 on success
* -EINVAL on error
*/
int nccl_net_ofi_get_cuda_device_for_addr(void *data, int *dev_id);

/*
* @brief wraps cudaFlushGPUDirectRDMAWrites() with default args.
* @return 0 on success
* -1 on error
*/
int nccl_net_ofi_gpu_flush_gpudirect_rdma_writes(void);

/*
* @brief wraps cudaGetDevice()
* @return 0 on success
* -1 on error
*/
int nccl_net_ofi_cuda_get_num_devices(void);

/*
* @brief query CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED
* @return true if attr is fetched successfully and true.
* false otherwise.
*/
int nccl_net_ofi_cuda_get_active_device_idx(void);

bool nccl_net_ofi_cuda_have_dma_buf_attr(void);
bool nccl_net_ofi_cuda_have_gdr_support_attr(void);

#endif /* NCCL_OFI_ROCM_H_ */
53 changes: 53 additions & 0 deletions m4/check_pkg_rocm.m4
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# -*- autoconf -*-
#
# Copyright (c) 2018-2023 Amazon.com, Inc. or its affiliates. All rights reserved.
# Copyright (c) 2025, Hewlett Packard Enterprise Development LP.
# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved
#
#
# See LICENSE.txt for license information
#

AC_DEFUN([CHECK_PKG_ROCM], [
check_pkg_found="yes"
check_pkg_CPPFLAGS_save="${CPPFLAGS}"
check_pkg_LDFLAGS_save="${LDFLAGS}"
check_pkg_LIBS_save="${LIBS}"
AC_ARG_WITH([rocm],
[AS_HELP_STRING([--with-rocm=PATH], [Path to non-standard ROCm installation])])
AS_IF([test -z "${with-rocm}" -o "{with_rocm}" = "yes"],
[],
[test "${with_rocm}" = "no"],
[check_pkg_found=no],
[AS_IF([test -d ${with_rocm}/lib64], [check_pkg_libdir="lib64"], [check_pkg_libdir="lib"])
CPPFLAGS="-I${with_rocm}/include ${CPPFLAGS}"
LDFLAGS="-L${with_rocm}/${check_pkg_libdir} ${LDFLAGS}"])
AS_IF([test "${check_pkg_found}" = "yes"],
[AC_CHECK_LIB([amdhip64], [hipMemAllocHost], [], [check_pkg_found=no])])
AS_IF([test "${check_pkg_found}" = "yes"],
[AC_CHECK_HEADERS([hip/hip_runtime_api.h], [], [check_pkg_found=no], [#define __HIP_PLATFORM_AMD__])])
AS_IF([test "${check_pkg_found}" = "yes"],
[check_pkg_define="yes"],
[check_pkg_define="no"
CPPFLAGS="${check_pkg_CPPFLAGS_save}"
LDFLAGS="${check_pkg_LDFLAGS_save}"
LIBS="${check_pkg_LIBS_save}"
])
AS_IF([test -n "${with_rocm}"],
[AS_IF([test "${check_pkg_define}" = "yes"],
[$1], [$2] )
], [$2]
)
AS_UNSET([check_pkg_found])
AS_UNSET([check_pkg_define])
AS_UNSET([check_pkg_CPPFLAGS_save])
AS_UNSET([check_pkg_LDFLAGS_save])
AS_UNSET([check_pkg_LIBS_save])
])
94 changes: 66 additions & 28 deletions src/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
AM_CPPFLAGS = -I$(abs_top_srcdir)/include
AM_CPPFLAGS += -isystem $(abs_top_srcdir)/3rd-party
AM_CPPFLAGS += -isystem $(abs_top_srcdir)/3rd-party/nccl/$(DEVICE_INTERFACE)/include
AM_CPPFLAGS += $(CUDA_CPPFLAGS)
AM_CPPFLAGS += $(CUDA_CPPFLAGS) $(ROCM_CPPFLAGS)
AM_CPPFLAGS += -DXML_DIR=\"${pkgdatadir}/xml\"

sources = \
Expand Down Expand Up @@ -41,15 +41,21 @@ sources += platform-aws.cpp
endif

if ENABLE_NEURON
sources += nccl_ofi_interface_neuron.cpp
else
sources += \
nccl_ofi_cuda.cpp \
nccl_ofi_interface_nvidia.cpp
sources += nccl_ofi_interface_neuron.cpp
endif

if HAVE_ROCM
sources += nccl_ofi_rocm.cpp nccl_ofi_interface_nvidia.cpp
endif

if HAVE_CUDA
sources += nccl_ofi_cuda.cpp nccl_ofi_interface_nvidia.cpp
endif

# add the tuner sources into the library
# add the tuner sources into the library
if !ENABLE_NEURON
if WANT_PLATFORM_AWS
sources += \
sources += \
tuner/nccl_ofi_regions.cpp \
tuner/nccl_ofi_tuner.cpp \
tuner/nccl_ofi_model.cpp
Expand All @@ -61,30 +67,69 @@ endif
# us writing dlopen() handlers for simple unit tests.
noinst_LTLIBRARIES = libinternal_plugin.la
libinternal_plugin_la_SOURCES = $(sources)
libinternal_plugin_la_LDFLAGS = -static $(CUDA_LDFLAGS)
libinternal_plugin_la_LIBADD = $(CUDA_LIBS)
libinternal_plugin_la_LDFLAGS = -static $(CUDA_LDFLAGS) $(ROCM_LDFLAGS)
libinternal_plugin_la_LIBADD = $(CUDA_LIBS) $(ROCM_LIBS)

lib_LTLIBRARIES =

if ENABLE_NEURON
lib_LTLIBRARIES = libnccom-net.la
lib_LTLIBRARIES += libnccom-net.la
libnccom_net_la_SOURCES =
libnccom_net_la_LIBADD = libinternal_plugin.la
libnccom_net_la_LIBTOOLFLAGS = --tag=CXX
libnccom_net_la_LDFLAGS = -module -avoid-version
else
lib_LTLIBRARIES = libnccl-net-ofi.la
endif

# For both NCCL and RCCL,we always install lib{n,r}ccl-net-ofi.so. To use the
# default shared library, either NCCL_NET_PLUGIN=lib{n,r}ccl-net-ofi.so or
# NCCL_NET_PLUGIN=ofi must be set.
#
# To enable the OFI plugin by default, a lib{n,r}ccl-net.so library is created.
# If NCCL_NET_PLUGIN is not set, NCCL will attempt to dlopen lib{n,r}ccl-net.so,
# with dlopen() searching the default search path. This behavior is optional,
# as some situations (like the NGC containers) may have multiple network
# plugins.

if HAVE_ROCM

if ENABLE_NCCL_NET_LIBRARY
lib_LTLIBRARIES += librccl-net.la
librccl_net_la_SOURCES =
librccl_net_la_LIBADD = libinternal_plugin.la
librccl_net_la_LIBTOOLFLAGS = --tag=CXX
librccl_net_la_LDFLAGS = -module -avoid-version
endif

lib_LTLIBRARIES += librccl-net-ofi.la
librccl_net_ofi_la_SOURCES =
librccl_net_ofi_la_LIBADD = libinternal_plugin.la
librccl_net_ofi_la_LIBTOOLFLAGS = --tag=CXX
librccl_net_ofi_la_LDFLAGS = -module -avoid-version

endif


if HAVE_CUDA

if ENABLE_NCCL_NET_LIBRARY
lib_LTLIBRARIES += libnccl-net.la
libnccl_net_la_SOURCES =
libnccl_net_la_LIBADD = libinternal_plugin.la
libnccl_net_la_LIBTOOLFLAGS = --tag=CXX
libnccl_net_la_LDFLAGS = -module -avoid-version
endif

lib_LTLIBRARIES += libnccl-net-ofi.la
libnccl_net_ofi_la_SOURCES =
libnccl_net_ofi_la_LIBADD = libinternal_plugin.la
libnccl_net_ofi_la_LIBTOOLFLAGS = --tag=CXX
libnccl_net_ofi_la_LDFLAGS = -module -avoid-version

# We always install libnccl-net-ofi.so. To use the default shared library,
# either NCCL_NET_PLUGIN=libnccl-net-ofi.so or NCCL_NET_PLUGIN=ofi must be set.
#
# To enable the OFI plugin by default, a libnccl-net.so library is created. If
# NCCL_NET_PLUGIN is not set, NCCL will attempt to dlopen libnccl-net.so, with
# dlopen() searching the default search path. This behavior is optional, as
# some situations (like the NGC containers) may have multiple network plugins.

if WANT_PLATFORM_AWS
# The tuner is normally built into the net plugin shared library, supporting
# recent NCCL versions by default. For historical reasons, we build a separate
# tuner library on Nvidia platforms.
#
# Recent versions of NCCL include a tuner interface for algorithm/protocol
# selection. The tuner code lives in the net plugin, but a libnccl-ofi-tuner.so
Expand All @@ -97,14 +142,7 @@ else
#
# By bundling the tuner in the net plugin, we cause the tuner to be used by
# default on NCCL 2.21 or later.
if ENABLE_NCCL_NET_LIBRARY
lib_LTLIBRARIES += libnccl-net.la
libnccl_net_la_SOURCES =
libnccl_net_la_LIBADD = libinternal_plugin.la
libnccl_net_la_LIBTOOLFLAGS = --tag=CXX
libnccl_net_la_LDFLAGS = -module -avoid-version
endif
if WANT_PLATFORM_AWS
#
# NCCL standardized on the libnccl-tuner-<interface> format after we released a
# plugin with the tuner named libnccl-ofi-tuner.so. Create separate libraries
# for each name.
Expand Down
2 changes: 1 addition & 1 deletion src/nccl_ofi_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -535,7 +535,7 @@ ncclResult_t nccl_net_ofi_regMrDmaBuf_v6(void* comm, void* data, size_t size,
/* Validate type of buffer */
bool valid_buffer_type = false;
if (type == NCCL_PTR_HOST) valid_buffer_type = true;
#if HAVE_CUDA
#if HAVE_GPU
if (type == NCCL_PTR_CUDA) valid_buffer_type = true;
#endif
#if HAVE_NEURON
Expand Down
4 changes: 2 additions & 2 deletions src/nccl_ofi_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ DECLARE_CUDA_FUNCTION(cuMemAlloc, 3020);
DECLARE_CUDA_FUNCTION(cuMemFree, 3020);
DECLARE_CUDA_FUNCTION(cuMemcpy, 4000);

int nccl_net_ofi_cuda_init(void)
int nccl_net_ofi_gpu_init(void)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not a huge fan of renaming some, but not all, of the functions. We should figure out a plan and keep one or the other pattern.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed. Leaving everything as "cuda" would surely eliminate having to have tons of shims everywhere, but I also see the benefit in having a generic "gpu" layer. Do you have any preference?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I'd just make them all nccl_net_ofi_gpu_*() for now, in both the cuda and rocm interfaces. THis is a place we should clean up later, but not now.

{
int driverVersion = -1;
int runtimeVersion = -1;
Expand Down Expand Up @@ -110,7 +110,7 @@ int nccl_net_ofi_cuda_init(void)
return 0;
}

int nccl_net_ofi_cuda_flush_gpudirect_rdma_writes(void)
int nccl_net_ofi_gpu_flush_gpudirect_rdma_writes(void)
{
#if HAVE_CUDA_GDRFLUSH_SUPPORT
static_assert(CUDA_VERSION >= 11030, "Requires cudart>=11.3");
Expand Down
6 changes: 4 additions & 2 deletions src/nccl_ofi_net.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
#include "nccl_ofi_tracepoint.h"
#if HAVE_CUDA
#include "nccl_ofi_cuda.h"
#elif HAVE_ROCM
#include "nccl_ofi_rocm.h"
#endif
#include "nccl_ofi_sendrecv.h"
#include "nccl_ofi_rdma.h"
Expand Down Expand Up @@ -166,8 +168,8 @@ int nccl_net_ofi_create_plugin(nccl_net_ofi_plugin_t **plugin_p)
*/
mr_cache_alignment = std::min(system_page_size, NCCL_OFI_CACHE_PAGE_SIZE);

#if HAVE_CUDA
ret = nccl_net_ofi_cuda_init();
#if HAVE_GPU
ret = nccl_net_ofi_gpu_init();
if (ret != 0) {
NCCL_OFI_WARN("CUDA initialization failed.");
goto exit;
Expand Down
Loading