Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion fabtests/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ bin_PROGRAMS = \
multinode/fi_multinode_coll \
component/sock_test \
regression/sighandler_test \
common/check_hmem
common/check_hmem \
common/check_cuda_dmabuf

if HAVE_ZE_DEVEL
if HAVE_VERBS_DEVEL
Expand Down Expand Up @@ -619,6 +620,14 @@ common_check_hmem_LDADD = libfabtests.la
common_checK_hmem_CFLAGS = \
$(AM_CFLAGS)

common_check_cuda_dmabuf_SOURCES = \
common/check_cuda_dmabuf.c

common_check_cuda_dmabuf_LDADD = libfabtests.la

common_check_cuda_dmabuf_CFLAGS = \
$(AM_CFLAGS)

real_man_pages = \
man/man7/fabtests.7

Expand Down
67 changes: 67 additions & 0 deletions fabtests/common/check_cuda_dmabuf.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/*
* Copyright (c) 2024, Amazon.com, Inc. All rights reserved.
*
* This software is available to you under the BSD license
* below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* This test returns whether or not dmabuf is viable and supported
* based on aws-ofi-nccl logic
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <getopt.h>
#include <string.h>
#include <shared.h>
#include <hmem.h>

static cuda_memory_support_e dmabuf_viable_and_supported( void )
{
cuda_memory_support_e cuda_memory_support = ft_cuda_memory_support();

return cuda_memory_support;
}

int main(int argc, char **argv)
{
int ret;

/* Make sure default CUDA device is sane for ft_cuda_init() */
opts = INIT_OPTS;
opts.device = 0; /* cuda device 0 */

/* Initialize CUDA side only; avoid ft_init_fabric() */
ret = ft_cuda_init();
if (ret != FI_SUCCESS) {
FT_ERR("ft_cuda_init failed: %d", ret);
return CUDA_MEMORY_SUPPORT__UNKNOWN;
}

cuda_memory_support_e cuda_memory_support = dmabuf_viable_and_supported();
FT_INFO("dmabuf: ft_cuda_memory_support() -> %d", cuda_memory_support);

ft_cuda_cleanup();
return cuda_memory_support;
}
178 changes: 151 additions & 27 deletions fabtests/common/hmem_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ struct cuda_ops {
CUresult (*cuDeviceGetAttribute)(int* pi,
CUdevice_attribute attrib, CUdevice dev);
CUresult (*cuDeviceGet)(CUdevice* device, int ordinal);
CUresult (*cuDeviceGetName)(char* name, int len, CUdevice dev);
CUresult (*cuDriverGetVersion)(int* driverVersion);
CUresult (*cuMemGetAddressRange)( CUdeviceptr* pbase,
size_t* psize, CUdeviceptr dptr);
};
Expand All @@ -73,6 +75,8 @@ static struct cuda_ops cuda_ops;
static void *cudart_handle;
static void *cuda_handle;
static bool dmabuf_supported;
static bool p2p_supported;
static cuda_memory_support_e cuda_memory_support = CUDA_MEMORY_SUPPORT__NOT_INITIALIZED;

/**
* Since function names can get redefined in cuda.h/cuda_runtime.h files,
Expand Down Expand Up @@ -113,43 +117,134 @@ static int ft_cuda_pointer_set_attribute(void *buf)
}

/**
* @brief detect dmabuf support in the current platform
* This checks the dmabuf support in the current platform
* by querying the property of cuda device 0
* @brief Detect CUDA memory transport support (dma-buf vs P2P)
*
* @return FI_SUCCESS if dmabuf support check is successful
* -FI_EIO upon CUDA API error
* This routine queries device 0 for:
* - CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED
* - CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED
* - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR
*
* Logic is derived from CUDA 13.0 release notes and Blackwell compatibility guide:
* - https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html
* - https://docs.nvidia.com/cuda/blackwell-compatibility-guide/
* - https://developer.nvidia.com/blog/cuda-toolkit-12-8-delivers-nvidia-blackwell-support/
*
* NVIDIA deprecated GPUDirect RDMA (nv-p2p APIs) starting with Blackwell
* (compute capability >= 10). Applications must migrate to dma-buf.
*
* Truth table for effective support (device memory only):
*
* DMA_BUF GPU_DIRECT_RDMA Result
* ------- ---------------- ------------------------
* 0 0 UNKNOWN
* 0 1 P2P_ONLY
* 1 0 DMA_BUF_ONLY
* 1 1 DMA_P2P_BOTH
*
* Note:
* - CU_DEVICE_ATTRIBUTE_HOST_ALLOC_DMA_BUF_SUPPORTED is orthogonal and
* indicates whether cudaHostAlloc() memory can be exported as dma-buf.
* - On compute capability >= 10 (Blackwell), we force GPU_DIRECT_RDMA=0
* regardless of attribute value to align with the deprecation notice.
*
* @return FI_SUCCESS on success
* -FI_EIO on CUDA API error
* Sets global flags dmabuf_supported, p2p_supported, and cuda_memory_support.
*/
static int ft_cuda_hmem_detect_dmabuf_support(void)
static int ft_cuda_detect_memory_support(void)
{
dmabuf_supported = false;
#if HAVE_CUDA_DMABUF
CUresult cuda_ret;
CUdevice dev;
int is_supported = 0;

cuda_ret = cuda_ops.cuDeviceGet(&dev, 0);
if (cuda_ret != CUDA_SUCCESS) {
ft_cuda_driver_api_print_error(cuda_ret, "cuDeviceGet");
return -FI_EIO;
}

cuda_ret = cuda_ops.cuDeviceGetAttribute(&is_supported,
CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED, dev);
if (cuda_ret != CUDA_SUCCESS) {
ft_cuda_driver_api_print_error(cuda_ret, "cuDeviceGetAttribute");
return -FI_EIO;
}
CUresult cuda_ret;
CUdevice dev;
int cc_major = 0, cc_minor = 0;
int dma_buf_attr = 0;
int p2p_attr = 0;
cuda_memory_support = CUDA_MEMORY_SUPPORT__NOT_INITIALIZED;


FT_INFO("ft_cuda_detect_memory_support() called");

cuda_ret = cuda_ops.cuDeviceGet(&dev, 0);
if (cuda_ret != CUDA_SUCCESS) {
ft_cuda_driver_api_print_error(cuda_ret, "cuDeviceGet");
cuda_memory_support = CUDA_MEMORY_SUPPORT__UNKNOWN;
return -FI_EIO;
}

cuda_ret = cuda_ops.cuDeviceGetAttribute(&cc_major,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev);
if (cuda_ret != CUDA_SUCCESS) {
ft_cuda_driver_api_print_error(cuda_ret, "cuDeviceGetAttribute(CC_MAJOR)");
cuda_memory_support = CUDA_MEMORY_SUPPORT__UNKNOWN;
return -FI_EIO;
}

cuda_ret = cuda_ops.cuDeviceGetAttribute(&cc_minor,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev);
if (cuda_ret != CUDA_SUCCESS) {
ft_cuda_driver_api_print_error(cuda_ret, "cuDeviceGetAttribute(CC_MINOR)");
cuda_memory_support = CUDA_MEMORY_SUPPORT__UNKNOWN;
return -FI_EIO;
}

cuda_ret = cuda_ops.cuDeviceGetAttribute(&dma_buf_attr,
CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED, dev);
if (cuda_ret != CUDA_SUCCESS) {
ft_cuda_driver_api_print_error(cuda_ret, "cuDeviceGetAttribute(DMA_BUF_SUPPORTED)");
cuda_memory_support = CUDA_MEMORY_SUPPORT__UNKNOWN;
return -FI_EIO;
}

cuda_ret = cuda_ops.cuDeviceGetAttribute(&p2p_attr,
CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, dev);
if (cuda_ret != CUDA_SUCCESS) {
ft_cuda_driver_api_print_error(cuda_ret, "cuDeviceGetAttribute(GPU_DIRECT_RDMA_SUPPORTED)");
cuda_memory_support = CUDA_MEMORY_SUPPORT__UNKNOWN;
return -FI_EIO;
}

dmabuf_supported = (dma_buf_attr == 1);

if (cc_major >= 10) {
// Blackwell or newer: nv-p2p deprecated
FT_INFO("Compute capability %d.%d: forcing p2p_supported=false due to Blackwell deprecation", cc_major, cc_minor);
p2p_supported = false;
} else {
p2p_supported = (p2p_attr == 1);
}

FT_INFO("Compute capability %d.%d", cc_major, cc_minor);
FT_INFO("dmabuf_supported=%s", dmabuf_supported ? "true" : "false");
FT_INFO("GPU_DIRECT_RDMA_SUPPORTED raw=%d -> p2p_supported=%s",
p2p_attr, p2p_supported ? "true" : "false");

// Final truth table
if (!p2p_supported && !dmabuf_supported) {
cuda_memory_support = CUDA_MEMORY_SUPPORT__UNKNOWN;
} else if (p2p_supported && dmabuf_supported) {
cuda_memory_support = CUDA_MEMORY_SUPPORT__DMA_P2P_BOTH;
} else if (dmabuf_supported) {
cuda_memory_support = CUDA_MEMORY_SUPPORT__DMA_BUF_ONLY;
} else {
cuda_memory_support = CUDA_MEMORY_SUPPORT__P2P_ONLY;
}

FT_INFO("cuda_memory_support=%d", cuda_memory_support);
return FI_SUCCESS;

dmabuf_supported = (is_supported == 1);
#else
FT_INFO("HAVE_CUDA_DMABUF not enabled, returning CUDA_MEMORY_SUPPORT__NOT_INITIALIZED");
cuda_memory_support = CUDA_MEMORY_SUPPORT__NOT_INITIALIZED;
return FI_SUCCESS;
#endif
return FI_SUCCESS;
}


int ft_cuda_init(void)
{
cudaError_t cuda_ret;
int ret;
cuda_memory_support = CUDA_MEMORY_SUPPORT__NOT_INITIALIZED;

cudart_handle = dlopen("libcudart.so", RTLD_NOW);
if (!cudart_handle) {
Expand Down Expand Up @@ -261,6 +356,20 @@ int ft_cuda_init(void)
goto err_dlclose_cuda;
}

cuda_ops.cuDeviceGetName = dlsym(cuda_handle,
STRINGIFY(cuDeviceGetName));
if (!cuda_ops.cuDeviceGetName) {
FT_ERR("Failed to find cuDeviceGetName\n");
goto err_dlclose_cuda;
}

cuda_ops.cuDriverGetVersion = dlsym(cuda_handle,
STRINGIFY(cuDriverGetVersion));
if (!cuda_ops.cuDriverGetVersion) {
FT_ERR("Failed to find cuDriverGetVersion\n");
goto err_dlclose_cuda;
}

cuda_ops.cuMemGetAddressRange = dlsym(cuda_handle,
STRINGIFY(cuMemGetAddressRange));
if (!cuda_ops.cuMemGetAddressRange) {
Expand All @@ -274,9 +383,11 @@ int ft_cuda_init(void)
goto err_dlclose_cuda;
}

ret = ft_cuda_hmem_detect_dmabuf_support();
if (ret != FI_SUCCESS)
ret = ft_cuda_detect_memory_support();
if (ret != FI_SUCCESS) {
goto err_dlclose_cuda;
}


return FI_SUCCESS;

Expand Down Expand Up @@ -495,6 +606,14 @@ int ft_cuda_put_dmabuf_fd(int fd)
#endif /* HAVE_CUDA_DMABUF */
}

cuda_memory_support_e ft_cuda_memory_support(void)
{
if (cuda_memory_support == CUDA_MEMORY_SUPPORT__NOT_INITIALIZED) {
FT_INFO("ft_cuda_memory_support() not called yet!");
}
return cuda_memory_support;
}

#else

int ft_cuda_init(void)
Expand Down Expand Up @@ -554,4 +673,9 @@ int ft_cuda_put_dmabuf_fd(int fd)
{
return -FI_ENOSYS;
}

cuda_memory_support_e ft_cuda_memory_support(void)
{
return CUDA_MEMORY_SUPPORT__UNKNOWN;
}
#endif /* HAVE_CUDA_RUNTIME_H */
9 changes: 9 additions & 0 deletions fabtests/include/hmem.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,14 @@
#include <rdma/fi_domain.h>
#include <rdma/fi_errno.h>

typedef enum {
CUDA_MEMORY_SUPPORT__NOT_INITIALIZED = -1,
CUDA_MEMORY_SUPPORT__UNKNOWN = 0,
CUDA_MEMORY_SUPPORT__DMA_BUF_ONLY = 1,
CUDA_MEMORY_SUPPORT__P2P_ONLY= 2,
CUDA_MEMORY_SUPPORT__DMA_P2P_BOTH = 3,
} cuda_memory_support_e;

#if HAVE_ZE
#include <level_zero/ze_api.h>
extern struct libze_ops {
Expand Down Expand Up @@ -185,6 +193,7 @@ int ft_cuda_copy_from_hmem(uint64_t device, void *dst, const void *src,
int ft_cuda_get_dmabuf_fd(void *buf, size_t len,
int *fd, uint64_t *offset);
int ft_cuda_put_dmabuf_fd(int fd);
cuda_memory_support_e ft_cuda_memory_support(void);

int ft_rocr_init(void);
int ft_rocr_cleanup(void);
Expand Down
5 changes: 3 additions & 2 deletions fabtests/include/shared.h
Original file line number Diff line number Diff line change
Expand Up @@ -369,8 +369,9 @@ static inline int ft_use_size(int index, int enable_flags)
#define FT_LOG(level, fmt, ...) \
do { \
int saved_errno = errno; \
fprintf(stderr, "[%s] fabtests:%s:%d: " fmt "\n", \
level, __FILE__, __LINE__, ##__VA_ARGS__); \
int64_t ns = ft_gettime_ns(); \
fprintf(stderr, "[%s] %lu.%06lu fabtests:%s:%d: " fmt "\n", \
level, ns/1000000000UL, (ns%1000000000UL)/1000UL, __FILE__, __LINE__, ##__VA_ARGS__); \
errno = saved_errno; \
} while (0)

Expand Down
Loading