Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion fabtests/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ bin_PROGRAMS = \
multinode/fi_multinode_coll \
component/sock_test \
regression/sighandler_test \
common/check_hmem
common/check_hmem \
common/check_cuda_dmabuf

if HAVE_ZE_DEVEL
if HAVE_VERBS_DEVEL
Expand Down Expand Up @@ -619,6 +620,14 @@ common_check_hmem_LDADD = libfabtests.la
common_checK_hmem_CFLAGS = \
$(AM_CFLAGS)

common_check_cuda_dmabuf_SOURCES = \
common/check_cuda_dmabuf.c

common_check_cuda_dmabuf_LDADD = libfabtests.la

common_check_cuda_dmabuf_CFLAGS = \
$(AM_CFLAGS)

real_man_pages = \
man/man7/fabtests.7

Expand Down
33 changes: 33 additions & 0 deletions fabtests/common/check_cuda_dmabuf.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
/*
* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved.
*/

#include <stdio.h>
#include <stdlib.h>
#include <getopt.h>
#include <string.h>
#include <shared.h>
#include <hmem.h>

int main(int argc, char **argv)
{
int ret;

/* Make sure default CUDA device is sane for ft_cuda_init() */
opts = INIT_OPTS;
opts.device = 0; /* cuda device 0 */

/* Initialize CUDA side only; avoid ft_init_fabric() */
ret = ft_cuda_init();
if (ret != FI_SUCCESS) {
FT_ERR("ft_cuda_init failed: %d", ret);
return FT_CUDA_NOT_SUPPORTED;
}

enum ft_cuda_memory_support cuda_memory_support = ft_cuda_memory_support();
FT_INFO("dmabuf: ft_cuda_memory_support() -> %d", cuda_memory_support);

ft_cuda_cleanup();
return cuda_memory_support;
}
193 changes: 166 additions & 27 deletions fabtests/common/hmem_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ struct cuda_ops {
CUresult (*cuDeviceGetAttribute)(int* pi,
CUdevice_attribute attrib, CUdevice dev);
CUresult (*cuDeviceGet)(CUdevice* device, int ordinal);
CUresult (*cuDeviceGetName)(char* name, int len, CUdevice dev);
CUresult (*cuDriverGetVersion)(int* driverVersion);
CUresult (*cuMemGetAddressRange)( CUdeviceptr* pbase,
size_t* psize, CUdeviceptr dptr);
};
Expand All @@ -73,6 +75,24 @@ static struct cuda_ops cuda_ops;
static void *cudart_handle;
static void *cuda_handle;
static bool dmabuf_supported;
static bool gdr_supported;
static enum ft_cuda_memory_support cuda_memory_support = FT_CUDA_NOT_INITIALIZED;
static const char* get_cuda_memory_support_str(enum ft_cuda_memory_support support) {
switch (support) {
case FT_CUDA_NOT_INITIALIZED:
return "NOT_INITIALIZED";
case FT_CUDA_NOT_SUPPORTED:
return "NOT_SUPPORTED";
case FT_CUDA_DMA_BUF_ONLY:
return "DMA_BUF_ONLY";
case FT_CUDA_GDR_ONLY:
return "GDR_ONLY";
case FT_CUDA_DMABUF_GDR_BOTH:
return "DMABUF_GDR_BOTH";
default:
return "INVALID";
}
}

/**
* Since function names can get redefined in cuda.h/cuda_runtime.h files,
Expand Down Expand Up @@ -113,43 +133,133 @@ static int ft_cuda_pointer_set_attribute(void *buf)
}

/**
* @brief detect dmabuf support in the current platform
* This checks the dmabuf support in the current platform
* by querying the property of cuda device 0
* @brief Detect CUDA memory transport support (dma-buf vs P2P)
*
* This routine queries device 0 for:
* - CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED
* - CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED
* - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR
*
* Logic is derived from CUDA 13.0 release notes and Blackwell compatibility guide:
* - https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html
* - https://docs.nvidia.com/cuda/blackwell-compatibility-guide/
* - https://developer.nvidia.com/blog/cuda-toolkit-12-8-delivers-nvidia-blackwell-support/
*
* NVIDIA deprecated GPUDirect RDMA (nv-p2p APIs) starting with Blackwell
* (compute capability >= 10). Applications must migrate to dma-buf.
*
* @return FI_SUCCESS if dmabuf support check is successful
* -FI_EIO upon CUDA API error
* Truth table for effective support (device memory only):
*
* DMA_BUF GPU_DIRECT_RDMA Result
* ------- ---------------- ------------------------
* 0 0 NOT_SUPPORTED
* 0 1 GDR_ONLY
* 1 0 DMA_BUF_ONLY
* 1 1 DMABUF_GDR_BOTH
*
* Note:
* - CU_DEVICE_ATTRIBUTE_HOST_ALLOC_DMA_BUF_SUPPORTED is orthogonal and
* indicates whether cudaHostAlloc() memory can be exported as dma-buf.
* - On compute capability >= 10 (Blackwell), we force GPU_DIRECT_RDMA=0
* regardless of attribute value to align with the deprecation notice.
*
* @return FI_SUCCESS on success
* -FI_EIO on CUDA API error
* Sets global flags dmabuf_supported, gdr_supported, and cuda_memory_support.
*/
static int ft_cuda_hmem_detect_dmabuf_support(void)
static int ft_cuda_detect_memory_support(void)
{
dmabuf_supported = false;
#if HAVE_CUDA_DMABUF
CUresult cuda_ret;
CUdevice dev;
int is_supported = 0;

cuda_ret = cuda_ops.cuDeviceGet(&dev, 0);
if (cuda_ret != CUDA_SUCCESS) {
ft_cuda_driver_api_print_error(cuda_ret, "cuDeviceGet");
return -FI_EIO;
}

cuda_ret = cuda_ops.cuDeviceGetAttribute(&is_supported,
CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED, dev);
if (cuda_ret != CUDA_SUCCESS) {
ft_cuda_driver_api_print_error(cuda_ret, "cuDeviceGetAttribute");
return -FI_EIO;
}
CUresult cuda_ret;
CUdevice dev;
int cc_major = 0, cc_minor = 0;
int dma_buf_attr = 0;
int gdr_attr = 0;
cuda_memory_support = FT_CUDA_NOT_INITIALIZED;


FT_INFO("ft_cuda_detect_memory_support() called");

cuda_ret = cuda_ops.cuDeviceGet(&dev, 0);
if (cuda_ret != CUDA_SUCCESS) {
ft_cuda_driver_api_print_error(cuda_ret, "cuDeviceGet");
cuda_memory_support = FT_CUDA_NOT_SUPPORTED;
return -FI_EIO;
}

cuda_ret = cuda_ops.cuDeviceGetAttribute(&cc_major,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev);
if (cuda_ret != CUDA_SUCCESS) {
ft_cuda_driver_api_print_error(cuda_ret, "cuDeviceGetAttribute(CC_MAJOR)");
cuda_memory_support = FT_CUDA_NOT_SUPPORTED;
return -FI_EIO;
}

cuda_ret = cuda_ops.cuDeviceGetAttribute(&cc_minor,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev);
if (cuda_ret != CUDA_SUCCESS) {
ft_cuda_driver_api_print_error(cuda_ret, "cuDeviceGetAttribute(CC_MINOR)");
cuda_memory_support = FT_CUDA_NOT_SUPPORTED;
return -FI_EIO;
}

cuda_ret = cuda_ops.cuDeviceGetAttribute(&dma_buf_attr,
CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED, dev);
if (cuda_ret != CUDA_SUCCESS) {
ft_cuda_driver_api_print_error(cuda_ret, "cuDeviceGetAttribute(DMA_BUF_SUPPORTED)");
cuda_memory_support = FT_CUDA_NOT_SUPPORTED;
return -FI_EIO;
}

cuda_ret = cuda_ops.cuDeviceGetAttribute(&gdr_attr,
CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, dev);
if (cuda_ret != CUDA_SUCCESS) {
ft_cuda_driver_api_print_error(cuda_ret, "cuDeviceGetAttribute(GPU_DIRECT_RDMA_SUPPORTED)");
cuda_memory_support = FT_CUDA_NOT_SUPPORTED;
return -FI_EIO;
}

dmabuf_supported = (dma_buf_attr == 1);

if (cc_major >= 10) {
// Blackwell or newer: nv-p2p deprecated
FT_INFO("Compute capability %d.%d: forcing gdr_supported=false due to Blackwell deprecation", cc_major, cc_minor);
gdr_supported = false;
} else {
gdr_supported = (gdr_attr == 1);
}

FT_INFO("Compute capability %d.%d", cc_major, cc_minor);
FT_INFO("dmabuf_supported=%s", dmabuf_supported ? "true" : "false");
FT_INFO("GPU_DIRECT_RDMA_SUPPORTED raw=%d -> gdr_supported=%s",
gdr_attr, gdr_supported ? "true" : "false");

// Final truth table
if (!gdr_supported && !dmabuf_supported)
cuda_memory_support = FT_CUDA_NOT_SUPPORTED;
else if (gdr_supported && dmabuf_supported)
cuda_memory_support = FT_CUDA_DMABUF_GDR_BOTH;
else if (dmabuf_supported)
cuda_memory_support = FT_CUDA_DMA_BUF_ONLY;
else
cuda_memory_support = FT_CUDA_GDR_ONLY;

FT_INFO("cuda_memory_support=%s", get_cuda_memory_support_str(cuda_memory_support));
return FI_SUCCESS;

dmabuf_supported = (is_supported == 1);
#else
FT_INFO("HAVE_CUDA_DMABUF not enabled, returning FT_CUDA_NOT_INITIALIZED");
cuda_memory_support = FT_CUDA_NOT_INITIALIZED;
return FI_SUCCESS;
#endif
return FI_SUCCESS;
}


int ft_cuda_init(void)
{
cudaError_t cuda_ret;
int ret;
cuda_memory_support = FT_CUDA_NOT_INITIALIZED;

cudart_handle = dlopen("libcudart.so", RTLD_NOW);
if (!cudart_handle) {
Expand Down Expand Up @@ -261,6 +371,20 @@ int ft_cuda_init(void)
goto err_dlclose_cuda;
}

cuda_ops.cuDeviceGetName = dlsym(cuda_handle,
STRINGIFY(cuDeviceGetName));
if (!cuda_ops.cuDeviceGetName) {
FT_ERR("Failed to find cuDeviceGetName\n");
goto err_dlclose_cuda;
}

cuda_ops.cuDriverGetVersion = dlsym(cuda_handle,
STRINGIFY(cuDriverGetVersion));
if (!cuda_ops.cuDriverGetVersion) {
FT_ERR("Failed to find cuDriverGetVersion\n");
goto err_dlclose_cuda;
}

cuda_ops.cuMemGetAddressRange = dlsym(cuda_handle,
STRINGIFY(cuMemGetAddressRange));
if (!cuda_ops.cuMemGetAddressRange) {
Expand All @@ -274,9 +398,11 @@ int ft_cuda_init(void)
goto err_dlclose_cuda;
}

ret = ft_cuda_hmem_detect_dmabuf_support();
if (ret != FI_SUCCESS)
ret = ft_cuda_detect_memory_support();
if (ret != FI_SUCCESS) {
goto err_dlclose_cuda;
}


return FI_SUCCESS;

Expand Down Expand Up @@ -495,6 +621,14 @@ int ft_cuda_put_dmabuf_fd(int fd)
#endif /* HAVE_CUDA_DMABUF */
}

enum ft_cuda_memory_support ft_cuda_memory_support(void)
{
if (cuda_memory_support == FT_CUDA_NOT_INITIALIZED) {
FT_INFO("ft_cuda_memory_support() not called yet!");
}
return cuda_memory_support;
}

#else

int ft_cuda_init(void)
Expand Down Expand Up @@ -554,4 +688,9 @@ int ft_cuda_put_dmabuf_fd(int fd)
{
return -FI_ENOSYS;
}

enum ft_cuda_memory_support ft_cuda_memory_support(void)
{
return FT_CUDA_NOT_SUPPORTED;
}
#endif /* HAVE_CUDA_RUNTIME_H */
9 changes: 9 additions & 0 deletions fabtests/include/hmem.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,14 @@
#include <rdma/fi_domain.h>
#include <rdma/fi_errno.h>

enum ft_cuda_memory_support {
FT_CUDA_NOT_INITIALIZED = -1,
FT_CUDA_NOT_SUPPORTED = 0,
FT_CUDA_DMA_BUF_ONLY = 1,
FT_CUDA_GDR_ONLY = 2,
FT_CUDA_DMABUF_GDR_BOTH = 3,
};

#if HAVE_ZE
#include <level_zero/ze_api.h>
extern struct libze_ops {
Expand Down Expand Up @@ -185,6 +193,7 @@ int ft_cuda_copy_from_hmem(uint64_t device, void *dst, const void *src,
int ft_cuda_get_dmabuf_fd(void *buf, size_t len,
int *fd, uint64_t *offset);
int ft_cuda_put_dmabuf_fd(int fd);
enum ft_cuda_memory_support ft_cuda_memory_support(void);

int ft_rocr_init(void);
int ft_rocr_cleanup(void);
Expand Down
Loading