Skip to content

Commit 394485f

Browse files
committed
test: Adding cuda dmabuf validation logic
Problem: - Users was submitting fabtests without the --do-dmabuf-reg-for-hmem flag Solution: - Added hmem cuda logic changes based on nvidia manual [https://docs.nvidia.com/cuda/gpudirect-rdma/ here] - Added check_dmabuf to init cuda internals and get dmabuf support information - Added conftest to validate these checks if a user specifies cuda command in fabtests Testing: - Validated tests with --do-dmabuf-reg-for-hmem flag and without it on p6-gb200 cluster - All tests skipped with cuda command - make -j install && python3 install/bin/runfabtests.py --expression "cuda" -vvv -p /home/nmazzill/libfabric/fabtests/install/bin/ --junit-xml ft_`git branch --show-current`_`git rev-parse HEAD`_all_junit.xml --nworkers 16 -b efa 10.0.123.149 10.0.121.190 | tee ft_`git branch --show-current`_`git rev-parse HEAD`_all_stdout - All tests passed with this cuda command - make -j install && python3 install/bin/runfabtests.py --expression "cuda" --do-dmabuf-reg-for-hmem -vvv -p /home/nmazzill/libfabric/fabtests/install/bin/ --junit-xml ft_`git branch --show-current`_`git rev-parse HEAD`_all_junit.xml --nworkers 16 -b efa 10.0.123.149 10.0.121.190 | tee ft_`git branch --show-current`_`git rev-parse HEAD`_all_stdout Sim Issue: - N/A Signed-off-by: Nick Mazzilli <[email protected]>
1 parent b3abf8a commit 394485f

File tree

6 files changed

+342
-29
lines changed

6 files changed

+342
-29
lines changed

fabtests/Makefile.am

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,8 @@ bin_PROGRAMS = \
6969
multinode/fi_multinode_coll \
7070
component/sock_test \
7171
regression/sighandler_test \
72-
common/check_hmem
72+
common/check_hmem \
73+
common/check_cuda_dmabuf
7374

7475
if HAVE_ZE_DEVEL
7576
if HAVE_VERBS_DEVEL
@@ -619,6 +620,14 @@ common_check_hmem_LDADD = libfabtests.la
619620
common_checK_hmem_CFLAGS = \
620621
$(AM_CFLAGS)
621622

623+
common_check_cuda_dmabuf_SOURCES = \
624+
common/check_cuda_dmabuf.c
625+
626+
common_check_cuda_dmabuf_LDADD = libfabtests.la
627+
628+
common_check_cuda_dmabuf_CFLAGS = \
629+
$(AM_CFLAGS)
630+
622631
real_man_pages = \
623632
man/man7/fabtests.7
624633

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */
2+
/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */
3+
#include <stdio.h>
4+
#include <stdlib.h>
5+
#include <getopt.h>
6+
#include <string.h>
7+
#include <shared.h>
8+
#include <hmem.h>
9+
10+
static cuda_memory_support_e dmabuf_viable_and_supported(void)
11+
{
12+
cuda_memory_support_e cuda_memory_support = ft_cuda_memory_support();
13+
14+
return cuda_memory_support;
15+
}
16+
17+
int main(int argc, char **argv)
18+
{
19+
int ret;
20+
21+
/* Make sure default CUDA device is sane for ft_cuda_init() */
22+
opts = INIT_OPTS;
23+
opts.device = 0; /* cuda device 0 */
24+
25+
/* Initialize CUDA side only; avoid ft_init_fabric() */
26+
ret = ft_cuda_init();
27+
if (ret != FI_SUCCESS) {
28+
FT_ERR("ft_cuda_init failed: %d", ret);
29+
return CUDA_MEMORY_SUPPORT__NOT_SUPPORTED;
30+
}
31+
32+
cuda_memory_support_e cuda_memory_support = dmabuf_viable_and_supported();
33+
FT_INFO("dmabuf: ft_cuda_memory_support() -> %d", cuda_memory_support);
34+
35+
ft_cuda_cleanup();
36+
return cuda_memory_support;
37+
}

fabtests/common/hmem_cuda.c

Lines changed: 167 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,8 @@ struct cuda_ops {
6565
CUresult (*cuDeviceGetAttribute)(int* pi,
6666
CUdevice_attribute attrib, CUdevice dev);
6767
CUresult (*cuDeviceGet)(CUdevice* device, int ordinal);
68+
CUresult (*cuDeviceGetName)(char* name, int len, CUdevice dev);
69+
CUresult (*cuDriverGetVersion)(int* driverVersion);
6870
CUresult (*cuMemGetAddressRange)( CUdeviceptr* pbase,
6971
size_t* psize, CUdeviceptr dptr);
7072
};
@@ -73,6 +75,24 @@ static struct cuda_ops cuda_ops;
7375
static void *cudart_handle;
7476
static void *cuda_handle;
7577
static bool dmabuf_supported;
78+
static bool gdr_supported;
79+
static cuda_memory_support_e cuda_memory_support = CUDA_MEMORY_SUPPORT__NOT_INITIALIZED;
80+
static const char* get_cuda_memory_support_str(cuda_memory_support_e support) {
81+
switch (support) {
82+
case CUDA_MEMORY_SUPPORT__NOT_INITIALIZED:
83+
return "NOT_INITIALIZED";
84+
case CUDA_MEMORY_SUPPORT__NOT_SUPPORTED:
85+
return "NOT_SUPPORTED";
86+
case CUDA_MEMORY_SUPPORT__DMA_BUF_ONLY:
87+
return "DMA_BUF_ONLY";
88+
case CUDA_MEMORY_SUPPORT__GDR_ONLY:
89+
return "GDR_ONLY";
90+
case CUDA_MEMORY_SUPPORT__DMABUF_GDR_BOTH:
91+
return "DMABUF_GDR_BOTH";
92+
default:
93+
return "INVALID";
94+
}
95+
}
7696

7797
/**
7898
* Since function names can get redefined in cuda.h/cuda_runtime.h files,
@@ -113,43 +133,134 @@ static int ft_cuda_pointer_set_attribute(void *buf)
113133
}
114134

115135
/**
116-
* @brief detect dmabuf support in the current platform
117-
* This checks the dmabuf support in the current platform
118-
* by querying the property of cuda device 0
136+
* @brief Detect CUDA memory transport support (dma-buf vs P2P)
137+
*
138+
* This routine queries device 0 for:
139+
* - CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED
140+
* - CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED
141+
* - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR
142+
*
143+
* Logic is derived from CUDA 13.0 release notes and Blackwell compatibility guide:
144+
* - https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html
145+
* - https://docs.nvidia.com/cuda/blackwell-compatibility-guide/
146+
* - https://developer.nvidia.com/blog/cuda-toolkit-12-8-delivers-nvidia-blackwell-support/
147+
*
148+
* NVIDIA deprecated GPUDirect RDMA (nv-p2p APIs) starting with Blackwell
149+
* (compute capability >= 10). Applications must migrate to dma-buf.
119150
*
120-
* @return FI_SUCCESS if dmabuf support check is successful
121-
* -FI_EIO upon CUDA API error
151+
* Truth table for effective support (device memory only):
152+
*
153+
* DMA_BUF GPU_DIRECT_RDMA Result
154+
* ------- ---------------- ------------------------
155+
* 0 0 NOT_SUPPORTED
156+
* 0 1 GDR_ONLY
157+
* 1 0 DMA_BUF_ONLY
158+
* 1 1 DMABUF_GDR_BOTH
159+
*
160+
* Note:
161+
* - CU_DEVICE_ATTRIBUTE_HOST_ALLOC_DMA_BUF_SUPPORTED is orthogonal and
162+
* indicates whether cudaHostAlloc() memory can be exported as dma-buf.
163+
* - On compute capability >= 10 (Blackwell), we force GPU_DIRECT_RDMA=0
164+
* regardless of attribute value to align with the deprecation notice.
165+
*
166+
* @return FI_SUCCESS on success
167+
* -FI_EIO on CUDA API error
168+
* Sets global flags dmabuf_supported, gdr_supported, and cuda_memory_support.
122169
*/
123-
static int ft_cuda_hmem_detect_dmabuf_support(void)
170+
static int ft_cuda_detect_memory_support(void)
124171
{
125-
dmabuf_supported = false;
126172
#if HAVE_CUDA_DMABUF
127-
CUresult cuda_ret;
128-
CUdevice dev;
129-
int is_supported = 0;
130-
131-
cuda_ret = cuda_ops.cuDeviceGet(&dev, 0);
132-
if (cuda_ret != CUDA_SUCCESS) {
133-
ft_cuda_driver_api_print_error(cuda_ret, "cuDeviceGet");
134-
return -FI_EIO;
135-
}
136-
137-
cuda_ret = cuda_ops.cuDeviceGetAttribute(&is_supported,
138-
CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED, dev);
139-
if (cuda_ret != CUDA_SUCCESS) {
140-
ft_cuda_driver_api_print_error(cuda_ret, "cuDeviceGetAttribute");
141-
return -FI_EIO;
142-
}
173+
CUresult cuda_ret;
174+
CUdevice dev;
175+
int cc_major = 0, cc_minor = 0;
176+
int dma_buf_attr = 0;
177+
int gdr_attr = 0;
178+
cuda_memory_support = CUDA_MEMORY_SUPPORT__NOT_INITIALIZED;
179+
180+
181+
FT_INFO("ft_cuda_detect_memory_support() called");
182+
183+
cuda_ret = cuda_ops.cuDeviceGet(&dev, 0);
184+
if (cuda_ret != CUDA_SUCCESS) {
185+
ft_cuda_driver_api_print_error(cuda_ret, "cuDeviceGet");
186+
cuda_memory_support = CUDA_MEMORY_SUPPORT__NOT_SUPPORTED;
187+
return -FI_EIO;
188+
}
189+
190+
cuda_ret = cuda_ops.cuDeviceGetAttribute(&cc_major,
191+
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev);
192+
if (cuda_ret != CUDA_SUCCESS) {
193+
ft_cuda_driver_api_print_error(cuda_ret, "cuDeviceGetAttribute(CC_MAJOR)");
194+
cuda_memory_support = CUDA_MEMORY_SUPPORT__NOT_SUPPORTED;
195+
return -FI_EIO;
196+
}
197+
198+
cuda_ret = cuda_ops.cuDeviceGetAttribute(&cc_minor,
199+
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev);
200+
if (cuda_ret != CUDA_SUCCESS) {
201+
ft_cuda_driver_api_print_error(cuda_ret, "cuDeviceGetAttribute(CC_MINOR)");
202+
cuda_memory_support = CUDA_MEMORY_SUPPORT__NOT_SUPPORTED;
203+
return -FI_EIO;
204+
}
205+
206+
cuda_ret = cuda_ops.cuDeviceGetAttribute(&dma_buf_attr,
207+
CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED, dev);
208+
if (cuda_ret != CUDA_SUCCESS) {
209+
ft_cuda_driver_api_print_error(cuda_ret, "cuDeviceGetAttribute(DMA_BUF_SUPPORTED)");
210+
cuda_memory_support = CUDA_MEMORY_SUPPORT__NOT_SUPPORTED;
211+
return -FI_EIO;
212+
}
213+
214+
cuda_ret = cuda_ops.cuDeviceGetAttribute(&gdr_attr,
215+
CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, dev);
216+
if (cuda_ret != CUDA_SUCCESS) {
217+
ft_cuda_driver_api_print_error(cuda_ret, "cuDeviceGetAttribute(GPU_DIRECT_RDMA_SUPPORTED)");
218+
cuda_memory_support = CUDA_MEMORY_SUPPORT__NOT_SUPPORTED;
219+
return -FI_EIO;
220+
}
221+
222+
dmabuf_supported = (dma_buf_attr == 1);
223+
224+
if (cc_major >= 10) {
225+
// Blackwell or newer: nv-p2p deprecated
226+
FT_INFO("Compute capability %d.%d: forcing gdr_supported=false due to Blackwell deprecation", cc_major, cc_minor);
227+
gdr_supported = false;
228+
} else {
229+
gdr_supported = (gdr_attr == 1);
230+
}
231+
232+
FT_INFO("Compute capability %d.%d", cc_major, cc_minor);
233+
FT_INFO("dmabuf_supported=%s", dmabuf_supported ? "true" : "false");
234+
FT_INFO("GPU_DIRECT_RDMA_SUPPORTED raw=%d -> gdr_supported=%s",
235+
gdr_attr, gdr_supported ? "true" : "false");
236+
237+
// Final truth table
238+
if (!gdr_supported && !dmabuf_supported) {
239+
cuda_memory_support = CUDA_MEMORY_SUPPORT__NOT_SUPPORTED;
240+
} else if (gdr_supported && dmabuf_supported) {
241+
cuda_memory_support = CUDA_MEMORY_SUPPORT__DMABUF_GDR_BOTH;
242+
} else if (dmabuf_supported) {
243+
cuda_memory_support = CUDA_MEMORY_SUPPORT__DMA_BUF_ONLY;
244+
} else {
245+
cuda_memory_support = CUDA_MEMORY_SUPPORT__GDR_ONLY;
246+
}
247+
248+
FT_INFO("cuda_memory_support=%s", get_cuda_memory_support_str(cuda_memory_support));
249+
return FI_SUCCESS;
143250

144-
dmabuf_supported = (is_supported == 1);
251+
#else
252+
FT_INFO("HAVE_CUDA_DMABUF not enabled, returning CUDA_MEMORY_SUPPORT__NOT_INITIALIZED");
253+
cuda_memory_support = CUDA_MEMORY_SUPPORT__NOT_INITIALIZED;
254+
return FI_SUCCESS;
145255
#endif
146-
return FI_SUCCESS;
147256
}
148257

258+
149259
int ft_cuda_init(void)
150260
{
151261
cudaError_t cuda_ret;
152262
int ret;
263+
cuda_memory_support = CUDA_MEMORY_SUPPORT__NOT_INITIALIZED;
153264

154265
cudart_handle = dlopen("libcudart.so", RTLD_NOW);
155266
if (!cudart_handle) {
@@ -261,6 +372,20 @@ int ft_cuda_init(void)
261372
goto err_dlclose_cuda;
262373
}
263374

375+
cuda_ops.cuDeviceGetName = dlsym(cuda_handle,
376+
STRINGIFY(cuDeviceGetName));
377+
if (!cuda_ops.cuDeviceGetName) {
378+
FT_ERR("Failed to find cuDeviceGetName\n");
379+
goto err_dlclose_cuda;
380+
}
381+
382+
cuda_ops.cuDriverGetVersion = dlsym(cuda_handle,
383+
STRINGIFY(cuDriverGetVersion));
384+
if (!cuda_ops.cuDriverGetVersion) {
385+
FT_ERR("Failed to find cuDriverGetVersion\n");
386+
goto err_dlclose_cuda;
387+
}
388+
264389
cuda_ops.cuMemGetAddressRange = dlsym(cuda_handle,
265390
STRINGIFY(cuMemGetAddressRange));
266391
if (!cuda_ops.cuMemGetAddressRange) {
@@ -274,9 +399,11 @@ int ft_cuda_init(void)
274399
goto err_dlclose_cuda;
275400
}
276401

277-
ret = ft_cuda_hmem_detect_dmabuf_support();
278-
if (ret != FI_SUCCESS)
402+
ret = ft_cuda_detect_memory_support();
403+
if (ret != FI_SUCCESS) {
279404
goto err_dlclose_cuda;
405+
}
406+
280407

281408
return FI_SUCCESS;
282409

@@ -495,6 +622,14 @@ int ft_cuda_put_dmabuf_fd(int fd)
495622
#endif /* HAVE_CUDA_DMABUF */
496623
}
497624

625+
cuda_memory_support_e ft_cuda_memory_support(void)
626+
{
627+
if (cuda_memory_support == CUDA_MEMORY_SUPPORT__NOT_INITIALIZED) {
628+
FT_INFO("ft_cuda_memory_support() not called yet!");
629+
}
630+
return cuda_memory_support;
631+
}
632+
498633
#else
499634

500635
int ft_cuda_init(void)
@@ -554,4 +689,9 @@ int ft_cuda_put_dmabuf_fd(int fd)
554689
{
555690
return -FI_ENOSYS;
556691
}
692+
693+
cuda_memory_support_e ft_cuda_memory_support(void)
694+
{
695+
return CUDA_MEMORY_SUPPORT__UNKNOWN;
696+
}
557697
#endif /* HAVE_CUDA_RUNTIME_H */

fabtests/include/hmem.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,14 @@
3535
#include <rdma/fi_domain.h>
3636
#include <rdma/fi_errno.h>
3737

38+
typedef enum {
39+
CUDA_MEMORY_SUPPORT__NOT_INITIALIZED = -1,
40+
CUDA_MEMORY_SUPPORT__NOT_SUPPORTED = 0,
41+
CUDA_MEMORY_SUPPORT__DMA_BUF_ONLY = 1,
42+
CUDA_MEMORY_SUPPORT__GDR_ONLY= 2,
43+
CUDA_MEMORY_SUPPORT__DMABUF_GDR_BOTH = 3,
44+
} cuda_memory_support_e;
45+
3846
#if HAVE_ZE
3947
#include <level_zero/ze_api.h>
4048
extern struct libze_ops {
@@ -185,6 +193,7 @@ int ft_cuda_copy_from_hmem(uint64_t device, void *dst, const void *src,
185193
int ft_cuda_get_dmabuf_fd(void *buf, size_t len,
186194
int *fd, uint64_t *offset);
187195
int ft_cuda_put_dmabuf_fd(int fd);
196+
cuda_memory_support_e ft_cuda_memory_support(void);
188197

189198
int ft_rocr_init(void);
190199
int ft_rocr_cleanup(void);

0 commit comments

Comments
 (0)