@@ -65,6 +65,8 @@ struct cuda_ops {
65
65
CUresult (* cuDeviceGetAttribute )(int * pi ,
66
66
CUdevice_attribute attrib , CUdevice dev );
67
67
CUresult (* cuDeviceGet )(CUdevice * device , int ordinal );
68
+ CUresult (* cuDeviceGetName )(char * name , int len , CUdevice dev );
69
+ CUresult (* cuDriverGetVersion )(int * driverVersion );
68
70
CUresult (* cuMemGetAddressRange )( CUdeviceptr * pbase ,
69
71
size_t * psize , CUdeviceptr dptr );
70
72
};
@@ -73,6 +75,24 @@ static struct cuda_ops cuda_ops;
73
75
static void * cudart_handle ;
74
76
static void * cuda_handle ;
75
77
static bool dmabuf_supported ;
78
+ static bool gdr_supported ;
79
+ static cuda_memory_support_e cuda_memory_support = CUDA_MEMORY_SUPPORT__NOT_INITIALIZED ;
80
+ static const char * get_cuda_memory_support_str (cuda_memory_support_e support ) {
81
+ switch (support ) {
82
+ case CUDA_MEMORY_SUPPORT__NOT_INITIALIZED :
83
+ return "NOT_INITIALIZED" ;
84
+ case CUDA_MEMORY_SUPPORT__NOT_SUPPORTED :
85
+ return "NOT_SUPPORTED" ;
86
+ case CUDA_MEMORY_SUPPORT__DMA_BUF_ONLY :
87
+ return "DMA_BUF_ONLY" ;
88
+ case CUDA_MEMORY_SUPPORT__GDR_ONLY :
89
+ return "GDR_ONLY" ;
90
+ case CUDA_MEMORY_SUPPORT__DMABUF_GDR_BOTH :
91
+ return "DMABUF_GDR_BOTH" ;
92
+ default :
93
+ return "INVALID" ;
94
+ }
95
+ }
76
96
77
97
/**
78
98
* Since function names can get redefined in cuda.h/cuda_runtime.h files,
@@ -113,43 +133,134 @@ static int ft_cuda_pointer_set_attribute(void *buf)
113
133
}
114
134
115
135
/**
116
- * @brief detect dmabuf support in the current platform
117
- * This checks the dmabuf support in the current platform
118
- * by querying the property of cuda device 0
136
+ * @brief Detect CUDA memory transport support (dma-buf vs P2P)
137
+ *
138
+ * This routine queries device 0 for:
139
+ * - CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED
140
+ * - CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED
141
+ * - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR
142
+ *
143
+ * Logic is derived from CUDA 13.0 release notes and Blackwell compatibility guide:
144
+ * - https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html
145
+ * - https://docs.nvidia.com/cuda/blackwell-compatibility-guide/
146
+ * - https://developer.nvidia.com/blog/cuda-toolkit-12-8-delivers-nvidia-blackwell-support/
147
+ *
148
+ * NVIDIA deprecated GPUDirect RDMA (nv-p2p APIs) starting with Blackwell
149
+ * (compute capability >= 10). Applications must migrate to dma-buf.
119
150
*
120
- * @return FI_SUCCESS if dmabuf support check is successful
121
- * -FI_EIO upon CUDA API error
151
+ * Truth table for effective support (device memory only):
152
+ *
153
+ * DMA_BUF GPU_DIRECT_RDMA Result
154
+ * ------- ---------------- ------------------------
155
+ * 0 0 NOT_SUPPORTED
156
+ * 0 1 GDR_ONLY
157
+ * 1 0 DMA_BUF_ONLY
158
+ * 1 1 DMABUF_GDR_BOTH
159
+ *
160
+ * Note:
161
+ * - CU_DEVICE_ATTRIBUTE_HOST_ALLOC_DMA_BUF_SUPPORTED is orthogonal and
162
+ * indicates whether cudaHostAlloc() memory can be exported as dma-buf.
163
+ * - On compute capability >= 10 (Blackwell), we force GPU_DIRECT_RDMA=0
164
+ * regardless of attribute value to align with the deprecation notice.
165
+ *
166
+ * @return FI_SUCCESS on success
167
+ * -FI_EIO on CUDA API error
168
+ * Sets global flags dmabuf_supported, gdr_supported, and cuda_memory_support.
122
169
*/
123
- static int ft_cuda_hmem_detect_dmabuf_support (void )
170
+ static int ft_cuda_detect_memory_support (void )
124
171
{
125
- dmabuf_supported = false;
126
172
#if HAVE_CUDA_DMABUF
127
- CUresult cuda_ret ;
128
- CUdevice dev ;
129
- int is_supported = 0 ;
130
-
131
- cuda_ret = cuda_ops .cuDeviceGet (& dev , 0 );
132
- if (cuda_ret != CUDA_SUCCESS ) {
133
- ft_cuda_driver_api_print_error (cuda_ret , "cuDeviceGet" );
134
- return - FI_EIO ;
135
- }
136
-
137
- cuda_ret = cuda_ops .cuDeviceGetAttribute (& is_supported ,
138
- CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED , dev );
139
- if (cuda_ret != CUDA_SUCCESS ) {
140
- ft_cuda_driver_api_print_error (cuda_ret , "cuDeviceGetAttribute" );
141
- return - FI_EIO ;
142
- }
173
+ CUresult cuda_ret ;
174
+ CUdevice dev ;
175
+ int cc_major = 0 , cc_minor = 0 ;
176
+ int dma_buf_attr = 0 ;
177
+ int gdr_attr = 0 ;
178
+ cuda_memory_support = CUDA_MEMORY_SUPPORT__NOT_INITIALIZED ;
179
+
180
+
181
+ FT_INFO ("ft_cuda_detect_memory_support() called" );
182
+
183
+ cuda_ret = cuda_ops .cuDeviceGet (& dev , 0 );
184
+ if (cuda_ret != CUDA_SUCCESS ) {
185
+ ft_cuda_driver_api_print_error (cuda_ret , "cuDeviceGet" );
186
+ cuda_memory_support = CUDA_MEMORY_SUPPORT__NOT_SUPPORTED ;
187
+ return - FI_EIO ;
188
+ }
189
+
190
+ cuda_ret = cuda_ops .cuDeviceGetAttribute (& cc_major ,
191
+ CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR , dev );
192
+ if (cuda_ret != CUDA_SUCCESS ) {
193
+ ft_cuda_driver_api_print_error (cuda_ret , "cuDeviceGetAttribute(CC_MAJOR)" );
194
+ cuda_memory_support = CUDA_MEMORY_SUPPORT__NOT_SUPPORTED ;
195
+ return - FI_EIO ;
196
+ }
197
+
198
+ cuda_ret = cuda_ops .cuDeviceGetAttribute (& cc_minor ,
199
+ CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR , dev );
200
+ if (cuda_ret != CUDA_SUCCESS ) {
201
+ ft_cuda_driver_api_print_error (cuda_ret , "cuDeviceGetAttribute(CC_MINOR)" );
202
+ cuda_memory_support = CUDA_MEMORY_SUPPORT__NOT_SUPPORTED ;
203
+ return - FI_EIO ;
204
+ }
205
+
206
+ cuda_ret = cuda_ops .cuDeviceGetAttribute (& dma_buf_attr ,
207
+ CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED , dev );
208
+ if (cuda_ret != CUDA_SUCCESS ) {
209
+ ft_cuda_driver_api_print_error (cuda_ret , "cuDeviceGetAttribute(DMA_BUF_SUPPORTED)" );
210
+ cuda_memory_support = CUDA_MEMORY_SUPPORT__NOT_SUPPORTED ;
211
+ return - FI_EIO ;
212
+ }
213
+
214
+ cuda_ret = cuda_ops .cuDeviceGetAttribute (& gdr_attr ,
215
+ CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED , dev );
216
+ if (cuda_ret != CUDA_SUCCESS ) {
217
+ ft_cuda_driver_api_print_error (cuda_ret , "cuDeviceGetAttribute(GPU_DIRECT_RDMA_SUPPORTED)" );
218
+ cuda_memory_support = CUDA_MEMORY_SUPPORT__NOT_SUPPORTED ;
219
+ return - FI_EIO ;
220
+ }
221
+
222
+ dmabuf_supported = (dma_buf_attr == 1 );
223
+
224
+ if (cc_major >= 10 ) {
225
+ // Blackwell or newer: nv-p2p deprecated
226
+ FT_INFO ("Compute capability %d.%d: forcing gdr_supported=false due to Blackwell deprecation" , cc_major , cc_minor );
227
+ gdr_supported = false;
228
+ } else {
229
+ gdr_supported = (gdr_attr == 1 );
230
+ }
231
+
232
+ FT_INFO ("Compute capability %d.%d" , cc_major , cc_minor );
233
+ FT_INFO ("dmabuf_supported=%s" , dmabuf_supported ? "true" : "false" );
234
+ FT_INFO ("GPU_DIRECT_RDMA_SUPPORTED raw=%d -> gdr_supported=%s" ,
235
+ gdr_attr , gdr_supported ? "true" : "false" );
236
+
237
+ // Final truth table
238
+ if (!gdr_supported && !dmabuf_supported ) {
239
+ cuda_memory_support = CUDA_MEMORY_SUPPORT__NOT_SUPPORTED ;
240
+ } else if (gdr_supported && dmabuf_supported ) {
241
+ cuda_memory_support = CUDA_MEMORY_SUPPORT__DMABUF_GDR_BOTH ;
242
+ } else if (dmabuf_supported ) {
243
+ cuda_memory_support = CUDA_MEMORY_SUPPORT__DMA_BUF_ONLY ;
244
+ } else {
245
+ cuda_memory_support = CUDA_MEMORY_SUPPORT__GDR_ONLY ;
246
+ }
247
+
248
+ FT_INFO ("cuda_memory_support=%s" , get_cuda_memory_support_str (cuda_memory_support ));
249
+ return FI_SUCCESS ;
143
250
144
- dmabuf_supported = (is_supported == 1 );
251
+ #else
252
+ FT_INFO ("HAVE_CUDA_DMABUF not enabled, returning CUDA_MEMORY_SUPPORT__NOT_INITIALIZED" );
253
+ cuda_memory_support = CUDA_MEMORY_SUPPORT__NOT_INITIALIZED ;
254
+ return FI_SUCCESS ;
145
255
#endif
146
- return FI_SUCCESS ;
147
256
}
148
257
258
+
149
259
int ft_cuda_init (void )
150
260
{
151
261
cudaError_t cuda_ret ;
152
262
int ret ;
263
+ cuda_memory_support = CUDA_MEMORY_SUPPORT__NOT_INITIALIZED ;
153
264
154
265
cudart_handle = dlopen ("libcudart.so" , RTLD_NOW );
155
266
if (!cudart_handle ) {
@@ -261,6 +372,20 @@ int ft_cuda_init(void)
261
372
goto err_dlclose_cuda ;
262
373
}
263
374
375
+ cuda_ops .cuDeviceGetName = dlsym (cuda_handle ,
376
+ STRINGIFY (cuDeviceGetName ));
377
+ if (!cuda_ops .cuDeviceGetName ) {
378
+ FT_ERR ("Failed to find cuDeviceGetName\n" );
379
+ goto err_dlclose_cuda ;
380
+ }
381
+
382
+ cuda_ops .cuDriverGetVersion = dlsym (cuda_handle ,
383
+ STRINGIFY (cuDriverGetVersion ));
384
+ if (!cuda_ops .cuDriverGetVersion ) {
385
+ FT_ERR ("Failed to find cuDriverGetVersion\n" );
386
+ goto err_dlclose_cuda ;
387
+ }
388
+
264
389
cuda_ops .cuMemGetAddressRange = dlsym (cuda_handle ,
265
390
STRINGIFY (cuMemGetAddressRange ));
266
391
if (!cuda_ops .cuMemGetAddressRange ) {
@@ -274,9 +399,11 @@ int ft_cuda_init(void)
274
399
goto err_dlclose_cuda ;
275
400
}
276
401
277
- ret = ft_cuda_hmem_detect_dmabuf_support ();
278
- if (ret != FI_SUCCESS )
402
+ ret = ft_cuda_detect_memory_support ();
403
+ if (ret != FI_SUCCESS ) {
279
404
goto err_dlclose_cuda ;
405
+ }
406
+
280
407
281
408
return FI_SUCCESS ;
282
409
@@ -495,6 +622,14 @@ int ft_cuda_put_dmabuf_fd(int fd)
495
622
#endif /* HAVE_CUDA_DMABUF */
496
623
}
497
624
625
+ cuda_memory_support_e ft_cuda_memory_support (void )
626
+ {
627
+ if (cuda_memory_support == CUDA_MEMORY_SUPPORT__NOT_INITIALIZED ) {
628
+ FT_INFO ("ft_cuda_memory_support() not called yet!" );
629
+ }
630
+ return cuda_memory_support ;
631
+ }
632
+
498
633
#else
499
634
500
635
int ft_cuda_init (void )
@@ -554,4 +689,9 @@ int ft_cuda_put_dmabuf_fd(int fd)
554
689
{
555
690
return - FI_ENOSYS ;
556
691
}
692
+
693
+ cuda_memory_support_e ft_cuda_memory_support (void )
694
+ {
695
+ return CUDA_MEMORY_SUPPORT__UNKNOWN ;
696
+ }
557
697
#endif /* HAVE_CUDA_RUNTIME_H */
0 commit comments