@@ -65,6 +65,8 @@ struct cuda_ops {
6565 CUresult (* cuDeviceGetAttribute )(int * pi ,
6666 CUdevice_attribute attrib , CUdevice dev );
6767 CUresult (* cuDeviceGet )(CUdevice * device , int ordinal );
68+ CUresult (* cuDeviceGetName )(char * name , int len , CUdevice dev );
69+ CUresult (* cuDriverGetVersion )(int * driverVersion );
6870 CUresult (* cuMemGetAddressRange )( CUdeviceptr * pbase ,
6971 size_t * psize , CUdeviceptr dptr );
7072};
@@ -73,6 +75,24 @@ static struct cuda_ops cuda_ops;
7375static void * cudart_handle ;
7476static void * cuda_handle ;
7577static bool dmabuf_supported ;
78+ static bool gdr_supported ;
79+ static cuda_memory_support_e cuda_memory_support = CUDA_MEMORY_SUPPORT__NOT_INITIALIZED ;
80+ static const char * get_cuda_memory_support_str (cuda_memory_support_e support ) {
81+ switch (support ) {
82+ case CUDA_MEMORY_SUPPORT__NOT_INITIALIZED :
83+ return "NOT_INITIALIZED" ;
84+ case CUDA_MEMORY_SUPPORT__NOT_SUPPORTED :
85+ return "NOT_SUPPORTED" ;
86+ case CUDA_MEMORY_SUPPORT__DMA_BUF_ONLY :
87+ return "DMA_BUF_ONLY" ;
88+ case CUDA_MEMORY_SUPPORT__GDR_ONLY :
89+ return "GDR_ONLY" ;
90+ case CUDA_MEMORY_SUPPORT__DMABUF_GDR_BOTH :
91+ return "DMABUF_GDR_BOTH" ;
92+ default :
93+ return "INVALID" ;
94+ }
95+ }
7696
7797/**
7898 * Since function names can get redefined in cuda.h/cuda_runtime.h files,
@@ -113,43 +133,134 @@ static int ft_cuda_pointer_set_attribute(void *buf)
113133}
114134
115135/**
116- * @brief detect dmabuf support in the current platform
117- * This checks the dmabuf support in the current platform
118- * by querying the property of cuda device 0
136+ * @brief Detect CUDA memory transport support (dma-buf vs P2P)
137+ *
138+ * This routine queries device 0 for:
139+ * - CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED
140+ * - CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED
141+ * - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR
142+ *
143+ * Logic is derived from CUDA 13.0 release notes and Blackwell compatibility guide:
144+ * - https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html
145+ * - https://docs.nvidia.com/cuda/blackwell-compatibility-guide/
146+ * - https://developer.nvidia.com/blog/cuda-toolkit-12-8-delivers-nvidia-blackwell-support/
147+ *
148+ * NVIDIA deprecated GPUDirect RDMA (nv-p2p APIs) starting with Blackwell
149+ * (compute capability >= 10). Applications must migrate to dma-buf.
119150 *
120- * @return FI_SUCCESS if dmabuf support check is successful
121- * -FI_EIO upon CUDA API error
151+ * Truth table for effective support (device memory only):
152+ *
153+ * DMA_BUF GPU_DIRECT_RDMA Result
154+ * ------- ---------------- ------------------------
155+ * 0 0 NOT_SUPPORTED
156+ * 0 1 GDR_ONLY
157+ * 1 0 DMA_BUF_ONLY
158+ * 1 1 DMABUF_GDR_BOTH
159+ *
160+ * Note:
161+ * - CU_DEVICE_ATTRIBUTE_HOST_ALLOC_DMA_BUF_SUPPORTED is orthogonal and
162+ * indicates whether cudaHostAlloc() memory can be exported as dma-buf.
163+ * - On compute capability >= 10 (Blackwell), we force GPU_DIRECT_RDMA=0
164+ * regardless of attribute value to align with the deprecation notice.
165+ *
166+ * @return FI_SUCCESS on success
167+ * -FI_EIO on CUDA API error
168+ * Sets global flags dmabuf_supported, gdr_supported, and cuda_memory_support.
122169 */
123- static int ft_cuda_hmem_detect_dmabuf_support (void )
170+ static int ft_cuda_detect_memory_support (void )
124171{
125- dmabuf_supported = false;
126172#if HAVE_CUDA_DMABUF
127- CUresult cuda_ret ;
128- CUdevice dev ;
129- int is_supported = 0 ;
130-
131- cuda_ret = cuda_ops .cuDeviceGet (& dev , 0 );
132- if (cuda_ret != CUDA_SUCCESS ) {
133- ft_cuda_driver_api_print_error (cuda_ret , "cuDeviceGet" );
134- return - FI_EIO ;
135- }
136-
137- cuda_ret = cuda_ops .cuDeviceGetAttribute (& is_supported ,
138- CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED , dev );
139- if (cuda_ret != CUDA_SUCCESS ) {
140- ft_cuda_driver_api_print_error (cuda_ret , "cuDeviceGetAttribute" );
141- return - FI_EIO ;
142- }
173+ CUresult cuda_ret ;
174+ CUdevice dev ;
175+ int cc_major = 0 , cc_minor = 0 ;
176+ int dma_buf_attr = 0 ;
177+ int gdr_attr = 0 ;
178+ cuda_memory_support = CUDA_MEMORY_SUPPORT__NOT_INITIALIZED ;
179+
180+
181+ FT_INFO ("ft_cuda_detect_memory_support() called" );
182+
183+ cuda_ret = cuda_ops .cuDeviceGet (& dev , 0 );
184+ if (cuda_ret != CUDA_SUCCESS ) {
185+ ft_cuda_driver_api_print_error (cuda_ret , "cuDeviceGet" );
186+ cuda_memory_support = CUDA_MEMORY_SUPPORT__NOT_SUPPORTED ;
187+ return - FI_EIO ;
188+ }
189+
190+ cuda_ret = cuda_ops .cuDeviceGetAttribute (& cc_major ,
191+ CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR , dev );
192+ if (cuda_ret != CUDA_SUCCESS ) {
193+ ft_cuda_driver_api_print_error (cuda_ret , "cuDeviceGetAttribute(CC_MAJOR)" );
194+ cuda_memory_support = CUDA_MEMORY_SUPPORT__NOT_SUPPORTED ;
195+ return - FI_EIO ;
196+ }
197+
198+ cuda_ret = cuda_ops .cuDeviceGetAttribute (& cc_minor ,
199+ CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR , dev );
200+ if (cuda_ret != CUDA_SUCCESS ) {
201+ ft_cuda_driver_api_print_error (cuda_ret , "cuDeviceGetAttribute(CC_MINOR)" );
202+ cuda_memory_support = CUDA_MEMORY_SUPPORT__NOT_SUPPORTED ;
203+ return - FI_EIO ;
204+ }
205+
206+ cuda_ret = cuda_ops .cuDeviceGetAttribute (& dma_buf_attr ,
207+ CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED , dev );
208+ if (cuda_ret != CUDA_SUCCESS ) {
209+ ft_cuda_driver_api_print_error (cuda_ret , "cuDeviceGetAttribute(DMA_BUF_SUPPORTED)" );
210+ cuda_memory_support = CUDA_MEMORY_SUPPORT__NOT_SUPPORTED ;
211+ return - FI_EIO ;
212+ }
213+
214+ cuda_ret = cuda_ops .cuDeviceGetAttribute (& gdr_attr ,
215+ CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED , dev );
216+ if (cuda_ret != CUDA_SUCCESS ) {
217+ ft_cuda_driver_api_print_error (cuda_ret , "cuDeviceGetAttribute(GPU_DIRECT_RDMA_SUPPORTED)" );
218+ cuda_memory_support = CUDA_MEMORY_SUPPORT__NOT_SUPPORTED ;
219+ return - FI_EIO ;
220+ }
221+
222+ dmabuf_supported = (dma_buf_attr == 1 );
223+
224+ if (cc_major >= 10 ) {
225+ // Blackwell or newer: nv-p2p deprecated
226+ FT_INFO ("Compute capability %d.%d: forcing gdr_supported=false due to Blackwell deprecation" , cc_major , cc_minor );
227+ gdr_supported = false;
228+ } else {
229+ gdr_supported = (gdr_attr == 1 );
230+ }
231+
232+ FT_INFO ("Compute capability %d.%d" , cc_major , cc_minor );
233+ FT_INFO ("dmabuf_supported=%s" , dmabuf_supported ? "true" : "false" );
234+ FT_INFO ("GPU_DIRECT_RDMA_SUPPORTED raw=%d -> gdr_supported=%s" ,
235+ gdr_attr , gdr_supported ? "true" : "false" );
236+
237+ // Final truth table
238+ if (!gdr_supported && !dmabuf_supported ) {
239+ cuda_memory_support = CUDA_MEMORY_SUPPORT__NOT_SUPPORTED ;
240+ } else if (gdr_supported && dmabuf_supported ) {
241+ cuda_memory_support = CUDA_MEMORY_SUPPORT__DMABUF_GDR_BOTH ;
242+ } else if (dmabuf_supported ) {
243+ cuda_memory_support = CUDA_MEMORY_SUPPORT__DMA_BUF_ONLY ;
244+ } else {
245+ cuda_memory_support = CUDA_MEMORY_SUPPORT__GDR_ONLY ;
246+ }
247+
248+ FT_INFO ("cuda_memory_support=%s" , get_cuda_memory_support_str (cuda_memory_support ));
249+ return FI_SUCCESS ;
143250
144- dmabuf_supported = (is_supported == 1 );
251+ #else
252+ FT_INFO ("HAVE_CUDA_DMABUF not enabled, returning CUDA_MEMORY_SUPPORT__NOT_INITIALIZED" );
253+ cuda_memory_support = CUDA_MEMORY_SUPPORT__NOT_INITIALIZED ;
254+ return FI_SUCCESS ;
145255#endif
146- return FI_SUCCESS ;
147256}
148257
258+
149259int ft_cuda_init (void )
150260{
151261 cudaError_t cuda_ret ;
152262 int ret ;
263+ cuda_memory_support = CUDA_MEMORY_SUPPORT__NOT_INITIALIZED ;
153264
154265 cudart_handle = dlopen ("libcudart.so" , RTLD_NOW );
155266 if (!cudart_handle ) {
@@ -261,6 +372,20 @@ int ft_cuda_init(void)
261372 goto err_dlclose_cuda ;
262373 }
263374
375+ cuda_ops .cuDeviceGetName = dlsym (cuda_handle ,
376+ STRINGIFY (cuDeviceGetName ));
377+ if (!cuda_ops .cuDeviceGetName ) {
378+ FT_ERR ("Failed to find cuDeviceGetName\n" );
379+ goto err_dlclose_cuda ;
380+ }
381+
382+ cuda_ops .cuDriverGetVersion = dlsym (cuda_handle ,
383+ STRINGIFY (cuDriverGetVersion ));
384+ if (!cuda_ops .cuDriverGetVersion ) {
385+ FT_ERR ("Failed to find cuDriverGetVersion\n" );
386+ goto err_dlclose_cuda ;
387+ }
388+
264389 cuda_ops .cuMemGetAddressRange = dlsym (cuda_handle ,
265390 STRINGIFY (cuMemGetAddressRange ));
266391 if (!cuda_ops .cuMemGetAddressRange ) {
@@ -274,9 +399,11 @@ int ft_cuda_init(void)
274399 goto err_dlclose_cuda ;
275400 }
276401
277- ret = ft_cuda_hmem_detect_dmabuf_support ();
278- if (ret != FI_SUCCESS )
402+ ret = ft_cuda_detect_memory_support ();
403+ if (ret != FI_SUCCESS ) {
279404 goto err_dlclose_cuda ;
405+ }
406+
280407
281408 return FI_SUCCESS ;
282409
@@ -495,6 +622,14 @@ int ft_cuda_put_dmabuf_fd(int fd)
495622#endif /* HAVE_CUDA_DMABUF */
496623}
497624
625+ cuda_memory_support_e ft_cuda_memory_support (void )
626+ {
627+ if (cuda_memory_support == CUDA_MEMORY_SUPPORT__NOT_INITIALIZED ) {
628+ FT_INFO ("ft_cuda_memory_support() not called yet!" );
629+ }
630+ return cuda_memory_support ;
631+ }
632+
498633#else
499634
500635int ft_cuda_init (void )
@@ -554,4 +689,9 @@ int ft_cuda_put_dmabuf_fd(int fd)
554689{
555690 return - FI_ENOSYS ;
556691}
692+
693+ cuda_memory_support_e ft_cuda_memory_support (void )
694+ {
695+ return CUDA_MEMORY_SUPPORT__NOT_SUPPORTED ;
696+ }
557697#endif /* HAVE_CUDA_RUNTIME_H */
0 commit comments