From cf55f56c03361e625c8fee84a874fa2487e25ef3 Mon Sep 17 00:00:00 2001 From: Nick Mazzilli Date: Tue, 7 Oct 2025 12:32:34 -0700 Subject: [PATCH 1/2] fabtests/common: cuda dmabuf validation logic feat: Adding cuda dmabuf validation logic Problem: - Users was submitting fabtests without the --do-dmabuf-reg-for-hmem flag Solution: - Added hmem cuda logic changes based on nvidia manual [https://docs.nvidia.com/cuda/gpudirect-rdma/ here] - Added check_dmabuf to init cuda internals and get dmabuf support information Testing: - Validated tests with --do-dmabuf-reg-for-hmem flag and without it on p6-gb200 cluster - All tests skipped with cuda command - make -j install && python3 install/bin/runfabtests.py --expression "cuda" -vvv -p /home/nmazzill/libfabric/fabtests/install/bin/ --junit-xml ft_`git branch --show-current`_`git rev-parse HEAD`_all_junit.xml --nworkers 16 -b efa 10.0.123.149 10.0.121.190 | tee ft_`git branch --show-current`_`git rev-parse HEAD`_all_stdout - All tests passed with this cuda command - make -j install && python3 install/bin/runfabtests.py --expression "cuda" --do-dmabuf-reg-for-hmem -vvv -p /home/nmazzill/libfabric/fabtests/install/bin/ --junit-xml ft_`git branch --show-current`_`git rev-parse HEAD`_all_junit.xml --nworkers 16 -b efa 10.0.123.149 10.0.121.190 | tee ft_`git branch --show-current`_`git rev-parse HEAD`_all_stdout Sim Issue: - N/A Signed-off-by: Nick Mazzilli --- fabtests/Makefile.am | 11 +- fabtests/common/check_cuda_dmabuf.c | 33 +++++ fabtests/common/hmem_cuda.c | 193 ++++++++++++++++++++++++---- fabtests/include/hmem.h | 9 ++ 4 files changed, 218 insertions(+), 28 deletions(-) create mode 100644 fabtests/common/check_cuda_dmabuf.c diff --git a/fabtests/Makefile.am b/fabtests/Makefile.am index c9876a2edb6..7ddff7cb22e 100644 --- a/fabtests/Makefile.am +++ b/fabtests/Makefile.am @@ -69,7 +69,8 @@ bin_PROGRAMS = \ multinode/fi_multinode_coll \ component/sock_test \ regression/sighandler_test \ - common/check_hmem + common/check_hmem \ + common/check_cuda_dmabuf if HAVE_ZE_DEVEL if HAVE_VERBS_DEVEL @@ -619,6 +620,14 @@ common_check_hmem_LDADD = libfabtests.la common_checK_hmem_CFLAGS = \ $(AM_CFLAGS) +common_check_cuda_dmabuf_SOURCES = \ + common/check_cuda_dmabuf.c + +common_check_cuda_dmabuf_LDADD = libfabtests.la + +common_check_cuda_dmabuf_CFLAGS = \ + $(AM_CFLAGS) + real_man_pages = \ man/man7/fabtests.7 diff --git a/fabtests/common/check_cuda_dmabuf.c b/fabtests/common/check_cuda_dmabuf.c new file mode 100644 index 00000000000..5e6867b15d0 --- /dev/null +++ b/fabtests/common/check_cuda_dmabuf.c @@ -0,0 +1,33 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. +*/ + +#include +#include +#include +#include +#include +#include + +int main(int argc, char **argv) +{ + int ret; + + /* Make sure default CUDA device is sane for ft_cuda_init() */ + opts = INIT_OPTS; + opts.device = 0; /* cuda device 0 */ + + /* Initialize CUDA side only; avoid ft_init_fabric() */ + ret = ft_cuda_init(); + if (ret != FI_SUCCESS) { + FT_ERR("ft_cuda_init failed: %d", ret); + return FT_CUDA_NOT_SUPPORTED; + } + + enum ft_cuda_memory_support cuda_memory_support = ft_cuda_memory_support(); + FT_INFO("dmabuf: ft_cuda_memory_support() -> %d", cuda_memory_support); + + ft_cuda_cleanup(); + return cuda_memory_support; +} \ No newline at end of file diff --git a/fabtests/common/hmem_cuda.c b/fabtests/common/hmem_cuda.c index f056a30f91a..3a96cacd77a 100644 --- a/fabtests/common/hmem_cuda.c +++ b/fabtests/common/hmem_cuda.c @@ -65,6 +65,8 @@ struct cuda_ops { CUresult (*cuDeviceGetAttribute)(int* pi, CUdevice_attribute attrib, CUdevice dev); CUresult (*cuDeviceGet)(CUdevice* device, int ordinal); + CUresult (*cuDeviceGetName)(char* name, int len, CUdevice dev); + CUresult (*cuDriverGetVersion)(int* driverVersion); CUresult (*cuMemGetAddressRange)( CUdeviceptr* pbase, size_t* psize, CUdeviceptr dptr); }; @@ -73,6 +75,24 @@ static struct cuda_ops cuda_ops; static void *cudart_handle; static void *cuda_handle; static bool dmabuf_supported; +static bool gdr_supported; +static enum ft_cuda_memory_support cuda_memory_support = FT_CUDA_NOT_INITIALIZED; +static const char* get_cuda_memory_support_str(enum ft_cuda_memory_support support) { + switch (support) { + case FT_CUDA_NOT_INITIALIZED: + return "NOT_INITIALIZED"; + case FT_CUDA_NOT_SUPPORTED: + return "NOT_SUPPORTED"; + case FT_CUDA_DMA_BUF_ONLY: + return "DMA_BUF_ONLY"; + case FT_CUDA_GDR_ONLY: + return "GDR_ONLY"; + case FT_CUDA_DMABUF_GDR_BOTH: + return "DMABUF_GDR_BOTH"; + default: + return "INVALID"; + } +} /** * Since function names can get redefined in cuda.h/cuda_runtime.h files, @@ -113,43 +133,133 @@ static int ft_cuda_pointer_set_attribute(void *buf) } /** - * @brief detect dmabuf support in the current platform - * This checks the dmabuf support in the current platform - * by querying the property of cuda device 0 + * @brief Detect CUDA memory transport support (dma-buf vs P2P) + * + * This routine queries device 0 for: + * - CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED + * - CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED + * - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR + * + * Logic is derived from CUDA 13.0 release notes and Blackwell compatibility guide: + * - https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html + * - https://docs.nvidia.com/cuda/blackwell-compatibility-guide/ + * - https://developer.nvidia.com/blog/cuda-toolkit-12-8-delivers-nvidia-blackwell-support/ + * + * NVIDIA deprecated GPUDirect RDMA (nv-p2p APIs) starting with Blackwell + * (compute capability >= 10). Applications must migrate to dma-buf. * - * @return FI_SUCCESS if dmabuf support check is successful - * -FI_EIO upon CUDA API error + * Truth table for effective support (device memory only): + * + * DMA_BUF GPU_DIRECT_RDMA Result + * ------- ---------------- ------------------------ + * 0 0 NOT_SUPPORTED + * 0 1 GDR_ONLY + * 1 0 DMA_BUF_ONLY + * 1 1 DMABUF_GDR_BOTH + * + * Note: + * - CU_DEVICE_ATTRIBUTE_HOST_ALLOC_DMA_BUF_SUPPORTED is orthogonal and + * indicates whether cudaHostAlloc() memory can be exported as dma-buf. + * - On compute capability >= 10 (Blackwell), we force GPU_DIRECT_RDMA=0 + * regardless of attribute value to align with the deprecation notice. + * + * @return FI_SUCCESS on success + * -FI_EIO on CUDA API error + * Sets global flags dmabuf_supported, gdr_supported, and cuda_memory_support. */ -static int ft_cuda_hmem_detect_dmabuf_support(void) +static int ft_cuda_detect_memory_support(void) { - dmabuf_supported = false; #if HAVE_CUDA_DMABUF - CUresult cuda_ret; - CUdevice dev; - int is_supported = 0; - - cuda_ret = cuda_ops.cuDeviceGet(&dev, 0); - if (cuda_ret != CUDA_SUCCESS) { - ft_cuda_driver_api_print_error(cuda_ret, "cuDeviceGet"); - return -FI_EIO; - } - - cuda_ret = cuda_ops.cuDeviceGetAttribute(&is_supported, - CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED, dev); - if (cuda_ret != CUDA_SUCCESS) { - ft_cuda_driver_api_print_error(cuda_ret, "cuDeviceGetAttribute"); - return -FI_EIO; - } + CUresult cuda_ret; + CUdevice dev; + int cc_major = 0, cc_minor = 0; + int dma_buf_attr = 0; + int gdr_attr = 0; + cuda_memory_support = FT_CUDA_NOT_INITIALIZED; + + + FT_INFO("ft_cuda_detect_memory_support() called"); + + cuda_ret = cuda_ops.cuDeviceGet(&dev, 0); + if (cuda_ret != CUDA_SUCCESS) { + ft_cuda_driver_api_print_error(cuda_ret, "cuDeviceGet"); + cuda_memory_support = FT_CUDA_NOT_SUPPORTED; + return -FI_EIO; + } + + cuda_ret = cuda_ops.cuDeviceGetAttribute(&cc_major, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev); + if (cuda_ret != CUDA_SUCCESS) { + ft_cuda_driver_api_print_error(cuda_ret, "cuDeviceGetAttribute(CC_MAJOR)"); + cuda_memory_support = FT_CUDA_NOT_SUPPORTED; + return -FI_EIO; + } + + cuda_ret = cuda_ops.cuDeviceGetAttribute(&cc_minor, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev); + if (cuda_ret != CUDA_SUCCESS) { + ft_cuda_driver_api_print_error(cuda_ret, "cuDeviceGetAttribute(CC_MINOR)"); + cuda_memory_support = FT_CUDA_NOT_SUPPORTED; + return -FI_EIO; + } + + cuda_ret = cuda_ops.cuDeviceGetAttribute(&dma_buf_attr, + CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED, dev); + if (cuda_ret != CUDA_SUCCESS) { + ft_cuda_driver_api_print_error(cuda_ret, "cuDeviceGetAttribute(DMA_BUF_SUPPORTED)"); + cuda_memory_support = FT_CUDA_NOT_SUPPORTED; + return -FI_EIO; + } + + cuda_ret = cuda_ops.cuDeviceGetAttribute(&gdr_attr, + CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, dev); + if (cuda_ret != CUDA_SUCCESS) { + ft_cuda_driver_api_print_error(cuda_ret, "cuDeviceGetAttribute(GPU_DIRECT_RDMA_SUPPORTED)"); + cuda_memory_support = FT_CUDA_NOT_SUPPORTED; + return -FI_EIO; + } + + dmabuf_supported = (dma_buf_attr == 1); + + if (cc_major >= 10) { + // Blackwell or newer: nv-p2p deprecated + FT_INFO("Compute capability %d.%d: forcing gdr_supported=false due to Blackwell deprecation", cc_major, cc_minor); + gdr_supported = false; + } else { + gdr_supported = (gdr_attr == 1); + } + + FT_INFO("Compute capability %d.%d", cc_major, cc_minor); + FT_INFO("dmabuf_supported=%s", dmabuf_supported ? "true" : "false"); + FT_INFO("GPU_DIRECT_RDMA_SUPPORTED raw=%d -> gdr_supported=%s", + gdr_attr, gdr_supported ? "true" : "false"); + + // Final truth table + if (!gdr_supported && !dmabuf_supported) + cuda_memory_support = FT_CUDA_NOT_SUPPORTED; + else if (gdr_supported && dmabuf_supported) + cuda_memory_support = FT_CUDA_DMABUF_GDR_BOTH; + else if (dmabuf_supported) + cuda_memory_support = FT_CUDA_DMA_BUF_ONLY; + else + cuda_memory_support = FT_CUDA_GDR_ONLY; + + FT_INFO("cuda_memory_support=%s", get_cuda_memory_support_str(cuda_memory_support)); + return FI_SUCCESS; - dmabuf_supported = (is_supported == 1); +#else + FT_INFO("HAVE_CUDA_DMABUF not enabled, returning FT_CUDA_NOT_INITIALIZED"); + cuda_memory_support = FT_CUDA_NOT_INITIALIZED; + return FI_SUCCESS; #endif - return FI_SUCCESS; } + int ft_cuda_init(void) { cudaError_t cuda_ret; int ret; + cuda_memory_support = FT_CUDA_NOT_INITIALIZED; cudart_handle = dlopen("libcudart.so", RTLD_NOW); if (!cudart_handle) { @@ -261,6 +371,20 @@ int ft_cuda_init(void) goto err_dlclose_cuda; } + cuda_ops.cuDeviceGetName = dlsym(cuda_handle, + STRINGIFY(cuDeviceGetName)); + if (!cuda_ops.cuDeviceGetName) { + FT_ERR("Failed to find cuDeviceGetName\n"); + goto err_dlclose_cuda; + } + + cuda_ops.cuDriverGetVersion = dlsym(cuda_handle, + STRINGIFY(cuDriverGetVersion)); + if (!cuda_ops.cuDriverGetVersion) { + FT_ERR("Failed to find cuDriverGetVersion\n"); + goto err_dlclose_cuda; + } + cuda_ops.cuMemGetAddressRange = dlsym(cuda_handle, STRINGIFY(cuMemGetAddressRange)); if (!cuda_ops.cuMemGetAddressRange) { @@ -274,9 +398,11 @@ int ft_cuda_init(void) goto err_dlclose_cuda; } - ret = ft_cuda_hmem_detect_dmabuf_support(); - if (ret != FI_SUCCESS) + ret = ft_cuda_detect_memory_support(); + if (ret != FI_SUCCESS) { goto err_dlclose_cuda; + } + return FI_SUCCESS; @@ -495,6 +621,14 @@ int ft_cuda_put_dmabuf_fd(int fd) #endif /* HAVE_CUDA_DMABUF */ } +enum ft_cuda_memory_support ft_cuda_memory_support(void) +{ + if (cuda_memory_support == FT_CUDA_NOT_INITIALIZED) { + FT_INFO("ft_cuda_memory_support() not called yet!"); + } + return cuda_memory_support; +} + #else int ft_cuda_init(void) @@ -554,4 +688,9 @@ int ft_cuda_put_dmabuf_fd(int fd) { return -FI_ENOSYS; } + +enum ft_cuda_memory_support ft_cuda_memory_support(void) +{ + return FT_CUDA_NOT_SUPPORTED; +} #endif /* HAVE_CUDA_RUNTIME_H */ diff --git a/fabtests/include/hmem.h b/fabtests/include/hmem.h index 2a8498f7c2f..db837e1d5ad 100644 --- a/fabtests/include/hmem.h +++ b/fabtests/include/hmem.h @@ -35,6 +35,14 @@ #include #include +enum ft_cuda_memory_support { + FT_CUDA_NOT_INITIALIZED = -1, + FT_CUDA_NOT_SUPPORTED = 0, + FT_CUDA_DMA_BUF_ONLY = 1, + FT_CUDA_GDR_ONLY = 2, + FT_CUDA_DMABUF_GDR_BOTH = 3, +}; + #if HAVE_ZE #include extern struct libze_ops { @@ -185,6 +193,7 @@ int ft_cuda_copy_from_hmem(uint64_t device, void *dst, const void *src, int ft_cuda_get_dmabuf_fd(void *buf, size_t len, int *fd, uint64_t *offset); int ft_cuda_put_dmabuf_fd(int fd); +enum ft_cuda_memory_support ft_cuda_memory_support(void); int ft_rocr_init(void); int ft_rocr_cleanup(void); From f664b171bd0315206d1f027ce07aecc5bace6945 Mon Sep 17 00:00:00 2001 From: Nick Mazzilli Date: Tue, 7 Oct 2025 12:33:29 -0700 Subject: [PATCH 2/2] fabtests/efa: cuda dmabuf validation logic test: Adding cuda dmabuf validation logic Problem: - Users was submitting fabtests without the --do-dmabuf-reg-for-hmem flag Solution: - Added conftest to validate these checks if a user specifies cuda command in fabtests Testing: - Validated tests with --do-dmabuf-reg-for-hmem flag and without it on p6-gb200 cluster - All tests skipped with cuda command - make -j install && python3 install/bin/runfabtests.py --expression "cuda" -vvv -p /home/nmazzill/libfabric/fabtests/install/bin/ --junit-xml ft_`git branch --show-current`_`git rev-parse HEAD`_all_junit.xml --nworkers 16 -b efa 10.0.123.149 10.0.121.190 | tee ft_`git branch --show-current`_`git rev-parse HEAD`_all_stdout - All tests passed with this cuda command - make -j install && python3 install/bin/runfabtests.py --expression "cuda" --do-dmabuf-reg-for-hmem -vvv -p /home/nmazzill/libfabric/fabtests/install/bin/ --junit-xml ft_`git branch --show-current`_`git rev-parse HEAD`_all_junit.xml --nworkers 16 -b efa 10.0.123.149 10.0.121.190 | tee ft_`git branch --show-current`_`git rev-parse HEAD`_all_stdout Sim Issue: - N/A Signed-off-by: Nick Mazzilli --- fabtests/pytest/efa/conftest.py | 58 ++++++++++++++++++++++++++++- fabtests/pytest/efa/efa_common.py | 61 +++++++++++++++++++++++++++++++ 2 files changed, 118 insertions(+), 1 deletion(-) diff --git a/fabtests/pytest/efa/conftest.py b/fabtests/pytest/efa/conftest.py index e980a258f6b..37e018a99de 100644 --- a/fabtests/pytest/efa/conftest.py +++ b/fabtests/pytest/efa/conftest.py @@ -1,5 +1,11 @@ import pytest -from efa_common import has_rdma, support_cq_interrupts +import time +from efa_common import ( + has_rdma, + support_cq_interrupts, + CudaMemorySupport, + get_cuda_memory_support, +) # The memory types for bi-directional tests. memory_type_list_bi_dir = [ @@ -99,6 +105,56 @@ def rma_fabric(cmdline_args, fabric): pytest.skip("FI_RMA is not supported. Skip rma tests on efa-direct.") return fabric + +def cuda_memory_type_validation(cmdline_args): + """ + Validate CUDA memory type configuration against hardware capabilities at session startup. + + Args: + cmdline_args: Command line arguments containing dmabuf configuration. + + Returns: + None + + Notes: + - Skips tests if user specified non-dmabuf but hardware only supports DMA_BUF_ONLY + - Only validates if CUDA tests are being run + """ + # Check if CUDA tests are being run via expression + print("Running cuda_memory_type_validation() validation checks!") + + cuda_support: CudaMemorySupport = get_cuda_memory_support( + cmdline_args=cmdline_args, + ip=cmdline_args.server_id + ) + + if cuda_support == CudaMemorySupport.NOT_INITIALIZED: + pytest.fail("CUDA memory support never initialized") + + do_dmabuf = cmdline_args.do_dmabuf_reg_for_hmem + if (do_dmabuf is None and + cuda_support == CudaMemorySupport.DMA_BUF_ONLY): + error = "User specified CUDA without dmabuf but hardware only supports DMA_BUF_ONLY" + print(f"CUDA validation failed: {error}") + pytest.skip(error) + + print(f"Correctly defined dma buf mode {do_dmabuf} and return {cuda_support}!") + + return + + +@pytest.fixture(scope="function", autouse=True) +def cuda_validation_fixture(request, cmdline_args): + """Auto-run CUDA validation if CUDA tests are present.""" + # Check if the current test has cuda_memory mark + has_cuda_mark = any(mark.name == 'cuda_memory' for mark in request.node.iter_markers()) + + if has_cuda_mark: + cuda_memory_type_validation(cmdline_args) + else: + print("No CUDA memory mark, skipping validation") + + @pytest.hookimpl(hookwrapper=True) def pytest_collection_modifyitems(session, config, items): # Called after collection has been performed, may filter or re-order the items in-place diff --git a/fabtests/pytest/efa/efa_common.py b/fabtests/pytest/efa/efa_common.py index 35e33c78cc8..016aa1bd895 100644 --- a/fabtests/pytest/efa/efa_common.py +++ b/fabtests/pytest/efa/efa_common.py @@ -1,9 +1,70 @@ import os import subprocess import functools +import pytest +from enum import IntEnum from common import SshConnectionError, is_ssh_connection_error, has_ssh_connection_err_msg, ClientServerTest from retrying import retry + +class CudaMemorySupport(IntEnum): + NOT_INITIALIZED = -1 + NOT_SUPPORTED = 0 + DMA_BUF_ONLY = 1 + GDR_ONLY = 2 + DMABUF_GDR_BOTH = 3 + + def __str__(self): + return self.name + +@retry(retry_on_exception=is_ssh_connection_error, stop_max_attempt_number=3, wait_fixed=5000) +def get_cuda_memory_support(cmdline_args, ip): + """ + Execute check_dmabuf binary to determine CUDA memory support capabilities. + + Args: + cmdline_args: Command line arguments containing binpath, timeout, provider, and environments. + ip: IP address or hostname of the target machine. + + Returns: + CudaMemorySupport: Enum value indicating hardware CUDA memory support type. + + Notes: + - Executes check_dmabuf binary remotely via SSH with timeout + - Maps return code directly to CudaMemorySupport enum values + - Returns UNKNOWN for negative return codes indicating errors + - Retries on SSH connection errors up to 3 times + """ + binpath = cmdline_args.binpath or "" + cmd = "timeout " + str(cmdline_args.timeout) \ + + " " + os.path.join(binpath, "check_cuda_dmabuf") \ + + " -p " + cmdline_args.provider + if cmdline_args.environments: + cmd = cmdline_args.environments + " " + cmd + + proc = subprocess.run("ssh {} {}".format(ip, cmd), + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + shell=True, + universal_newlines=True) + + if has_ssh_connection_err_msg(proc.stdout): + raise SshConnectionError() + + if proc.returncode < 0: + return CudaMemorySupport.NOT_SUPPORTED + + print(f"The ssh return is {proc}") + rc = proc.returncode + if rc not in (CudaMemorySupport.NOT_SUPPORTED, + CudaMemorySupport.DMA_BUF_ONLY, + CudaMemorySupport.GDR_ONLY, + CudaMemorySupport.DMABUF_GDR_BOTH): + print(f"[warn] check_dmabuf returned unexpected code {rc}, treating as NOT_INITIALIZED") + return CudaMemorySupport.NOT_INITIALIZED + + return CudaMemorySupport(rc) + def efa_run_client_server_test(cmdline_args, executable, iteration_type, completion_semantic, memory_type, message_size, warmup_iteration_type=None, timeout=None,