diff --git a/clang/include/clang/DPCT/DPCTOptions.inc b/clang/include/clang/DPCT/DPCTOptions.inc index afa05039ad58..37deb377c8cc 100644 --- a/clang/include/clang/DPCT/DPCTOptions.inc +++ b/clang/include/clang/DPCT/DPCTOptions.inc @@ -813,8 +813,8 @@ DPCT_ENUM_OPTION( false), DPCT_OPTION_ENUM_VALUE( "virtual_mem", int(ExperimentalFeatures::Exp_VirtualMemory), - "Experimental extension that allows for mapping of an address range onto " - "multiple allocations of physical memory.", + "Experimental extension that allows for mapping of an address " + "range onto multiple allocations of physical memory.", false), DPCT_OPTION_ENUM_VALUE( "in_order_queue_events", @@ -838,7 +838,13 @@ DPCT_ENUM_OPTION( "level_zero", int(ExperimentalFeatures::Exp_LevelZero), "Experimental migration feature that enables the use of Level Zero " "APIs to migrate target code, like CUDA Inter-Process " - "Communication (IPC) APIs.\n", false), + "Communication (IPC) APIs.\n", + false), + DPCT_OPTION_ENUM_VALUE("async_alloc", + int(ExperimentalFeatures::Exp_AsyncAlloc), + "Experimental extension that allows use of SYCL " + "async allocation APIs.\n", + false), DPCT_OPTION_ENUM_VALUE( "all", int(ExperimentalFeatures::Exp_All), "Enable all experimental extensions listed in this option.\n", diff --git a/clang/lib/DPCT/AnalysisInfo.h b/clang/lib/DPCT/AnalysisInfo.h index 06958a3785ec..507d2df800c9 100644 --- a/clang/lib/DPCT/AnalysisInfo.h +++ b/clang/lib/DPCT/AnalysisInfo.h @@ -1352,6 +1352,9 @@ class DpctGlobalInfo { static bool useExtLevelZero() { return getUsingExperimental(); } + static bool useExtAsyncAlloc() { + return getUsingExperimental(); + } static bool useExtPrefetch() { return getUsingExperimental(); } diff --git a/clang/lib/DPCT/CommandOption/ValidateArguments.h b/clang/lib/DPCT/CommandOption/ValidateArguments.h index d7ac5f463211..f6b04b582c74 100644 --- a/clang/lib/DPCT/CommandOption/ValidateArguments.h +++ b/clang/lib/DPCT/CommandOption/ValidateArguments.h @@ -103,6 +103,7 @@ enum class ExperimentalFeatures : unsigned int { Exp_NonStandardSYCLBuiltins, Exp_Prefetch, Exp_LevelZero, + Exp_AsyncAlloc, Exp_All }; enum class HelperFuncPreference : unsigned int { NoQueueDevice = 0 }; diff --git a/clang/lib/DPCT/RuleInfra/CallExprRewriterCommon.h b/clang/lib/DPCT/RuleInfra/CallExprRewriterCommon.h index f7a2c8e545fe..628c7e9b5dcb 100644 --- a/clang/lib/DPCT/RuleInfra/CallExprRewriterCommon.h +++ b/clang/lib/DPCT/RuleInfra/CallExprRewriterCommon.h @@ -1827,6 +1827,10 @@ inline auto UseExtLevelZero = [](const CallExpr *C) -> bool { return DpctGlobalInfo::useExtLevelZero(); }; +inline auto UseExtAsyncAlloc = [](const CallExpr *C) -> bool { + return DpctGlobalInfo::useExtAsyncAlloc(); +}; + inline auto UseExtGraph = [](const CallExpr *C) -> bool { return DpctGlobalInfo::useExtGraph(); }; diff --git a/clang/lib/DPCT/RulesInclude/HeaderTypes.inc b/clang/lib/DPCT/RulesInclude/HeaderTypes.inc index 21e7a643c47e..2279c9b1416f 100644 --- a/clang/lib/DPCT/RulesInclude/HeaderTypes.inc +++ b/clang/lib/DPCT/RulesInclude/HeaderTypes.inc @@ -52,6 +52,8 @@ STD_HEADER(DL, "") #endif STD_HEADER(SHMEM, "") STD_HEADER(SHMEMX, "") +STD_HEADER(AsyncAlloc, + "") ONEDPL_HEADER(Algorithm, "") ONEDPL_HEADER(Execution, "") diff --git a/clang/lib/DPCT/RulesLang/APINamesMemory.inc b/clang/lib/DPCT/RulesLang/APINamesMemory.inc index 3d959fb3fd23..538e222e1760 100644 --- a/clang/lib/DPCT/RulesLang/APINamesMemory.inc +++ b/clang/lib/DPCT/RulesLang/APINamesMemory.inc @@ -993,6 +993,47 @@ CONDITIONAL_FACTORY_ENTRY( "memcpy", false), MEM_ARG(0), MEM_ARG(1), ARG(2), ARG(3)))))) +CONDITIONAL_FACTORY_ENTRY( + checkIsUSM(), + CONDITIONAL_FACTORY_ENTRY( + CheckArgCount(3), + CONDITIONAL_FACTORY_ENTRY( + UseExtAsyncAlloc, + ASSIGNABLE_FACTORY(HEADER_INSERT_FACTORY( + HeaderType::HT_AsyncAlloc, + ASSIGN_FACTORY_ENTRY( + "cudaMallocAsync", DEREF(0), + CALL(MapNames::getClNamespace() + + "ext::oneapi::experimental::async_malloc", + DEREF(2), + ARG(MapNames::getClNamespace() + "usm::alloc::device"), + ARG(1))))), + UNSUPPORT_FACTORY_ENTRY( + "cudaMallocAsync", Diagnostics::TRY_EXPERIMENTAL_FEATURE, + ARG("cudaMallocAsync"), + ARG("--use-experimental-features=async_alloc"))), + UNSUPPORT_FACTORY_ENTRY("cudaMallocAsync", + Diagnostics::API_NOT_MIGRATED, + ARG("cudaMallocAsync"))), + UNSUPPORT_FACTORY_ENTRY("cudaMallocAsync", Diagnostics::API_NOT_MIGRATED, + ARG("cudaMallocAsync"))) +CONDITIONAL_FACTORY_ENTRY( + checkIsUSM(), + CONDITIONAL_FACTORY_ENTRY( + UseExtAsyncAlloc, + ASSIGNABLE_FACTORY(HEADER_INSERT_FACTORY( + HeaderType::HT_AsyncAlloc, + CALL_FACTORY_ENTRY("cudaFreeAsync", + CALL(MapNames::getClNamespace() + + "ext::oneapi::experimental::async_free", + DEREF(1), ARG(0))))), + UNSUPPORT_FACTORY_ENTRY( + "cudaFreeAsync", Diagnostics::TRY_EXPERIMENTAL_FEATURE, + ARG("cudaFreeAsync"), + ARG("--use-experimental-features=async_alloc"))), + UNSUPPORT_FACTORY_ENTRY("cudaFreeAsync", Diagnostics::API_NOT_MIGRATED, + ARG("cudaFreeAsync"))) + #define CUDA_FREE(NAME) \ CONDITIONAL_FACTORY_ENTRY( \ hasManagedAttr(0), \ diff --git a/clang/lib/DPCT/RulesLang/RulesLang.cpp b/clang/lib/DPCT/RulesLang/RulesLang.cpp index e4cefbbc1c85..f4c76ded4906 100644 --- a/clang/lib/DPCT/RulesLang/RulesLang.cpp +++ b/clang/lib/DPCT/RulesLang/RulesLang.cpp @@ -5806,7 +5806,7 @@ void MemoryMigrationRule::mallocMigration( } else if (Name == "cudaHostAlloc" || Name == "cudaMallocHost" || Name == "cuMemHostAlloc" || Name == "cuMemAllocHost_v2" || Name == "cuMemAllocPitch_v2" || Name == "cudaMallocPitch" || - Name == "cudaMallocMipmappedArray") { + Name == "cudaMallocMipmappedArray" || Name == "cudaMallocAsync") { ExprAnalysis EA(C); emplaceTransformation(EA.getReplacement()); EA.applyAllSubExprRepl(); @@ -6837,7 +6837,7 @@ void MemoryMigrationRule::registerMatcher(MatchFinder &MF) { "cuMemsetD8_v2", "cuMemsetD8Async", "cudaMallocMipmappedArray", "cudaGetMipmappedArrayLevel", "cudaFreeMipmappedArray", "cudaMemcpyPeer", "cudaMemcpyPeerAsync", "cuMemcpyPeer", - "cuMemcpyPeerAsync"); + "cuMemcpyPeerAsync", "cudaMallocAsync", "cudaFreeAsync"); }; MF.addMatcher(callExpr(allOf(callee(functionDecl(memoryAPI())), parentStmt())) @@ -6927,7 +6927,8 @@ void MemoryMigrationRule::runRule(const MatchFinder::MatchResult &Result) { Name.compare("cudaMallocMipmappedArray") && Name.compare("cudaGetMipmappedArrayLevel") && Name.compare("cudaFreeMipmappedArray") && Name.compare("cudaMemcpy") && - Name.compare("cudaFree") && Name.compare("cublasFree")) { + Name.compare("cudaFree") && Name.compare("cublasFree") && + Name.compare("cudaMallocAsync") && Name.compare("cudaFreeAsync")) { requestFeature(HelperFeatureEnum::device_ext); insertAroundStmt(C, MapNames::getCheckErrorMacroName() + "(", ")"); } else if (IsAssigned && !Name.compare("cudaMemAdvise") && @@ -6990,6 +6991,7 @@ MemoryMigrationRule::MemoryMigrationRule() { const CallExpr *, const UnresolvedLookupExpr *, bool)>> Dispatcher{ {"cudaMalloc", &MemoryMigrationRule::mallocMigration}, + {"cudaMallocAsync", &MemoryMigrationRule::mallocMigration}, {"cuMemAlloc_v2", &MemoryMigrationRule::mallocMigration}, {"cudaHostAlloc", &MemoryMigrationRule::mallocMigration}, {"cudaMallocHost", &MemoryMigrationRule::mallocMigration}, @@ -7052,6 +7054,7 @@ MemoryMigrationRule::MemoryMigrationRule() { {"cuMemcpyDtoA_v2", &MemoryMigrationRule::arrayMigration}, {"cuMemcpyAtoA_v2", &MemoryMigrationRule::arrayMigration}, {"cudaFree", &MemoryMigrationRule::freeMigration}, + {"cudaFreeAsync", &MemoryMigrationRule::freeMigration}, {"cuMemFree_v2", &MemoryMigrationRule::freeMigration}, {"cudaFreeArray", &MemoryMigrationRule::freeMigration}, {"cudaFreeMipmappedArray", &MemoryMigrationRule::freeMigration}, diff --git a/clang/lib/DPCT/SrcAPI/APINames.inc b/clang/lib/DPCT/SrcAPI/APINames.inc index 8f17aaa8d9a2..043aa4321e3f 100644 --- a/clang/lib/DPCT/SrcAPI/APINames.inc +++ b/clang/lib/DPCT/SrcAPI/APINames.inc @@ -237,8 +237,8 @@ ENTRY(cudaMemcpyToArray, cudaMemcpyToArray, true, NO_FLAG, P0, "Successful") ENTRY(cudaMemcpyToArrayAsync, cudaMemcpyToArrayAsync, true, NO_FLAG, P0, "Successful") // stream ordered memory allocator functions of runtime API -ENTRY(cudaFreeAsync, cudaFreeAsync, false, NO_FLAG, P7, "comment") -ENTRY(cudaMallocAsync, cudaMallocAsync, false, NO_FLAG, P7, "comment") +ENTRY(cudaFreeAsync, cudaFreeAsync, true, NO_FLAG, P7, "comment") +ENTRY(cudaMallocAsync, cudaMallocAsync, true, NO_FLAG, P7, "partial") ENTRY(cudaMallocFromPoolAsync, cudaMallocFromPoolAsync, false, NO_FLAG, P7, "comment") ENTRY(cudaMemPoolCreate, cudaMemPoolCreate, false, NO_FLAG, P4, "comment") ENTRY(cudaMemPoolDestroy, cudaMemPoolDestroy, false, NO_FLAG, P7, "comment") diff --git a/clang/test/dpct/async_alloc.cu b/clang/test/dpct/async_alloc.cu new file mode 100644 index 000000000000..9a80e8db5ba3 --- /dev/null +++ b/clang/test/dpct/async_alloc.cu @@ -0,0 +1,23 @@ +// UNSUPPORTED: cuda-8.0, cuda-9.0, cuda-9.1, cuda-9.2, cuda-10.0, cuda-10.1, cuda-10.2, cuda-11.0, cuda-11.1 +// UNSUPPORTED: v8.0, v9.0, v9.1, v9.2, v10.0, v10.1, v10.2, v11.0, v11.1 +// RUN: dpct --format-range=none --out-root %T/async_alloc %s --cuda-include-path="%cuda-path/include" --use-experimental-features=async_alloc +// RUN: FileCheck --match-full-lines --input-file %T/async_alloc/async_alloc.dp.cpp %s +// RUN: %if build_lit %{icpx -c -fsycl -DNO_BUILD_TEST %T/async_alloc/async_alloc.dp.cpp -o %T/async_alloc/async_alloc.dp.o %} + +// CHECK: #include + +void foo_1(float *f, cudaStream_t hStream) { + // CHECK: cudaMemPool_t memPool; + // CHECK-NEXT: /* + // CHECK-NEXT: DPCT1007:{{[0-9]+}}: Migration of cudaMallocAsync is not supported. + // CHECK-NEXT: */ + // CHECK-NEXT: cudaMallocAsync(&f, 1024, memPool, hStream); + // CHECK: f = sycl::ext::oneapi::experimental::async_malloc(*hStream, sycl::usm::alloc::device, 1024); + // CHECK-NEXT: sycl::ext::oneapi::experimental::async_free(*hStream, f); +#ifndef NO_BUILD_TEST + cudaMemPool_t memPool; + cudaMallocAsync(&f, 1024, memPool, hStream); +#endif + cudaMallocAsync(&f, 1024, hStream); + cudaFreeAsync(f, hStream); +} diff --git a/clang/test/dpct/async_alloc_no_ext.cu b/clang/test/dpct/async_alloc_no_ext.cu new file mode 100644 index 000000000000..947324e60c6a --- /dev/null +++ b/clang/test/dpct/async_alloc_no_ext.cu @@ -0,0 +1,27 @@ +// UNSUPPORTED: cuda-8.0, cuda-9.0, cuda-9.1, cuda-9.2, cuda-10.0, cuda-10.1, cuda-10.2, cuda-11.0, cuda-11.1 +// UNSUPPORTED: v8.0, v9.0, v9.1, v9.2, v10.0, v10.1, v10.2, v11.0, v11.1 +// RUN: dpct --format-range=none --out-root %T/async_alloc_no_ext %s --cuda-include-path="%cuda-path/include" +// RUN: FileCheck --match-full-lines --input-file %T/async_alloc_no_ext/async_alloc_no_ext.dp.cpp %s +// RUN: %if build_lit %{icpx -c -fsycl -DNO_BUILD_TEST %T/async_alloc_no_ext/async_alloc_no_ext.dp.cpp -o %T/async_alloc_no_ext/async_alloc_no_ext.dp.o %} + +void foo_1(float *f, cudaStream_t hStream) { + // CHECK: cudaMemPool_t memPool; + // CHECK-NEXT: /* + // CHECK-NEXT: DPCT1007:{{[0-9]+}}: Migration of cudaMallocAsync is not supported. + // CHECK-NEXT: */ + // CHECK-NEXT: cudaMallocAsync(&f, 1024, memPool, hStream); + // CHECK-NEXT: /* + // CHECK-NEXT: DPCT1119:{{[0-9]+}}: Migration of cudaMallocAsync is not supported, please try to remigrate with option: --use-experimental-features=async_alloc. + // CHECK-NEXT: */ + // CHECK-NEXT: cudaMallocAsync(&f, 1024, hStream); + // CHECK-NEXT: /* + // CHECK-NEXT: DPCT1119:{{[0-9]+}}: Migration of cudaFreeAsync is not supported, please try to remigrate with option: --use-experimental-features=async_alloc. + // CHECK-NEXT: */ + // CHECK-NEXT: cudaFreeAsync(f, hStream); +#ifndef NO_BUILD_TEST + cudaMemPool_t memPool; + cudaMallocAsync(&f, 1024, memPool, hStream); + cudaMallocAsync(&f, 1024, hStream); + cudaFreeAsync(f, hStream); +#endif +} diff --git a/clang/test/dpct/help_option_check/lin/help_advanced.txt b/clang/test/dpct/help_option_check/lin/help_advanced.txt index eec0dd19163c..5e699c861c3f 100644 --- a/clang/test/dpct/help_option_check/lin/help_advanced.txt +++ b/clang/test/dpct/help_option_check/lin/help_advanced.txt @@ -65,6 +65,7 @@ Advanced DPCT options =non-stdandard-sycl-builtins - Experimental extension that allows use of non standard SYCL builtin functions. =prefetch - Experimental extension that allows use of SYCL prefetch APIs. =level_zero - Experimental migration feature that enables the use of Level Zero APIs to migrate target code, like CUDA Inter-Process Communication (IPC) APIs. + =async_alloc - Experimental extension that allows use of SYCL async allocation APIs. =all - Enable all experimental extensions listed in this option. ... Paths of input source files. These paths are looked up in the compilation database. diff --git a/clang/test/dpct/help_option_check/lin/help_all.txt b/clang/test/dpct/help_option_check/lin/help_all.txt index 0a759cf82505..0b34c34bd810 100644 --- a/clang/test/dpct/help_option_check/lin/help_all.txt +++ b/clang/test/dpct/help_option_check/lin/help_all.txt @@ -172,6 +172,7 @@ All DPCT options =non-stdandard-sycl-builtins - Experimental extension that allows use of non standard SYCL builtin functions. =prefetch - Experimental extension that allows use of SYCL prefetch APIs. =level_zero - Experimental migration feature that enables the use of Level Zero APIs to migrate target code, like CUDA Inter-Process Communication (IPC) APIs. + =async_alloc - Experimental extension that allows use of SYCL async allocation APIs. =all - Enable all experimental extensions listed in this option. --use-explicit-namespace= - Define the namespaces to use explicitly in generated code. The is a comma separated list. Default: dpct/syclcompat, sycl. diff --git a/clang/test/dpct/help_option_check/win/help_advanced.txt b/clang/test/dpct/help_option_check/win/help_advanced.txt index 965b8f959536..6543d07d1711 100644 --- a/clang/test/dpct/help_option_check/win/help_advanced.txt +++ b/clang/test/dpct/help_option_check/win/help_advanced.txt @@ -65,6 +65,7 @@ Advanced DPCT options =non-stdandard-sycl-builtins - Experimental extension that allows use of non standard SYCL builtin functions. =prefetch - Experimental extension that allows use of SYCL prefetch APIs. =level_zero - Experimental migration feature that enables the use of Level Zero APIs to migrate target code, like CUDA Inter-Process Communication (IPC) APIs. + =async_alloc - Experimental extension that allows use of SYCL async allocation APIs. =all - Enable all experimental extensions listed in this option. ... Paths of input source files. These paths are looked up in the compilation database. diff --git a/clang/test/dpct/help_option_check/win/help_all.txt b/clang/test/dpct/help_option_check/win/help_all.txt index 7887c4a5a5dc..a988d0d04f4a 100644 --- a/clang/test/dpct/help_option_check/win/help_all.txt +++ b/clang/test/dpct/help_option_check/win/help_all.txt @@ -171,6 +171,7 @@ All DPCT options =non-stdandard-sycl-builtins - Experimental extension that allows use of non standard SYCL builtin functions. =prefetch - Experimental extension that allows use of SYCL prefetch APIs. =level_zero - Experimental migration feature that enables the use of Level Zero APIs to migrate target code, like CUDA Inter-Process Communication (IPC) APIs. + =async_alloc - Experimental extension that allows use of SYCL async allocation APIs. =all - Enable all experimental extensions listed in this option. --use-explicit-namespace= - Define the namespaces to use explicitly in generated code. The is a comma separated list. Default: dpct/syclcompat, sycl.