From c632bbadd6ba83f25f953569011ff47c69f51e38 Mon Sep 17 00:00:00 2001 From: John Shumway Date: Fri, 17 Oct 2025 03:40:54 +0000 Subject: [PATCH 01/39] Add experimental builder infrastructure for composable_kernel - Add experimental/builder directory with README documentation. - Create initial test infrastructure with CMakeLists.txt and placeholder test. - Update root CMakeLists.txt to support CK_EXPERIMENTAL_BUILDER option. - Update .gitignore to not treat `experimental/builder` as a CMake build directory. This establishes the directory structure for a high-level builder pattern that will provide a semantically-clear interface for constructing CK operations, with initial focus on convolution kernels for MIOpen integration. --- .gitignore | 8 +++-- CMakeLists.txt | 5 +++ experimental/builder/README.md | 34 +++++++++++++++++++ experimental/builder/test/CMakeLists.txt | 20 +++++++++++ .../builder/test/test_conv_builder.cpp | 9 +++++ 5 files changed, 74 insertions(+), 2 deletions(-) create mode 100644 experimental/builder/README.md create mode 100644 experimental/builder/test/CMakeLists.txt create mode 100644 experimental/builder/test/test_conv_builder.cpp diff --git a/.gitignore b/.gitignore index e4dd8f7513..bcc5888b7f 100644 --- a/.gitignore +++ b/.gitignore @@ -36,7 +36,7 @@ tags # Editors .vscode -# build-in-source directory +# build-in-source directory (see exceptions below) build* # emacs temporary/backup files @@ -58,7 +58,7 @@ _doxygen/ docs/doxygen/html docs/doxygen/xml -# JetBrains IDE +# JetBrains IDE (see build* exceptions below) .idea/ cmake-build*/ build*/ @@ -71,3 +71,7 @@ __pycache__/ .cache/ +# Exceptions to build* patterns above +# The experimental/builder directory should be tracked despite matching build* +!experimental/builder +!experimental/builder/** diff --git a/CMakeLists.txt b/CMakeLists.txt index f4d3a83c34..d260d8c00e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -37,6 +37,7 @@ include(CTest) option(ENABLE_CLANG_CPP_CHECKS "Enables clang tidy, cppcheck" ON) option(MIOPEN_REQ_LIBS_ONLY "Build only the MIOpen required libraries" OFF) +option(CK_EXPERIMENTAL_BUILDER "Enable experimental builder" OFF) option(BUILD_MHA_LIB "Build the static library for flash attention" OFF) # Usage: for customized Python location cmake -DCK_USE_ALTERNATIVE_PYTHON="/opt/Python-3.8.13/bin/python3.8" @@ -692,6 +693,10 @@ if (NOT MIOPEN_REQ_LIBS_ONLY) add_subdirectory(profiler) endif() +if (CK_EXPERIMENTAL_BUILDER) + add_subdirectory(experimental/builder/test) +endif() + if(CK_USE_CODEGEN AND (SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR GPU_ARCHS)) add_subdirectory(codegen) endif() diff --git a/experimental/builder/README.md b/experimental/builder/README.md new file mode 100644 index 0000000000..d8b8757dc2 --- /dev/null +++ b/experimental/builder/README.md @@ -0,0 +1,34 @@ +# Builder + +This directory contains the experimental builder feature for composable_kernel. + +* Status: In development (October - November 2025) + +## Overview + +The builder provides a high-level, semantically-clear interface for constructing composable kernel operations, with an initial focus on convolution kernels for MIOpen. It leverages modern C++20 features (such as POD structs as non-type template parameters, concepts, and designated initializers) to simplify kernel instantiation and improve developer experience. + +This project is a prototype for a more general builder pattern for all of composable_kernel (CK) and CKTile, but is currently limited to formalizing the interface between MIOpen and CK. + +## Directory Structure + +- `include/ck_tile/builder/` + Core builder headers and public API. +- `test/` + Unit tests and example usage of the builder pattern. +- `CMakeLists.txt` + CMake configuration for building the experimental builder and its tests. + +## CMake Configuration + +To enable the experimental builder, configure your build with: + +```sh +cmake -DCK_EXPERIMENTAL_BUILDER=ON -DCMAKE_CXX_STANDARD=20 ... +``` +## Building and testing + +During development, build and test from the CK build directory with +```sh +ninja test_conv_builder && bin/test_conv_builder +``` diff --git a/experimental/builder/test/CMakeLists.txt b/experimental/builder/test/CMakeLists.txt new file mode 100644 index 0000000000..b3647ade66 --- /dev/null +++ b/experimental/builder/test/CMakeLists.txt @@ -0,0 +1,20 @@ +set(CMAKE_CXX_STANDARD 20) + +include(gtest) + +# Helper function to create a gtest executable with common properties +function(add_ck_builder_test test_name) + add_executable(${test_name} ${ARGN}) + target_include_directories(${test_name} PRIVATE + "${PROJECT_SOURCE_DIR}/experimental/builder/include" + "${PROJECT_SOURCE_DIR}/include" + ) + target_compile_options(${test_name} PRIVATE + -Wno-global-constructors + -Wno-c++20-compat + ) + target_link_libraries(${test_name} PRIVATE GTest::gtest GTest::gtest_main GTest::gmock) +endfunction() + +add_ck_builder_test(test_conv_builder + test_conv_builder.cpp) diff --git a/experimental/builder/test/test_conv_builder.cpp b/experimental/builder/test/test_conv_builder.cpp new file mode 100644 index 0000000000..816c249ebd --- /dev/null +++ b/experimental/builder/test/test_conv_builder.cpp @@ -0,0 +1,9 @@ +#include + +class ConvBuilderTest : public ::testing::Test { +}; + +TEST_F(ConvBuilderTest, PlaceholderTest) { + // TODO: Implement actual test + EXPECT_TRUE(true); +} From db2561147e68a3d8552fe4c0891058049a436274 Mon Sep 17 00:00:00 2001 From: John Shumway Date: Fri, 17 Oct 2025 15:21:28 +0000 Subject: [PATCH 02/39] Fix clang formatting. --- experimental/builder/test/test_conv_builder.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/experimental/builder/test/test_conv_builder.cpp b/experimental/builder/test/test_conv_builder.cpp index 816c249ebd..4ec189daa4 100644 --- a/experimental/builder/test/test_conv_builder.cpp +++ b/experimental/builder/test/test_conv_builder.cpp @@ -1,9 +1,11 @@ #include -class ConvBuilderTest : public ::testing::Test { +class ConvBuilderTest : public ::testing::Test +{ }; -TEST_F(ConvBuilderTest, PlaceholderTest) { +TEST_F(ConvBuilderTest, PlaceholderTest) +{ // TODO: Implement actual test EXPECT_TRUE(true); } From 121884d028a459ba48c2625742a31f2d69f2c273 Mon Sep 17 00:00:00 2001 From: John Shumway Date: Fri, 17 Oct 2025 21:21:30 +0000 Subject: [PATCH 03/39] Fix CMake build infrastructure for experimental builder - Add experimental/builder CMakeLists.txt with proper subdirectory structure - Add placeholder include/ck_tile/builder CMakeLists.txt for header installation - Fix gtest.cmake to use include_guard to prevent multiple inclusions - Update root CMakeLists.txt to include full builder directory instead of just tests --- CMakeLists.txt | 2 +- cmake/gtest.cmake | 1 + experimental/builder/CMakeLists.txt | 3 +++ experimental/builder/include/ck_tile/builder/CMakeLists.txt | 1 + 4 files changed, 6 insertions(+), 1 deletion(-) create mode 100644 experimental/builder/CMakeLists.txt create mode 100644 experimental/builder/include/ck_tile/builder/CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index d260d8c00e..310e2a6576 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -694,7 +694,7 @@ if (NOT MIOPEN_REQ_LIBS_ONLY) endif() if (CK_EXPERIMENTAL_BUILDER) - add_subdirectory(experimental/builder/test) + add_subdirectory(experimental/builder) endif() if(CK_USE_CODEGEN AND (SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR GPU_ARCHS)) diff --git a/cmake/gtest.cmake b/cmake/gtest.cmake index 41e2fa2cc0..9336d47e71 100644 --- a/cmake/gtest.cmake +++ b/cmake/gtest.cmake @@ -1,3 +1,4 @@ +include_guard(GLOBAL) include(FetchContent) set(GOOGLETEST_DIR "" CACHE STRING "Location of local GoogleTest repo to build against") diff --git a/experimental/builder/CMakeLists.txt b/experimental/builder/CMakeLists.txt new file mode 100644 index 0000000000..103acbad55 --- /dev/null +++ b/experimental/builder/CMakeLists.txt @@ -0,0 +1,3 @@ +if(BUILD_TESTING) + add_subdirectory(test) +endif() diff --git a/experimental/builder/include/ck_tile/builder/CMakeLists.txt b/experimental/builder/include/ck_tile/builder/CMakeLists.txt new file mode 100644 index 0000000000..f20b5d54ec --- /dev/null +++ b/experimental/builder/include/ck_tile/builder/CMakeLists.txt @@ -0,0 +1 @@ +# Empty placeholder until we add library code. From 79f057bfc99f090681738e409059284bb8516af6 Mon Sep 17 00:00:00 2001 From: John Shumway Date: Mon, 20 Oct 2025 04:56:36 -0700 Subject: [PATCH 04/39] Scope C++20 settingto the test code Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- experimental/builder/test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experimental/builder/test/CMakeLists.txt b/experimental/builder/test/CMakeLists.txt index b3647ade66..754e08d915 100644 --- a/experimental/builder/test/CMakeLists.txt +++ b/experimental/builder/test/CMakeLists.txt @@ -1,10 +1,10 @@ -set(CMAKE_CXX_STANDARD 20) include(gtest) # Helper function to create a gtest executable with common properties function(add_ck_builder_test test_name) add_executable(${test_name} ${ARGN}) + target_compile_features(${test_name} PRIVATE cxx_std_20) target_include_directories(${test_name} PRIVATE "${PROJECT_SOURCE_DIR}/experimental/builder/include" "${PROJECT_SOURCE_DIR}/include" From 63a9d9f4bf2bd6fb456abbc06866e5a04188074a Mon Sep 17 00:00:00 2001 From: John Shumway Date: Mon, 20 Oct 2025 04:57:30 -0700 Subject: [PATCH 05/39] Remove redundant GTest::gtest linkage Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- experimental/builder/test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experimental/builder/test/CMakeLists.txt b/experimental/builder/test/CMakeLists.txt index 754e08d915..5890aa8dcd 100644 --- a/experimental/builder/test/CMakeLists.txt +++ b/experimental/builder/test/CMakeLists.txt @@ -13,7 +13,7 @@ function(add_ck_builder_test test_name) -Wno-global-constructors -Wno-c++20-compat ) - target_link_libraries(${test_name} PRIVATE GTest::gtest GTest::gtest_main GTest::gmock) + target_link_libraries(${test_name} PRIVATE GTest::gtest_main GTest::gmock) endfunction() add_ck_builder_test(test_conv_builder From dd7a6ed3ede68b6a9b7ef28261c17e788e484a95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= Date: Mon, 20 Oct 2025 15:11:58 +0000 Subject: [PATCH 06/39] Introduce basic types, and convolution algorithm concepts and limits. --- experimental/builder/README.md | 16 ++- .../builder/conv_algorithm_concepts.hpp | 119 ++++++++++++++++++ .../ck_tile/builder/conv_algorithm_limits.hpp | 20 +++ .../builder/include/ck_tile/builder/types.hpp | 54 ++++++++ 4 files changed, 207 insertions(+), 2 deletions(-) create mode 100644 experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp create mode 100644 experimental/builder/include/ck_tile/builder/conv_algorithm_limits.hpp create mode 100644 experimental/builder/include/ck_tile/builder/types.hpp diff --git a/experimental/builder/README.md b/experimental/builder/README.md index d8b8757dc2..74d103bf02 100644 --- a/experimental/builder/README.md +++ b/experimental/builder/README.md @@ -23,9 +23,21 @@ This project is a prototype for a more general builder pattern for all of compos To enable the experimental builder, configure your build with: -```sh -cmake -DCK_EXPERIMENTAL_BUILDER=ON -DCMAKE_CXX_STANDARD=20 ... + +```bash +cmake \ + -D CMAKE_PREFIX_PATH=/opt/rocm \ + -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ + -D CMAKE_BUILD_TYPE=Release \ + -D GPU_TARGETS="gfx942;gfx950" \ + -D CK_EXPERIMENTAL_BUILDER=ON \ + -D CMAKE_CXX_STANDARD=20 \ + -G Ninja \ + .. ``` + ## Building and testing During development, build and test from the CK build directory with diff --git a/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp b/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp new file mode 100644 index 0000000000..3a6c04eb1f --- /dev/null +++ b/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include "types.hpp" + +namespace ck_tile::builder +{ + +// Concept for thread block dimensions for a GEMM problem. +template +concept ThreadBlockDescriptor = requires(T t) { + { t.block_size } -> std::convertible_to; + { t.tile_size.m } -> std::convertible_to; + { t.tile_size.n } -> std::convertible_to; + { t.tile_size.k } -> std::convertible_to; +}; + +// Concept to check if struct specifies thread block info. +template +concept SpecifiesThreadBlock = requires { + { T::thread_block } -> ThreadBlockDescriptor; +}; + +// Concept for parameters that describe a gridwise GEMM problem. +template +concept GridwiseGemmDescriptor = requires(T t) { + { t.ak1 } -> std::convertible_to; + { t.bk1 } -> std::convertible_to; + { t.m_per_xdl } -> std::convertible_to; + { t.n_per_xdl } -> std::convertible_to; + { t.m_xdl_per_wave } -> std::convertible_to; + { t.n_xdl_per_wave } -> std::convertible_to; +}; + +// Concept to check if a struct specifies gridwise GEMM info. +template +concept SpecifiesGridwiseGemm = requires { + { T::tuning_params } -> GridwiseGemmDescriptor; +}; + +// Concept for convolution input block transfer. +template +concept InputBlockTransferDescriptor = requires(T t) { + { t.k0 } -> std::convertible_to; + { t.m } -> std::convertible_to; + { t.k1 } -> std::convertible_to; +}; + +// Concept for output block transfer. +template +concept OutputBlockTransferDescriptor = requires(T t) { + { t.m_block } -> std::convertible_to; + { t.m_wave_per_xdl } -> std::convertible_to; + { t.n_block } -> std::convertible_to; + { t.n_wave_per_xdl } -> std::convertible_to; +}; + +// Concept to check if a struct specifies convolution input and output block transfer info. +template +concept SpecifiesBlockTransfer = requires(T t) { + { T::block_transfer.thread_cluster_dims_a } -> InputBlockTransferDescriptor; + { T::block_transfer.thread_cluster_dims_b } -> InputBlockTransferDescriptor; + { T::block_transfer.thread_cluster_dims_c } -> OutputBlockTransferDescriptor; +}; + +// Concept for the convolution input vector transfer. +template +concept InputVectorTransferDescriptor = requires(T t) { + { t.src_vector_dim } -> std::convertible_to; + { t.src_scalar_per_vector } -> std::convertible_to; + { t.dest_scalar_per_vector_k1 } -> std::convertible_to; + { t.add_extra } -> std::convertible_to; +}; + +// Concepts for the convolution output vector transfer. +template +concept OutputVectorTransferDescriptor = requires(T t) { + { t.m_xdl_per_wave_per_shuffle } -> std::convertible_to; + { t.n_xdl_per_wave_per_shuffle } -> std::convertible_to; + { t.scalar_per_vector } -> std::convertible_to; +}; + +// Concept to check if a struct specifies block vector transfer info. +template +concept SpecifiesBlockVectorTransfer = requires(T t) { + { T::block_transfer.vector_transfer_a } -> InputVectorTransferDescriptor; + { T::block_transfer.vector_transfer_b } -> InputVectorTransferDescriptor; + { T::block_transfer.vector_transfer_c } -> OutputVectorTransferDescriptor; +}; + +// Concept for the thread cluster access order +template +concept AccessOrderDescriptor = requires(T t) { + { t.order } -> std::convertible_to>; +}; + +// Concept to check if a struct specifies thread cluster access order info. +template +concept SpecifiesThreadClusterAccessOrder = requires(T t) { + { T::block_transfer.a_thread_cluster_access_order } -> AccessOrderDescriptor; + { T::block_transfer.b_thread_cluster_access_order } -> AccessOrderDescriptor; +}; + +// Concept to check if struct specifies block_gemm_pipeline_version. +template +concept SpecifiesGemmPipelineVersion = requires { + { T::pipeline_version } -> std::convertible_to; +}; + +// No requirements yet for a ConvAlogorithm concept. +template +concept ConvAlgorithmDescriptor = std::is_class_v; + +} // namespace ck_tile::builder diff --git a/experimental/builder/include/ck_tile/builder/conv_algorithm_limits.hpp b/experimental/builder/include/ck_tile/builder/conv_algorithm_limits.hpp new file mode 100644 index 0000000000..13783e4ef3 --- /dev/null +++ b/experimental/builder/include/ck_tile/builder/conv_algorithm_limits.hpp @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +namespace ck_tile::builder +{ + +// Limits for output vector transfer. +template +concept OutputVectorTransferLimits = requires { + requires Value.scalar_per_vector > 0 && + Value.m_xdl_per_wave_per_shuffle > 0 && + Value.n_xdl_per_wave_per_shuffle > 0 ; +}; + +} // namespace ck_tile::builder diff --git a/experimental/builder/include/ck_tile/builder/types.hpp b/experimental/builder/include/ck_tile/builder/types.hpp new file mode 100644 index 0000000000..fd2da2100a --- /dev/null +++ b/experimental/builder/include/ck_tile/builder/types.hpp @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +namespace ck_tile::builder { + +enum class DataType +{ + FP64, + FP32, + FP16, + BF16, + S16, + S8, + S4, +}; + +// Memory layouts for convolution tensors, following PyTorch conventions. +enum class GroupConvLayout +{ + CHANNELS_LAST, // e.g., NHWGC + CHANNELS_FIRST // e.g., NGCHW +}; + +// Direction of the convolution operation. +enum class ConvDirection +{ + FORWARD, + BACKWARD_DATA, + BACKWARD_WEIGHT +}; + +// Fused element-wise operations. +enum class ElementwiseOperation +{ + BIAS, + BIAS_CLAMP, + BILINEAR, + CLAMP, + SCALE, + PASS_THROUGH +}; + +// Enums for the current block GEMM pipeline versions. +enum class BlockGemmPipelineVersion +{ + V1, + V3, + V4, + V5 +}; + +} // namespace ck_tile::builder From fc258eb047719a38de1d76ddba6565125602e218 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= Date: Mon, 20 Oct 2025 15:17:00 +0000 Subject: [PATCH 07/39] Add convolution signature concepts. --- .../builder/conv_signature_concepts.hpp | 64 +++++++++++++++++++ .../builder/include/ck_tile/builder/types.hpp | 6 +- 2 files changed, 66 insertions(+), 4 deletions(-) create mode 100644 experimental/builder/include/ck_tile/builder/conv_signature_concepts.hpp diff --git a/experimental/builder/include/ck_tile/builder/conv_signature_concepts.hpp b/experimental/builder/include/ck_tile/builder/conv_signature_concepts.hpp new file mode 100644 index 0000000000..f666c0189c --- /dev/null +++ b/experimental/builder/include/ck_tile/builder/conv_signature_concepts.hpp @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +// This file defines the compile-time "signature" for grouped convolution operations. +// A signature is a collection of properties that fully describe a convolution kernel's +// mathematical characteristics. It uses C++20 concepts and enums to specify these +// properties, enabling compile-time validation and specialization. +// +// The core components of a signature are: +// - Spatial dimensionality (1D, 2D, 3D) +// - Operational direction (Forward, Backward Data, Backward Weight) +// - Tensor memory layout (Channels First/Last) +// - Data type (FP32, FP16, BF16) +// - Fused element-wise operation (e.g., Bias, Clamp) +// +// The file also provides predicate concepts to query the properties of a given +// signature at compile time. +#pragma once + +#include +#include + +#include + +namespace ck_tile::builder +{ + +// Constrains convolution to 1D, 2D, or 3D spatial dimensions. +template +concept ConvSpatialDim = std::is_integral_v && (N == 1 || N == 2 || N == 3); + +// Constrains convolution data types to common floating-point types. +template +concept ConvDataType = (T == DataType::FP32) || (T == DataType::FP16) || (T == DataType::BF16) || (T == DataType::FP8) || (T == DataType::I8); + +// Concept for a type that defines a convolution's operational signature. +template +concept ConvSignatureDescriptor = requires(T t) { + { t.spatial_dim } -> std::convertible_to; + { t.direction } -> std::convertible_to; + { t.layout } -> std::convertible_to; + { t.data_type } -> std::convertible_to; +}; + +// Concept to validate a convolution signature's values. +template +concept ValidConvSignature = requires { + requires ConvSpatialDim; + requires ConvDataType; +}; + +// Predicate for forward convolution. +template +concept ConvDirectionIsForward = (Sig.direction == ConvDirection::FORWARD); + +// Predicate for backward data convolution. +template +concept ConvDirectionIsBackwardData = (Sig.direction == ConvDirection::BACKWARD_DATA); + +// Predicate for backward weight convolution. +template +concept ConvDirectionIsBackwardWeight = (Sig.direction == ConvDirection::BACKWARD_WEIGHT); + +} // namespace ck_tile::builder \ No newline at end of file diff --git a/experimental/builder/include/ck_tile/builder/types.hpp b/experimental/builder/include/ck_tile/builder/types.hpp index fd2da2100a..27bc2570d3 100644 --- a/experimental/builder/include/ck_tile/builder/types.hpp +++ b/experimental/builder/include/ck_tile/builder/types.hpp @@ -7,13 +7,11 @@ namespace ck_tile::builder { enum class DataType { - FP64, FP32, FP16, BF16, - S16, - S8, - S4, + FP8, + I8 }; // Memory layouts for convolution tensors, following PyTorch conventions. From 9c0fdff0e7090fc1d756ddedc843b0b35a169075 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= Date: Mon, 20 Oct 2025 15:24:35 +0000 Subject: [PATCH 08/39] Add convolution factory. --- .../include/ck_tile/builder/builder_utils.hpp | 110 ++++ .../include/ck_tile/builder/conv_factory.hpp | 539 ++++++++++++++++++ .../include/ck_tile/builder/versions.hpp | 18 + 3 files changed, 667 insertions(+) create mode 100644 experimental/builder/include/ck_tile/builder/builder_utils.hpp create mode 100644 experimental/builder/include/ck_tile/builder/conv_factory.hpp create mode 100644 experimental/builder/include/ck_tile/builder/versions.hpp diff --git a/experimental/builder/include/ck_tile/builder/builder_utils.hpp b/experimental/builder/include/ck_tile/builder/builder_utils.hpp new file mode 100644 index 0000000000..1b51453a4d --- /dev/null +++ b/experimental/builder/include/ck_tile/builder/builder_utils.hpp @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once +#include +#include + +namespace ck_tile::builder +{ + +// Convert a static array to a sequence +// Usage example: +// static constexpr std::vector arr {1, 2, 3}; +// using seq = to_sequence_v; // seq is ck::Sequence<1, 2, 3> +template +struct to_sequence_t +{ + private: + template + static auto get_sequence_type(std::index_sequence) -> ck::Sequence; + + // Helper method to handler the unusual .Size() method name in ck::Array. + static constexpr auto get_size(const auto& arr) + { + if constexpr(requires { arr.size(); }) + { + return arr.size(); + } + else + { + return arr.Size(); + } + } + + public: + using value = decltype(get_sequence_type(std::make_index_sequence{})); +}; + +template +using to_sequence_v = typename to_sequence_t, Arr>::value; + +// Wrapper function to make constexpr strings a structural type for NTTP. +template +struct StringLiteral { + char data[N]; + constexpr StringLiteral(const char (&str)[N]) { + for (size_t i = 0; i < N; ++i) data[i] = str[i]; + } + + constexpr bool operator==(const StringLiteral& other) const { + for (size_t i = 0; i < N; ++i) { + if (data[i] != other.data[i]) { + return false; + } + } + return true; + } +}; + +// This is a C++17 deduction guide. It allows the compiler to automatically +// deduce the template argument `N` for `StringLiteral` from a string literal +// constructor argument. For example, you can write `StringLiteral s{"foo"};` +// instead of `StringLiteral<4> s{"foo"};`. +template +StringLiteral(const char (&)[N]) -> StringLiteral; + +// Helper to provide a readable error for unsupported enum values. +// The compiler will print the name of this struct in the error message, so +// the name of the enum value will appear instead of just its integer value. +template +struct UnsupportedEnumValue +{ +}; + +// Helper functions to convert enums to strings +constexpr std::string_view ConvDirectionToString(ConvDirection dir) +{ + switch(dir) + { + case ConvDirection::FORWARD: return "Forward"; + case ConvDirection::BACKWARD_DATA: return "Backward Data"; + case ConvDirection::BACKWARD_WEIGHT: return "Backward Weight"; + default: return "Unknown"; + } +} + +constexpr std::string_view DataTypeToString(DataType dt) +{ + switch(dt) + { + case DataType::FP16: return "FP16"; + case DataType::FP32: return "FP32"; + case DataType::BF16: return "BF16"; + case DataType::FP8: return "FP8"; + case DataType::I8: return "I8"; + default: return "Unknown"; + } +} + +constexpr std::string_view LayoutToString(GroupConvLayout layout) +{ + switch(layout) + { + case GroupConvLayout::CHANNELS_FIRST: return "Channels-first (NCHW)"; + case GroupConvLayout::CHANNELS_LAST: return "Channels-last (NHWC)"; + default: return "Unknown"; + } +} + +} // namespace ck_tile::builder \ No newline at end of file diff --git a/experimental/builder/include/ck_tile/builder/conv_factory.hpp b/experimental/builder/include/ck_tile/builder/conv_factory.hpp new file mode 100644 index 0000000000..e77daac33e --- /dev/null +++ b/experimental/builder/include/ck_tile/builder/conv_factory.hpp @@ -0,0 +1,539 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +// A factory for instantiating CK convolution kernels. +// +// This file translates a semantic description of a convolution operation +// (`ConvSignatureDescriptor` and `ConvAlgorithmDescriptor`) into specific, +// low-level template arguments required by the underlying CK device-level +// kernel implementations. This abstraction also enables more complex build +// time logic and simplifies the kernel specification. +// +// Key Components: +// +// Template Metaprogram: +// - ConvFactory: The main factory, with specializations for different +// convolution directions. +// +// Template Metaprogram Helpers: +// - ConvTensorLayouts: Maps layout enums to CK layout types. +// - ConvTensorTypes: Maps data type enums to C++ types used by CK. +// - ConvPassThroughOps: Hard-coded pass-through element-wise operations. +// +// `constexpr` Helper Functions: +// - SetThreadBlockInfo: Determines thread block dimensions from the algorithm +// descriptor or provides defaults. +// - SetConvTuningInfo: Sets low-level tuning parameters. +// - Set*BlockTransfer: Configures tensor data movement parameters for +// tensors A, B, and C. +// - SetBlockGemmPipelineVersion: Selects the GEMM pipeline version. +// +// The primary entry point is the `ConvFactory` struct, which is specialized +// for forward and backward-data convolutions. + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace ck_tile::builder::factory_internal { + +// Type mappings from the builder GroupConvLayout enum class to the CK tensor data types. +template + requires(ConvSpatialDim) +struct ConvTensorLayouts +{ + // This will trigger if a specialization for the given layout is not found. + // We should always catch this in an earlier validation check. + static_assert(sizeof(Layout) == 0, + "Internal error. Unsupported layout for convolution factory."); +}; + +template <> +struct ConvTensorLayouts +{ + using ALayout = ck::tensor_layout::convolution::NHWGC; + using BLayout = ck::tensor_layout::convolution::GKCYX; + using DsLayout = ck::Tuple<>; + using ELayout = ck::tensor_layout::convolution::NGKHW; +}; + +template <> +struct ConvTensorLayouts +{ + using ALayout = ck::tensor_layout::convolution::NGKHW; + using BLayout = ck::tensor_layout::convolution::GKYXC; + using DsLayout = ck::Tuple<>; + using ELayout = ck::tensor_layout::convolution::NGCHW; +}; + +template <> +struct ConvTensorLayouts +{ + using ALayout = ck::tensor_layout::convolution::NHWGC; + using BLayout = ck::tensor_layout::convolution::GKYXC; + using DsLayout = ck::Tuple<>; + using ELayout = ck::tensor_layout::convolution::NHWGK; +}; + +template <> +struct ConvTensorLayouts +{ + using ALayout = ck::tensor_layout::convolution::NDHWGC; + using BLayout = ck::tensor_layout::convolution::GKZYXC; + using DsLayout = ck::Tuple<>; + using ELayout = ck::tensor_layout::convolution::NDHWGK; +}; + +// Type mappings from builder convolution data type to CK tensor types. +template +struct ConvTensorTypes +{ + // This will trigger if a specialization for the given DataType is not found. + // We should always catch this in an earlier validation check. + static_assert(sizeof(UnsupportedEnumValue) == 0, + "Internal error. Unsupported data type for convolution factory."); +}; + +template <> +struct ConvTensorTypes +{ + using ADataType = ck::half_t; + using BDataType = ck::half_t; + using CShuffleDataType = ck::half_t; + using DsDataTypes = ck::Tuple<>; + using AccDataType = float; + using EDataType = ck::half_t; +}; + +template <> +struct ConvTensorTypes +{ + using ADataType = ck::bhalf_t; + using BDataType = ck::bhalf_t; + using CShuffleDataType = ck::bhalf_t; + using DsDataTypes = ck::Tuple<>; + using AccDataType = float; + using EDataType = ck::bhalf_t; +}; + +template <> +struct ConvTensorTypes +{ + using ADataType = float; + using BDataType = float; + using CShuffleDataType = float; + using DsDataTypes = ck::Tuple<>; + using AccDataType = float; + using EDataType = float; +}; + +// Hard-coded pass-through ops. +// TODO: Generalize this for more fused operations. +struct ConvPassThroughOps +{ + using AElementwiseOp = ck::tensor_operation::element_wise::PassThrough; + using BElementwiseOp = ck::tensor_operation::element_wise::PassThrough; + using CDEElementwiseOp = ck::tensor_operation::element_wise::PassThrough; +}; + +// The algorithm specializations for the convolution and GEMM. +template + requires( + std::is_same_v || + std::is_same_v) +struct ConvSpec +{ + CONV_ENUM conv_spec; + ck::tensor_operation::device::GemmSpecialization gemm_spec; +}; + +// Deduction guide for ConvSpec to simplify brace initialization. +template +ConvSpec(CONV_ENUM, GEMM_ENUM) -> ConvSpec; + +// Block info for a convolution. +struct ConvBlock +{ + int block_size = 0; + MNK per_block; +}; + +template +constexpr ConvBlock SetThreadBlockInfo() +{ + using AlgorithmType = decltype(ALGORITHM); + if constexpr(SpecifiesThreadBlock) + { + constexpr auto& TB = ALGORITHM.thread_block; + return ConvBlock{ + .block_size = TB.block_size, + .per_block = {.m = TB.submatrix.m, .n = TB.submatrix.n, .k = TB.submatrix.k}}; + } + // Default values if thread block info isn't specified. + return ConvBlock{ + .block_size = 256, + .per_block = {.m = 256, .n = 256, .k = 32}, + }; +} + +// Convolution tuning parameters. +struct ConvTuning +{ + int ak1 = 0; + int bk1 = 0; + int m_per_xdl = 0; + int n_per_dxl = 0; + int m_xdl_per_wave = 0; + int n_xdl_per_wave = 0; +}; + +template +constexpr ConvTuning SetConvTuningInfo() +{ + using AlgorithmType = decltype(ALGORITHM); + if constexpr(ConvDirectionIsBackwardData) + { + // Default values for backward data if tuning info isn't specified. + return ConvTuning{ + .ak1 = 8, + .bk1 = 8, + .m_per_xdl = 16, + .n_per_dxl = 16, + .m_xdl_per_wave = 1, + .n_xdl_per_wave = 4, + }; + } + if constexpr(SpecifiesConvTuning) + { + constexpr auto& TP = ALGORITHM.tuning_params; + return ConvTuning{ + .ak1 = TP.ak1, + .bk1 = TP.bk1, + .m_per_xdl = 32, + .n_per_dxl = 32, + .m_xdl_per_wave = TP.m_xdl_per_wave, + .n_xdl_per_wave = TP.n_xdl_per_wave, + }; + } + // Default values. + return ConvTuning{ + .ak1 = 8, + .bk1 = 8, + .m_per_xdl = 32, + .n_per_dxl = 32, + .m_xdl_per_wave = 4, + .n_xdl_per_wave = 4, + }; +} + +// Block transfer paramters for A or B tensor. +struct BlockTransfer +{ + ck::Array thread_cluster_dims = {0, 0, 0}; // k0, m, k1 + ck::Array thread_cluster_order = {0, 0, 0}; + ck::Array src_access_order = {0, 0, 0}; + int src_vector_dim = 0; + int src_scaler_per_vector = 0; + int dest_scaler_per_vector_k1 = 0; + int add_extra = 0; +}; + +// Block transfer parameters for C tensor. +struct CBlockTransfer +{ + int m_xdl_per_wave_per_shuffle = 0; + int n_xdl_per_wave_per_shuffle = 0; + ck::Array thread_cluster_dims = {0, 0, 0, 8}; + int scaler_per_vector = 8; +}; + +template +constexpr BlockTransfer SetFwdConvABlockTransfer() +{ + using AlgorithmType = decltype(ALGORITHM); + BlockTransfer block_transfer{ + .thread_cluster_dims = {4, 64, 1}, + .thread_cluster_order = {1, 0, 2}, + .src_access_order = {1, 0, 2}, + .src_vector_dim = 2, + .src_scaler_per_vector = 8, + .dest_scaler_per_vector_k1 = 8, + .add_extra = 0, + }; + if constexpr(SpecifiesBlockATransfer) + { + constexpr auto& TCL = ALGORITHM.block_transfer.thread_cluster_dims_a; + block_transfer.thread_cluster_dims = {TCL.k0, TCL.m, TCL.k1}; + } + // Default. + return block_transfer; +} + +template +constexpr BlockTransfer SetBwdDataConvABlockTransfer() +{ + return BlockTransfer{ + .thread_cluster_dims = {4, 16, 1}, + .thread_cluster_order = {1, 0, 2}, + .src_access_order = {1, 0, 2}, + .src_vector_dim = 2, + .src_scaler_per_vector = 8, + .dest_scaler_per_vector_k1 = 8, + .add_extra = 1, + }; +} + +template +constexpr BlockTransfer SetFwdConvBBlockTransfer() +{ + using AlgorithmType = decltype(ALGORITHM); + BlockTransfer block_transfer{ + .thread_cluster_dims = {4, 64, 1}, + .thread_cluster_order = {1, 0, 2}, + .src_access_order = {1, 0, 2}, + .src_vector_dim = 2, + .src_scaler_per_vector = 8, + .dest_scaler_per_vector_k1 = 8, + .add_extra = 0, + }; + if constexpr(SpecifiesBlockBTransfer) + { + constexpr auto& TCL = ALGORITHM.block_transfer.thread_cluster_dims_b; + block_transfer.thread_cluster_dims = {TCL.k0, TCL.n, TCL.k1}; + } + return block_transfer; +} + +template +constexpr BlockTransfer SetBwdDataConvBBlockTransfer() +{ + // Different default values for backward data. + return BlockTransfer{ + .thread_cluster_dims = {4, 8, 1}, + .thread_cluster_order = {0, 2, 1}, + .src_access_order = {0, 2, 1}, + .src_vector_dim = 1, + .src_scaler_per_vector = 8, + .dest_scaler_per_vector_k1 = 8, + .add_extra = 1, + }; +} + +template +constexpr CBlockTransfer SetCBlockTransfer() +{ + using AlgorithmType = decltype(ALGORITHM); + if constexpr(ConvDirectionIsBackwardData) + { + // Different default values for backward data. + return CBlockTransfer{ + .m_xdl_per_wave_per_shuffle = 1, + .n_xdl_per_wave_per_shuffle = 1, + .thread_cluster_dims = {1, 16, 1, 4}, + .scaler_per_vector = 4, + }; + } + CBlockTransfer block_transfer{ + .m_xdl_per_wave_per_shuffle = 1, + .n_xdl_per_wave_per_shuffle = 1, + .thread_cluster_dims = {1, 32, 1, 8}, + .scaler_per_vector = 8, + }; + if constexpr(SpecifiesBlockCTransfer) + { + constexpr auto& TCL = ALGORITHM.block_transfer.thread_cluster_dims_c; + block_transfer.thread_cluster_dims = { + TCL.m_block, + TCL.m_wave_per_xdl, + TCL.n_block, + TCL.n_wave_per_xdl, + }; + } + return block_transfer; +} + +template +constexpr ck::BlockGemmPipelineVersion SetBlockGemmPipelineVersion() +{ + using AlgorithmType = decltype(ALGORITHM); + if constexpr(SpecifiesGemmPipelineVersion) + { + switch(ALGORITHM.pipeline_version) + { + case BlockGemmPipelineVersion::V1: return ck::BlockGemmPipelineVersion::v1; + case BlockGemmPipelineVersion::V3: return ck::BlockGemmPipelineVersion::v3; + case BlockGemmPipelineVersion::V4: return ck::BlockGemmPipelineVersion::v4; + case BlockGemmPipelineVersion::V5: return ck::BlockGemmPipelineVersion::v5; + } + } + // Default value. + return ck::BlockGemmPipelineVersion::v4; +} + +} // namespace ck_tile::builder::factory + +namespace ck_tile::builder { + +// Primary template for the convolution factory. +template +struct ConvFactory; + +// Factory specialization for an instance of a grouped forward convolution kernel. +template + requires ConvDirectionIsForward +struct ConvFactory +{ + static constexpr int SPATIAL_DIM = SIGNATURE.spatial_dim; + using Layouts = factory_internal::ConvTensorLayouts; + using Types = factory_internal::ConvTensorTypes; + using Ops = factory_internal::ConvPassThroughOps; + static constexpr factory_internal::ConvSpec SPECIALIZATION{ + .conv_spec = ck::tensor_operation::device::ConvolutionForwardSpecialization::Default, + .gemm_spec = ck::tensor_operation::device::GemmSpecialization::MNKPadding, + }; + static constexpr auto BLOCK = factory_internal::SetThreadBlockInfo(); + static constexpr auto TUNING = factory_internal::SetConvTuningInfo(); + static constexpr auto A_BLOCK_TRANSFER = factory_internal::SetFwdConvABlockTransfer(); + static constexpr auto B_BLOCK_TRANSFER = factory_internal::SetFwdConvBBlockTransfer(); + static constexpr auto C_BLOCK_TRANSFER = factory_internal::SetCBlockTransfer(); + static constexpr auto PIPELINE_SCHEDULER = ck::BlockGemmPipelineScheduler::Intrawave; + static constexpr auto PIPELINE_VERSION = factory_internal::SetBlockGemmPipelineVersion(); + + // Preconditions + static_assert(ValidVectorTransferC); + + // The forward convolution kernel class instance. + using Instance = + ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3< // + SPATIAL_DIM, + typename Layouts::ALayout, + typename Layouts::BLayout, + typename Layouts::DsLayout, + typename Layouts::ELayout, + typename Types::ADataType, + typename Types::BDataType, + typename Types::AccDataType, + typename Types::CShuffleDataType, + typename Types::DsDataTypes, + typename Types::EDataType, + typename Ops::AElementwiseOp, + typename Ops::BElementwiseOp, + typename Ops::CDEElementwiseOp, + SPECIALIZATION.conv_spec, + SPECIALIZATION.gemm_spec, + BLOCK.block_size, + BLOCK.per_block.m, + BLOCK.per_block.n, + BLOCK.per_block.k, + TUNING.ak1, + TUNING.bk1, + TUNING.m_per_xdl, + TUNING.n_per_dxl, + TUNING.m_xdl_per_wave, + TUNING.n_xdl_per_wave, + to_sequence_v, + to_sequence_v, + to_sequence_v, + A_BLOCK_TRANSFER.src_vector_dim, + A_BLOCK_TRANSFER.src_scaler_per_vector, + A_BLOCK_TRANSFER.dest_scaler_per_vector_k1, + A_BLOCK_TRANSFER.add_extra, + to_sequence_v, + to_sequence_v, + to_sequence_v, + B_BLOCK_TRANSFER.src_vector_dim, + B_BLOCK_TRANSFER.src_scaler_per_vector, + B_BLOCK_TRANSFER.dest_scaler_per_vector_k1, + B_BLOCK_TRANSFER.add_extra, + C_BLOCK_TRANSFER.m_xdl_per_wave_per_shuffle, + C_BLOCK_TRANSFER.n_xdl_per_wave_per_shuffle, + to_sequence_v, + C_BLOCK_TRANSFER.scaler_per_vector, + PIPELINE_SCHEDULER, + PIPELINE_VERSION>; +}; + +// Factory specialization for an instance of a grouped backward-data convolution kernel. +template + requires SupportedVersion && ConvDirectionIsBackwardData +struct ConvFactory +{ + static constexpr int SPATIAL_DIM = SIGNATURE.spatial_dim; + using Layouts = factory_internal::ConvTensorLayouts; + using Types = factory_internal::ConvTensorTypes; + using Ops = factory_internal::ConvPassThroughOps; + static constexpr factory_internal::ConvSpec SPECIALIZATION{ + .conv_spec = ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default, + .gemm_spec = ck::tensor_operation::device::GemmSpecialization::MNKPadding, + }; + static constexpr auto BLOCK = factory_internal::SetThreadBlockInfo(); + static constexpr auto TUNING = factory_internal::SetConvTuningInfo(); + static constexpr auto A_BLOCK_TRANSFER = factory_internal::SetBwdDataConvABlockTransfer(); + static constexpr auto B_BLOCK_TRANSFER = factory_internal::SetBwdDataConvBBlockTransfer(); + static constexpr auto C_BLOCK_TRANSFER = factory_internal::SetCBlockTransfer(); + static constexpr auto PIPELINE_SCHEDULER = ck::BlockGemmPipelineScheduler::Intrawave; + static constexpr auto PIPELINE_VERSION = factory_internal::SetBlockGemmPipelineVersion(); + // The backward-data convolution kernel class instance. + using Instance = + ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< + SPATIAL_DIM, + typename Layouts::ALayout, + typename Layouts::BLayout, + typename Layouts::DsLayout, + typename Layouts::ELayout, + typename Types::ADataType, + typename Types::BDataType, + typename Types::AccDataType, + typename Types::CShuffleDataType, + typename Types::DsDataTypes, + typename Types::EDataType, + typename Ops::AElementwiseOp, + typename Ops::BElementwiseOp, + typename Ops::CDEElementwiseOp, + SPECIALIZATION.conv_spec, + true, // DoPadGemmM + true, // DoPadGemmN + 1, // NumGemmKPrefetchStage + BLOCK.block_size, + BLOCK.per_block.m, + BLOCK.per_block.n, + BLOCK.per_block.k, + TUNING.ak1, + TUNING.bk1, + TUNING.m_per_xdl, + TUNING.n_per_dxl, + TUNING.m_xdl_per_wave, + TUNING.n_xdl_per_wave, + to_sequence_v, + to_sequence_v, + to_sequence_v, + A_BLOCK_TRANSFER.src_vector_dim, + A_BLOCK_TRANSFER.src_scaler_per_vector, + A_BLOCK_TRANSFER.dest_scaler_per_vector_k1, + A_BLOCK_TRANSFER.add_extra, + to_sequence_v, + to_sequence_v, + to_sequence_v, + B_BLOCK_TRANSFER.src_vector_dim, + B_BLOCK_TRANSFER.src_scaler_per_vector, + B_BLOCK_TRANSFER.dest_scaler_per_vector_k1, + B_BLOCK_TRANSFER.add_extra, + C_BLOCK_TRANSFER.m_xdl_per_wave_per_shuffle, + C_BLOCK_TRANSFER.n_xdl_per_wave_per_shuffle, + to_sequence_v, + C_BLOCK_TRANSFER.scaler_per_vector>; +}; + +} // namespace ck_tile::builder \ No newline at end of file diff --git a/experimental/builder/include/ck_tile/builder/versions.hpp b/experimental/builder/include/ck_tile/builder/versions.hpp new file mode 100644 index 0000000000..89618b8913 --- /dev/null +++ b/experimental/builder/include/ck_tile/builder/versions.hpp @@ -0,0 +1,18 @@ +#pragma once + +#include +#include + +#include + +namespace ck_tile::builder { + +static constexpr StringLiteral V0_0_0 = "0.0.0"; +static constexpr StringLiteral V0_1_0 = "0.1.0"; + +static constexpr StringLiteral LATEST_API_VERSION = V0_1_0; + +template +concept SupportedVersion = (V == V0_0_0) || (V == V0_1_0); + +} // namespace ck_tile::builder \ No newline at end of file From 25837b457bd710557c3df7bcbd63ae7313d7590b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= Date: Tue, 21 Oct 2025 09:47:42 +0000 Subject: [PATCH 09/39] Finalize conv factory implementation for fwd convolutions. --- .../builder/conv_algorithm_concepts.hpp | 11 +- .../ck_tile/builder/conv_algorithm_limits.hpp | 8 + .../include/ck_tile/builder/conv_factory.hpp | 316 ++++++------------ 3 files changed, 112 insertions(+), 223 deletions(-) diff --git a/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp b/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp index 3a6c04eb1f..6621a3aa99 100644 --- a/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp @@ -102,8 +102,15 @@ concept AccessOrderDescriptor = requires(T t) { // Concept to check if a struct specifies thread cluster access order info. template concept SpecifiesThreadClusterAccessOrder = requires(T t) { - { T::block_transfer.a_thread_cluster_access_order } -> AccessOrderDescriptor; - { T::block_transfer.b_thread_cluster_access_order } -> AccessOrderDescriptor; + { T::block_transfer.thread_cluster_access_order_a } -> AccessOrderDescriptor; + { T::block_transfer.thread_cluster_access_order_b } -> AccessOrderDescriptor; +}; + +// Concept to check if a struct specifies source access order info. +template +concept SpecifiesSourceAccessOrder = requires(T t) { + { T::block_transfer.src_access_order_a } -> AccessOrderDescriptor; + { T::block_transfer.src_access_order_b } -> AccessOrderDescriptor; }; // Concept to check if struct specifies block_gemm_pipeline_version. diff --git a/experimental/builder/include/ck_tile/builder/conv_algorithm_limits.hpp b/experimental/builder/include/ck_tile/builder/conv_algorithm_limits.hpp index 13783e4ef3..c35d4f5f3a 100644 --- a/experimental/builder/include/ck_tile/builder/conv_algorithm_limits.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_algorithm_limits.hpp @@ -9,6 +9,14 @@ namespace ck_tile::builder { +// Limits for input vector transfer. +template +concept InputVectorTransferLimits = requires { + requires Value.src_vector_dim > 0 && + Value.src_scalar_per_vector > 0 && + Value.dest_scalar_per_vector_k1 > 0; +}; + // Limits for output vector transfer. template concept OutputVectorTransferLimits = requires { diff --git a/experimental/builder/include/ck_tile/builder/conv_factory.hpp b/experimental/builder/include/ck_tile/builder/conv_factory.hpp index e77daac33e..4d7121ca31 100644 --- a/experimental/builder/include/ck_tile/builder/conv_factory.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_factory.hpp @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -63,21 +64,21 @@ struct ConvTensorLayouts -struct ConvTensorLayouts +struct ConvTensorLayouts { - using ALayout = ck::tensor_layout::convolution::NGKHW; + using ALayout = ck::tensor_layout::convolution::NHWGC; using BLayout = ck::tensor_layout::convolution::GKYXC; using DsLayout = ck::Tuple<>; - using ELayout = ck::tensor_layout::convolution::NGCHW; + using ELayout = ck::tensor_layout::convolution::NHWGK; }; template <> -struct ConvTensorLayouts +struct ConvTensorLayouts { - using ALayout = ck::tensor_layout::convolution::NHWGC; - using BLayout = ck::tensor_layout::convolution::GKYXC; + using ALayout = ck::tensor_layout::convolution::NDHWGC; + using BLayout = ck::tensor_layout::convolution::GKCZYX; using DsLayout = ck::Tuple<>; - using ELayout = ck::tensor_layout::convolution::NHWGK; + using ELayout = ck::tensor_layout::convolution::NGKDHW; }; template <> @@ -158,27 +159,26 @@ template ConvSpec(CONV_ENUM, GEMM_ENUM) -> ConvSpec; // Block info for a convolution. +struct MNK +{ + size_t m{}; + size_t n{}; + size_t k{}; +}; struct ConvBlock { int block_size = 0; - MNK per_block; + MNK per_block = {}; }; template constexpr ConvBlock SetThreadBlockInfo() { using AlgorithmType = decltype(ALGORITHM); - if constexpr(SpecifiesThreadBlock) - { - constexpr auto& TB = ALGORITHM.thread_block; - return ConvBlock{ - .block_size = TB.block_size, - .per_block = {.m = TB.submatrix.m, .n = TB.submatrix.n, .k = TB.submatrix.k}}; - } - // Default values if thread block info isn't specified. + constexpr auto& TB = ALGORITHM.thread_block; return ConvBlock{ - .block_size = 256, - .per_block = {.m = 256, .n = 256, .k = 32}, + .block_size = TB.block_size, + .per_block = {.m = TB.tile_size.m, .n = TB.tile_size.n, .k = TB.tile_size.k} }; } @@ -197,164 +197,91 @@ template ) - { - // Default values for backward data if tuning info isn't specified. - return ConvTuning{ - .ak1 = 8, - .bk1 = 8, - .m_per_xdl = 16, - .n_per_dxl = 16, - .m_xdl_per_wave = 1, - .n_xdl_per_wave = 4, - }; - } - if constexpr(SpecifiesConvTuning) - { - constexpr auto& TP = ALGORITHM.tuning_params; - return ConvTuning{ - .ak1 = TP.ak1, - .bk1 = TP.bk1, - .m_per_xdl = 32, - .n_per_dxl = 32, - .m_xdl_per_wave = TP.m_xdl_per_wave, - .n_xdl_per_wave = TP.n_xdl_per_wave, - }; - } - // Default values. + constexpr auto& TP = ALGORITHM.tuning_params; return ConvTuning{ - .ak1 = 8, - .bk1 = 8, - .m_per_xdl = 32, - .n_per_dxl = 32, - .m_xdl_per_wave = 4, - .n_xdl_per_wave = 4, + .ak1 = TP.ak1, + .bk1 = TP.bk1, + .m_per_xdl = TP.m_per_xdl, + .n_per_dxl = TP.n_per_dxl, + .m_xdl_per_wave = TP.m_xdl_per_wave, + .n_xdl_per_wave = TP.n_xdl_per_wave, }; } -// Block transfer paramters for A or B tensor. +// Block transfer parameters for A or B tensor. struct BlockTransfer { ck::Array thread_cluster_dims = {0, 0, 0}; // k0, m, k1 ck::Array thread_cluster_order = {0, 0, 0}; ck::Array src_access_order = {0, 0, 0}; int src_vector_dim = 0; - int src_scaler_per_vector = 0; - int dest_scaler_per_vector_k1 = 0; + int src_scalar_per_vector = 0; + int dest_scalar_per_vector_k1 = 0; int add_extra = 0; }; -// Block transfer parameters for C tensor. -struct CBlockTransfer -{ - int m_xdl_per_wave_per_shuffle = 0; - int n_xdl_per_wave_per_shuffle = 0; - ck::Array thread_cluster_dims = {0, 0, 0, 8}; - int scaler_per_vector = 8; -}; - template constexpr BlockTransfer SetFwdConvABlockTransfer() { - using AlgorithmType = decltype(ALGORITHM); + using AlgorithmType = decltype(ALGORITHM); + constexpr auto& TCL = ALGORITHM.block_transfer.thread_cluster_dims_a; + constexpr auto& TCO = ALGORITHM.block_transfer.thread_cluster_access_order_a; + constexpr auto& SAO = ALGORITHM.block_transfer.src_access_order_a; + constexpr auto& VTD = ALGORITHM.block_transfer.vector_transfer_a; + BlockTransfer block_transfer{ - .thread_cluster_dims = {4, 64, 1}, - .thread_cluster_order = {1, 0, 2}, - .src_access_order = {1, 0, 2}, - .src_vector_dim = 2, - .src_scaler_per_vector = 8, - .dest_scaler_per_vector_k1 = 8, - .add_extra = 0, + .thread_cluster_dims = {TCL.k0, TCL.m, TCL.k1}, + .thread_cluster_order = TCO.order, + .src_access_order = SAO.order, + .src_vector_dim = VTD.src_vector_dim, + .src_scalar_per_vector = VTD.src_scalar_per_vector, + .dest_scalar_per_vector_k1 = VTD.dest_scalar_per_vector_k1, + .add_extra = VTD.add_extra }; - if constexpr(SpecifiesBlockATransfer) - { - constexpr auto& TCL = ALGORITHM.block_transfer.thread_cluster_dims_a; - block_transfer.thread_cluster_dims = {TCL.k0, TCL.m, TCL.k1}; - } - // Default. return block_transfer; } -template -constexpr BlockTransfer SetBwdDataConvABlockTransfer() -{ - return BlockTransfer{ - .thread_cluster_dims = {4, 16, 1}, - .thread_cluster_order = {1, 0, 2}, - .src_access_order = {1, 0, 2}, - .src_vector_dim = 2, - .src_scaler_per_vector = 8, - .dest_scaler_per_vector_k1 = 8, - .add_extra = 1, - }; -} - template constexpr BlockTransfer SetFwdConvBBlockTransfer() { - using AlgorithmType = decltype(ALGORITHM); + using AlgorithmType = decltype(ALGORITHM); + constexpr auto& TCL = ALGORITHM.block_transfer.thread_cluster_dims_b; + constexpr auto& TCO = ALGORITHM.block_transfer.thread_cluster_access_order_b; + constexpr auto& SAO = ALGORITHM.block_transfer.src_access_order_b; + constexpr auto& VTD = ALGORITHM.block_transfer.vector_transfer_b; + BlockTransfer block_transfer{ - .thread_cluster_dims = {4, 64, 1}, - .thread_cluster_order = {1, 0, 2}, - .src_access_order = {1, 0, 2}, - .src_vector_dim = 2, - .src_scaler_per_vector = 8, - .dest_scaler_per_vector_k1 = 8, - .add_extra = 0, + .thread_cluster_dims = {TCL.k0, TCL.m, TCL.k1}, + .thread_cluster_order = TCO.order, + .src_access_order = SAO.order, + .src_vector_dim = VTD.src_vector_dim, + .src_scalar_per_vector = VTD.src_scalar_per_vector, + .dest_scalar_per_vector_k1 = VTD.dest_scalar_per_vector_k1, + .add_extra = VTD.add_extra }; - if constexpr(SpecifiesBlockBTransfer) - { - constexpr auto& TCL = ALGORITHM.block_transfer.thread_cluster_dims_b; - block_transfer.thread_cluster_dims = {TCL.k0, TCL.n, TCL.k1}; - } return block_transfer; } -template -constexpr BlockTransfer SetBwdDataConvBBlockTransfer() +// Block transfer parameters for C tensor. +struct CBlockTransfer { - // Different default values for backward data. - return BlockTransfer{ - .thread_cluster_dims = {4, 8, 1}, - .thread_cluster_order = {0, 2, 1}, - .src_access_order = {0, 2, 1}, - .src_vector_dim = 1, - .src_scaler_per_vector = 8, - .dest_scaler_per_vector_k1 = 8, - .add_extra = 1, - }; -} + int m_xdl_per_wave_per_shuffle = 0; + int n_xdl_per_wave_per_shuffle = 0; + ck::Array thread_cluster_dims = {0, 0, 0, 0}; + int scaler_per_vector = 0; +}; template constexpr CBlockTransfer SetCBlockTransfer() { using AlgorithmType = decltype(ALGORITHM); - if constexpr(ConvDirectionIsBackwardData) - { - // Different default values for backward data. - return CBlockTransfer{ - .m_xdl_per_wave_per_shuffle = 1, - .n_xdl_per_wave_per_shuffle = 1, - .thread_cluster_dims = {1, 16, 1, 4}, - .scaler_per_vector = 4, - }; - } - CBlockTransfer block_transfer{ - .m_xdl_per_wave_per_shuffle = 1, - .n_xdl_per_wave_per_shuffle = 1, - .thread_cluster_dims = {1, 32, 1, 8}, - .scaler_per_vector = 8, - }; - if constexpr(SpecifiesBlockCTransfer) - { - constexpr auto& TCL = ALGORITHM.block_transfer.thread_cluster_dims_c; + constexpr auto& TCL = ALGORITHM.block_transfer.thread_cluster_dims_c; block_transfer.thread_cluster_dims = { TCL.m_block, TCL.m_wave_per_xdl, TCL.n_block, TCL.n_wave_per_xdl, }; - } return block_transfer; } @@ -362,21 +289,17 @@ template constexpr ck::BlockGemmPipelineVersion SetBlockGemmPipelineVersion() { using AlgorithmType = decltype(ALGORITHM); - if constexpr(SpecifiesGemmPipelineVersion) + switch(ALGORITHM.pipeline_version) { - switch(ALGORITHM.pipeline_version) - { - case BlockGemmPipelineVersion::V1: return ck::BlockGemmPipelineVersion::v1; - case BlockGemmPipelineVersion::V3: return ck::BlockGemmPipelineVersion::v3; - case BlockGemmPipelineVersion::V4: return ck::BlockGemmPipelineVersion::v4; - case BlockGemmPipelineVersion::V5: return ck::BlockGemmPipelineVersion::v5; - } + case BlockGemmPipelineVersion::V1: return ck::BlockGemmPipelineVersion::v1; + case BlockGemmPipelineVersion::V3: return ck::BlockGemmPipelineVersion::v3; + case BlockGemmPipelineVersion::V4: return ck::BlockGemmPipelineVersion::v4; + case BlockGemmPipelineVersion::V5: return ck::BlockGemmPipelineVersion::v5; + default: return ck::BlockGemmPipelineVersion::v4; } - // Default value. - return ck::BlockGemmPipelineVersion::v4; } -} // namespace ck_tile::builder::factory +} // namespace ck_tile::builder::factory_internal namespace ck_tile::builder { @@ -397,6 +320,27 @@ struct ConvFactory using Layouts = factory_internal::ConvTensorLayouts; using Types = factory_internal::ConvTensorTypes; using Ops = factory_internal::ConvPassThroughOps; + using AlgorithmType = decltype(ALGORITHM); + + // Check preconditions for the algorithm description. + static_assert(SPATIAL_DIM == 2 || SPATIAL_DIM == 3, + "Only 2D and 3D convolutions are supported in this factory."); + static_assert(SpecifiesThreadBlock, + "The convolution algorithm descriptor must specify thread block info."); + static_assert(SpecifiesGridwiseGemm, + "The convolution algorithm descriptor must specify gridwise GEMM info."); + static_assert(SpecifiesBlockTransfer, + "The convolution algorithm descriptor must specify block transfer info."); + static_assert(SpecifiesBlockVectorTransfer, + "The convolution algorithm descriptor must specify block vector transfer info."); + static_assert(SpecifiesThreadClusterAccessOrder, + "The convolution algorithm descriptor must specify thread cluster access order info."); + static_assert(SpecifiesSourceAccessOrder, + "The convolution algorithm descriptor must specify source access order info."); + static_assert(SpecifiesGemmPipelineVersion, + "The convolution algorithm descriptor must specify block gemm pipeline version."); + + static constexpr factory_internal::ConvSpec SPECIALIZATION{ .conv_spec = ck::tensor_operation::device::ConvolutionForwardSpecialization::Default, .gemm_spec = ck::tensor_operation::device::GemmSpecialization::MNKPadding, @@ -409,8 +353,11 @@ struct ConvFactory static constexpr auto PIPELINE_SCHEDULER = ck::BlockGemmPipelineScheduler::Intrawave; static constexpr auto PIPELINE_VERSION = factory_internal::SetBlockGemmPipelineVersion(); - // Preconditions - static_assert(ValidVectorTransferC); + // Check limits for the algorithm parameters. + // TODO: Add more limits checks as needed. + static_assert(InputVectorTransferLimits); + static_assert(InputVectorTransferLimits); + static_assert(OutputVectorTransferLimits); // The forward convolution kernel class instance. using Instance = @@ -463,77 +410,4 @@ struct ConvFactory PIPELINE_VERSION>; }; -// Factory specialization for an instance of a grouped backward-data convolution kernel. -template - requires SupportedVersion && ConvDirectionIsBackwardData -struct ConvFactory -{ - static constexpr int SPATIAL_DIM = SIGNATURE.spatial_dim; - using Layouts = factory_internal::ConvTensorLayouts; - using Types = factory_internal::ConvTensorTypes; - using Ops = factory_internal::ConvPassThroughOps; - static constexpr factory_internal::ConvSpec SPECIALIZATION{ - .conv_spec = ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default, - .gemm_spec = ck::tensor_operation::device::GemmSpecialization::MNKPadding, - }; - static constexpr auto BLOCK = factory_internal::SetThreadBlockInfo(); - static constexpr auto TUNING = factory_internal::SetConvTuningInfo(); - static constexpr auto A_BLOCK_TRANSFER = factory_internal::SetBwdDataConvABlockTransfer(); - static constexpr auto B_BLOCK_TRANSFER = factory_internal::SetBwdDataConvBBlockTransfer(); - static constexpr auto C_BLOCK_TRANSFER = factory_internal::SetCBlockTransfer(); - static constexpr auto PIPELINE_SCHEDULER = ck::BlockGemmPipelineScheduler::Intrawave; - static constexpr auto PIPELINE_VERSION = factory_internal::SetBlockGemmPipelineVersion(); - // The backward-data convolution kernel class instance. - using Instance = - ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< - SPATIAL_DIM, - typename Layouts::ALayout, - typename Layouts::BLayout, - typename Layouts::DsLayout, - typename Layouts::ELayout, - typename Types::ADataType, - typename Types::BDataType, - typename Types::AccDataType, - typename Types::CShuffleDataType, - typename Types::DsDataTypes, - typename Types::EDataType, - typename Ops::AElementwiseOp, - typename Ops::BElementwiseOp, - typename Ops::CDEElementwiseOp, - SPECIALIZATION.conv_spec, - true, // DoPadGemmM - true, // DoPadGemmN - 1, // NumGemmKPrefetchStage - BLOCK.block_size, - BLOCK.per_block.m, - BLOCK.per_block.n, - BLOCK.per_block.k, - TUNING.ak1, - TUNING.bk1, - TUNING.m_per_xdl, - TUNING.n_per_dxl, - TUNING.m_xdl_per_wave, - TUNING.n_xdl_per_wave, - to_sequence_v, - to_sequence_v, - to_sequence_v, - A_BLOCK_TRANSFER.src_vector_dim, - A_BLOCK_TRANSFER.src_scaler_per_vector, - A_BLOCK_TRANSFER.dest_scaler_per_vector_k1, - A_BLOCK_TRANSFER.add_extra, - to_sequence_v, - to_sequence_v, - to_sequence_v, - B_BLOCK_TRANSFER.src_vector_dim, - B_BLOCK_TRANSFER.src_scaler_per_vector, - B_BLOCK_TRANSFER.dest_scaler_per_vector_k1, - B_BLOCK_TRANSFER.add_extra, - C_BLOCK_TRANSFER.m_xdl_per_wave_per_shuffle, - C_BLOCK_TRANSFER.n_xdl_per_wave_per_shuffle, - to_sequence_v, - C_BLOCK_TRANSFER.scaler_per_vector>; -}; - } // namespace ck_tile::builder \ No newline at end of file From 3aaf8b945a08a7ac87700bff5e2b79f8cf87c190 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= Date: Tue, 21 Oct 2025 10:15:05 +0000 Subject: [PATCH 10/39] Add type definitions for testing. --- .../builder/conv_algorithm_concepts.hpp | 68 +++++----- .../ck_tile/builder/conv_algorithm_limits.hpp | 11 ++ .../include/ck_tile/builder/conv_factory.hpp | 4 + .../test/impl/conv_algorithm_types.hpp | 118 ++++++++++++++++++ .../test/impl/conv_signature_types.hpp | 19 +++ 5 files changed, 190 insertions(+), 30 deletions(-) create mode 100644 experimental/builder/test/impl/conv_algorithm_types.hpp create mode 100644 experimental/builder/test/impl/conv_signature_types.hpp diff --git a/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp b/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp index 6621a3aa99..023aa40e29 100644 --- a/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp @@ -11,6 +11,10 @@ namespace ck_tile::builder { +/********************************************************************/ +/* Descriptors for individual elements of the algorithm description */ +/********************************************************************/ + // Concept for thread block dimensions for a GEMM problem. template concept ThreadBlockDescriptor = requires(T t) { @@ -20,12 +24,6 @@ concept ThreadBlockDescriptor = requires(T t) { { t.tile_size.k } -> std::convertible_to; }; -// Concept to check if struct specifies thread block info. -template -concept SpecifiesThreadBlock = requires { - { T::thread_block } -> ThreadBlockDescriptor; -}; - // Concept for parameters that describe a gridwise GEMM problem. template concept GridwiseGemmDescriptor = requires(T t) { @@ -37,12 +35,6 @@ concept GridwiseGemmDescriptor = requires(T t) { { t.n_xdl_per_wave } -> std::convertible_to; }; -// Concept to check if a struct specifies gridwise GEMM info. -template -concept SpecifiesGridwiseGemm = requires { - { T::tuning_params } -> GridwiseGemmDescriptor; -}; - // Concept for convolution input block transfer. template concept InputBlockTransferDescriptor = requires(T t) { @@ -60,14 +52,6 @@ concept OutputBlockTransferDescriptor = requires(T t) { { t.n_wave_per_xdl } -> std::convertible_to; }; -// Concept to check if a struct specifies convolution input and output block transfer info. -template -concept SpecifiesBlockTransfer = requires(T t) { - { T::block_transfer.thread_cluster_dims_a } -> InputBlockTransferDescriptor; - { T::block_transfer.thread_cluster_dims_b } -> InputBlockTransferDescriptor; - { T::block_transfer.thread_cluster_dims_c } -> OutputBlockTransferDescriptor; -}; - // Concept for the convolution input vector transfer. template concept InputVectorTransferDescriptor = requires(T t) { @@ -85,6 +69,40 @@ concept OutputVectorTransferDescriptor = requires(T t) { { t.scalar_per_vector } -> std::convertible_to; }; +// Concept for the thread cluster access order +template +concept AccessOrderDescriptor = requires(T t) { + { t.order } -> std::convertible_to>; +}; + +// No requirements yet for a ConvAlogorithm concept. +template +concept ConvAlgorithmDescriptor = std::is_class_v; + +/******************************************** */ +/* Requirements for the algorithm description */ +/******************************************** */ + +// Concept to check if struct specifies thread block info. +template +concept SpecifiesThreadBlock = requires { + { T::thread_block } -> ThreadBlockDescriptor; +}; + +// Concept to check if a struct specifies gridwise GEMM info. +template +concept SpecifiesGridwiseGemm = requires { + { T::tuning_params } -> GridwiseGemmDescriptor; +}; + +// Concept to check if a struct specifies convolution input and output block transfer info. +template +concept SpecifiesBlockTransfer = requires(T t) { + { T::block_transfer.thread_cluster_dims_a } -> InputBlockTransferDescriptor; + { T::block_transfer.thread_cluster_dims_b } -> InputBlockTransferDescriptor; + { T::block_transfer.thread_cluster_dims_c } -> OutputBlockTransferDescriptor; +}; + // Concept to check if a struct specifies block vector transfer info. template concept SpecifiesBlockVectorTransfer = requires(T t) { @@ -93,12 +111,6 @@ concept SpecifiesBlockVectorTransfer = requires(T t) { { T::block_transfer.vector_transfer_c } -> OutputVectorTransferDescriptor; }; -// Concept for the thread cluster access order -template -concept AccessOrderDescriptor = requires(T t) { - { t.order } -> std::convertible_to>; -}; - // Concept to check if a struct specifies thread cluster access order info. template concept SpecifiesThreadClusterAccessOrder = requires(T t) { @@ -119,8 +131,4 @@ concept SpecifiesGemmPipelineVersion = requires { { T::pipeline_version } -> std::convertible_to; }; -// No requirements yet for a ConvAlogorithm concept. -template -concept ConvAlgorithmDescriptor = std::is_class_v; - } // namespace ck_tile::builder diff --git a/experimental/builder/include/ck_tile/builder/conv_algorithm_limits.hpp b/experimental/builder/include/ck_tile/builder/conv_algorithm_limits.hpp index c35d4f5f3a..79b304fc5d 100644 --- a/experimental/builder/include/ck_tile/builder/conv_algorithm_limits.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_algorithm_limits.hpp @@ -25,4 +25,15 @@ concept OutputVectorTransferLimits = requires { Value.n_xdl_per_wave_per_shuffle > 0 ; }; +// Limits for access order. Must be a permutation of {0, 1, 2}. +template +concept AccessOrderLimits = requires { + requires ((Value.order[0] != Value.order[1]) && + (Value.order[0] != Value.order[2]) && + (Value.order[1] != Value.order[2]) && + (Value.order[0] >= 0 && Value.order[0] < 3) && + (Value.order[1] >= 0 && Value.order[1] < 3) && + (Value.order[2] >= 0 && Value.order[2] < 3)); +}; + } // namespace ck_tile::builder diff --git a/experimental/builder/include/ck_tile/builder/conv_factory.hpp b/experimental/builder/include/ck_tile/builder/conv_factory.hpp index 4d7121ca31..b632a87d98 100644 --- a/experimental/builder/include/ck_tile/builder/conv_factory.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_factory.hpp @@ -358,6 +358,10 @@ struct ConvFactory static_assert(InputVectorTransferLimits); static_assert(InputVectorTransferLimits); static_assert(OutputVectorTransferLimits); + static_assert(AccessOrderLimits); + static_assert(AccessOrderLimits); + static_assert(AccessOrderLimits); + static_assert(AccessOrderLimits); // The forward convolution kernel class instance. using Instance = diff --git a/experimental/builder/test/impl/conv_algorithm_types.hpp b/experimental/builder/test/impl/conv_algorithm_types.hpp new file mode 100644 index 0000000000..fde91e6f85 --- /dev/null +++ b/experimental/builder/test/impl/conv_algorithm_types.hpp @@ -0,0 +1,118 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "" + +namespace ck_tile::builder::test +{ + +namespace ckb = ck_tile::builder; + +// Convenience struct for a tuple of m, n, and k values. +template +struct MNK +{ + T m{}; + T n{}; + T k{}; +}; + +// Specify thread block dimensions for a GEMM. +struct ThreadBlock +{ + // Thread block size. + int block_size; + // Size of the submatrix problem in a thread block. + MNK submatrix; +}; +static_assert(ckb::ThreadBlockDescriptor); + +// Describe some convolution tuning parameters. +struct ConvTuningParams +{ + // NOTE: ak1 and bk1 are difficult to verify in the kernel instantiation!!! + int ak1 = 0; + int bk1 = 0; + int m_per_xdl = 0; + int n_per_xdl = 0; + int m_xdl_per_wave = 0; + int n_xdl_per_wave = 0; +}; +static_assert(ckb::GridwiseGemmDescriptor); + +// Describe A block transfer thread cluster lengths. +struct InputBlockTransferLengths +{ + int k0; + int m; + int k1; +}; +static_assert(ckb::InputBlockTransferDescriptor); + +// Describe C block transfer thread cluster lengths. +struct OutputBlockTransferLengths +{ + int m_block; + int m_wave_per_xdl; + int n_block; + int n_wave_per_xdl; +}; +static_assert(OutputBlockTransferDescriptor); + +struct InputVectorTransfer +{ + size_t src_vector_dim; + size_t src_scaler_per_vector; + size_t dest_scaler_per_vector_k1; + bool add_extra; +}; +static_assert(InputVectorTransferDescriptor); + +struct OutputVectorTransfer +{ + size_t m_xdl_per_wave_per_shuffle; + size_t n_xdl_per_wave_per_shuffle; + size_t scalar_per_vector; +}; +static_assert(OutputVectorTransferDescriptor); + +struct AccessOrder +{ + std::array order; +}; +static_assert(AccessOrderDescriptor); + + +struct BlockTransfer +{ + InputBlockTransferLengths thread_cluster_dims_a; + InputBlockTransferLengths thread_cluster_dims_b; + OutputBlockTransferLengths thread_cluster_dims_c; + InputVectorTransfer vector_transfer_a; + InputVectorTransfer vector_transfer_b; + OutputVectorTransfer vector_transfer_c; + AccessOrder thread_cluster_access_order_a; + AccessOrder thread_cluster_access_order_b; + AccessOrder a_source_access_order; + AccessOrder b_source_access_order; +}; + +struct ConvAlgorithm +{ + ThreadBlock thread_block; + ConvTuningParams tuning_params; + BlockTransfer block_transfer; + BlockGemmPipelineVersion pipeline_version; +}; +static_assert(ckb::ConvAlgorithmDescriptor); +static_assert(ckb::SpecifiesThreadBlock); +static_assert(ckb::SpecifiesGridwiseGemm); +static_assert(ckb::SpecifiesGemmPipelineVersion); +static_assert(ckb::SpecifiesBlockTransfer); +static_assert(ckb::SpecifiesBlockVectorTransfer); +static_assert(ckb::SpecifiesThreadClusterAccessOrder); +static_assert(ckb::SpecifiesSourceAccessOrder); + +} // namespace ck_tile::builder::test diff --git a/experimental/builder/test/impl/conv_signature_types.hpp b/experimental/builder/test/impl/conv_signature_types.hpp new file mode 100644 index 0000000000..db735447af --- /dev/null +++ b/experimental/builder/test/impl/conv_signature_types.hpp @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "" + +namespace ck_tile::builder { + +struct ConvSignature +{ + int spatial_dim; + ConvDirection direction; + GroupConvLayout layout; + DataType data_type; +}; +static_assert(ConvSignatureDescriptor); + +} // namespace ck_tile::builder From 7b2a62222c4c820aae471f02925a3e68f84b447c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= Date: Tue, 21 Oct 2025 11:07:59 +0000 Subject: [PATCH 11/39] Add placeholder test. --- experimental/builder/test/CMakeLists.txt | 3 +++ .../builder/test/impl/conv_algorithm_types.hpp | 12 ++++++------ .../builder/test/test_ckb_build_fwd_instances.cpp | 13 +++++++++++++ 3 files changed, 22 insertions(+), 6 deletions(-) create mode 100644 experimental/builder/test/test_ckb_build_fwd_instances.cpp diff --git a/experimental/builder/test/CMakeLists.txt b/experimental/builder/test/CMakeLists.txt index 5890aa8dcd..06ea20e396 100644 --- a/experimental/builder/test/CMakeLists.txt +++ b/experimental/builder/test/CMakeLists.txt @@ -18,3 +18,6 @@ endfunction() add_ck_builder_test(test_conv_builder test_conv_builder.cpp) + +add_ck_builder_test(test_ckb_build_fwd_instances + test_ckb_build_fwd_instances.cpp) \ No newline at end of file diff --git a/experimental/builder/test/impl/conv_algorithm_types.hpp b/experimental/builder/test/impl/conv_algorithm_types.hpp index fde91e6f85..00661b60fe 100644 --- a/experimental/builder/test/impl/conv_algorithm_types.hpp +++ b/experimental/builder/test/impl/conv_algorithm_types.hpp @@ -3,7 +3,7 @@ #pragma once -#include "" +#include "ck_tile/builder/conv_algorithm_concepts.hpp" namespace ck_tile::builder::test { @@ -25,7 +25,7 @@ struct ThreadBlock // Thread block size. int block_size; // Size of the submatrix problem in a thread block. - MNK submatrix; + MNK tile_size; }; static_assert(ckb::ThreadBlockDescriptor); @@ -64,8 +64,8 @@ static_assert(OutputBlockTransferDescriptor); struct InputVectorTransfer { size_t src_vector_dim; - size_t src_scaler_per_vector; - size_t dest_scaler_per_vector_k1; + size_t src_scalar_per_vector; + size_t dest_scalar_per_vector_k1; bool add_extra; }; static_assert(InputVectorTransferDescriptor); @@ -95,8 +95,8 @@ struct BlockTransfer OutputVectorTransfer vector_transfer_c; AccessOrder thread_cluster_access_order_a; AccessOrder thread_cluster_access_order_b; - AccessOrder a_source_access_order; - AccessOrder b_source_access_order; + AccessOrder src_access_order_a; + AccessOrder src_access_order_b; }; struct ConvAlgorithm diff --git a/experimental/builder/test/test_ckb_build_fwd_instances.cpp b/experimental/builder/test/test_ckb_build_fwd_instances.cpp new file mode 100644 index 0000000000..596f603d42 --- /dev/null +++ b/experimental/builder/test/test_ckb_build_fwd_instances.cpp @@ -0,0 +1,13 @@ +#include + +#include "impl/conv_algorithm_types.hpp" + +class FwdConvBuilderTest : public ::testing::Test +{ +}; + +TEST_F(FwdConvBuilderTest, CreateInvoker) +{ + // TODO: Implement actual test + EXPECT_TRUE(true); +} From 11e71abd80912d939c96e0c725a94365e3b85e94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= Date: Tue, 21 Oct 2025 11:11:24 +0000 Subject: [PATCH 12/39] Add convolution builder definition. --- .../include/ck_tile/builder/conv_builder.hpp | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 experimental/builder/include/ck_tile/builder/conv_builder.hpp diff --git a/experimental/builder/include/ck_tile/builder/conv_builder.hpp b/experimental/builder/include/ck_tile/builder/conv_builder.hpp new file mode 100644 index 0000000000..82c0556e1c --- /dev/null +++ b/experimental/builder/include/ck_tile/builder/conv_builder.hpp @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +#include +#include + +namespace ck_tile::builder { + +/** + * @brief Top-level builder for creating convolution kernel instances. + * + * This struct serves as the main entry point for generating a convolution kernel. + * It uses a factory pattern based on the provided signature, algorithm, and version + * to construct the appropriate kernel instance. + * + * @tparam SIGNATURE The convolution signature, which describes the mathematical functionality of + * the algorithm (e.g., data types, layouts, direction). + * @tparam ALGORITHM The specific convolution algorithm to be used for the implementation. + * @tparam VERSION The version of the builder implementation. + */ +template + requires SupportedVersion && ValidConvSignature +struct ConvBuilder +{ + static constexpr auto kVersion = VERSION; + using Factory = ConvFactory; + // Output: The kernel class. + using Instance = Factory::Instance; +}; + +} // namespace ck_tile::builder \ No newline at end of file From 7b894868e81bbf2431a5165ce5daad9586dca6c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= Date: Tue, 21 Oct 2025 13:01:06 +0000 Subject: [PATCH 13/39] Fully functional fwd conv builder. --- .../include/ck_tile/builder/builder_utils.hpp | 4 +- .../builder/conv_algorithm_concepts.hpp | 4 +- .../include/ck_tile/builder/conv_builder.hpp | 2 +- .../include/ck_tile/builder/conv_factory.hpp | 115 +++++++++--------- .../builder/conv_signature_concepts.hpp | 2 +- .../include/ck_tile/builder/versions.hpp | 2 +- .../test/impl/conv_algorithm_types.hpp | 37 +++--- .../test/impl/conv_signature_types.hpp | 2 +- .../test/test_ckb_build_fwd_instances.cpp | 70 ++++++++++- 9 files changed, 153 insertions(+), 85 deletions(-) diff --git a/experimental/builder/include/ck_tile/builder/builder_utils.hpp b/experimental/builder/include/ck_tile/builder/builder_utils.hpp index 1b51453a4d..efa924303e 100644 --- a/experimental/builder/include/ck_tile/builder/builder_utils.hpp +++ b/experimental/builder/include/ck_tile/builder/builder_utils.hpp @@ -3,7 +3,7 @@ #pragma once #include -#include +#include namespace ck_tile::builder { @@ -107,4 +107,4 @@ constexpr std::string_view LayoutToString(GroupConvLayout layout) } } -} // namespace ck_tile::builder \ No newline at end of file +} // namespace ck_tile::builder diff --git a/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp b/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp index 023aa40e29..84eb741a0b 100644 --- a/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp @@ -39,7 +39,7 @@ concept GridwiseGemmDescriptor = requires(T t) { template concept InputBlockTransferDescriptor = requires(T t) { { t.k0 } -> std::convertible_to; - { t.m } -> std::convertible_to; + { t.m_n } -> std::convertible_to; { t.k1 } -> std::convertible_to; }; @@ -72,7 +72,7 @@ concept OutputVectorTransferDescriptor = requires(T t) { // Concept for the thread cluster access order template concept AccessOrderDescriptor = requires(T t) { - { t.order } -> std::convertible_to>; + { t.order } -> std::convertible_to>; }; // No requirements yet for a ConvAlogorithm concept. diff --git a/experimental/builder/include/ck_tile/builder/conv_builder.hpp b/experimental/builder/include/ck_tile/builder/conv_builder.hpp index 82c0556e1c..3ea223c16e 100644 --- a/experimental/builder/include/ck_tile/builder/conv_builder.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_builder.hpp @@ -35,4 +35,4 @@ struct ConvBuilder using Instance = Factory::Instance; }; -} // namespace ck_tile::builder \ No newline at end of file +} // namespace ck_tile::builder diff --git a/experimental/builder/include/ck_tile/builder/conv_factory.hpp b/experimental/builder/include/ck_tile/builder/conv_factory.hpp index b632a87d98..1ad75f6b73 100644 --- a/experimental/builder/include/ck_tile/builder/conv_factory.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_factory.hpp @@ -44,7 +44,7 @@ namespace ck_tile::builder::factory_internal { // Type mappings from the builder GroupConvLayout enum class to the CK tensor data types. -template +template requires(ConvSpatialDim) struct ConvTensorLayouts { @@ -145,9 +145,11 @@ struct ConvPassThroughOps // The algorithm specializations for the convolution and GEMM. template requires( - std::is_same_v || - std::is_same_v) + std::is_same_v + // || + // std::is_same_v + ) struct ConvSpec { CONV_ENUM conv_spec; @@ -167,14 +169,13 @@ struct MNK }; struct ConvBlock { - int block_size = 0; + size_t block_size = 0; MNK per_block = {}; }; template constexpr ConvBlock SetThreadBlockInfo() { - using AlgorithmType = decltype(ALGORITHM); constexpr auto& TB = ALGORITHM.thread_block; return ConvBlock{ .block_size = TB.block_size, @@ -185,24 +186,23 @@ constexpr ConvBlock SetThreadBlockInfo() // Convolution tuning parameters. struct ConvTuning { - int ak1 = 0; - int bk1 = 0; - int m_per_xdl = 0; - int n_per_dxl = 0; - int m_xdl_per_wave = 0; - int n_xdl_per_wave = 0; + size_t ak1 = 0; + size_t bk1 = 0; + size_t m_per_xdl = 0; + size_t n_per_xdl = 0; + size_t m_xdl_per_wave = 0; + size_t n_xdl_per_wave = 0; }; template constexpr ConvTuning SetConvTuningInfo() { - using AlgorithmType = decltype(ALGORITHM); constexpr auto& TP = ALGORITHM.tuning_params; return ConvTuning{ .ak1 = TP.ak1, .bk1 = TP.bk1, .m_per_xdl = TP.m_per_xdl, - .n_per_dxl = TP.n_per_dxl, + .n_per_xdl = TP.n_per_xdl, .m_xdl_per_wave = TP.m_xdl_per_wave, .n_xdl_per_wave = TP.n_xdl_per_wave, }; @@ -211,28 +211,27 @@ constexpr ConvTuning SetConvTuningInfo() // Block transfer parameters for A or B tensor. struct BlockTransfer { - ck::Array thread_cluster_dims = {0, 0, 0}; // k0, m, k1 - ck::Array thread_cluster_order = {0, 0, 0}; - ck::Array src_access_order = {0, 0, 0}; - int src_vector_dim = 0; - int src_scalar_per_vector = 0; - int dest_scalar_per_vector_k1 = 0; - int add_extra = 0; + ck::Array thread_cluster_dims = {0, 0, 0}; // k0, m, k1 + ck::Array thread_cluster_order = {0, 0, 0}; + ck::Array src_access_order = {0, 0, 0}; + size_t src_vector_dim = 0; + size_t src_scalar_per_vector = 0; + size_t dest_scalar_per_vector_k1 = 0; + size_t add_extra = 0; }; template constexpr BlockTransfer SetFwdConvABlockTransfer() { - using AlgorithmType = decltype(ALGORITHM); constexpr auto& TCL = ALGORITHM.block_transfer.thread_cluster_dims_a; constexpr auto& TCO = ALGORITHM.block_transfer.thread_cluster_access_order_a; constexpr auto& SAO = ALGORITHM.block_transfer.src_access_order_a; constexpr auto& VTD = ALGORITHM.block_transfer.vector_transfer_a; BlockTransfer block_transfer{ - .thread_cluster_dims = {TCL.k0, TCL.m, TCL.k1}, - .thread_cluster_order = TCO.order, - .src_access_order = SAO.order, + .thread_cluster_dims = {TCL.k0, TCL.m_n, TCL.k1}, + .thread_cluster_order = {TCO.order[0], TCO.order[1], TCO.order[2]}, + .src_access_order = {SAO.order[0], SAO.order[1], SAO.order[2]}, .src_vector_dim = VTD.src_vector_dim, .src_scalar_per_vector = VTD.src_scalar_per_vector, .dest_scalar_per_vector_k1 = VTD.dest_scalar_per_vector_k1, @@ -244,16 +243,15 @@ constexpr BlockTransfer SetFwdConvABlockTransfer() template constexpr BlockTransfer SetFwdConvBBlockTransfer() { - using AlgorithmType = decltype(ALGORITHM); constexpr auto& TCL = ALGORITHM.block_transfer.thread_cluster_dims_b; constexpr auto& TCO = ALGORITHM.block_transfer.thread_cluster_access_order_b; constexpr auto& SAO = ALGORITHM.block_transfer.src_access_order_b; constexpr auto& VTD = ALGORITHM.block_transfer.vector_transfer_b; BlockTransfer block_transfer{ - .thread_cluster_dims = {TCL.k0, TCL.m, TCL.k1}, - .thread_cluster_order = TCO.order, - .src_access_order = SAO.order, + .thread_cluster_dims = {TCL.k0, TCL.m_n, TCL.k1}, + .thread_cluster_order = {TCO.order[0], TCO.order[1], TCO.order[2]}, + .src_access_order = {SAO.order[0], SAO.order[1], SAO.order[2]}, .src_vector_dim = VTD.src_vector_dim, .src_scalar_per_vector = VTD.src_scalar_per_vector, .dest_scalar_per_vector_k1 = VTD.dest_scalar_per_vector_k1, @@ -265,30 +263,35 @@ constexpr BlockTransfer SetFwdConvBBlockTransfer() // Block transfer parameters for C tensor. struct CBlockTransfer { - int m_xdl_per_wave_per_shuffle = 0; - int n_xdl_per_wave_per_shuffle = 0; - ck::Array thread_cluster_dims = {0, 0, 0, 0}; - int scaler_per_vector = 0; + size_t m_xdl_per_wave_per_shuffle = 0; + size_t n_xdl_per_wave_per_shuffle = 0; + ck::Array thread_cluster_dims = {0, 0, 0, 0}; + size_t scalar_per_vector = 0; }; template constexpr CBlockTransfer SetCBlockTransfer() { - using AlgorithmType = decltype(ALGORITHM); constexpr auto& TCL = ALGORITHM.block_transfer.thread_cluster_dims_c; - block_transfer.thread_cluster_dims = { - TCL.m_block, - TCL.m_wave_per_xdl, - TCL.n_block, - TCL.n_wave_per_xdl, - }; + constexpr auto& VTC = ALGORITHM.block_transfer.vector_transfer_c; + CBlockTransfer block_transfer + { + .m_xdl_per_wave_per_shuffle = VTC.m_xdl_per_wave_per_shuffle, + .n_xdl_per_wave_per_shuffle = VTC.n_xdl_per_wave_per_shuffle, + .thread_cluster_dims = { + TCL.m_block, + TCL.m_wave_per_xdl, + TCL.n_block, + TCL.n_wave_per_xdl, + }, + .scalar_per_vector = VTC.scalar_per_vector + }; return block_transfer; } template constexpr ck::BlockGemmPipelineVersion SetBlockGemmPipelineVersion() { - using AlgorithmType = decltype(ALGORITHM); switch(ALGORITHM.pipeline_version) { case BlockGemmPipelineVersion::V1: return ck::BlockGemmPipelineVersion::v1; @@ -316,7 +319,7 @@ template struct ConvFactory { - static constexpr int SPATIAL_DIM = SIGNATURE.spatial_dim; + static constexpr size_t SPATIAL_DIM = SIGNATURE.spatial_dim; using Layouts = factory_internal::ConvTensorLayouts; using Types = factory_internal::ConvTensorTypes; using Ops = factory_internal::ConvPassThroughOps; @@ -355,13 +358,13 @@ struct ConvFactory // Check limits for the algorithm parameters. // TODO: Add more limits checks as needed. - static_assert(InputVectorTransferLimits); - static_assert(InputVectorTransferLimits); - static_assert(OutputVectorTransferLimits); - static_assert(AccessOrderLimits); - static_assert(AccessOrderLimits); - static_assert(AccessOrderLimits); - static_assert(AccessOrderLimits); + // static_assert(InputVectorTransferLimits); + // static_assert(InputVectorTransferLimits); + // static_assert(OutputVectorTransferLimits); + // static_assert(AccessOrderLimits); + // static_assert(AccessOrderLimits); + // static_assert(AccessOrderLimits); + // static_assert(AccessOrderLimits); // The forward convolution kernel class instance. using Instance = @@ -389,29 +392,29 @@ struct ConvFactory TUNING.ak1, TUNING.bk1, TUNING.m_per_xdl, - TUNING.n_per_dxl, + TUNING.n_per_xdl, TUNING.m_xdl_per_wave, TUNING.n_xdl_per_wave, to_sequence_v, to_sequence_v, to_sequence_v, A_BLOCK_TRANSFER.src_vector_dim, - A_BLOCK_TRANSFER.src_scaler_per_vector, - A_BLOCK_TRANSFER.dest_scaler_per_vector_k1, + A_BLOCK_TRANSFER.src_scalar_per_vector, + A_BLOCK_TRANSFER.dest_scalar_per_vector_k1, A_BLOCK_TRANSFER.add_extra, to_sequence_v, to_sequence_v, to_sequence_v, B_BLOCK_TRANSFER.src_vector_dim, - B_BLOCK_TRANSFER.src_scaler_per_vector, - B_BLOCK_TRANSFER.dest_scaler_per_vector_k1, + B_BLOCK_TRANSFER.src_scalar_per_vector, + B_BLOCK_TRANSFER.dest_scalar_per_vector_k1, B_BLOCK_TRANSFER.add_extra, C_BLOCK_TRANSFER.m_xdl_per_wave_per_shuffle, C_BLOCK_TRANSFER.n_xdl_per_wave_per_shuffle, to_sequence_v, - C_BLOCK_TRANSFER.scaler_per_vector, + C_BLOCK_TRANSFER.scalar_per_vector, PIPELINE_SCHEDULER, PIPELINE_VERSION>; }; -} // namespace ck_tile::builder \ No newline at end of file +} // namespace ck_tile::builder diff --git a/experimental/builder/include/ck_tile/builder/conv_signature_concepts.hpp b/experimental/builder/include/ck_tile/builder/conv_signature_concepts.hpp index f666c0189c..714c47f6f0 100644 --- a/experimental/builder/include/ck_tile/builder/conv_signature_concepts.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_signature_concepts.hpp @@ -61,4 +61,4 @@ concept ConvDirectionIsBackwardData = (Sig.direction == ConvDirection::BACKWARD_ template concept ConvDirectionIsBackwardWeight = (Sig.direction == ConvDirection::BACKWARD_WEIGHT); -} // namespace ck_tile::builder \ No newline at end of file +} // namespace ck_tile::builder diff --git a/experimental/builder/include/ck_tile/builder/versions.hpp b/experimental/builder/include/ck_tile/builder/versions.hpp index 89618b8913..c7543f11ae 100644 --- a/experimental/builder/include/ck_tile/builder/versions.hpp +++ b/experimental/builder/include/ck_tile/builder/versions.hpp @@ -15,4 +15,4 @@ static constexpr StringLiteral LATEST_API_VERSION = V0_1_0; template concept SupportedVersion = (V == V0_0_0) || (V == V0_1_0); -} // namespace ck_tile::builder \ No newline at end of file +} // namespace ck_tile::builder diff --git a/experimental/builder/test/impl/conv_algorithm_types.hpp b/experimental/builder/test/impl/conv_algorithm_types.hpp index 00661b60fe..3f9ecc3754 100644 --- a/experimental/builder/test/impl/conv_algorithm_types.hpp +++ b/experimental/builder/test/impl/conv_algorithm_types.hpp @@ -23,9 +23,9 @@ struct MNK struct ThreadBlock { // Thread block size. - int block_size; + size_t block_size; // Size of the submatrix problem in a thread block. - MNK tile_size; + MNK tile_size; }; static_assert(ckb::ThreadBlockDescriptor); @@ -33,31 +33,31 @@ static_assert(ckb::ThreadBlockDescriptor); struct ConvTuningParams { // NOTE: ak1 and bk1 are difficult to verify in the kernel instantiation!!! - int ak1 = 0; - int bk1 = 0; - int m_per_xdl = 0; - int n_per_xdl = 0; - int m_xdl_per_wave = 0; - int n_xdl_per_wave = 0; + size_t ak1 = 0; + size_t bk1 = 0; + size_t m_per_xdl = 0; + size_t n_per_xdl = 0; + size_t m_xdl_per_wave = 0; + size_t n_xdl_per_wave = 0; }; static_assert(ckb::GridwiseGemmDescriptor); // Describe A block transfer thread cluster lengths. struct InputBlockTransferLengths { - int k0; - int m; - int k1; + size_t k0; + size_t m_n; + size_t k1; }; static_assert(ckb::InputBlockTransferDescriptor); // Describe C block transfer thread cluster lengths. struct OutputBlockTransferLengths { - int m_block; - int m_wave_per_xdl; - int n_block; - int n_wave_per_xdl; + size_t m_block; + size_t m_wave_per_xdl; + size_t n_block; + size_t n_wave_per_xdl; }; static_assert(OutputBlockTransferDescriptor); @@ -80,12 +80,11 @@ static_assert(OutputVectorTransferDescriptor); struct AccessOrder { - std::array order; + std::array order; }; static_assert(AccessOrderDescriptor); - -struct BlockTransfer +struct InputOutputBlockTransfer { InputBlockTransferLengths thread_cluster_dims_a; InputBlockTransferLengths thread_cluster_dims_b; @@ -103,7 +102,7 @@ struct ConvAlgorithm { ThreadBlock thread_block; ConvTuningParams tuning_params; - BlockTransfer block_transfer; + InputOutputBlockTransfer block_transfer; BlockGemmPipelineVersion pipeline_version; }; static_assert(ckb::ConvAlgorithmDescriptor); diff --git a/experimental/builder/test/impl/conv_signature_types.hpp b/experimental/builder/test/impl/conv_signature_types.hpp index db735447af..7ad47c6baa 100644 --- a/experimental/builder/test/impl/conv_signature_types.hpp +++ b/experimental/builder/test/impl/conv_signature_types.hpp @@ -3,7 +3,7 @@ #pragma once -#include "" +#include "ck_tile/builder/conv_signature_concepts.hpp" namespace ck_tile::builder { diff --git a/experimental/builder/test/test_ckb_build_fwd_instances.cpp b/experimental/builder/test/test_ckb_build_fwd_instances.cpp index 596f603d42..e2cc0f0eef 100644 --- a/experimental/builder/test/test_ckb_build_fwd_instances.cpp +++ b/experimental/builder/test/test_ckb_build_fwd_instances.cpp @@ -1,6 +1,10 @@ #include #include "impl/conv_algorithm_types.hpp" +#include "impl/conv_signature_types.hpp" +#include "ck_tile/builder/conv_builder.hpp" +#include "ck_tile/builder/conv_algorithm_concepts.hpp" +#include "ck_tile/builder/conv_signature_concepts.hpp" class FwdConvBuilderTest : public ::testing::Test { @@ -8,6 +12,68 @@ class FwdConvBuilderTest : public ::testing::Test TEST_F(FwdConvBuilderTest, CreateInvoker) { - // TODO: Implement actual test - EXPECT_TRUE(true); + using namespace ck_tile::builder; + using namespace ck_tile::builder::test; + + constexpr ConvSignature FwdConvSignature + { + .spatial_dim = 2, + .direction = ConvDirection::FORWARD, + .layout = GroupConvLayout::CHANNELS_LAST, + .data_type = DataType::BF16 + }; + + constexpr ThreadBlock FwdThreadBlock + { + .block_size = 256, + .tile_size = {.m = 256, .n = 256, .k = 32} + }; + + constexpr ConvTuningParams FwdTuningParams + { + .ak1 = 8, + .bk1 = 8, + .m_per_xdl=32, + .n_per_xdl = 32, + .m_xdl_per_wave = 4, + .n_xdl_per_wave = 4 + }; + + constexpr InputOutputBlockTransfer FwdBlockTransfer + { + .thread_cluster_dims_a = {.k0 = 4, .m_n = 64, .k1 = 1}, + .thread_cluster_dims_b = {.k0 = 4, .m_n = 64, .k1 = 1}, + .thread_cluster_dims_c = { + .m_block = 1, .m_wave_per_xdl = 32, .n_block = 1, .n_wave_per_xdl = 8}, + .vector_transfer_a = { + .src_vector_dim = 2, .src_scalar_per_vector = 2, .dest_scalar_per_vector_k1 = 8, .add_extra = false}, + .vector_transfer_b = { + .src_vector_dim = 2, .src_scalar_per_vector = 8, .dest_scalar_per_vector_k1 = 8, .add_extra = false}, + .vector_transfer_c = { + .m_xdl_per_wave_per_shuffle = 1, .n_xdl_per_wave_per_shuffle = 1, .scalar_per_vector = 8}, + .thread_cluster_access_order_a = {1, 0, 2}, + .thread_cluster_access_order_b = {1, 0, 2}, + .src_access_order_a = {1, 0, 2}, + .src_access_order_b = {1, 0, 2} + }; + + constexpr ConvAlgorithm FwdConvAlgorithm + { + .thread_block = FwdThreadBlock, + .tuning_params = FwdTuningParams, + .block_transfer = FwdBlockTransfer, + .pipeline_version = BlockGemmPipelineVersion::V4, + }; + + using Builder = ConvBuilder; + //const auto kernel_string = Builder::Instance::GetTypeString(); + //std::cout << "Generated kernel: " << kernel_string << std::endl; + + // The invoker is the entrypoint to launch the kernel. + // Creating the invoker triggers the validation of the builder configuration, + // that is, the combination of all builder parameters is checked at compile time. + auto invoker = Builder::Instance::MakeInvoker(); + + // TODO: Prepare actual data and launch the kernel. + (void)invoker; } From 16df5babda2b19149805376832547e0c22c7b127 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= Date: Tue, 21 Oct 2025 13:17:34 +0000 Subject: [PATCH 14/39] Test improvements. --- .../test/test_ckb_build_fwd_instances.cpp | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/experimental/builder/test/test_ckb_build_fwd_instances.cpp b/experimental/builder/test/test_ckb_build_fwd_instances.cpp index e2cc0f0eef..392377b76a 100644 --- a/experimental/builder/test/test_ckb_build_fwd_instances.cpp +++ b/experimental/builder/test/test_ckb_build_fwd_instances.cpp @@ -10,7 +10,7 @@ class FwdConvBuilderTest : public ::testing::Test { }; -TEST_F(FwdConvBuilderTest, CreateInvoker) +TEST_F(FwdConvBuilderTest, Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance) { using namespace ck_tile::builder; using namespace ck_tile::builder::test; @@ -66,14 +66,15 @@ TEST_F(FwdConvBuilderTest, CreateInvoker) }; using Builder = ConvBuilder; - //const auto kernel_string = Builder::Instance::GetTypeString(); - //std::cout << "Generated kernel: " << kernel_string << std::endl; + + auto instance = Builder::Instance{}; + + const auto kernel_string = instance.GetTypeString(); + std::cout << "Generated kernel: " << kernel_string << std::endl; + EXPECT_GT(kernel_string.size(), 0); - // The invoker is the entrypoint to launch the kernel. - // Creating the invoker triggers the validation of the builder configuration, - // that is, the combination of all builder parameters is checked at compile time. - auto invoker = Builder::Instance::MakeInvoker(); + EXPECT_TRUE(kernel_string.starts_with("DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3")); - // TODO: Prepare actual data and launch the kernel. - (void)invoker; + const auto invoker_ptr = instance.MakeInvokerPointer(); + EXPECT_NE(invoker_ptr, nullptr); } From c6a1fa4dfb90ae849dcbdaf417506962998f41e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= Date: Tue, 21 Oct 2025 13:24:54 +0000 Subject: [PATCH 15/39] Clean-up include headers. --- .../include/ck_tile/builder/builder_utils.hpp | 5 +++-- .../ck_tile/builder/conv_algorithm_concepts.hpp | 3 ++- .../include/ck_tile/builder/conv_builder.hpp | 4 ++-- .../include/ck_tile/builder/conv_factory.hpp | 14 +++++++------- .../ck_tile/builder/conv_signature_concepts.hpp | 2 +- .../builder/include/ck_tile/builder/versions.hpp | 2 +- .../builder/test/test_ckb_build_fwd_instances.cpp | 2 -- 7 files changed, 16 insertions(+), 16 deletions(-) diff --git a/experimental/builder/include/ck_tile/builder/builder_utils.hpp b/experimental/builder/include/ck_tile/builder/builder_utils.hpp index efa924303e..5f6b4fb013 100644 --- a/experimental/builder/include/ck_tile/builder/builder_utils.hpp +++ b/experimental/builder/include/ck_tile/builder/builder_utils.hpp @@ -2,8 +2,9 @@ // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once -#include -#include + +#include "ck/utility/sequence.hpp" +#include "ck_tile/builder/types.hpp" namespace ck_tile::builder { diff --git a/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp b/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp index 84eb741a0b..1646d6c69d 100644 --- a/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp @@ -6,7 +6,8 @@ #include #include #include -#include "types.hpp" + +#include "ck_tile/builder/types.hpp" namespace ck_tile::builder { diff --git a/experimental/builder/include/ck_tile/builder/conv_builder.hpp b/experimental/builder/include/ck_tile/builder/conv_builder.hpp index 3ea223c16e..2830d600ee 100644 --- a/experimental/builder/include/ck_tile/builder/conv_builder.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_builder.hpp @@ -6,8 +6,8 @@ #include #include -#include -#include +#include "ck_tile/builder/conv_factory.hpp" +#include "ck_tile/builder/versions.hpp" namespace ck_tile::builder { diff --git a/experimental/builder/include/ck_tile/builder/conv_factory.hpp b/experimental/builder/include/ck_tile/builder/conv_factory.hpp index 1ad75f6b73..952004d38e 100644 --- a/experimental/builder/include/ck_tile/builder/conv_factory.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_factory.hpp @@ -33,13 +33,13 @@ #pragma once -#include -#include -#include -#include -#include -#include -#include +#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp" +#include "ck_tile/builder/conv_signature_concepts.hpp" +#include "ck_tile/builder/conv_algorithm_concepts.hpp" +#include "ck_tile/builder/conv_algorithm_limits.hpp" +#include "ck_tile/builder/builder_utils.hpp" +#include "ck_tile/builder/types.hpp" +#include "ck_tile/builder/versions.hpp" namespace ck_tile::builder::factory_internal { diff --git a/experimental/builder/include/ck_tile/builder/conv_signature_concepts.hpp b/experimental/builder/include/ck_tile/builder/conv_signature_concepts.hpp index 714c47f6f0..bd104948a1 100644 --- a/experimental/builder/include/ck_tile/builder/conv_signature_concepts.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_signature_concepts.hpp @@ -20,7 +20,7 @@ #include #include -#include +#include "ck_tile/builder/types.hpp" namespace ck_tile::builder { diff --git a/experimental/builder/include/ck_tile/builder/versions.hpp b/experimental/builder/include/ck_tile/builder/versions.hpp index c7543f11ae..e8fb2fe4de 100644 --- a/experimental/builder/include/ck_tile/builder/versions.hpp +++ b/experimental/builder/include/ck_tile/builder/versions.hpp @@ -3,7 +3,7 @@ #include #include -#include +#include "ck_tile/builder/builder_utils.hpp" namespace ck_tile::builder { diff --git a/experimental/builder/test/test_ckb_build_fwd_instances.cpp b/experimental/builder/test/test_ckb_build_fwd_instances.cpp index 392377b76a..647b5ff294 100644 --- a/experimental/builder/test/test_ckb_build_fwd_instances.cpp +++ b/experimental/builder/test/test_ckb_build_fwd_instances.cpp @@ -3,8 +3,6 @@ #include "impl/conv_algorithm_types.hpp" #include "impl/conv_signature_types.hpp" #include "ck_tile/builder/conv_builder.hpp" -#include "ck_tile/builder/conv_algorithm_concepts.hpp" -#include "ck_tile/builder/conv_signature_concepts.hpp" class FwdConvBuilderTest : public ::testing::Test { From 6cf8cc12dfcc94752b3c291d734b1de07fd952eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= Date: Tue, 21 Oct 2025 13:30:50 +0000 Subject: [PATCH 16/39] Enable the limit checks for the convolution algorithm parameters. --- .../ck_tile/builder/conv_algorithm_limits.hpp | 12 +++--- .../include/ck_tile/builder/conv_factory.hpp | 41 ++++++++++--------- 2 files changed, 28 insertions(+), 25 deletions(-) diff --git a/experimental/builder/include/ck_tile/builder/conv_algorithm_limits.hpp b/experimental/builder/include/ck_tile/builder/conv_algorithm_limits.hpp index 79b304fc5d..6b3aac0b2f 100644 --- a/experimental/builder/include/ck_tile/builder/conv_algorithm_limits.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_algorithm_limits.hpp @@ -28,12 +28,12 @@ concept OutputVectorTransferLimits = requires { // Limits for access order. Must be a permutation of {0, 1, 2}. template concept AccessOrderLimits = requires { - requires ((Value.order[0] != Value.order[1]) && - (Value.order[0] != Value.order[2]) && - (Value.order[1] != Value.order[2]) && - (Value.order[0] >= 0 && Value.order[0] < 3) && - (Value.order[1] >= 0 && Value.order[1] < 3) && - (Value.order[2] >= 0 && Value.order[2] < 3)); + requires ((Value[0] != Value[1]) && + (Value[0] != Value[2]) && + (Value[1] != Value[2]) && + (Value[0] >= 0 && Value[0] < 3) && + (Value[1] >= 0 && Value[1] < 3) && + (Value[2] >= 0 && Value[2] < 3)); }; } // namespace ck_tile::builder diff --git a/experimental/builder/include/ck_tile/builder/conv_factory.hpp b/experimental/builder/include/ck_tile/builder/conv_factory.hpp index 952004d38e..187580e699 100644 --- a/experimental/builder/include/ck_tile/builder/conv_factory.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_factory.hpp @@ -6,30 +6,33 @@ // This file translates a semantic description of a convolution operation // (`ConvSignatureDescriptor` and `ConvAlgorithmDescriptor`) into specific, // low-level template arguments required by the underlying CK device-level -// kernel implementations. This abstraction also enables more complex build +// kernel implementations. This abstraction enables more complex build // time logic and simplifies the kernel specification. // // Key Components: // // Template Metaprogram: // - ConvFactory: The main factory, with specializations for different -// convolution directions. +// convolution directions (currently only forward). // // Template Metaprogram Helpers: -// - ConvTensorLayouts: Maps layout enums to CK layout types. -// - ConvTensorTypes: Maps data type enums to C++ types used by CK. +// - ConvTensorLayouts: Maps layout enums to CK layout types for different +// spatial dimensions (2D/3D) and directions. +// - ConvTensorTypes: Maps data type enums (FP16, BF16, FP32) to C++ types used by CK. // - ConvPassThroughOps: Hard-coded pass-through element-wise operations. +// - ConvSpec: Encapsulates convolution and GEMM specialization enums. // // `constexpr` Helper Functions: -// - SetThreadBlockInfo: Determines thread block dimensions from the algorithm -// descriptor or provides defaults. -// - SetConvTuningInfo: Sets low-level tuning parameters. -// - Set*BlockTransfer: Configures tensor data movement parameters for -// tensors A, B, and C. -// - SetBlockGemmPipelineVersion: Selects the GEMM pipeline version. +// - SetThreadBlockInfo: Determines thread block dimensions and tile sizes. +// - SetConvTuningInfo: Sets XDL and AK1/BK1 tuning parameters. +// - SetFwdConvABlockTransfer: Configures A tensor block transfer parameters. +// - SetFwdConvBBlockTransfer: Configures B tensor block transfer parameters. +// - SetCBlockTransfer: Configures C tensor block transfer parameters. +// - SetBlockGemmPipelineVersion: Maps pipeline version enum to CK types. // -// The primary entry point is the `ConvFactory` struct, which is specialized -// for forward and backward-data convolutions. +// The primary entry point is the `ConvFactory` struct, which is currently +// specialized for forward convolutions and produces instances of +// DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3. #pragma once @@ -358,13 +361,13 @@ struct ConvFactory // Check limits for the algorithm parameters. // TODO: Add more limits checks as needed. - // static_assert(InputVectorTransferLimits); - // static_assert(InputVectorTransferLimits); - // static_assert(OutputVectorTransferLimits); - // static_assert(AccessOrderLimits); - // static_assert(AccessOrderLimits); - // static_assert(AccessOrderLimits); - // static_assert(AccessOrderLimits); + static_assert(InputVectorTransferLimits); + static_assert(InputVectorTransferLimits); + static_assert(OutputVectorTransferLimits); + static_assert(AccessOrderLimits); + static_assert(AccessOrderLimits); + static_assert(AccessOrderLimits); + static_assert(AccessOrderLimits); // The forward convolution kernel class instance. using Instance = From c76954b00b311e7fb9c8546419b44e12b11184a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= Date: Tue, 21 Oct 2025 13:31:42 +0000 Subject: [PATCH 17/39] Remove dead code. --- experimental/builder/include/ck_tile/builder/conv_factory.hpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/experimental/builder/include/ck_tile/builder/conv_factory.hpp b/experimental/builder/include/ck_tile/builder/conv_factory.hpp index 187580e699..12da280f3d 100644 --- a/experimental/builder/include/ck_tile/builder/conv_factory.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_factory.hpp @@ -149,9 +149,6 @@ struct ConvPassThroughOps template requires( std::is_same_v - // || - // std::is_same_v ) struct ConvSpec { From c3f5097e2e030821688b2912a146ba3b65c879bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= Date: Tue, 21 Oct 2025 13:36:57 +0000 Subject: [PATCH 18/39] clang formatting. --- .../include/ck_tile/builder/builder_utils.hpp | 21 +-- .../builder/conv_algorithm_concepts.hpp | 15 +- .../ck_tile/builder/conv_algorithm_limits.hpp | 22 ++- .../include/ck_tile/builder/conv_builder.hpp | 2 +- .../include/ck_tile/builder/conv_factory.hpp | 130 +++++++++--------- .../builder/conv_signature_concepts.hpp | 6 +- .../test/impl/conv_algorithm_types.hpp | 5 +- .../test/impl/conv_signature_types.hpp | 2 +- .../test/test_ckb_build_fwd_instances.cpp | 83 ++++++----- 9 files changed, 137 insertions(+), 149 deletions(-) diff --git a/experimental/builder/include/ck_tile/builder/builder_utils.hpp b/experimental/builder/include/ck_tile/builder/builder_utils.hpp index 5f6b4fb013..99ff8d8fc6 100644 --- a/experimental/builder/include/ck_tile/builder/builder_utils.hpp +++ b/experimental/builder/include/ck_tile/builder/builder_utils.hpp @@ -6,8 +6,7 @@ #include "ck/utility/sequence.hpp" #include "ck_tile/builder/types.hpp" -namespace ck_tile::builder -{ +namespace ck_tile::builder { // Convert a static array to a sequence // Usage example: @@ -42,15 +41,21 @@ using to_sequence_v = typename to_sequence_t, // Wrapper function to make constexpr strings a structural type for NTTP. template -struct StringLiteral { +struct StringLiteral +{ char data[N]; - constexpr StringLiteral(const char (&str)[N]) { - for (size_t i = 0; i < N; ++i) data[i] = str[i]; + constexpr StringLiteral(const char (&str)[N]) + { + for(size_t i = 0; i < N; ++i) + data[i] = str[i]; } - constexpr bool operator==(const StringLiteral& other) const { - for (size_t i = 0; i < N; ++i) { - if (data[i] != other.data[i]) { + constexpr bool operator==(const StringLiteral& other) const + { + for(size_t i = 0; i < N; ++i) + { + if(data[i] != other.data[i]) + { return false; } } diff --git a/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp b/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp index 1646d6c69d..8db6df6e9c 100644 --- a/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp @@ -9,8 +9,7 @@ #include "ck_tile/builder/types.hpp" -namespace ck_tile::builder -{ +namespace ck_tile::builder { /********************************************************************/ /* Descriptors for individual elements of the algorithm description */ @@ -19,10 +18,10 @@ namespace ck_tile::builder // Concept for thread block dimensions for a GEMM problem. template concept ThreadBlockDescriptor = requires(T t) { - { t.block_size } -> std::convertible_to; - { t.tile_size.m } -> std::convertible_to; - { t.tile_size.n } -> std::convertible_to; - { t.tile_size.k } -> std::convertible_to; + { t.block_size } -> std::convertible_to; + { t.tile_size.m } -> std::convertible_to; + { t.tile_size.n } -> std::convertible_to; + { t.tile_size.k } -> std::convertible_to; }; // Concept for parameters that describe a gridwise GEMM problem. @@ -67,8 +66,8 @@ template concept OutputVectorTransferDescriptor = requires(T t) { { t.m_xdl_per_wave_per_shuffle } -> std::convertible_to; { t.n_xdl_per_wave_per_shuffle } -> std::convertible_to; - { t.scalar_per_vector } -> std::convertible_to; -}; + { t.scalar_per_vector } -> std::convertible_to; +}; // Concept for the thread cluster access order template diff --git a/experimental/builder/include/ck_tile/builder/conv_algorithm_limits.hpp b/experimental/builder/include/ck_tile/builder/conv_algorithm_limits.hpp index 6b3aac0b2f..0662976520 100644 --- a/experimental/builder/include/ck_tile/builder/conv_algorithm_limits.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_algorithm_limits.hpp @@ -6,34 +6,28 @@ #include #include -namespace ck_tile::builder -{ +namespace ck_tile::builder { // Limits for input vector transfer. template concept InputVectorTransferLimits = requires { - requires Value.src_vector_dim > 0 && - Value.src_scalar_per_vector > 0 && - Value.dest_scalar_per_vector_k1 > 0; + requires Value.src_vector_dim > 0 && Value.src_scalar_per_vector > 0 && + Value.dest_scalar_per_vector_k1 > 0; }; // Limits for output vector transfer. template concept OutputVectorTransferLimits = requires { - requires Value.scalar_per_vector > 0 && - Value.m_xdl_per_wave_per_shuffle > 0 && - Value.n_xdl_per_wave_per_shuffle > 0 ; + requires Value.scalar_per_vector > 0 && Value.m_xdl_per_wave_per_shuffle > 0 && + Value.n_xdl_per_wave_per_shuffle > 0; }; // Limits for access order. Must be a permutation of {0, 1, 2}. template concept AccessOrderLimits = requires { - requires ((Value[0] != Value[1]) && - (Value[0] != Value[2]) && - (Value[1] != Value[2]) && - (Value[0] >= 0 && Value[0] < 3) && - (Value[1] >= 0 && Value[1] < 3) && - (Value[2] >= 0 && Value[2] < 3)); + requires((Value[0] != Value[1]) && (Value[0] != Value[2]) && (Value[1] != Value[2]) && + (Value[0] >= 0 && Value[0] < 3) && (Value[1] >= 0 && Value[1] < 3) && + (Value[2] >= 0 && Value[2] < 3)); }; } // namespace ck_tile::builder diff --git a/experimental/builder/include/ck_tile/builder/conv_builder.hpp b/experimental/builder/include/ck_tile/builder/conv_builder.hpp index 2830d600ee..d74948709b 100644 --- a/experimental/builder/include/ck_tile/builder/conv_builder.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_builder.hpp @@ -30,7 +30,7 @@ template ; + using Factory = ConvFactory; // Output: The kernel class. using Instance = Factory::Instance; }; diff --git a/experimental/builder/include/ck_tile/builder/conv_factory.hpp b/experimental/builder/include/ck_tile/builder/conv_factory.hpp index 12da280f3d..26cad0b8e5 100644 --- a/experimental/builder/include/ck_tile/builder/conv_factory.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_factory.hpp @@ -4,8 +4,8 @@ // A factory for instantiating CK convolution kernels. // // This file translates a semantic description of a convolution operation -// (`ConvSignatureDescriptor` and `ConvAlgorithmDescriptor`) into specific, -// low-level template arguments required by the underlying CK device-level +// (`ConvSignatureDescriptor` and `ConvAlgorithmDescriptor`) into specific, +// low-level template arguments required by the underlying CK device-level // kernel implementations. This abstraction enables more complex build // time logic and simplifies the kernel specification. // @@ -148,8 +148,7 @@ struct ConvPassThroughOps // The algorithm specializations for the convolution and GEMM. template requires( - std::is_same_v - ) + std::is_same_v) struct ConvSpec { CONV_ENUM conv_spec; @@ -170,17 +169,15 @@ struct MNK struct ConvBlock { size_t block_size = 0; - MNK per_block = {}; + MNK per_block = {}; }; template constexpr ConvBlock SetThreadBlockInfo() { constexpr auto& TB = ALGORITHM.thread_block; - return ConvBlock{ - .block_size = TB.block_size, - .per_block = {.m = TB.tile_size.m, .n = TB.tile_size.n, .k = TB.tile_size.k} - }; + return ConvBlock{.block_size = TB.block_size, + .per_block = {.m = TB.tile_size.m, .n = TB.tile_size.n, .k = TB.tile_size.k}}; } // Convolution tuning parameters. @@ -223,40 +220,36 @@ struct BlockTransfer template constexpr BlockTransfer SetFwdConvABlockTransfer() { - constexpr auto& TCL = ALGORITHM.block_transfer.thread_cluster_dims_a; - constexpr auto& TCO = ALGORITHM.block_transfer.thread_cluster_access_order_a; - constexpr auto& SAO = ALGORITHM.block_transfer.src_access_order_a; - constexpr auto& VTD = ALGORITHM.block_transfer.vector_transfer_a; - - BlockTransfer block_transfer{ - .thread_cluster_dims = {TCL.k0, TCL.m_n, TCL.k1}, - .thread_cluster_order = {TCO.order[0], TCO.order[1], TCO.order[2]}, - .src_access_order = {SAO.order[0], SAO.order[1], SAO.order[2]}, - .src_vector_dim = VTD.src_vector_dim, - .src_scalar_per_vector = VTD.src_scalar_per_vector, - .dest_scalar_per_vector_k1 = VTD.dest_scalar_per_vector_k1, - .add_extra = VTD.add_extra - }; + constexpr auto& TCL = ALGORITHM.block_transfer.thread_cluster_dims_a; + constexpr auto& TCO = ALGORITHM.block_transfer.thread_cluster_access_order_a; + constexpr auto& SAO = ALGORITHM.block_transfer.src_access_order_a; + constexpr auto& VTD = ALGORITHM.block_transfer.vector_transfer_a; + + BlockTransfer block_transfer{.thread_cluster_dims = {TCL.k0, TCL.m_n, TCL.k1}, + .thread_cluster_order = {TCO.order[0], TCO.order[1], TCO.order[2]}, + .src_access_order = {SAO.order[0], SAO.order[1], SAO.order[2]}, + .src_vector_dim = VTD.src_vector_dim, + .src_scalar_per_vector = VTD.src_scalar_per_vector, + .dest_scalar_per_vector_k1 = VTD.dest_scalar_per_vector_k1, + .add_extra = VTD.add_extra}; return block_transfer; } template constexpr BlockTransfer SetFwdConvBBlockTransfer() { - constexpr auto& TCL = ALGORITHM.block_transfer.thread_cluster_dims_b; - constexpr auto& TCO = ALGORITHM.block_transfer.thread_cluster_access_order_b; - constexpr auto& SAO = ALGORITHM.block_transfer.src_access_order_b; - constexpr auto& VTD = ALGORITHM.block_transfer.vector_transfer_b; - - BlockTransfer block_transfer{ - .thread_cluster_dims = {TCL.k0, TCL.m_n, TCL.k1}, - .thread_cluster_order = {TCO.order[0], TCO.order[1], TCO.order[2]}, - .src_access_order = {SAO.order[0], SAO.order[1], SAO.order[2]}, - .src_vector_dim = VTD.src_vector_dim, - .src_scalar_per_vector = VTD.src_scalar_per_vector, - .dest_scalar_per_vector_k1 = VTD.dest_scalar_per_vector_k1, - .add_extra = VTD.add_extra - }; + constexpr auto& TCL = ALGORITHM.block_transfer.thread_cluster_dims_b; + constexpr auto& TCO = ALGORITHM.block_transfer.thread_cluster_access_order_b; + constexpr auto& SAO = ALGORITHM.block_transfer.src_access_order_b; + constexpr auto& VTD = ALGORITHM.block_transfer.vector_transfer_b; + + BlockTransfer block_transfer{.thread_cluster_dims = {TCL.k0, TCL.m_n, TCL.k1}, + .thread_cluster_order = {TCO.order[0], TCO.order[1], TCO.order[2]}, + .src_access_order = {SAO.order[0], SAO.order[1], SAO.order[2]}, + .src_vector_dim = VTD.src_vector_dim, + .src_scalar_per_vector = VTD.src_scalar_per_vector, + .dest_scalar_per_vector_k1 = VTD.dest_scalar_per_vector_k1, + .add_extra = VTD.add_extra}; return block_transfer; } @@ -274,18 +267,16 @@ constexpr CBlockTransfer SetCBlockTransfer() { constexpr auto& TCL = ALGORITHM.block_transfer.thread_cluster_dims_c; constexpr auto& VTC = ALGORITHM.block_transfer.vector_transfer_c; - CBlockTransfer block_transfer - { - .m_xdl_per_wave_per_shuffle = VTC.m_xdl_per_wave_per_shuffle, - .n_xdl_per_wave_per_shuffle = VTC.n_xdl_per_wave_per_shuffle, - .thread_cluster_dims = { - TCL.m_block, - TCL.m_wave_per_xdl, - TCL.n_block, - TCL.n_wave_per_xdl, - }, - .scalar_per_vector = VTC.scalar_per_vector - }; + CBlockTransfer block_transfer{.m_xdl_per_wave_per_shuffle = VTC.m_xdl_per_wave_per_shuffle, + .n_xdl_per_wave_per_shuffle = VTC.n_xdl_per_wave_per_shuffle, + .thread_cluster_dims = + { + TCL.m_block, + TCL.m_wave_per_xdl, + TCL.n_block, + TCL.n_wave_per_xdl, + }, + .scalar_per_vector = VTC.scalar_per_vector}; return block_transfer; } @@ -298,7 +289,7 @@ constexpr ck::BlockGemmPipelineVersion SetBlockGemmPipelineVersion() case BlockGemmPipelineVersion::V3: return ck::BlockGemmPipelineVersion::v3; case BlockGemmPipelineVersion::V4: return ck::BlockGemmPipelineVersion::v4; case BlockGemmPipelineVersion::V5: return ck::BlockGemmPipelineVersion::v5; - default: return ck::BlockGemmPipelineVersion::v4; + default: return ck::BlockGemmPipelineVersion::v4; } } @@ -320,41 +311,46 @@ template { static constexpr size_t SPATIAL_DIM = SIGNATURE.spatial_dim; - using Layouts = factory_internal::ConvTensorLayouts; - using Types = factory_internal::ConvTensorTypes; - using Ops = factory_internal::ConvPassThroughOps; + using Layouts = + factory_internal::ConvTensorLayouts; + using Types = factory_internal::ConvTensorTypes; + using Ops = factory_internal::ConvPassThroughOps; using AlgorithmType = decltype(ALGORITHM); // Check preconditions for the algorithm description. static_assert(SPATIAL_DIM == 2 || SPATIAL_DIM == 3, "Only 2D and 3D convolutions are supported in this factory."); - static_assert(SpecifiesThreadBlock, + static_assert(SpecifiesThreadBlock, "The convolution algorithm descriptor must specify thread block info."); static_assert(SpecifiesGridwiseGemm, "The convolution algorithm descriptor must specify gridwise GEMM info."); static_assert(SpecifiesBlockTransfer, - "The convolution algorithm descriptor must specify block transfer info."); + "The convolution algorithm descriptor must specify block transfer info."); static_assert(SpecifiesBlockVectorTransfer, - "The convolution algorithm descriptor must specify block vector transfer info."); - static_assert(SpecifiesThreadClusterAccessOrder, - "The convolution algorithm descriptor must specify thread cluster access order info."); + "The convolution algorithm descriptor must specify block vector transfer info."); + static_assert( + SpecifiesThreadClusterAccessOrder, + "The convolution algorithm descriptor must specify thread cluster access order info."); static_assert(SpecifiesSourceAccessOrder, - "The convolution algorithm descriptor must specify source access order info."); + "The convolution algorithm descriptor must specify source access order info."); static_assert(SpecifiesGemmPipelineVersion, - "The convolution algorithm descriptor must specify block gemm pipeline version."); - + "The convolution algorithm descriptor must specify block gemm pipeline version."); static constexpr factory_internal::ConvSpec SPECIALIZATION{ .conv_spec = ck::tensor_operation::device::ConvolutionForwardSpecialization::Default, .gemm_spec = ck::tensor_operation::device::GemmSpecialization::MNKPadding, }; - static constexpr auto BLOCK = factory_internal::SetThreadBlockInfo(); - static constexpr auto TUNING = factory_internal::SetConvTuningInfo(); - static constexpr auto A_BLOCK_TRANSFER = factory_internal::SetFwdConvABlockTransfer(); - static constexpr auto B_BLOCK_TRANSFER = factory_internal::SetFwdConvBBlockTransfer(); - static constexpr auto C_BLOCK_TRANSFER = factory_internal::SetCBlockTransfer(); + static constexpr auto BLOCK = factory_internal::SetThreadBlockInfo(); + static constexpr auto TUNING = factory_internal::SetConvTuningInfo(); + static constexpr auto A_BLOCK_TRANSFER = + factory_internal::SetFwdConvABlockTransfer(); + static constexpr auto B_BLOCK_TRANSFER = + factory_internal::SetFwdConvBBlockTransfer(); + static constexpr auto C_BLOCK_TRANSFER = + factory_internal::SetCBlockTransfer(); static constexpr auto PIPELINE_SCHEDULER = ck::BlockGemmPipelineScheduler::Intrawave; - static constexpr auto PIPELINE_VERSION = factory_internal::SetBlockGemmPipelineVersion(); + static constexpr auto PIPELINE_VERSION = + factory_internal::SetBlockGemmPipelineVersion(); // Check limits for the algorithm parameters. // TODO: Add more limits checks as needed. diff --git a/experimental/builder/include/ck_tile/builder/conv_signature_concepts.hpp b/experimental/builder/include/ck_tile/builder/conv_signature_concepts.hpp index bd104948a1..10c2ec55c1 100644 --- a/experimental/builder/include/ck_tile/builder/conv_signature_concepts.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_signature_concepts.hpp @@ -22,8 +22,7 @@ #include "ck_tile/builder/types.hpp" -namespace ck_tile::builder -{ +namespace ck_tile::builder { // Constrains convolution to 1D, 2D, or 3D spatial dimensions. template @@ -31,7 +30,8 @@ concept ConvSpatialDim = std::is_integral_v && (N == 1 || N == 2 || // Constrains convolution data types to common floating-point types. template -concept ConvDataType = (T == DataType::FP32) || (T == DataType::FP16) || (T == DataType::BF16) || (T == DataType::FP8) || (T == DataType::I8); +concept ConvDataType = (T == DataType::FP32) || (T == DataType::FP16) || (T == DataType::BF16) || + (T == DataType::FP8) || (T == DataType::I8); // Concept for a type that defines a convolution's operational signature. template diff --git a/experimental/builder/test/impl/conv_algorithm_types.hpp b/experimental/builder/test/impl/conv_algorithm_types.hpp index 3f9ecc3754..6999b9c5ee 100644 --- a/experimental/builder/test/impl/conv_algorithm_types.hpp +++ b/experimental/builder/test/impl/conv_algorithm_types.hpp @@ -5,8 +5,7 @@ #include "ck_tile/builder/conv_algorithm_concepts.hpp" -namespace ck_tile::builder::test -{ +namespace ck_tile::builder::test { namespace ckb = ck_tile::builder; @@ -66,7 +65,7 @@ struct InputVectorTransfer size_t src_vector_dim; size_t src_scalar_per_vector; size_t dest_scalar_per_vector_k1; - bool add_extra; + bool add_extra; }; static_assert(InputVectorTransferDescriptor); diff --git a/experimental/builder/test/impl/conv_signature_types.hpp b/experimental/builder/test/impl/conv_signature_types.hpp index 7ad47c6baa..c69ee4220d 100644 --- a/experimental/builder/test/impl/conv_signature_types.hpp +++ b/experimental/builder/test/impl/conv_signature_types.hpp @@ -7,7 +7,7 @@ namespace ck_tile::builder { -struct ConvSignature +struct ConvSignature { int spatial_dim; ConvDirection direction; diff --git a/experimental/builder/test/test_ckb_build_fwd_instances.cpp b/experimental/builder/test/test_ckb_build_fwd_instances.cpp index 647b5ff294..db84dd9035 100644 --- a/experimental/builder/test/test_ckb_build_fwd_instances.cpp +++ b/experimental/builder/test/test_ckb_build_fwd_instances.cpp @@ -13,66 +13,61 @@ TEST_F(FwdConvBuilderTest, Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V using namespace ck_tile::builder; using namespace ck_tile::builder::test; - constexpr ConvSignature FwdConvSignature - { - .spatial_dim = 2, - .direction = ConvDirection::FORWARD, - .layout = GroupConvLayout::CHANNELS_LAST, - .data_type = DataType::BF16 - }; + constexpr ConvSignature FwdConvSignature{.spatial_dim = 2, + .direction = ConvDirection::FORWARD, + .layout = GroupConvLayout::CHANNELS_LAST, + .data_type = DataType::BF16}; - constexpr ThreadBlock FwdThreadBlock - { - .block_size = 256, - .tile_size = {.m = 256, .n = 256, .k = 32} - }; + constexpr ThreadBlock FwdThreadBlock{.block_size = 256, + .tile_size = {.m = 256, .n = 256, .k = 32}}; - constexpr ConvTuningParams FwdTuningParams - { - .ak1 = 8, - .bk1 = 8, - .m_per_xdl=32, - .n_per_xdl = 32, - .m_xdl_per_wave = 4, - .n_xdl_per_wave = 4 - }; + constexpr ConvTuningParams FwdTuningParams{.ak1 = 8, + .bk1 = 8, + .m_per_xdl = 32, + .n_per_xdl = 32, + .m_xdl_per_wave = 4, + .n_xdl_per_wave = 4}; - constexpr InputOutputBlockTransfer FwdBlockTransfer - { - .thread_cluster_dims_a = {.k0 = 4, .m_n = 64, .k1 = 1}, - .thread_cluster_dims_b = {.k0 = 4, .m_n = 64, .k1 = 1}, - .thread_cluster_dims_c = { - .m_block = 1, .m_wave_per_xdl = 32, .n_block = 1, .n_wave_per_xdl = 8}, - .vector_transfer_a = { - .src_vector_dim = 2, .src_scalar_per_vector = 2, .dest_scalar_per_vector_k1 = 8, .add_extra = false}, - .vector_transfer_b = { - .src_vector_dim = 2, .src_scalar_per_vector = 8, .dest_scalar_per_vector_k1 = 8, .add_extra = false}, - .vector_transfer_c = { - .m_xdl_per_wave_per_shuffle = 1, .n_xdl_per_wave_per_shuffle = 1, .scalar_per_vector = 8}, + constexpr InputOutputBlockTransfer FwdBlockTransfer{ + .thread_cluster_dims_a = {.k0 = 4, .m_n = 64, .k1 = 1}, + .thread_cluster_dims_b = {.k0 = 4, .m_n = 64, .k1 = 1}, + .thread_cluster_dims_c = {.m_block = 1, + .m_wave_per_xdl = 32, + .n_block = 1, + .n_wave_per_xdl = 8}, + .vector_transfer_a = {.src_vector_dim = 2, + .src_scalar_per_vector = 2, + .dest_scalar_per_vector_k1 = 8, + .add_extra = false}, + .vector_transfer_b = {.src_vector_dim = 2, + .src_scalar_per_vector = 8, + .dest_scalar_per_vector_k1 = 8, + .add_extra = false}, + .vector_transfer_c = {.m_xdl_per_wave_per_shuffle = 1, + .n_xdl_per_wave_per_shuffle = 1, + .scalar_per_vector = 8}, .thread_cluster_access_order_a = {1, 0, 2}, .thread_cluster_access_order_b = {1, 0, 2}, - .src_access_order_a = {1, 0, 2}, - .src_access_order_b = {1, 0, 2} - }; + .src_access_order_a = {1, 0, 2}, + .src_access_order_b = {1, 0, 2}}; - constexpr ConvAlgorithm FwdConvAlgorithm - { - .thread_block = FwdThreadBlock, - .tuning_params = FwdTuningParams, - .block_transfer = FwdBlockTransfer, + constexpr ConvAlgorithm FwdConvAlgorithm{ + .thread_block = FwdThreadBlock, + .tuning_params = FwdTuningParams, + .block_transfer = FwdBlockTransfer, .pipeline_version = BlockGemmPipelineVersion::V4, }; using Builder = ConvBuilder; - + auto instance = Builder::Instance{}; - + const auto kernel_string = instance.GetTypeString(); std::cout << "Generated kernel: " << kernel_string << std::endl; EXPECT_GT(kernel_string.size(), 0); EXPECT_TRUE(kernel_string.starts_with("DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3")); - + const auto invoker_ptr = instance.MakeInvokerPointer(); EXPECT_NE(invoker_ptr, nullptr); } From 28f6707e211703bc9425d3b6c403d841ec9b7ecc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= Date: Tue, 21 Oct 2025 14:18:24 +0000 Subject: [PATCH 19/39] Add more tests and missing conv specialization argument. --- .../builder/conv_algorithm_concepts.hpp | 5 ++ .../include/ck_tile/builder/conv_factory.hpp | 20 +++++++- .../builder/include/ck_tile/builder/types.hpp | 10 ++++ .../test/impl/conv_algorithm_types.hpp | 2 + .../test/impl/conv_signature_types.hpp | 2 +- .../test/test_ckb_build_fwd_instances.cpp | 49 +++++++++++++++---- 6 files changed, 76 insertions(+), 12 deletions(-) diff --git a/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp b/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp index 8db6df6e9c..a4f7940f98 100644 --- a/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp @@ -131,4 +131,9 @@ concept SpecifiesGemmPipelineVersion = requires { { T::pipeline_version } -> std::convertible_to; }; +template +concept SpecifiesFwdConcSpecialization = requires { + { T::fwd_specialization } -> std::convertible_to; +}; + } // namespace ck_tile::builder diff --git a/experimental/builder/include/ck_tile/builder/conv_factory.hpp b/experimental/builder/include/ck_tile/builder/conv_factory.hpp index 26cad0b8e5..73147b313d 100644 --- a/experimental/builder/include/ck_tile/builder/conv_factory.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_factory.hpp @@ -293,6 +293,20 @@ constexpr ck::BlockGemmPipelineVersion SetBlockGemmPipelineVersion() } } +template +constexpr ck::tensor_operation::device::ConvolutionForwardSpecialization SetFwdConvSpecialization() +{ + switch(ALGORITHM.fwd_specialization) + { + case ConvFwdSpecialization::FILTER_1X1_PAD0: return ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0; + case ConvFwdSpecialization::FILTER_1X1_STRIDE1_PAD0: return ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0; + case ConvFwdSpecialization::ODD_C: return ck::tensor_operation::device::ConvolutionForwardSpecialization::OddC; + case ConvFwdSpecialization::FILTER_3x3: return ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter3x3; + case ConvFwdSpecialization::DEFAULT: + default: return ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; + } +} + } // namespace ck_tile::builder::factory_internal namespace ck_tile::builder { @@ -335,9 +349,13 @@ struct ConvFactory "The convolution algorithm descriptor must specify source access order info."); static_assert(SpecifiesGemmPipelineVersion, "The convolution algorithm descriptor must specify block gemm pipeline version."); + static_assert(SpecifiesFwdConcSpecialization, + "The convolution algorithm descriptor must specify forward convolution " + "specialization."); + static constexpr auto FWD_CONV_SPECIALIZATION = factory_internal::SetFwdConvSpecialization(); static constexpr factory_internal::ConvSpec SPECIALIZATION{ - .conv_spec = ck::tensor_operation::device::ConvolutionForwardSpecialization::Default, + .conv_spec = FWD_CONV_SPECIALIZATION, .gemm_spec = ck::tensor_operation::device::GemmSpecialization::MNKPadding, }; static constexpr auto BLOCK = factory_internal::SetThreadBlockInfo(); diff --git a/experimental/builder/include/ck_tile/builder/types.hpp b/experimental/builder/include/ck_tile/builder/types.hpp index 27bc2570d3..08309e30bb 100644 --- a/experimental/builder/include/ck_tile/builder/types.hpp +++ b/experimental/builder/include/ck_tile/builder/types.hpp @@ -49,4 +49,14 @@ enum class BlockGemmPipelineVersion V5 }; +// Enums for the forward convolution specialization. +enum class ConvFwdSpecialization +{ + DEFAULT, + FILTER_1X1_PAD0, + FILTER_1X1_STRIDE1_PAD0, + ODD_C, + FILTER_3x3 +}; + } // namespace ck_tile::builder diff --git a/experimental/builder/test/impl/conv_algorithm_types.hpp b/experimental/builder/test/impl/conv_algorithm_types.hpp index 6999b9c5ee..84fdd83d60 100644 --- a/experimental/builder/test/impl/conv_algorithm_types.hpp +++ b/experimental/builder/test/impl/conv_algorithm_types.hpp @@ -103,6 +103,7 @@ struct ConvAlgorithm ConvTuningParams tuning_params; InputOutputBlockTransfer block_transfer; BlockGemmPipelineVersion pipeline_version; + ConvFwdSpecialization fwd_specialization; }; static_assert(ckb::ConvAlgorithmDescriptor); static_assert(ckb::SpecifiesThreadBlock); @@ -112,5 +113,6 @@ static_assert(ckb::SpecifiesBlockTransfer); static_assert(ckb::SpecifiesBlockVectorTransfer); static_assert(ckb::SpecifiesThreadClusterAccessOrder); static_assert(ckb::SpecifiesSourceAccessOrder); +static_assert(ckb::SpecifiesFwdConcSpecialization); } // namespace ck_tile::builder::test diff --git a/experimental/builder/test/impl/conv_signature_types.hpp b/experimental/builder/test/impl/conv_signature_types.hpp index c69ee4220d..5efb2792e7 100644 --- a/experimental/builder/test/impl/conv_signature_types.hpp +++ b/experimental/builder/test/impl/conv_signature_types.hpp @@ -5,7 +5,7 @@ #include "ck_tile/builder/conv_signature_concepts.hpp" -namespace ck_tile::builder { +namespace ck_tile::builder::test { struct ConvSignature { diff --git a/experimental/builder/test/test_ckb_build_fwd_instances.cpp b/experimental/builder/test/test_ckb_build_fwd_instances.cpp index db84dd9035..b2226d06f5 100644 --- a/experimental/builder/test/test_ckb_build_fwd_instances.cpp +++ b/experimental/builder/test/test_ckb_build_fwd_instances.cpp @@ -8,19 +8,15 @@ class FwdConvBuilderTest : public ::testing::Test { }; -TEST_F(FwdConvBuilderTest, Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance) +template < + ck_tile::builder::test::ConvSignature FwdConvSignature, + ck_tile::builder::test::ThreadBlock FwdThreadBlock, + ck_tile::builder::ConvFwdSpecialization FwdConvSpecialization> +constexpr void run_test() { using namespace ck_tile::builder; using namespace ck_tile::builder::test; - constexpr ConvSignature FwdConvSignature{.spatial_dim = 2, - .direction = ConvDirection::FORWARD, - .layout = GroupConvLayout::CHANNELS_LAST, - .data_type = DataType::BF16}; - - constexpr ThreadBlock FwdThreadBlock{.block_size = 256, - .tile_size = {.m = 256, .n = 256, .k = 32}}; - constexpr ConvTuningParams FwdTuningParams{.ak1 = 8, .bk1 = 8, .m_per_xdl = 32, @@ -56,11 +52,12 @@ TEST_F(FwdConvBuilderTest, Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V .tuning_params = FwdTuningParams, .block_transfer = FwdBlockTransfer, .pipeline_version = BlockGemmPipelineVersion::V4, + .fwd_specialization = FwdConvSpecialization }; using Builder = ConvBuilder; - auto instance = Builder::Instance{}; + auto instance = typename Builder::Instance{}; const auto kernel_string = instance.GetTypeString(); std::cout << "Generated kernel: " << kernel_string << std::endl; @@ -71,3 +68,35 @@ TEST_F(FwdConvBuilderTest, Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V const auto invoker_ptr = instance.MakeInvokerPointer(); EXPECT_NE(invoker_ptr, nullptr); } + +TEST_F(FwdConvBuilderTest, Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_2D_BF16_ChannelsLast) +{ + using namespace ck_tile::builder; + using namespace ck_tile::builder::test; + + constexpr ConvSignature FwdConvSignature{.spatial_dim = 2, + .direction = ConvDirection::FORWARD, + .layout = GroupConvLayout::CHANNELS_LAST, + .data_type = DataType::BF16}; + + constexpr ThreadBlock FwdThreadBlock{.block_size = 256, + .tile_size = {.m = 256, .n = 256, .k = 32}}; + + run_test(); +} + +TEST_F(FwdConvBuilderTest, Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_3D_FP32_ChannelsFirst) +{ + using namespace ck_tile::builder; + using namespace ck_tile::builder::test; + + constexpr ConvSignature FwdConvSignature{.spatial_dim = 3, + .direction = ConvDirection::FORWARD, + .layout = GroupConvLayout::CHANNELS_FIRST, + .data_type = DataType::FP32}; + + constexpr ThreadBlock FwdThreadBlock{.block_size = 256, + .tile_size = {.m = 128, .n = 128, .k = 32}}; + + run_test(); +} From fc5caa1baef6eb2327856dd310d28d6f024c1b0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= Date: Tue, 21 Oct 2025 14:19:14 +0000 Subject: [PATCH 20/39] clang formatting. --- .../include/ck_tile/builder/conv_factory.hpp | 15 ++++++---- .../test/impl/conv_signature_types.hpp | 2 +- .../test/test_ckb_build_fwd_instances.cpp | 29 +++++++++---------- 3 files changed, 25 insertions(+), 21 deletions(-) diff --git a/experimental/builder/include/ck_tile/builder/conv_factory.hpp b/experimental/builder/include/ck_tile/builder/conv_factory.hpp index 73147b313d..38f806406d 100644 --- a/experimental/builder/include/ck_tile/builder/conv_factory.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_factory.hpp @@ -298,10 +298,14 @@ constexpr ck::tensor_operation::device::ConvolutionForwardSpecialization SetFwdC { switch(ALGORITHM.fwd_specialization) { - case ConvFwdSpecialization::FILTER_1X1_PAD0: return ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0; - case ConvFwdSpecialization::FILTER_1X1_STRIDE1_PAD0: return ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0; - case ConvFwdSpecialization::ODD_C: return ck::tensor_operation::device::ConvolutionForwardSpecialization::OddC; - case ConvFwdSpecialization::FILTER_3x3: return ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter3x3; + case ConvFwdSpecialization::FILTER_1X1_PAD0: + return ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0; + case ConvFwdSpecialization::FILTER_1X1_STRIDE1_PAD0: + return ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0; + case ConvFwdSpecialization::ODD_C: + return ck::tensor_operation::device::ConvolutionForwardSpecialization::OddC; + case ConvFwdSpecialization::FILTER_3x3: + return ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter3x3; case ConvFwdSpecialization::DEFAULT: default: return ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; } @@ -353,7 +357,8 @@ struct ConvFactory "The convolution algorithm descriptor must specify forward convolution " "specialization."); - static constexpr auto FWD_CONV_SPECIALIZATION = factory_internal::SetFwdConvSpecialization(); + static constexpr auto FWD_CONV_SPECIALIZATION = + factory_internal::SetFwdConvSpecialization(); static constexpr factory_internal::ConvSpec SPECIALIZATION{ .conv_spec = FWD_CONV_SPECIALIZATION, .gemm_spec = ck::tensor_operation::device::GemmSpecialization::MNKPadding, diff --git a/experimental/builder/test/impl/conv_signature_types.hpp b/experimental/builder/test/impl/conv_signature_types.hpp index 5efb2792e7..09c2df1398 100644 --- a/experimental/builder/test/impl/conv_signature_types.hpp +++ b/experimental/builder/test/impl/conv_signature_types.hpp @@ -16,4 +16,4 @@ struct ConvSignature }; static_assert(ConvSignatureDescriptor); -} // namespace ck_tile::builder +} // namespace ck_tile::builder::test diff --git a/experimental/builder/test/test_ckb_build_fwd_instances.cpp b/experimental/builder/test/test_ckb_build_fwd_instances.cpp index b2226d06f5..5d1b815161 100644 --- a/experimental/builder/test/test_ckb_build_fwd_instances.cpp +++ b/experimental/builder/test/test_ckb_build_fwd_instances.cpp @@ -8,10 +8,9 @@ class FwdConvBuilderTest : public ::testing::Test { }; -template < - ck_tile::builder::test::ConvSignature FwdConvSignature, - ck_tile::builder::test::ThreadBlock FwdThreadBlock, - ck_tile::builder::ConvFwdSpecialization FwdConvSpecialization> +template constexpr void run_test() { using namespace ck_tile::builder; @@ -47,13 +46,11 @@ constexpr void run_test() .src_access_order_a = {1, 0, 2}, .src_access_order_b = {1, 0, 2}}; - constexpr ConvAlgorithm FwdConvAlgorithm{ - .thread_block = FwdThreadBlock, - .tuning_params = FwdTuningParams, - .block_transfer = FwdBlockTransfer, - .pipeline_version = BlockGemmPipelineVersion::V4, - .fwd_specialization = FwdConvSpecialization - }; + constexpr ConvAlgorithm FwdConvAlgorithm{.thread_block = FwdThreadBlock, + .tuning_params = FwdTuningParams, + .block_transfer = FwdBlockTransfer, + .pipeline_version = BlockGemmPipelineVersion::V4, + .fwd_specialization = FwdConvSpecialization}; using Builder = ConvBuilder; @@ -69,11 +66,12 @@ constexpr void run_test() EXPECT_NE(invoker_ptr, nullptr); } -TEST_F(FwdConvBuilderTest, Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_2D_BF16_ChannelsLast) +TEST_F(FwdConvBuilderTest, + Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_2D_BF16_ChannelsLast) { using namespace ck_tile::builder; using namespace ck_tile::builder::test; - + constexpr ConvSignature FwdConvSignature{.spatial_dim = 2, .direction = ConvDirection::FORWARD, .layout = GroupConvLayout::CHANNELS_LAST, @@ -85,11 +83,12 @@ TEST_F(FwdConvBuilderTest, Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V run_test(); } -TEST_F(FwdConvBuilderTest, Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_3D_FP32_ChannelsFirst) +TEST_F(FwdConvBuilderTest, + Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_3D_FP32_ChannelsFirst) { using namespace ck_tile::builder; using namespace ck_tile::builder::test; - + constexpr ConvSignature FwdConvSignature{.spatial_dim = 3, .direction = ConvDirection::FORWARD, .layout = GroupConvLayout::CHANNELS_FIRST, From 6ade5a1e2c21440082588e5f44020e044c50832d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= Date: Wed, 22 Oct 2025 10:11:27 +0000 Subject: [PATCH 21/39] Add explicit handling of the tensor layouts. --- .../include/ck_tile/builder/builder_utils.hpp | 32 +++++++++++++++-- .../include/ck_tile/builder/conv_factory.hpp | 23 ++++++------ .../builder/conv_signature_concepts.hpp | 11 +++++- .../builder/include/ck_tile/builder/types.hpp | 34 +++++++++++++++--- .../test/impl/conv_signature_types.hpp | 5 ++- .../test/test_ckb_build_fwd_instances.cpp | 36 +++++++++---------- 6 files changed, 102 insertions(+), 39 deletions(-) diff --git a/experimental/builder/include/ck_tile/builder/builder_utils.hpp b/experimental/builder/include/ck_tile/builder/builder_utils.hpp index 99ff8d8fc6..e1510fcd34 100644 --- a/experimental/builder/include/ck_tile/builder/builder_utils.hpp +++ b/experimental/builder/include/ck_tile/builder/builder_utils.hpp @@ -103,12 +103,38 @@ constexpr std::string_view DataTypeToString(DataType dt) } } -constexpr std::string_view LayoutToString(GroupConvLayout layout) +constexpr std::string_view LayoutToString(GroupConvLayout1D layout) { switch(layout) { - case GroupConvLayout::CHANNELS_FIRST: return "Channels-first (NCHW)"; - case GroupConvLayout::CHANNELS_LAST: return "Channels-last (NHWC)"; + case GroupConvLayout1D::GNWC_GKXC_GNWK: return "GNWC_GKXC_GNWK"; + case GroupConvLayout1D::NWGC_GKXC_NWGK: return "NWGC_GKXC_NWGK"; + case GroupConvLayout1D::NGCW_GKXC_NGKW: return "NGCW_GKXC_NGKW"; + case GroupConvLayout1D::NGCW_GKCX_NGKW: return "NGCW_GKCX_NGKW"; + default: return "Unknown"; + } +} + +constexpr std::string_view LayoutToString(GroupConvLayout2D layout) +{ + switch(layout) + { + case GroupConvLayout2D::GNHWC_GKYXC_GNHWK: return "GNHWC_GKYXC_GNHWK"; + case GroupConvLayout2D::NHWGC_GKYXC_NHWGK: return "NHWGC_GKYXC_NHWGK"; + case GroupConvLayout2D::NGCHW_GKYXC_NGKHW: return "NGCHW_GKYXC_NGKHW"; + case GroupConvLayout2D::NGCHW_GKCYX_NGKHW: return "NGCHW_GKCYX_NGKHW"; + default: return "Unknown"; + } +} + +constexpr std::string_view LayoutToString(GroupConvLayout3D layout) +{ + switch(layout) + { + case GroupConvLayout3D::GNDHWC_GKZYXC_GNDHWK: return "GNDHWC_GKZYXC_GNDHWK"; + case GroupConvLayout3D::NDHWGC_GKZYXC_NDHWGK: return "NDHWGC_GKZYXC_NDHWGK"; + case GroupConvLayout3D::NGCDHW_GKZYXC_NGKDHW: return "NGCDHW_GKZYXC_NGKDHW"; + case GroupConvLayout3D::NGCDHW_GKCZYX_NGKDHW: return "NGCDHW_GKCZYX_NGKDHW"; default: return "Unknown"; } } diff --git a/experimental/builder/include/ck_tile/builder/conv_factory.hpp b/experimental/builder/include/ck_tile/builder/conv_factory.hpp index 38f806406d..9c86359ae2 100644 --- a/experimental/builder/include/ck_tile/builder/conv_factory.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_factory.hpp @@ -46,28 +46,29 @@ namespace ck_tile::builder::factory_internal { -// Type mappings from the builder GroupConvLayout enum class to the CK tensor data types. -template - requires(ConvSpatialDim) +// Type mappings from the builder FwdGroupConvLayout enum classes to the CK tensor data types. +template + requires(ConvSpatialDim && ValidConvLayoutForSpatialDim) struct ConvTensorLayouts { // This will trigger if a specialization for the given layout is not found. // We should always catch this in an earlier validation check. + using Layout = decltype(LayoutValue); static_assert(sizeof(Layout) == 0, "Internal error. Unsupported layout for convolution factory."); }; template <> -struct ConvTensorLayouts +struct ConvTensorLayouts { - using ALayout = ck::tensor_layout::convolution::NHWGC; - using BLayout = ck::tensor_layout::convolution::GKCYX; + using ALayout = ck::tensor_layout::convolution::NGCHW; + using BLayout = ck::tensor_layout::convolution::GKYXC; using DsLayout = ck::Tuple<>; using ELayout = ck::tensor_layout::convolution::NGKHW; }; template <> -struct ConvTensorLayouts +struct ConvTensorLayouts { using ALayout = ck::tensor_layout::convolution::NHWGC; using BLayout = ck::tensor_layout::convolution::GKYXC; @@ -76,16 +77,16 @@ struct ConvTensorLayouts -struct ConvTensorLayouts +struct ConvTensorLayouts { - using ALayout = ck::tensor_layout::convolution::NDHWGC; + using ALayout = ck::tensor_layout::convolution::NGCDHW; using BLayout = ck::tensor_layout::convolution::GKCZYX; using DsLayout = ck::Tuple<>; using ELayout = ck::tensor_layout::convolution::NGKDHW; }; template <> -struct ConvTensorLayouts +struct ConvTensorLayouts { using ALayout = ck::tensor_layout::convolution::NDHWGC; using BLayout = ck::tensor_layout::convolution::GKZYXC; @@ -93,6 +94,8 @@ struct ConvTensorLayouts struct ConvTensorTypes diff --git a/experimental/builder/include/ck_tile/builder/conv_signature_concepts.hpp b/experimental/builder/include/ck_tile/builder/conv_signature_concepts.hpp index 10c2ec55c1..c5e4fd6cfd 100644 --- a/experimental/builder/include/ck_tile/builder/conv_signature_concepts.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_signature_concepts.hpp @@ -28,6 +28,13 @@ namespace ck_tile::builder { template concept ConvSpatialDim = std::is_integral_v && (N == 1 || N == 2 || N == 3); +// Constraints for forward convolution layouts. +template +concept ValidConvLayoutForSpatialDim = + (SpatialDim == 1 && std::same_as) || + (SpatialDim == 2 && std::same_as) || + (SpatialDim == 3 && std::same_as); + // Constrains convolution data types to common floating-point types. template concept ConvDataType = (T == DataType::FP32) || (T == DataType::FP16) || (T == DataType::BF16) || @@ -38,7 +45,9 @@ template concept ConvSignatureDescriptor = requires(T t) { { t.spatial_dim } -> std::convertible_to; { t.direction } -> std::convertible_to; - { t.layout } -> std::convertible_to; + requires std::convertible_to || + std::convertible_to || + std::convertible_to; { t.data_type } -> std::convertible_to; }; diff --git a/experimental/builder/include/ck_tile/builder/types.hpp b/experimental/builder/include/ck_tile/builder/types.hpp index 08309e30bb..a234cf0de3 100644 --- a/experimental/builder/include/ck_tile/builder/types.hpp +++ b/experimental/builder/include/ck_tile/builder/types.hpp @@ -14,11 +14,37 @@ enum class DataType I8 }; -// Memory layouts for convolution tensors, following PyTorch conventions. -enum class GroupConvLayout +// Memory layouts for 1D convolution tensors. +// G: Group, N: Batch, K: Output Channel, C: Input Channel, W: Width +// Enum defines Input, Weight, and Output tensor layouts respectively. +enum class GroupConvLayout1D { - CHANNELS_LAST, // e.g., NHWGC - CHANNELS_FIRST // e.g., NGCHW + GNWC_GKXC_GNWK, + NWGC_GKXC_NWGK, + NGCW_GKXC_NGKW, + NGCW_GKCX_NGKW +}; + +// Memory layouts for 2D convolution tensors. +// G: Group, N: Batch, K: Output Channel, C: Input Channel, Y: Height, X: Width, H: Height +// Enum defines Input, Weight, and Output tensor layouts respectively. +enum class GroupConvLayout2D +{ + GNHWC_GKYXC_GNHWK, + NHWGC_GKYXC_NHWGK, + NGCHW_GKYXC_NGKHW, + NGCHW_GKCYX_NGKHW +}; + +// Memory layouts for 3D convolution tensors. +// G: Group, N: Batch, K: Output Channel, C: Input Channel, Z: Depth, Y: Height, X: Width, D: Depth, +// H: Height Enum defines Input, Weight, and Output tensor layouts respectively. +enum class GroupConvLayout3D +{ + GNDHWC_GKZYXC_GNDHWK, + NDHWGC_GKZYXC_NDHWGK, + NGCDHW_GKZYXC_NGKDHW, + NGCDHW_GKCZYX_NGKDHW, }; // Direction of the convolution operation. diff --git a/experimental/builder/test/impl/conv_signature_types.hpp b/experimental/builder/test/impl/conv_signature_types.hpp index 09c2df1398..40f49652fb 100644 --- a/experimental/builder/test/impl/conv_signature_types.hpp +++ b/experimental/builder/test/impl/conv_signature_types.hpp @@ -7,6 +7,7 @@ namespace ck_tile::builder::test { +template struct ConvSignature { int spatial_dim; @@ -14,6 +15,8 @@ struct ConvSignature GroupConvLayout layout; DataType data_type; }; -static_assert(ConvSignatureDescriptor); +static_assert(ConvSignatureDescriptor>); +static_assert(ConvSignatureDescriptor>); +static_assert(ConvSignatureDescriptor>); } // namespace ck_tile::builder::test diff --git a/experimental/builder/test/test_ckb_build_fwd_instances.cpp b/experimental/builder/test/test_ckb_build_fwd_instances.cpp index 5d1b815161..cd87e1acb1 100644 --- a/experimental/builder/test/test_ckb_build_fwd_instances.cpp +++ b/experimental/builder/test/test_ckb_build_fwd_instances.cpp @@ -8,14 +8,14 @@ class FwdConvBuilderTest : public ::testing::Test { }; -template +using namespace ck_tile::builder; +using namespace test; + +template constexpr void run_test() { - using namespace ck_tile::builder; - using namespace ck_tile::builder::test; - constexpr ConvTuningParams FwdTuningParams{.ak1 = 8, .bk1 = 8, .m_per_xdl = 32, @@ -69,13 +69,11 @@ constexpr void run_test() TEST_F(FwdConvBuilderTest, Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_2D_BF16_ChannelsLast) { - using namespace ck_tile::builder; - using namespace ck_tile::builder::test; - - constexpr ConvSignature FwdConvSignature{.spatial_dim = 2, - .direction = ConvDirection::FORWARD, - .layout = GroupConvLayout::CHANNELS_LAST, - .data_type = DataType::BF16}; + constexpr ConvSignature FwdConvSignature{ + .spatial_dim = 2, + .direction = ConvDirection::FORWARD, + .layout = GroupConvLayout2D::NHWGC_GKYXC_NHWGK, + .data_type = DataType::BF16}; constexpr ThreadBlock FwdThreadBlock{.block_size = 256, .tile_size = {.m = 256, .n = 256, .k = 32}}; @@ -86,13 +84,11 @@ TEST_F(FwdConvBuilderTest, TEST_F(FwdConvBuilderTest, Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_3D_FP32_ChannelsFirst) { - using namespace ck_tile::builder; - using namespace ck_tile::builder::test; - - constexpr ConvSignature FwdConvSignature{.spatial_dim = 3, - .direction = ConvDirection::FORWARD, - .layout = GroupConvLayout::CHANNELS_FIRST, - .data_type = DataType::FP32}; + constexpr ConvSignature FwdConvSignature{ + .spatial_dim = 3, + .direction = ConvDirection::FORWARD, + .layout = GroupConvLayout3D::NGCDHW_GKCZYX_NGKDHW, + .data_type = DataType::FP32}; constexpr ThreadBlock FwdThreadBlock{.block_size = 256, .tile_size = {.m = 128, .n = 128, .k = 32}}; From 806ddac2bf89e1abdfde746c3f7c1a554ed405c5 Mon Sep 17 00:00:00 2001 From: JH-Leon-KIM-AMD Date: Wed, 22 Oct 2025 19:47:46 +0000 Subject: [PATCH 22/39] Add complete 2D/3D layout support to CK Builder - Add missing 2D layouts: GNHWC_GKYXC_GNHWK, NGCHW_GKCYX_NGKHW - Add missing 3D layout: GNDHWC_GKZYXC_GNDHWK - Add 1D layouts (NWGC, NGCW, GNWC, NGCW_GKCX) for future support - Add 3 tests for new 2D/3D layouts - All tests pass (5/5) --- .../include/ck_tile/builder/conv_factory.hpp | 64 ++++++++++++++++++- .../test/test_ckb_build_fwd_instances.cpp | 45 +++++++++++++ 2 files changed, 108 insertions(+), 1 deletion(-) diff --git a/experimental/builder/include/ck_tile/builder/conv_factory.hpp b/experimental/builder/include/ck_tile/builder/conv_factory.hpp index 9c86359ae2..6f838827e3 100644 --- a/experimental/builder/include/ck_tile/builder/conv_factory.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_factory.hpp @@ -58,6 +58,43 @@ struct ConvTensorLayouts "Internal error. Unsupported layout for convolution factory."); }; +// 1D Forward Convolution Layout Specializations +template <> +struct ConvTensorLayouts +{ + using ALayout = ck::tensor_layout::convolution::NWGC; + using BLayout = ck::tensor_layout::convolution::GKXC; + using DsLayout = ck::Tuple<>; + using ELayout = ck::tensor_layout::convolution::NWGK; +}; + +template <> +struct ConvTensorLayouts +{ + using ALayout = ck::tensor_layout::convolution::NGCW; + using BLayout = ck::tensor_layout::convolution::GKXC; + using DsLayout = ck::Tuple<>; + using ELayout = ck::tensor_layout::convolution::NGKW; +}; + +template <> +struct ConvTensorLayouts +{ + using ALayout = ck::tensor_layout::convolution::GNWC; + using BLayout = ck::tensor_layout::convolution::GKXC; + using DsLayout = ck::Tuple<>; + using ELayout = ck::tensor_layout::convolution::GNWK; +}; + +template <> +struct ConvTensorLayouts +{ + using ALayout = ck::tensor_layout::convolution::NGCW; + using BLayout = ck::tensor_layout::convolution::GKCX; + using DsLayout = ck::Tuple<>; + using ELayout = ck::tensor_layout::convolution::NGKW; +}; + template <> struct ConvTensorLayouts { @@ -76,6 +113,24 @@ struct ConvTensorLayouts +struct ConvTensorLayouts +{ + using ALayout = ck::tensor_layout::convolution::GNHWC; + using BLayout = ck::tensor_layout::convolution::GKYXC; + using DsLayout = ck::Tuple<>; + using ELayout = ck::tensor_layout::convolution::GNHWK; +}; + +template <> +struct ConvTensorLayouts +{ + using ALayout = ck::tensor_layout::convolution::NGCHW; + using BLayout = ck::tensor_layout::convolution::GKCYX; + using DsLayout = ck::Tuple<>; + using ELayout = ck::tensor_layout::convolution::NGKHW; +}; + template <> struct ConvTensorLayouts { @@ -94,7 +149,14 @@ struct ConvTensorLayouts +struct ConvTensorLayouts +{ + using ALayout = ck::tensor_layout::convolution::GNDHWC; + using BLayout = ck::tensor_layout::convolution::GKZYXC; + using DsLayout = ck::Tuple<>; + using ELayout = ck::tensor_layout::convolution::GNDHWK; +}; // Type mappings from builder convolution data type to CK tensor types. template diff --git a/experimental/builder/test/test_ckb_build_fwd_instances.cpp b/experimental/builder/test/test_ckb_build_fwd_instances.cpp index cd87e1acb1..22e215d2bb 100644 --- a/experimental/builder/test/test_ckb_build_fwd_instances.cpp +++ b/experimental/builder/test/test_ckb_build_fwd_instances.cpp @@ -95,3 +95,48 @@ TEST_F(FwdConvBuilderTest, run_test(); } + +TEST_F(FwdConvBuilderTest, + Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_2D_FP16_GNHWC) +{ + constexpr ConvSignature FwdConvSignature{ + .spatial_dim = 2, + .direction = ConvDirection::FORWARD, + .layout = GroupConvLayout2D::GNHWC_GKYXC_GNHWK, + .data_type = DataType::FP16}; + + constexpr ThreadBlock FwdThreadBlock{.block_size = 256, + .tile_size = {.m = 256, .n = 256, .k = 32}}; + + run_test(); +} + +TEST_F(FwdConvBuilderTest, + Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_2D_FP32_NGCHW_GKCYX) +{ + constexpr ConvSignature FwdConvSignature{ + .spatial_dim = 2, + .direction = ConvDirection::FORWARD, + .layout = GroupConvLayout2D::NGCHW_GKCYX_NGKHW, + .data_type = DataType::FP32}; + + constexpr ThreadBlock FwdThreadBlock{.block_size = 256, + .tile_size = {.m = 128, .n = 128, .k = 32}}; + + run_test(); +} + +TEST_F(FwdConvBuilderTest, + Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_3D_BF16_GNDHWC) +{ + constexpr ConvSignature FwdConvSignature{ + .spatial_dim = 3, + .direction = ConvDirection::FORWARD, + .layout = GroupConvLayout3D::GNDHWC_GKZYXC_GNDHWK, + .data_type = DataType::BF16}; + + constexpr ThreadBlock FwdThreadBlock{.block_size = 256, + .tile_size = {.m = 256, .n = 256, .k = 32}}; + + run_test(); +} From 2f2e86e9439f53e36e531b8cf645564e243a01b7 Mon Sep 17 00:00:00 2001 From: JH-Leon-KIM-AMD Date: Wed, 22 Oct 2025 20:13:00 +0000 Subject: [PATCH 23/39] Add tests for remaining 2D/3D layouts - Add test for 2D NGCHW_GKYXC_NGKHW (channels-first) with Filter1x1Stride1Pad0 - Add test for 3D NDHWGC_GKZYXC_NDHWGK (channels-last) - All 7 tests pass (complete coverage for all 2D/3D forward layouts) --- .../test/test_ckb_build_fwd_instances.cpp | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/experimental/builder/test/test_ckb_build_fwd_instances.cpp b/experimental/builder/test/test_ckb_build_fwd_instances.cpp index 22e215d2bb..2807d3c40a 100644 --- a/experimental/builder/test/test_ckb_build_fwd_instances.cpp +++ b/experimental/builder/test/test_ckb_build_fwd_instances.cpp @@ -140,3 +140,33 @@ TEST_F(FwdConvBuilderTest, run_test(); } + +TEST_F(FwdConvBuilderTest, + Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_2D_BF16_NGCHW_ChannelsFirst) +{ + constexpr ConvSignature FwdConvSignature{ + .spatial_dim = 2, + .direction = ConvDirection::FORWARD, + .layout = GroupConvLayout2D::NGCHW_GKYXC_NGKHW, + .data_type = DataType::BF16}; + + constexpr ThreadBlock FwdThreadBlock{.block_size = 256, + .tile_size = {.m = 256, .n = 256, .k = 32}}; + + run_test(); +} + +TEST_F(FwdConvBuilderTest, + Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_3D_FP16_NDHWGC_ChannelsLast) +{ + constexpr ConvSignature FwdConvSignature{ + .spatial_dim = 3, + .direction = ConvDirection::FORWARD, + .layout = GroupConvLayout3D::NDHWGC_GKZYXC_NDHWGK, + .data_type = DataType::FP16}; + + constexpr ThreadBlock FwdThreadBlock{.block_size = 256, + .tile_size = {.m = 128, .n = 128, .k = 32}}; + + run_test(); +} From 89b795410f77f380a66bb6b0a38117a8582ac69f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= Date: Thu, 23 Oct 2025 06:25:42 +0000 Subject: [PATCH 24/39] Change enum converters to consteval. --- .../include/ck_tile/builder/conv_factory.hpp | 41 +++++++++++-------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/experimental/builder/include/ck_tile/builder/conv_factory.hpp b/experimental/builder/include/ck_tile/builder/conv_factory.hpp index 9c86359ae2..bf5fb75263 100644 --- a/experimental/builder/include/ck_tile/builder/conv_factory.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_factory.hpp @@ -284,33 +284,40 @@ constexpr CBlockTransfer SetCBlockTransfer() } template -constexpr ck::BlockGemmPipelineVersion SetBlockGemmPipelineVersion() +consteval ck::BlockGemmPipelineVersion SetBlockGemmPipelineVersion() { - switch(ALGORITHM.pipeline_version) - { - case BlockGemmPipelineVersion::V1: return ck::BlockGemmPipelineVersion::v1; - case BlockGemmPipelineVersion::V3: return ck::BlockGemmPipelineVersion::v3; - case BlockGemmPipelineVersion::V4: return ck::BlockGemmPipelineVersion::v4; - case BlockGemmPipelineVersion::V5: return ck::BlockGemmPipelineVersion::v5; - default: return ck::BlockGemmPipelineVersion::v4; + constexpr auto version = ALGORITHM.pipeline_version; + + if constexpr (version == BlockGemmPipelineVersion::V1) { + return ck::BlockGemmPipelineVersion::v1; + } else if constexpr (version == BlockGemmPipelineVersion::V3) { + return ck::BlockGemmPipelineVersion::v3; + } else if constexpr (version == BlockGemmPipelineVersion::V4) { + return ck::BlockGemmPipelineVersion::v4; + } else if constexpr (version == BlockGemmPipelineVersion::V5) { + return ck::BlockGemmPipelineVersion::v5; + } else { + static_assert(false, "Unknown BlockGemmPipelineVersion"); } } template -constexpr ck::tensor_operation::device::ConvolutionForwardSpecialization SetFwdConvSpecialization() +consteval ck::tensor_operation::device::ConvolutionForwardSpecialization SetFwdConvSpecialization() { - switch(ALGORITHM.fwd_specialization) - { - case ConvFwdSpecialization::FILTER_1X1_PAD0: + constexpr auto specialization = ALGORITHM.fwd_specialization; + + if constexpr (specialization == ConvFwdSpecialization::DEFAULT) { + return ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; + } else if constexpr (specialization == ConvFwdSpecialization::FILTER_1X1_PAD0) { return ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0; - case ConvFwdSpecialization::FILTER_1X1_STRIDE1_PAD0: + } else if constexpr (specialization == ConvFwdSpecialization::FILTER_1X1_STRIDE1_PAD0) { return ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0; - case ConvFwdSpecialization::ODD_C: + } else if constexpr (specialization == ConvFwdSpecialization::ODD_C) { return ck::tensor_operation::device::ConvolutionForwardSpecialization::OddC; - case ConvFwdSpecialization::FILTER_3x3: + } else if constexpr (specialization == ConvFwdSpecialization::FILTER_3x3) { return ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter3x3; - case ConvFwdSpecialization::DEFAULT: - default: return ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; + } else { + static_assert(false, "Unknown ConvFwdSpecialization"); } } From 273d50ac68f40e5f232a638edbe27969c519e27f Mon Sep 17 00:00:00 2001 From: JH-Leon-KIM-AMD Date: Thu, 23 Oct 2025 07:45:15 +0000 Subject: [PATCH 25/39] 7 tests with pipeline and specialization| Test # | Dim | Type | Layout | Pipeline | Specialization | |--------|-----|------|----------------------|----------|-------------------------| | 1 | 2D | BF16 | NHWGC_GKYXC_NHWGK | V1 | DEFAULT | | 2 | 2D | FP16 | GNHWC_GKYXC_GNHWK | V3 | FILTER_1X1_PAD0 | | 3 | 2D | FP32 | NGCHW_GKCYX_NGKHW | V4 | FILTER_1X1_STRIDE1_PAD0 | | 4 | 2D | BF16 | NHWGC_GKYXC_NHWGK | V5 | FILTER_3x3 | | 5 | 3D | FP32 | NGCDHW_GKCZYX_NGKDHW | V1 | FILTER_1X1_PAD0 | | 6 | 3D | BF16 | GNDHWC_GKZYXC_GNDHWK | V3 | DEFAULT | | 7 | 3D | FP16 | NDHWGC_GKZYXC_NDHWGK | V4 | FILTER_1X1_PAD0 | --- .../test/test_ckb_build_fwd_instances.cpp | 98 +++++++++++++------ 1 file changed, 68 insertions(+), 30 deletions(-) diff --git a/experimental/builder/test/test_ckb_build_fwd_instances.cpp b/experimental/builder/test/test_ckb_build_fwd_instances.cpp index 2807d3c40a..a93637e7fb 100644 --- a/experimental/builder/test/test_ckb_build_fwd_instances.cpp +++ b/experimental/builder/test/test_ckb_build_fwd_instances.cpp @@ -13,6 +13,7 @@ using namespace test; template constexpr void run_test() { @@ -49,7 +50,7 @@ constexpr void run_test() constexpr ConvAlgorithm FwdConvAlgorithm{.thread_block = FwdThreadBlock, .tuning_params = FwdTuningParams, .block_transfer = FwdBlockTransfer, - .pipeline_version = BlockGemmPipelineVersion::V4, + .pipeline_version = FwdPipelineVersion, .fwd_specialization = FwdConvSpecialization}; using Builder = ConvBuilder; @@ -62,10 +63,37 @@ constexpr void run_test() EXPECT_TRUE(kernel_string.starts_with("DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3")); + // Verify pipeline version is correct + if(FwdPipelineVersion == BlockGemmPipelineVersion::V1) + EXPECT_TRUE(kernel_string.find("BlkGemmPipelineVersion: v1") != std::string::npos); + else if(FwdPipelineVersion == BlockGemmPipelineVersion::V3) + EXPECT_TRUE(kernel_string.find("BlkGemmPipelineVersion: v3") != std::string::npos); + else if(FwdPipelineVersion == BlockGemmPipelineVersion::V4) + EXPECT_TRUE(kernel_string.find("BlkGemmPipelineVersion: v4") != std::string::npos); + else if(FwdPipelineVersion == BlockGemmPipelineVersion::V5) + EXPECT_TRUE(kernel_string.find("BlkGemmPipelineVersion: v5") != std::string::npos); + + // Verify specialization is correct + if(FwdConvSpecialization == ConvFwdSpecialization::DEFAULT) + EXPECT_TRUE(kernel_string.find("Default") != std::string::npos); + else if(FwdConvSpecialization == ConvFwdSpecialization::FILTER_1X1_PAD0) + EXPECT_TRUE(kernel_string.find("Filter1x1Pad0") != std::string::npos); + else if(FwdConvSpecialization == ConvFwdSpecialization::FILTER_1X1_STRIDE1_PAD0) + EXPECT_TRUE(kernel_string.find("Filter1x1Stride1Pad0") != std::string::npos); + else if(FwdConvSpecialization == ConvFwdSpecialization::FILTER_3x3) + EXPECT_TRUE(kernel_string.find("Filter3x3") != std::string::npos); + else if(FwdConvSpecialization == ConvFwdSpecialization::ODD_C) + EXPECT_TRUE(kernel_string.find("OddC") != std::string::npos); + const auto invoker_ptr = instance.MakeInvokerPointer(); EXPECT_NE(invoker_ptr, nullptr); } +//============================================================================== +// 2D Forward Convolution Tests +//============================================================================== + +// Test 1: 2D BF16 NHWGC (channels-last) with Pipeline V1 and DEFAULT TEST_F(FwdConvBuilderTest, Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_2D_BF16_ChannelsLast) { @@ -78,24 +106,10 @@ TEST_F(FwdConvBuilderTest, constexpr ThreadBlock FwdThreadBlock{.block_size = 256, .tile_size = {.m = 256, .n = 256, .k = 32}}; - run_test(); -} - -TEST_F(FwdConvBuilderTest, - Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_3D_FP32_ChannelsFirst) -{ - constexpr ConvSignature FwdConvSignature{ - .spatial_dim = 3, - .direction = ConvDirection::FORWARD, - .layout = GroupConvLayout3D::NGCDHW_GKCZYX_NGKDHW, - .data_type = DataType::FP32}; - - constexpr ThreadBlock FwdThreadBlock{.block_size = 256, - .tile_size = {.m = 128, .n = 128, .k = 32}}; - - run_test(); + run_test(); } +// Test 2: 2D FP16 GNHWC (group-first, channels-last) with Pipeline V3 and FILTER_1X1_PAD0 TEST_F(FwdConvBuilderTest, Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_2D_FP16_GNHWC) { @@ -108,9 +122,10 @@ TEST_F(FwdConvBuilderTest, constexpr ThreadBlock FwdThreadBlock{.block_size = 256, .tile_size = {.m = 256, .n = 256, .k = 32}}; - run_test(); + run_test(); } +// Test 3: 2D FP32 NGCHW_GKCYX (channels-first, different weight layout) with Pipeline V4 and FILTER_1X1_STRIDE1_PAD0 TEST_F(FwdConvBuilderTest, Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_2D_FP32_NGCHW_GKCYX) { @@ -123,39 +138,62 @@ TEST_F(FwdConvBuilderTest, constexpr ThreadBlock FwdThreadBlock{.block_size = 256, .tile_size = {.m = 128, .n = 128, .k = 32}}; - run_test(); + run_test(); } +// Test 4: 2D BF16 NHWGC (channels-last) with Pipeline V5 and FILTER_3x3 TEST_F(FwdConvBuilderTest, - Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_3D_BF16_GNDHWC) + Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_2D_BF16_NHWGC_Filter3x3) { - constexpr ConvSignature FwdConvSignature{ - .spatial_dim = 3, + constexpr ConvSignature FwdConvSignature{ + .spatial_dim = 2, .direction = ConvDirection::FORWARD, - .layout = GroupConvLayout3D::GNDHWC_GKZYXC_GNDHWK, + .layout = GroupConvLayout2D::NHWGC_GKYXC_NHWGK, .data_type = DataType::BF16}; constexpr ThreadBlock FwdThreadBlock{.block_size = 256, .tile_size = {.m = 256, .n = 256, .k = 32}}; - run_test(); + run_test(); } +//============================================================================== +// 3D Forward Convolution Tests +//============================================================================== + +// Test 5: 3D FP32 NGCDHW (channels-first) with Pipeline V1 and FILTER_1X1_PAD0 TEST_F(FwdConvBuilderTest, - Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_2D_BF16_NGCHW_ChannelsFirst) + Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_3D_FP32_ChannelsFirst) { - constexpr ConvSignature FwdConvSignature{ - .spatial_dim = 2, + constexpr ConvSignature FwdConvSignature{ + .spatial_dim = 3, + .direction = ConvDirection::FORWARD, + .layout = GroupConvLayout3D::NGCDHW_GKCZYX_NGKDHW, + .data_type = DataType::FP32}; + + constexpr ThreadBlock FwdThreadBlock{.block_size = 256, + .tile_size = {.m = 128, .n = 128, .k = 32}}; + + run_test(); +} + +// Test 6: 3D BF16 GNDHWC (group-first, channels-last) with Pipeline V3 and DEFAULT +TEST_F(FwdConvBuilderTest, + Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_3D_BF16_GNDHWC) +{ + constexpr ConvSignature FwdConvSignature{ + .spatial_dim = 3, .direction = ConvDirection::FORWARD, - .layout = GroupConvLayout2D::NGCHW_GKYXC_NGKHW, + .layout = GroupConvLayout3D::GNDHWC_GKZYXC_GNDHWK, .data_type = DataType::BF16}; constexpr ThreadBlock FwdThreadBlock{.block_size = 256, .tile_size = {.m = 256, .n = 256, .k = 32}}; - run_test(); + run_test(); } +// Test 7: 3D FP16 NDHWGC (channels-last) with Pipeline V4 and FILTER_1X1_PAD0 TEST_F(FwdConvBuilderTest, Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_3D_FP16_NDHWGC_ChannelsLast) { @@ -168,5 +206,5 @@ TEST_F(FwdConvBuilderTest, constexpr ThreadBlock FwdThreadBlock{.block_size = 256, .tile_size = {.m = 128, .n = 128, .k = 32}}; - run_test(); + run_test(); } From 275e6884a4a5cfa6502a8c85cc6f688b8bb967c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= Date: Thu, 23 Oct 2025 08:26:45 +0000 Subject: [PATCH 26/39] Add missing convolution layouts and provide better compile-time error in instance traits. --- .../builder/reflect/instance_traits_util.hpp | 57 ++++++++++++++++--- 1 file changed, 48 insertions(+), 9 deletions(-) diff --git a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp index 160a560529..4fd0f94afc 100644 --- a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp +++ b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp @@ -60,27 +60,66 @@ consteval std::string_view type_name() template constexpr std::string_view layout_name() { - // Convolution layouts - if constexpr(std::is_same_v) + // 1D Convolution layouts + if constexpr(std::is_same_v) + return "GNWC"; + else if constexpr(std::is_same_v) + return "GKXC"; + else if constexpr(std::is_same_v) + return "GNWK"; + else if constexpr(std::is_same_v) + return "NWGC"; + else if constexpr(std::is_same_v) + return "NWGK"; + else if constexpr(std::is_same_v) + return "NGCW"; + else if constexpr(std::is_same_v) + return "NGKW"; + else if constexpr(std::is_same_v) + return "GKCX"; + + // 2D Convolution layouts + else if constexpr(std::is_same_v) return "GNHWC"; else if constexpr(std::is_same_v) return "GKYXC"; else if constexpr(std::is_same_v) return "GNHWK"; - else if constexpr(std::is_same_v) - return "GKZYXC"; - else if constexpr(std::is_same_v) - return "GNDHWC"; - else if constexpr(std::is_same_v) - return "GNDHWK"; else if constexpr(std::is_same_v) return "NHWGC"; else if constexpr(std::is_same_v) return "KYXGC"; else if constexpr(std::is_same_v) return "NHWGK"; + else if constexpr(std::is_same_v) + return "NGCHW"; + else if constexpr(std::is_same_v) + return "NGKHW"; + else if constexpr(std::is_same_v) + return "GKCYX"; + + // 3D Convolution layouts + else if constexpr(std::is_same_v) + return "GKZYXC"; + else if constexpr(std::is_same_v) + return "GNDHWC"; + else if constexpr(std::is_same_v) + return "GNDHWK"; + else if constexpr(std::is_same_v) + return "NDHWGK"; + else if constexpr(std::is_same_v) + return "NDHWGC"; + else if constexpr(std::is_same_v) + return "NGKDHW"; + else if constexpr(std::is_same_v) + return "GKCZYX"; + else if constexpr(std::is_same_v) + return "NGCDHW"; + else if constexpr(std::is_same_v) + return "KZYXGC"; else - static_assert(false, "unknown_layout"); + // This provide a compile-time error message containing the type of the unrecognized layout. + static_assert(std::is_same_v, "unknown_layout"); } // Convert element-wise operation types to string names From 9d3f88c040a328825e3ba88400b44757e14a794a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= Date: Thu, 23 Oct 2025 08:28:06 +0000 Subject: [PATCH 27/39] Fix clang formatting. --- .../include/ck_tile/builder/conv_factory.hpp | 42 ++++++++++++++----- .../builder/reflect/instance_traits_util.hpp | 7 ++-- .../test/test_ckb_build_fwd_instances.cpp | 38 +++++++++++++---- 3 files changed, 65 insertions(+), 22 deletions(-) diff --git a/experimental/builder/include/ck_tile/builder/conv_factory.hpp b/experimental/builder/include/ck_tile/builder/conv_factory.hpp index 2edd088c6d..586afb81d0 100644 --- a/experimental/builder/include/ck_tile/builder/conv_factory.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_factory.hpp @@ -350,15 +350,24 @@ consteval ck::BlockGemmPipelineVersion SetBlockGemmPipelineVersion() { constexpr auto version = ALGORITHM.pipeline_version; - if constexpr (version == BlockGemmPipelineVersion::V1) { + if constexpr(version == BlockGemmPipelineVersion::V1) + { return ck::BlockGemmPipelineVersion::v1; - } else if constexpr (version == BlockGemmPipelineVersion::V3) { + } + else if constexpr(version == BlockGemmPipelineVersion::V3) + { return ck::BlockGemmPipelineVersion::v3; - } else if constexpr (version == BlockGemmPipelineVersion::V4) { + } + else if constexpr(version == BlockGemmPipelineVersion::V4) + { return ck::BlockGemmPipelineVersion::v4; - } else if constexpr (version == BlockGemmPipelineVersion::V5) { + } + else if constexpr(version == BlockGemmPipelineVersion::V5) + { return ck::BlockGemmPipelineVersion::v5; - } else { + } + else + { static_assert(false, "Unknown BlockGemmPipelineVersion"); } } @@ -368,17 +377,28 @@ consteval ck::tensor_operation::device::ConvolutionForwardSpecialization SetFwdC { constexpr auto specialization = ALGORITHM.fwd_specialization; - if constexpr (specialization == ConvFwdSpecialization::DEFAULT) { + if constexpr(specialization == ConvFwdSpecialization::DEFAULT) + { return ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; - } else if constexpr (specialization == ConvFwdSpecialization::FILTER_1X1_PAD0) { + } + else if constexpr(specialization == ConvFwdSpecialization::FILTER_1X1_PAD0) + { return ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0; - } else if constexpr (specialization == ConvFwdSpecialization::FILTER_1X1_STRIDE1_PAD0) { + } + else if constexpr(specialization == ConvFwdSpecialization::FILTER_1X1_STRIDE1_PAD0) + { return ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0; - } else if constexpr (specialization == ConvFwdSpecialization::ODD_C) { + } + else if constexpr(specialization == ConvFwdSpecialization::ODD_C) + { return ck::tensor_operation::device::ConvolutionForwardSpecialization::OddC; - } else if constexpr (specialization == ConvFwdSpecialization::FILTER_3x3) { + } + else if constexpr(specialization == ConvFwdSpecialization::FILTER_3x3) + { return ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter3x3; - } else { + } + else + { static_assert(false, "Unknown ConvFwdSpecialization"); } } diff --git a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp index 4fd0f94afc..62f090f226 100644 --- a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp +++ b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp @@ -77,7 +77,7 @@ constexpr std::string_view layout_name() return "NGKW"; else if constexpr(std::is_same_v) return "GKCX"; - + // 2D Convolution layouts else if constexpr(std::is_same_v) return "GNHWC"; @@ -97,7 +97,7 @@ constexpr std::string_view layout_name() return "NGKHW"; else if constexpr(std::is_same_v) return "GKCYX"; - + // 3D Convolution layouts else if constexpr(std::is_same_v) return "GKZYXC"; @@ -119,7 +119,8 @@ constexpr std::string_view layout_name() return "KZYXGC"; else // This provide a compile-time error message containing the type of the unrecognized layout. - static_assert(std::is_same_v, "unknown_layout"); + static_assert(std::is_same_v, + "unknown_layout"); } // Convert element-wise operation types to string names diff --git a/experimental/builder/test/test_ckb_build_fwd_instances.cpp b/experimental/builder/test/test_ckb_build_fwd_instances.cpp index a93637e7fb..56fb8803b3 100644 --- a/experimental/builder/test/test_ckb_build_fwd_instances.cpp +++ b/experimental/builder/test/test_ckb_build_fwd_instances.cpp @@ -106,7 +106,10 @@ TEST_F(FwdConvBuilderTest, constexpr ThreadBlock FwdThreadBlock{.block_size = 256, .tile_size = {.m = 256, .n = 256, .k = 32}}; - run_test(); + run_test(); } // Test 2: 2D FP16 GNHWC (group-first, channels-last) with Pipeline V3 and FILTER_1X1_PAD0 @@ -122,10 +125,14 @@ TEST_F(FwdConvBuilderTest, constexpr ThreadBlock FwdThreadBlock{.block_size = 256, .tile_size = {.m = 256, .n = 256, .k = 32}}; - run_test(); + run_test(); } -// Test 3: 2D FP32 NGCHW_GKCYX (channels-first, different weight layout) with Pipeline V4 and FILTER_1X1_STRIDE1_PAD0 +// Test 3: 2D FP32 NGCHW_GKCYX (channels-first, different weight layout) with Pipeline V4 and +// FILTER_1X1_STRIDE1_PAD0 TEST_F(FwdConvBuilderTest, Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_2D_FP32_NGCHW_GKCYX) { @@ -138,7 +145,10 @@ TEST_F(FwdConvBuilderTest, constexpr ThreadBlock FwdThreadBlock{.block_size = 256, .tile_size = {.m = 128, .n = 128, .k = 32}}; - run_test(); + run_test(); } // Test 4: 2D BF16 NHWGC (channels-last) with Pipeline V5 and FILTER_3x3 @@ -154,7 +164,10 @@ TEST_F(FwdConvBuilderTest, constexpr ThreadBlock FwdThreadBlock{.block_size = 256, .tile_size = {.m = 256, .n = 256, .k = 32}}; - run_test(); + run_test(); } //============================================================================== @@ -174,7 +187,10 @@ TEST_F(FwdConvBuilderTest, constexpr ThreadBlock FwdThreadBlock{.block_size = 256, .tile_size = {.m = 128, .n = 128, .k = 32}}; - run_test(); + run_test(); } // Test 6: 3D BF16 GNDHWC (group-first, channels-last) with Pipeline V3 and DEFAULT @@ -190,7 +206,10 @@ TEST_F(FwdConvBuilderTest, constexpr ThreadBlock FwdThreadBlock{.block_size = 256, .tile_size = {.m = 256, .n = 256, .k = 32}}; - run_test(); + run_test(); } // Test 7: 3D FP16 NDHWGC (channels-last) with Pipeline V4 and FILTER_1X1_PAD0 @@ -206,5 +225,8 @@ TEST_F(FwdConvBuilderTest, constexpr ThreadBlock FwdThreadBlock{.block_size = 256, .tile_size = {.m = 128, .n = 128, .k = 32}}; - run_test(); + run_test(); } From c388a876491e8335db8e460bc86163ad7377014f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= Date: Thu, 23 Oct 2025 08:29:56 +0000 Subject: [PATCH 28/39] Changed I8 -> S8. --- experimental/builder/include/ck_tile/builder/builder_utils.hpp | 2 +- experimental/builder/include/ck_tile/builder/types.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/experimental/builder/include/ck_tile/builder/builder_utils.hpp b/experimental/builder/include/ck_tile/builder/builder_utils.hpp index e1510fcd34..772cc438a3 100644 --- a/experimental/builder/include/ck_tile/builder/builder_utils.hpp +++ b/experimental/builder/include/ck_tile/builder/builder_utils.hpp @@ -98,7 +98,7 @@ constexpr std::string_view DataTypeToString(DataType dt) case DataType::FP32: return "FP32"; case DataType::BF16: return "BF16"; case DataType::FP8: return "FP8"; - case DataType::I8: return "I8"; + case DataType::S8: return "S8"; default: return "Unknown"; } } diff --git a/experimental/builder/include/ck_tile/builder/types.hpp b/experimental/builder/include/ck_tile/builder/types.hpp index a234cf0de3..3079ccfdfa 100644 --- a/experimental/builder/include/ck_tile/builder/types.hpp +++ b/experimental/builder/include/ck_tile/builder/types.hpp @@ -11,7 +11,7 @@ enum class DataType FP16, BF16, FP8, - I8 + S8 }; // Memory layouts for 1D convolution tensors. From 3a33509b8e9ee7c47c3e6454195d755560dabda1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= Date: Thu, 23 Oct 2025 08:31:03 +0000 Subject: [PATCH 29/39] Fix signature. --- .../builder/include/ck_tile/builder/conv_signature_concepts.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experimental/builder/include/ck_tile/builder/conv_signature_concepts.hpp b/experimental/builder/include/ck_tile/builder/conv_signature_concepts.hpp index c5e4fd6cfd..4ec6688cbe 100644 --- a/experimental/builder/include/ck_tile/builder/conv_signature_concepts.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_signature_concepts.hpp @@ -38,7 +38,7 @@ concept ValidConvLayoutForSpatialDim = // Constrains convolution data types to common floating-point types. template concept ConvDataType = (T == DataType::FP32) || (T == DataType::FP16) || (T == DataType::BF16) || - (T == DataType::FP8) || (T == DataType::I8); + (T == DataType::FP8) || (T == DataType::S8); // Concept for a type that defines a convolution's operational signature. template From b2a13a3253dcaf1d55e7569390f930bbe7329cac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= Date: Fri, 24 Oct 2025 11:22:05 +0000 Subject: [PATCH 30/39] Rename concepts and corresponding members. --- .../builder/conv_algorithm_concepts.hpp | 36 +++++------ .../include/ck_tile/builder/conv_factory.hpp | 62 +++++++++---------- .../test/impl/conv_algorithm_types.hpp | 50 +++++++-------- .../test/test_ckb_build_fwd_instances.cpp | 56 ++++++++--------- 4 files changed, 102 insertions(+), 102 deletions(-) diff --git a/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp b/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp index a4f7940f98..0e18a87d9f 100644 --- a/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp @@ -35,35 +35,35 @@ concept GridwiseGemmDescriptor = requires(T t) { { t.n_xdl_per_wave } -> std::convertible_to; }; -// Concept for convolution input block transfer. +// Concept for vectorized data transfer for convolution input tensors. template -concept InputBlockTransferDescriptor = requires(T t) { +concept BlockTransferDescriptor = requires(T t) { { t.k0 } -> std::convertible_to; { t.m_n } -> std::convertible_to; { t.k1 } -> std::convertible_to; }; -// Concept for output block transfer. +// Concept for thread cluster dimensions for GEMM output tensor. template -concept OutputBlockTransferDescriptor = requires(T t) { +concept ThreadClusterDescriptor = requires(T t) { { t.m_block } -> std::convertible_to; { t.m_wave_per_xdl } -> std::convertible_to; { t.n_block } -> std::convertible_to; { t.n_wave_per_xdl } -> std::convertible_to; }; -// Concept for the convolution input vector transfer. +// Concept for the LDS padding for the convolution input tensors. template -concept InputVectorTransferDescriptor = requires(T t) { +concept LdsPaddingDescriptor = requires(T t) { { t.src_vector_dim } -> std::convertible_to; { t.src_scalar_per_vector } -> std::convertible_to; { t.dest_scalar_per_vector_k1 } -> std::convertible_to; { t.add_extra } -> std::convertible_to; }; -// Concepts for the convolution output vector transfer. +// Concept for the convolution output tensor epilogue (copy from registers to global memory via LDS). template -concept OutputVectorTransferDescriptor = requires(T t) { +concept EpilogueDescriptor = requires(T t) { { t.m_xdl_per_wave_per_shuffle } -> std::convertible_to; { t.n_xdl_per_wave_per_shuffle } -> std::convertible_to; { t.scalar_per_vector } -> std::convertible_to; @@ -92,30 +92,30 @@ concept SpecifiesThreadBlock = requires { // Concept to check if a struct specifies gridwise GEMM info. template concept SpecifiesGridwiseGemm = requires { - { T::tuning_params } -> GridwiseGemmDescriptor; + { T::gridwise_gemm } -> GridwiseGemmDescriptor; }; // Concept to check if a struct specifies convolution input and output block transfer info. template concept SpecifiesBlockTransfer = requires(T t) { - { T::block_transfer.thread_cluster_dims_a } -> InputBlockTransferDescriptor; - { T::block_transfer.thread_cluster_dims_b } -> InputBlockTransferDescriptor; - { T::block_transfer.thread_cluster_dims_c } -> OutputBlockTransferDescriptor; + { T::block_transfer.block_transfer_a } -> BlockTransferDescriptor; + { T::block_transfer.block_transfer_b } -> BlockTransferDescriptor; + { T::block_transfer.thread_cluster_dims_c } -> ThreadClusterDescriptor; }; // Concept to check if a struct specifies block vector transfer info. template -concept SpecifiesBlockVectorTransfer = requires(T t) { - { T::block_transfer.vector_transfer_a } -> InputVectorTransferDescriptor; - { T::block_transfer.vector_transfer_b } -> InputVectorTransferDescriptor; - { T::block_transfer.vector_transfer_c } -> OutputVectorTransferDescriptor; +concept SpecifiesLdsTransfer = requires(T t) { + { T::block_transfer.lds_padding_a } -> LdsPaddingDescriptor; + { T::block_transfer.lds_padding_b } -> LdsPaddingDescriptor; + { T::block_transfer.epilogue_c } -> EpilogueDescriptor; }; // Concept to check if a struct specifies thread cluster access order info. template concept SpecifiesThreadClusterAccessOrder = requires(T t) { - { T::block_transfer.thread_cluster_access_order_a } -> AccessOrderDescriptor; - { T::block_transfer.thread_cluster_access_order_b } -> AccessOrderDescriptor; + { T::block_transfer.block_transfer_access_order_a } -> AccessOrderDescriptor; + { T::block_transfer.block_transfer_access_order_b } -> AccessOrderDescriptor; }; // Concept to check if a struct specifies source access order info. diff --git a/experimental/builder/include/ck_tile/builder/conv_factory.hpp b/experimental/builder/include/ck_tile/builder/conv_factory.hpp index 586afb81d0..10c1286f1c 100644 --- a/experimental/builder/include/ck_tile/builder/conv_factory.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_factory.hpp @@ -246,7 +246,7 @@ constexpr ConvBlock SetThreadBlockInfo() } // Convolution tuning parameters. -struct ConvTuning +struct GridwiseGemm { size_t ak1 = 0; size_t bk1 = 0; @@ -257,10 +257,10 @@ struct ConvTuning }; template -constexpr ConvTuning SetConvTuningInfo() +constexpr GridwiseGemm SetGridwiseGemmInfo() { - constexpr auto& TP = ALGORITHM.tuning_params; - return ConvTuning{ + constexpr auto& TP = ALGORITHM.gridwise_gemm; + return GridwiseGemm{ .ak1 = TP.ak1, .bk1 = TP.bk1, .m_per_xdl = TP.m_per_xdl, @@ -285,36 +285,36 @@ struct BlockTransfer template constexpr BlockTransfer SetFwdConvABlockTransfer() { - constexpr auto& TCL = ALGORITHM.block_transfer.thread_cluster_dims_a; - constexpr auto& TCO = ALGORITHM.block_transfer.thread_cluster_access_order_a; + constexpr auto& TCL = ALGORITHM.block_transfer.block_transfer_a; + constexpr auto& TCO = ALGORITHM.block_transfer.block_transfer_access_order_a; constexpr auto& SAO = ALGORITHM.block_transfer.src_access_order_a; - constexpr auto& VTD = ALGORITHM.block_transfer.vector_transfer_a; + constexpr auto& LDS = ALGORITHM.block_transfer.lds_padding_a; BlockTransfer block_transfer{.thread_cluster_dims = {TCL.k0, TCL.m_n, TCL.k1}, .thread_cluster_order = {TCO.order[0], TCO.order[1], TCO.order[2]}, .src_access_order = {SAO.order[0], SAO.order[1], SAO.order[2]}, - .src_vector_dim = VTD.src_vector_dim, - .src_scalar_per_vector = VTD.src_scalar_per_vector, - .dest_scalar_per_vector_k1 = VTD.dest_scalar_per_vector_k1, - .add_extra = VTD.add_extra}; + .src_vector_dim = LDS.src_vector_dim, + .src_scalar_per_vector = LDS.src_scalar_per_vector, + .dest_scalar_per_vector_k1 = LDS.dest_scalar_per_vector_k1, + .add_extra = LDS.add_extra}; return block_transfer; } template constexpr BlockTransfer SetFwdConvBBlockTransfer() { - constexpr auto& TCL = ALGORITHM.block_transfer.thread_cluster_dims_b; - constexpr auto& TCO = ALGORITHM.block_transfer.thread_cluster_access_order_b; + constexpr auto& TCL = ALGORITHM.block_transfer.block_transfer_b; + constexpr auto& TCO = ALGORITHM.block_transfer.block_transfer_access_order_b; constexpr auto& SAO = ALGORITHM.block_transfer.src_access_order_b; - constexpr auto& VTD = ALGORITHM.block_transfer.vector_transfer_b; + constexpr auto& LDS = ALGORITHM.block_transfer.lds_padding_b; BlockTransfer block_transfer{.thread_cluster_dims = {TCL.k0, TCL.m_n, TCL.k1}, .thread_cluster_order = {TCO.order[0], TCO.order[1], TCO.order[2]}, .src_access_order = {SAO.order[0], SAO.order[1], SAO.order[2]}, - .src_vector_dim = VTD.src_vector_dim, - .src_scalar_per_vector = VTD.src_scalar_per_vector, - .dest_scalar_per_vector_k1 = VTD.dest_scalar_per_vector_k1, - .add_extra = VTD.add_extra}; + .src_vector_dim = LDS.src_vector_dim, + .src_scalar_per_vector = LDS.src_scalar_per_vector, + .dest_scalar_per_vector_k1 = LDS.dest_scalar_per_vector_k1, + .add_extra = LDS.add_extra}; return block_transfer; } @@ -331,9 +331,9 @@ template "The convolution algorithm descriptor must specify gridwise GEMM info."); static_assert(SpecifiesBlockTransfer, "The convolution algorithm descriptor must specify block transfer info."); - static_assert(SpecifiesBlockVectorTransfer, - "The convolution algorithm descriptor must specify block vector transfer info."); + static_assert(SpecifiesLdsTransfer, + "The convolution algorithm descriptor must specify LDS transfer info."); static_assert( SpecifiesThreadClusterAccessOrder, "The convolution algorithm descriptor must specify thread cluster access order info."); @@ -456,7 +456,7 @@ struct ConvFactory .gemm_spec = ck::tensor_operation::device::GemmSpecialization::MNKPadding, }; static constexpr auto BLOCK = factory_internal::SetThreadBlockInfo(); - static constexpr auto TUNING = factory_internal::SetConvTuningInfo(); + static constexpr auto GRIDWISE_GEMM = factory_internal::SetGridwiseGemmInfo(); static constexpr auto A_BLOCK_TRANSFER = factory_internal::SetFwdConvABlockTransfer(); static constexpr auto B_BLOCK_TRANSFER = @@ -500,12 +500,12 @@ struct ConvFactory BLOCK.per_block.m, BLOCK.per_block.n, BLOCK.per_block.k, - TUNING.ak1, - TUNING.bk1, - TUNING.m_per_xdl, - TUNING.n_per_xdl, - TUNING.m_xdl_per_wave, - TUNING.n_xdl_per_wave, + GRIDWISE_GEMM.ak1, + GRIDWISE_GEMM.bk1, + GRIDWISE_GEMM.m_per_xdl, + GRIDWISE_GEMM.n_per_xdl, + GRIDWISE_GEMM.m_xdl_per_wave, + GRIDWISE_GEMM.n_xdl_per_wave, to_sequence_v, to_sequence_v, to_sequence_v, diff --git a/experimental/builder/test/impl/conv_algorithm_types.hpp b/experimental/builder/test/impl/conv_algorithm_types.hpp index 84fdd83d60..7348eab804 100644 --- a/experimental/builder/test/impl/conv_algorithm_types.hpp +++ b/experimental/builder/test/impl/conv_algorithm_types.hpp @@ -28,8 +28,8 @@ struct ThreadBlock }; static_assert(ckb::ThreadBlockDescriptor); -// Describe some convolution tuning parameters. -struct ConvTuningParams +// Describe gridwise GEMM parameters. +struct GridwiseGemm { // NOTE: ak1 and bk1 are difficult to verify in the kernel instantiation!!! size_t ak1 = 0; @@ -39,43 +39,43 @@ struct ConvTuningParams size_t m_xdl_per_wave = 0; size_t n_xdl_per_wave = 0; }; -static_assert(ckb::GridwiseGemmDescriptor); +static_assert(ckb::GridwiseGemmDescriptor); -// Describe A block transfer thread cluster lengths. -struct InputBlockTransferLengths +// Describe Aand B block transfer thread cluster lengths. +struct BlockTransfer { size_t k0; size_t m_n; size_t k1; }; -static_assert(ckb::InputBlockTransferDescriptor); +static_assert(ckb::BlockTransferDescriptor); // Describe C block transfer thread cluster lengths. -struct OutputBlockTransferLengths +struct ThreadCluster { size_t m_block; size_t m_wave_per_xdl; size_t n_block; size_t n_wave_per_xdl; }; -static_assert(OutputBlockTransferDescriptor); +static_assert(ThreadClusterDescriptor); -struct InputVectorTransfer +struct LdsPadding { size_t src_vector_dim; size_t src_scalar_per_vector; size_t dest_scalar_per_vector_k1; bool add_extra; }; -static_assert(InputVectorTransferDescriptor); +static_assert(LdsPaddingDescriptor); -struct OutputVectorTransfer +struct Epilogue { size_t m_xdl_per_wave_per_shuffle; size_t n_xdl_per_wave_per_shuffle; size_t scalar_per_vector; }; -static_assert(OutputVectorTransferDescriptor); +static_assert(EpilogueDescriptor); struct AccessOrder { @@ -83,16 +83,16 @@ struct AccessOrder }; static_assert(AccessOrderDescriptor); -struct InputOutputBlockTransfer +struct BlockTransferABC { - InputBlockTransferLengths thread_cluster_dims_a; - InputBlockTransferLengths thread_cluster_dims_b; - OutputBlockTransferLengths thread_cluster_dims_c; - InputVectorTransfer vector_transfer_a; - InputVectorTransfer vector_transfer_b; - OutputVectorTransfer vector_transfer_c; - AccessOrder thread_cluster_access_order_a; - AccessOrder thread_cluster_access_order_b; + BlockTransfer block_transfer_a; + BlockTransfer block_transfer_b; + ThreadCluster thread_cluster_dims_c; + LdsPadding lds_padding_a; + LdsPadding lds_padding_b; + Epilogue epilogue_c; + AccessOrder block_transfer_access_order_a; + AccessOrder block_transfer_access_order_b; AccessOrder src_access_order_a; AccessOrder src_access_order_b; }; @@ -100,19 +100,19 @@ struct InputOutputBlockTransfer struct ConvAlgorithm { ThreadBlock thread_block; - ConvTuningParams tuning_params; - InputOutputBlockTransfer block_transfer; + GridwiseGemm gridwise_gemm; + BlockTransferABC block_transfer; BlockGemmPipelineVersion pipeline_version; ConvFwdSpecialization fwd_specialization; }; static_assert(ckb::ConvAlgorithmDescriptor); static_assert(ckb::SpecifiesThreadBlock); static_assert(ckb::SpecifiesGridwiseGemm); -static_assert(ckb::SpecifiesGemmPipelineVersion); static_assert(ckb::SpecifiesBlockTransfer); -static_assert(ckb::SpecifiesBlockVectorTransfer); +static_assert(ckb::SpecifiesLdsTransfer); static_assert(ckb::SpecifiesThreadClusterAccessOrder); static_assert(ckb::SpecifiesSourceAccessOrder); +static_assert(ckb::SpecifiesGemmPipelineVersion); static_assert(ckb::SpecifiesFwdConcSpecialization); } // namespace ck_tile::builder::test diff --git a/experimental/builder/test/test_ckb_build_fwd_instances.cpp b/experimental/builder/test/test_ckb_build_fwd_instances.cpp index 56fb8803b3..8f28920629 100644 --- a/experimental/builder/test/test_ckb_build_fwd_instances.cpp +++ b/experimental/builder/test/test_ckb_build_fwd_instances.cpp @@ -17,38 +17,38 @@ template constexpr void run_test() { - constexpr ConvTuningParams FwdTuningParams{.ak1 = 8, - .bk1 = 8, - .m_per_xdl = 32, - .n_per_xdl = 32, - .m_xdl_per_wave = 4, - .n_xdl_per_wave = 4}; - - constexpr InputOutputBlockTransfer FwdBlockTransfer{ - .thread_cluster_dims_a = {.k0 = 4, .m_n = 64, .k1 = 1}, - .thread_cluster_dims_b = {.k0 = 4, .m_n = 64, .k1 = 1}, - .thread_cluster_dims_c = {.m_block = 1, - .m_wave_per_xdl = 32, - .n_block = 1, - .n_wave_per_xdl = 8}, - .vector_transfer_a = {.src_vector_dim = 2, - .src_scalar_per_vector = 2, - .dest_scalar_per_vector_k1 = 8, - .add_extra = false}, - .vector_transfer_b = {.src_vector_dim = 2, - .src_scalar_per_vector = 8, - .dest_scalar_per_vector_k1 = 8, - .add_extra = false}, - .vector_transfer_c = {.m_xdl_per_wave_per_shuffle = 1, - .n_xdl_per_wave_per_shuffle = 1, - .scalar_per_vector = 8}, - .thread_cluster_access_order_a = {1, 0, 2}, - .thread_cluster_access_order_b = {1, 0, 2}, + constexpr GridwiseGemm FwdGemmParams{.ak1 = 8, + .bk1 = 8, + .m_per_xdl = 32, + .n_per_xdl = 32, + .m_xdl_per_wave = 4, + .n_xdl_per_wave = 4}; + + constexpr BlockTransferABC FwdBlockTransfer{ + .block_transfer_a = {.k0 = 4, .m_n = 64, .k1 = 1}, + .block_transfer_b = {.k0 = 4, .m_n = 64, .k1 = 1}, + .thread_cluster_dims_c = {.m_block = 1, + .m_wave_per_xdl = 32, + .n_block = 1, + .n_wave_per_xdl = 8}, + .lds_padding_a = {.src_vector_dim = 2, + .src_scalar_per_vector = 2, + .dest_scalar_per_vector_k1 = 8, + .add_extra = false}, + .lds_padding_b = {.src_vector_dim = 2, + .src_scalar_per_vector = 8, + .dest_scalar_per_vector_k1 = 8, + .add_extra = false}, + .epilogue_c = {.m_xdl_per_wave_per_shuffle = 1, + .n_xdl_per_wave_per_shuffle = 1, + .scalar_per_vector = 8}, + .block_transfer_access_order_a = {1, 0, 2}, + .block_transfer_access_order_b = {1, 0, 2}, .src_access_order_a = {1, 0, 2}, .src_access_order_b = {1, 0, 2}}; constexpr ConvAlgorithm FwdConvAlgorithm{.thread_block = FwdThreadBlock, - .tuning_params = FwdTuningParams, + .gridwise_gemm = FwdGemmParams, .block_transfer = FwdBlockTransfer, .pipeline_version = FwdPipelineVersion, .fwd_specialization = FwdConvSpecialization}; From 9c5f26211074840621775aea062963bc9f2c8e1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= Date: Fri, 24 Oct 2025 14:22:14 +0000 Subject: [PATCH 31/39] Rename LDS related parameters. --- .../builder/conv_algorithm_concepts.hpp | 15 ++++++----- .../ck_tile/builder/conv_algorithm_limits.hpp | 2 +- .../include/ck_tile/builder/conv_factory.hpp | 27 ++++++++++--------- .../test/impl/conv_algorithm_types.hpp | 13 ++++----- .../test/test_ckb_build_fwd_instances.cpp | 14 +++++----- 5 files changed, 39 insertions(+), 32 deletions(-) diff --git a/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp b/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp index 0e18a87d9f..0324ffe38a 100644 --- a/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp @@ -52,13 +52,14 @@ concept ThreadClusterDescriptor = requires(T t) { { t.n_wave_per_xdl } -> std::convertible_to; }; -// Concept for the LDS padding for the convolution input tensors. +// Concept for the LDS transfer for the convolution input tensors. template -concept LdsPaddingDescriptor = requires(T t) { +concept LdsTransferDescriptor = requires(T t) { { t.src_vector_dim } -> std::convertible_to; { t.src_scalar_per_vector } -> std::convertible_to; - { t.dest_scalar_per_vector_k1 } -> std::convertible_to; - { t.add_extra } -> std::convertible_to; + { t.lds_dst_scalar_per_vector } -> std::convertible_to; + { t.is_direct_load } -> std::convertible_to; + { t.lds_padding } -> std::convertible_to; }; // Concept for the convolution output tensor epilogue (copy from registers to global memory via LDS). @@ -103,11 +104,11 @@ concept SpecifiesBlockTransfer = requires(T t) { { T::block_transfer.thread_cluster_dims_c } -> ThreadClusterDescriptor; }; -// Concept to check if a struct specifies block vector transfer info. +// Concept to check if a struct specifies LDS transfer info for tensors A, B, and C. template concept SpecifiesLdsTransfer = requires(T t) { - { T::block_transfer.lds_padding_a } -> LdsPaddingDescriptor; - { T::block_transfer.lds_padding_b } -> LdsPaddingDescriptor; + { T::block_transfer.lds_transfer_a } -> LdsTransferDescriptor; + { T::block_transfer.lds_transfer_b } -> LdsTransferDescriptor; { T::block_transfer.epilogue_c } -> EpilogueDescriptor; }; diff --git a/experimental/builder/include/ck_tile/builder/conv_algorithm_limits.hpp b/experimental/builder/include/ck_tile/builder/conv_algorithm_limits.hpp index 0662976520..7ef8930273 100644 --- a/experimental/builder/include/ck_tile/builder/conv_algorithm_limits.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_algorithm_limits.hpp @@ -12,7 +12,7 @@ namespace ck_tile::builder { template concept InputVectorTransferLimits = requires { requires Value.src_vector_dim > 0 && Value.src_scalar_per_vector > 0 && - Value.dest_scalar_per_vector_k1 > 0; + Value.lds_dst_scalar_per_vector > 0; }; // Limits for output vector transfer. diff --git a/experimental/builder/include/ck_tile/builder/conv_factory.hpp b/experimental/builder/include/ck_tile/builder/conv_factory.hpp index 10c1286f1c..60b4b1ddf2 100644 --- a/experimental/builder/include/ck_tile/builder/conv_factory.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_factory.hpp @@ -278,8 +278,9 @@ struct BlockTransfer ck::Array src_access_order = {0, 0, 0}; size_t src_vector_dim = 0; size_t src_scalar_per_vector = 0; - size_t dest_scalar_per_vector_k1 = 0; - size_t add_extra = 0; + size_t lds_dst_scalar_per_vector = 0; + bool is_direct_load = false; + bool lds_padding = false; }; template @@ -288,15 +289,16 @@ constexpr BlockTransfer SetFwdConvABlockTransfer() constexpr auto& TCL = ALGORITHM.block_transfer.block_transfer_a; constexpr auto& TCO = ALGORITHM.block_transfer.block_transfer_access_order_a; constexpr auto& SAO = ALGORITHM.block_transfer.src_access_order_a; - constexpr auto& LDS = ALGORITHM.block_transfer.lds_padding_a; + constexpr auto& LDS = ALGORITHM.block_transfer.lds_transfer_a; BlockTransfer block_transfer{.thread_cluster_dims = {TCL.k0, TCL.m_n, TCL.k1}, .thread_cluster_order = {TCO.order[0], TCO.order[1], TCO.order[2]}, .src_access_order = {SAO.order[0], SAO.order[1], SAO.order[2]}, .src_vector_dim = LDS.src_vector_dim, .src_scalar_per_vector = LDS.src_scalar_per_vector, - .dest_scalar_per_vector_k1 = LDS.dest_scalar_per_vector_k1, - .add_extra = LDS.add_extra}; + .lds_dst_scalar_per_vector = LDS.lds_dst_scalar_per_vector, + .is_direct_load = LDS.is_direct_load, + .lds_padding = LDS.lds_padding}; return block_transfer; } @@ -306,15 +308,16 @@ constexpr BlockTransfer SetFwdConvBBlockTransfer() constexpr auto& TCL = ALGORITHM.block_transfer.block_transfer_b; constexpr auto& TCO = ALGORITHM.block_transfer.block_transfer_access_order_b; constexpr auto& SAO = ALGORITHM.block_transfer.src_access_order_b; - constexpr auto& LDS = ALGORITHM.block_transfer.lds_padding_b; + constexpr auto& LDS = ALGORITHM.block_transfer.lds_transfer_b; BlockTransfer block_transfer{.thread_cluster_dims = {TCL.k0, TCL.m_n, TCL.k1}, .thread_cluster_order = {TCO.order[0], TCO.order[1], TCO.order[2]}, .src_access_order = {SAO.order[0], SAO.order[1], SAO.order[2]}, .src_vector_dim = LDS.src_vector_dim, .src_scalar_per_vector = LDS.src_scalar_per_vector, - .dest_scalar_per_vector_k1 = LDS.dest_scalar_per_vector_k1, - .add_extra = LDS.add_extra}; + .lds_dst_scalar_per_vector = LDS.lds_dst_scalar_per_vector, + .is_direct_load = LDS.is_direct_load, + .lds_padding = LDS.lds_padding}; return block_transfer; } @@ -511,15 +514,15 @@ struct ConvFactory to_sequence_v, A_BLOCK_TRANSFER.src_vector_dim, A_BLOCK_TRANSFER.src_scalar_per_vector, - A_BLOCK_TRANSFER.dest_scalar_per_vector_k1, - A_BLOCK_TRANSFER.add_extra, + A_BLOCK_TRANSFER.lds_dst_scalar_per_vector, + A_BLOCK_TRANSFER.lds_padding, to_sequence_v, to_sequence_v, to_sequence_v, B_BLOCK_TRANSFER.src_vector_dim, B_BLOCK_TRANSFER.src_scalar_per_vector, - B_BLOCK_TRANSFER.dest_scalar_per_vector_k1, - B_BLOCK_TRANSFER.add_extra, + B_BLOCK_TRANSFER.lds_dst_scalar_per_vector, + B_BLOCK_TRANSFER.lds_padding, C_BLOCK_TRANSFER.m_xdl_per_wave_per_shuffle, C_BLOCK_TRANSFER.n_xdl_per_wave_per_shuffle, to_sequence_v, diff --git a/experimental/builder/test/impl/conv_algorithm_types.hpp b/experimental/builder/test/impl/conv_algorithm_types.hpp index 7348eab804..2a6ec187dc 100644 --- a/experimental/builder/test/impl/conv_algorithm_types.hpp +++ b/experimental/builder/test/impl/conv_algorithm_types.hpp @@ -60,14 +60,15 @@ struct ThreadCluster }; static_assert(ThreadClusterDescriptor); -struct LdsPadding +struct LdsTransfer { size_t src_vector_dim; size_t src_scalar_per_vector; - size_t dest_scalar_per_vector_k1; - bool add_extra; + size_t lds_dst_scalar_per_vector; + bool is_direct_load; + bool lds_padding; }; -static_assert(LdsPaddingDescriptor); +static_assert(LdsTransferDescriptor); struct Epilogue { @@ -88,8 +89,8 @@ struct BlockTransferABC BlockTransfer block_transfer_a; BlockTransfer block_transfer_b; ThreadCluster thread_cluster_dims_c; - LdsPadding lds_padding_a; - LdsPadding lds_padding_b; + LdsTransfer lds_transfer_a; + LdsTransfer lds_transfer_b; Epilogue epilogue_c; AccessOrder block_transfer_access_order_a; AccessOrder block_transfer_access_order_b; diff --git a/experimental/builder/test/test_ckb_build_fwd_instances.cpp b/experimental/builder/test/test_ckb_build_fwd_instances.cpp index 8f28920629..a1c51f1cd6 100644 --- a/experimental/builder/test/test_ckb_build_fwd_instances.cpp +++ b/experimental/builder/test/test_ckb_build_fwd_instances.cpp @@ -31,14 +31,16 @@ constexpr void run_test() .m_wave_per_xdl = 32, .n_block = 1, .n_wave_per_xdl = 8}, - .lds_padding_a = {.src_vector_dim = 2, + .lds_transfer_a = {.src_vector_dim = 2, .src_scalar_per_vector = 2, - .dest_scalar_per_vector_k1 = 8, - .add_extra = false}, - .lds_padding_b = {.src_vector_dim = 2, + .lds_dst_scalar_per_vector = 8, + .is_direct_load = false, + .lds_padding = false}, + .lds_transfer_b = {.src_vector_dim = 2, .src_scalar_per_vector = 8, - .dest_scalar_per_vector_k1 = 8, - .add_extra = false}, + .lds_dst_scalar_per_vector = 8, + .is_direct_load = false, + .lds_padding = false}, .epilogue_c = {.m_xdl_per_wave_per_shuffle = 1, .n_xdl_per_wave_per_shuffle = 1, .scalar_per_vector = 8}, From 7987f074fcfd577f34dadb4a3cbd8825b9efd25e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= Date: Mon, 27 Oct 2025 10:15:06 +0000 Subject: [PATCH 32/39] Remove ODD_C specialization. Add V2 pipeline. --- experimental/builder/include/ck_tile/builder/conv_factory.hpp | 4 ---- experimental/builder/include/ck_tile/builder/types.hpp | 2 +- experimental/builder/test/test_ckb_build_fwd_instances.cpp | 2 -- 3 files changed, 1 insertion(+), 7 deletions(-) diff --git a/experimental/builder/include/ck_tile/builder/conv_factory.hpp b/experimental/builder/include/ck_tile/builder/conv_factory.hpp index 60b4b1ddf2..d2a5261918 100644 --- a/experimental/builder/include/ck_tile/builder/conv_factory.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_factory.hpp @@ -392,10 +392,6 @@ consteval ck::tensor_operation::device::ConvolutionForwardSpecialization SetFwdC { return ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0; } - else if constexpr(specialization == ConvFwdSpecialization::ODD_C) - { - return ck::tensor_operation::device::ConvolutionForwardSpecialization::OddC; - } else if constexpr(specialization == ConvFwdSpecialization::FILTER_3x3) { return ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter3x3; diff --git a/experimental/builder/include/ck_tile/builder/types.hpp b/experimental/builder/include/ck_tile/builder/types.hpp index 3079ccfdfa..211252bd51 100644 --- a/experimental/builder/include/ck_tile/builder/types.hpp +++ b/experimental/builder/include/ck_tile/builder/types.hpp @@ -70,6 +70,7 @@ enum class ElementwiseOperation enum class BlockGemmPipelineVersion { V1, + V2, V3, V4, V5 @@ -81,7 +82,6 @@ enum class ConvFwdSpecialization DEFAULT, FILTER_1X1_PAD0, FILTER_1X1_STRIDE1_PAD0, - ODD_C, FILTER_3x3 }; diff --git a/experimental/builder/test/test_ckb_build_fwd_instances.cpp b/experimental/builder/test/test_ckb_build_fwd_instances.cpp index a1c51f1cd6..cdb8358000 100644 --- a/experimental/builder/test/test_ckb_build_fwd_instances.cpp +++ b/experimental/builder/test/test_ckb_build_fwd_instances.cpp @@ -84,8 +84,6 @@ constexpr void run_test() EXPECT_TRUE(kernel_string.find("Filter1x1Stride1Pad0") != std::string::npos); else if(FwdConvSpecialization == ConvFwdSpecialization::FILTER_3x3) EXPECT_TRUE(kernel_string.find("Filter3x3") != std::string::npos); - else if(FwdConvSpecialization == ConvFwdSpecialization::ODD_C) - EXPECT_TRUE(kernel_string.find("OddC") != std::string::npos); const auto invoker_ptr = instance.MakeInvokerPointer(); EXPECT_NE(invoker_ptr, nullptr); From 7df49a8a16a10335895a85232cd0a4b0fa9d441a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= Date: Mon, 27 Oct 2025 10:26:11 +0000 Subject: [PATCH 33/39] Add missing types. --- .../builder/include/ck_tile/builder/builder_utils.hpp | 3 ++- .../include/ck_tile/builder/conv_signature_concepts.hpp | 2 +- experimental/builder/include/ck_tile/builder/types.hpp | 4 +++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/experimental/builder/include/ck_tile/builder/builder_utils.hpp b/experimental/builder/include/ck_tile/builder/builder_utils.hpp index 772cc438a3..a69471c9ed 100644 --- a/experimental/builder/include/ck_tile/builder/builder_utils.hpp +++ b/experimental/builder/include/ck_tile/builder/builder_utils.hpp @@ -98,7 +98,8 @@ constexpr std::string_view DataTypeToString(DataType dt) case DataType::FP32: return "FP32"; case DataType::BF16: return "BF16"; case DataType::FP8: return "FP8"; - case DataType::S8: return "S8"; + case DataType::I8: return "I8"; + case DataType::U8: return "U8"; default: return "Unknown"; } } diff --git a/experimental/builder/include/ck_tile/builder/conv_signature_concepts.hpp b/experimental/builder/include/ck_tile/builder/conv_signature_concepts.hpp index 4ec6688cbe..15d3f474d1 100644 --- a/experimental/builder/include/ck_tile/builder/conv_signature_concepts.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_signature_concepts.hpp @@ -38,7 +38,7 @@ concept ValidConvLayoutForSpatialDim = // Constrains convolution data types to common floating-point types. template concept ConvDataType = (T == DataType::FP32) || (T == DataType::FP16) || (T == DataType::BF16) || - (T == DataType::FP8) || (T == DataType::S8); + (T == DataType::FP8) || (T == DataType::I8) || (T == DataType::U8); // Concept for a type that defines a convolution's operational signature. template diff --git a/experimental/builder/include/ck_tile/builder/types.hpp b/experimental/builder/include/ck_tile/builder/types.hpp index 211252bd51..7f49e77f81 100644 --- a/experimental/builder/include/ck_tile/builder/types.hpp +++ b/experimental/builder/include/ck_tile/builder/types.hpp @@ -11,7 +11,8 @@ enum class DataType FP16, BF16, FP8, - S8 + I8, + U8 }; // Memory layouts for 1D convolution tensors. @@ -60,6 +61,7 @@ enum class ElementwiseOperation { BIAS, BIAS_CLAMP, + BIAS_BNORM_CLAMP, BILINEAR, CLAMP, SCALE, From 60b265b2b36fbd1e829f6f872ada60af9688add3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= Date: Mon, 27 Oct 2025 10:50:28 +0000 Subject: [PATCH 34/39] Add elementwise operation to the conv signature. --- .../include/ck_tile/builder/conv_factory.hpp | 14 ++++++++++--- .../builder/conv_signature_concepts.hpp | 1 + .../test/impl/conv_signature_types.hpp | 1 + .../test/test_ckb_build_fwd_instances.cpp | 21 ++++++++++++------- 4 files changed, 27 insertions(+), 10 deletions(-) diff --git a/experimental/builder/include/ck_tile/builder/conv_factory.hpp b/experimental/builder/include/ck_tile/builder/conv_factory.hpp index d2a5261918..45cd894e80 100644 --- a/experimental/builder/include/ck_tile/builder/conv_factory.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_factory.hpp @@ -201,9 +201,17 @@ struct ConvTensorTypes using EDataType = float; }; -// Hard-coded pass-through ops. -// TODO: Generalize this for more fused operations. +template struct ConvPassThroughOps +{ + // This will trigger if a specialization for the given DataType is not found. + // We should always catch this in an earlier validation check. + static_assert(sizeof(UnsupportedEnumValue) == 0, + "Internal error. Unsupported data type for convolution factory."); +}; + +template <> +struct ConvPassThroughOps { using AElementwiseOp = ck::tensor_operation::element_wise::PassThrough; using BElementwiseOp = ck::tensor_operation::element_wise::PassThrough; @@ -423,7 +431,7 @@ struct ConvFactory using Layouts = factory_internal::ConvTensorLayouts; using Types = factory_internal::ConvTensorTypes; - using Ops = factory_internal::ConvPassThroughOps; + using Ops = factory_internal::ConvPassThroughOps; using AlgorithmType = decltype(ALGORITHM); // Check preconditions for the algorithm description. diff --git a/experimental/builder/include/ck_tile/builder/conv_signature_concepts.hpp b/experimental/builder/include/ck_tile/builder/conv_signature_concepts.hpp index 15d3f474d1..0851f0061e 100644 --- a/experimental/builder/include/ck_tile/builder/conv_signature_concepts.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_signature_concepts.hpp @@ -49,6 +49,7 @@ concept ConvSignatureDescriptor = requires(T t) { std::convertible_to || std::convertible_to; { t.data_type } -> std::convertible_to; + { t.elementwise_operation } -> std::convertible_to; }; // Concept to validate a convolution signature's values. diff --git a/experimental/builder/test/impl/conv_signature_types.hpp b/experimental/builder/test/impl/conv_signature_types.hpp index 40f49652fb..297f827395 100644 --- a/experimental/builder/test/impl/conv_signature_types.hpp +++ b/experimental/builder/test/impl/conv_signature_types.hpp @@ -14,6 +14,7 @@ struct ConvSignature ConvDirection direction; GroupConvLayout layout; DataType data_type; + ElementwiseOperation elementwise_operation; }; static_assert(ConvSignatureDescriptor>); static_assert(ConvSignatureDescriptor>); diff --git a/experimental/builder/test/test_ckb_build_fwd_instances.cpp b/experimental/builder/test/test_ckb_build_fwd_instances.cpp index cdb8358000..df0bd3478c 100644 --- a/experimental/builder/test/test_ckb_build_fwd_instances.cpp +++ b/experimental/builder/test/test_ckb_build_fwd_instances.cpp @@ -101,7 +101,8 @@ TEST_F(FwdConvBuilderTest, .spatial_dim = 2, .direction = ConvDirection::FORWARD, .layout = GroupConvLayout2D::NHWGC_GKYXC_NHWGK, - .data_type = DataType::BF16}; + .data_type = DataType::BF16, + .elementwise_operation = ElementwiseOperation::PASS_THROUGH}; constexpr ThreadBlock FwdThreadBlock{.block_size = 256, .tile_size = {.m = 256, .n = 256, .k = 32}}; @@ -120,7 +121,8 @@ TEST_F(FwdConvBuilderTest, .spatial_dim = 2, .direction = ConvDirection::FORWARD, .layout = GroupConvLayout2D::GNHWC_GKYXC_GNHWK, - .data_type = DataType::FP16}; + .data_type = DataType::FP16, + .elementwise_operation = ElementwiseOperation::PASS_THROUGH}; constexpr ThreadBlock FwdThreadBlock{.block_size = 256, .tile_size = {.m = 256, .n = 256, .k = 32}}; @@ -140,7 +142,8 @@ TEST_F(FwdConvBuilderTest, .spatial_dim = 2, .direction = ConvDirection::FORWARD, .layout = GroupConvLayout2D::NGCHW_GKCYX_NGKHW, - .data_type = DataType::FP32}; + .data_type = DataType::FP32, + .elementwise_operation = ElementwiseOperation::PASS_THROUGH}; constexpr ThreadBlock FwdThreadBlock{.block_size = 256, .tile_size = {.m = 128, .n = 128, .k = 32}}; @@ -159,7 +162,8 @@ TEST_F(FwdConvBuilderTest, .spatial_dim = 2, .direction = ConvDirection::FORWARD, .layout = GroupConvLayout2D::NHWGC_GKYXC_NHWGK, - .data_type = DataType::BF16}; + .data_type = DataType::BF16, + .elementwise_operation = ElementwiseOperation::PASS_THROUGH}; constexpr ThreadBlock FwdThreadBlock{.block_size = 256, .tile_size = {.m = 256, .n = 256, .k = 32}}; @@ -182,7 +186,8 @@ TEST_F(FwdConvBuilderTest, .spatial_dim = 3, .direction = ConvDirection::FORWARD, .layout = GroupConvLayout3D::NGCDHW_GKCZYX_NGKDHW, - .data_type = DataType::FP32}; + .data_type = DataType::FP32, + .elementwise_operation = ElementwiseOperation::PASS_THROUGH}; constexpr ThreadBlock FwdThreadBlock{.block_size = 256, .tile_size = {.m = 128, .n = 128, .k = 32}}; @@ -201,7 +206,8 @@ TEST_F(FwdConvBuilderTest, .spatial_dim = 3, .direction = ConvDirection::FORWARD, .layout = GroupConvLayout3D::GNDHWC_GKZYXC_GNDHWK, - .data_type = DataType::BF16}; + .data_type = DataType::BF16, + .elementwise_operation = ElementwiseOperation::PASS_THROUGH}; constexpr ThreadBlock FwdThreadBlock{.block_size = 256, .tile_size = {.m = 256, .n = 256, .k = 32}}; @@ -220,7 +226,8 @@ TEST_F(FwdConvBuilderTest, .spatial_dim = 3, .direction = ConvDirection::FORWARD, .layout = GroupConvLayout3D::NDHWGC_GKZYXC_NDHWGK, - .data_type = DataType::FP16}; + .data_type = DataType::FP16, + .elementwise_operation = ElementwiseOperation::PASS_THROUGH}; constexpr ThreadBlock FwdThreadBlock{.block_size = 256, .tile_size = {.m = 128, .n = 128, .k = 32}}; From 6db211716d036fd57fc1f851617254acca10edb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= Date: Mon, 27 Oct 2025 10:58:53 +0000 Subject: [PATCH 35/39] Improve compile-time error message for unsupported elementwise ops. --- .../builder/include/ck_tile/builder/conv_factory.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/experimental/builder/include/ck_tile/builder/conv_factory.hpp b/experimental/builder/include/ck_tile/builder/conv_factory.hpp index 45cd894e80..ad67a7cc33 100644 --- a/experimental/builder/include/ck_tile/builder/conv_factory.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_factory.hpp @@ -202,16 +202,16 @@ struct ConvTensorTypes }; template -struct ConvPassThroughOps +struct ElementwiseOps { // This will trigger if a specialization for the given DataType is not found. // We should always catch this in an earlier validation check. static_assert(sizeof(UnsupportedEnumValue) == 0, - "Internal error. Unsupported data type for convolution factory."); + "Internal error. Unsupported elementwise operation for convolution factory."); }; template <> -struct ConvPassThroughOps +struct ElementwiseOps { using AElementwiseOp = ck::tensor_operation::element_wise::PassThrough; using BElementwiseOp = ck::tensor_operation::element_wise::PassThrough; @@ -431,7 +431,7 @@ struct ConvFactory using Layouts = factory_internal::ConvTensorLayouts; using Types = factory_internal::ConvTensorTypes; - using Ops = factory_internal::ConvPassThroughOps; + using Ops = factory_internal::ElementwiseOps; using AlgorithmType = decltype(ALGORITHM); // Check preconditions for the algorithm description. From 90357f6739889e4dffba168d00b0379fdfb68d4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= Date: Mon, 27 Oct 2025 12:18:54 +0000 Subject: [PATCH 36/39] Separate different fwd conv builder tests into separate compilation units. --- experimental/builder/test/CMakeLists.txt | 8 +- .../test/conv/test_ckb_conv_fwd_2d_bf16.cpp | 44 ++++ .../test/conv/test_ckb_conv_fwd_2d_fp16.cpp | 23 ++ .../test/conv/test_ckb_conv_fwd_2d_fp32.cpp | 23 ++ .../test/conv/test_ckb_conv_fwd_3d_bf16.cpp | 24 ++ .../test/conv/test_ckb_conv_fwd_3d_fp16.cpp | 24 ++ .../test/conv/test_ckb_conv_fwd_3d_fp32.cpp | 24 ++ .../test/test_ckb_build_fwd_instances.cpp | 239 ------------------ .../test/utils/ckb_conv_test_common.hpp | 102 ++++++++ 9 files changed, 271 insertions(+), 240 deletions(-) create mode 100644 experimental/builder/test/conv/test_ckb_conv_fwd_2d_bf16.cpp create mode 100644 experimental/builder/test/conv/test_ckb_conv_fwd_2d_fp16.cpp create mode 100644 experimental/builder/test/conv/test_ckb_conv_fwd_2d_fp32.cpp create mode 100644 experimental/builder/test/conv/test_ckb_conv_fwd_3d_bf16.cpp create mode 100644 experimental/builder/test/conv/test_ckb_conv_fwd_3d_fp16.cpp create mode 100644 experimental/builder/test/conv/test_ckb_conv_fwd_3d_fp32.cpp delete mode 100644 experimental/builder/test/test_ckb_build_fwd_instances.cpp create mode 100644 experimental/builder/test/utils/ckb_conv_test_common.hpp diff --git a/experimental/builder/test/CMakeLists.txt b/experimental/builder/test/CMakeLists.txt index ec2ddadf85..d0b48cd542 100644 --- a/experimental/builder/test/CMakeLists.txt +++ b/experimental/builder/test/CMakeLists.txt @@ -7,6 +7,7 @@ function(add_ck_builder_test test_name) target_include_directories(${test_name} PRIVATE "${PROJECT_SOURCE_DIR}/experimental/builder/include" "${PROJECT_SOURCE_DIR}/include" + "${CMAKE_CURRENT_SOURCE_DIR}" ) target_compile_options(${test_name} PRIVATE -Wno-global-constructors @@ -22,4 +23,9 @@ add_ck_builder_test(test_get_instance_string test_get_instance_string.cpp) add_ck_builder_test(test_ckb_build_fwd_instances - test_ckb_build_fwd_instances.cpp) \ No newline at end of file + conv/test_ckb_conv_fwd_2d_bf16.cpp + conv/test_ckb_conv_fwd_2d_fp16.cpp + conv/test_ckb_conv_fwd_2d_fp32.cpp + conv/test_ckb_conv_fwd_3d_bf16.cpp + conv/test_ckb_conv_fwd_3d_fp16.cpp + conv/test_ckb_conv_fwd_3d_fp32.cpp) \ No newline at end of file diff --git a/experimental/builder/test/conv/test_ckb_conv_fwd_2d_bf16.cpp b/experimental/builder/test/conv/test_ckb_conv_fwd_2d_bf16.cpp new file mode 100644 index 0000000000..2417cf997c --- /dev/null +++ b/experimental/builder/test/conv/test_ckb_conv_fwd_2d_bf16.cpp @@ -0,0 +1,44 @@ +#include "utils/ckb_conv_test_common.hpp" + +using namespace ck_tile::builder::test_utils; + +class FwdConv2DBF16Test : public FwdConvBuilderTestBase {}; + +// 2D BF16 NHWGC (channels-last) with Pipeline V1 and DEFAULT +TEST_F(FwdConv2DBF16Test, Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_2D_BF16_ChannelsLast) +{ + constexpr ConvSignature FwdConvSignature{ + .spatial_dim = 2, + .direction = ConvDirection::FORWARD, + .layout = GroupConvLayout2D::NHWGC_GKYXC_NHWGK, + .data_type = DataType::BF16, + .elementwise_operation = ElementwiseOperation::PASS_THROUGH}; + + constexpr ThreadBlock FwdThreadBlock{.block_size = 256, + .tile_size = {.m = 256, .n = 256, .k = 32}}; + + run_test(); +} + +// 2D BF16 NHWGC (channels-last) with Pipeline V5 and FILTER_3x3 +TEST_F(FwdConv2DBF16Test, + Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_2D_BF16_NHWGC_Filter3x3) +{ + constexpr ConvSignature FwdConvSignature{ + .spatial_dim = 2, + .direction = ConvDirection::FORWARD, + .layout = GroupConvLayout2D::NHWGC_GKYXC_NHWGK, + .data_type = DataType::BF16, + .elementwise_operation = ElementwiseOperation::PASS_THROUGH}; + + constexpr ThreadBlock FwdThreadBlock{.block_size = 256, + .tile_size = {.m = 256, .n = 256, .k = 32}}; + + run_test(); +} diff --git a/experimental/builder/test/conv/test_ckb_conv_fwd_2d_fp16.cpp b/experimental/builder/test/conv/test_ckb_conv_fwd_2d_fp16.cpp new file mode 100644 index 0000000000..66a27e1db3 --- /dev/null +++ b/experimental/builder/test/conv/test_ckb_conv_fwd_2d_fp16.cpp @@ -0,0 +1,23 @@ +#include "utils/ckb_conv_test_common.hpp" + +using namespace ck_tile::builder::test_utils; + +class FwdConv2DFP16Test : public FwdConvBuilderTestBase {}; + +TEST_F(FwdConv2DFP16Test, Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_2D_FP16_GNHWC) +{ + constexpr ConvSignature FwdConvSignature{ + .spatial_dim = 2, + .direction = ConvDirection::FORWARD, + .layout = GroupConvLayout2D::GNHWC_GKYXC_GNHWK, + .data_type = DataType::FP16, + .elementwise_operation = ElementwiseOperation::PASS_THROUGH}; + + constexpr ThreadBlock FwdThreadBlock{.block_size = 256, + .tile_size = {.m = 256, .n = 256, .k = 32}}; + + run_test(); +} diff --git a/experimental/builder/test/conv/test_ckb_conv_fwd_2d_fp32.cpp b/experimental/builder/test/conv/test_ckb_conv_fwd_2d_fp32.cpp new file mode 100644 index 0000000000..bca9d99e44 --- /dev/null +++ b/experimental/builder/test/conv/test_ckb_conv_fwd_2d_fp32.cpp @@ -0,0 +1,23 @@ +#include "utils/ckb_conv_test_common.hpp" + +using namespace ck_tile::builder::test_utils; + +class FwdConv2DFP32Test : public FwdConvBuilderTestBase {}; + +TEST_F(FwdConv2DFP32Test, Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_2D_FP32_NGCHW_GKCYX) +{ + constexpr ConvSignature FwdConvSignature{ + .spatial_dim = 2, + .direction = ConvDirection::FORWARD, + .layout = GroupConvLayout2D::NGCHW_GKCYX_NGKHW, + .data_type = DataType::FP32, + .elementwise_operation = ElementwiseOperation::PASS_THROUGH}; + + constexpr ThreadBlock FwdThreadBlock{.block_size = 256, + .tile_size = {.m = 128, .n = 128, .k = 32}}; + + run_test(); +} diff --git a/experimental/builder/test/conv/test_ckb_conv_fwd_3d_bf16.cpp b/experimental/builder/test/conv/test_ckb_conv_fwd_3d_bf16.cpp new file mode 100644 index 0000000000..f47d6ff933 --- /dev/null +++ b/experimental/builder/test/conv/test_ckb_conv_fwd_3d_bf16.cpp @@ -0,0 +1,24 @@ +#include "utils/ckb_conv_test_common.hpp" + +using namespace ck_tile::builder::test_utils; + +class FwdConv3DBF16Test : public FwdConvBuilderTestBase {}; + +// 3D BF16 GNDHWC (group-first, channels-last) with Pipeline V3 and DEFAULT +TEST_F(FwdConv3DBF16Test, Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_3D_BF16_GNDHWC) +{ + constexpr ConvSignature FwdConvSignature{ + .spatial_dim = 3, + .direction = ConvDirection::FORWARD, + .layout = GroupConvLayout3D::GNDHWC_GKZYXC_GNDHWK, + .data_type = DataType::BF16, + .elementwise_operation = ElementwiseOperation::PASS_THROUGH}; + + constexpr ThreadBlock FwdThreadBlock{.block_size = 256, + .tile_size = {.m = 256, .n = 256, .k = 32}}; + + run_test(); +} diff --git a/experimental/builder/test/conv/test_ckb_conv_fwd_3d_fp16.cpp b/experimental/builder/test/conv/test_ckb_conv_fwd_3d_fp16.cpp new file mode 100644 index 0000000000..75c9b7dff6 --- /dev/null +++ b/experimental/builder/test/conv/test_ckb_conv_fwd_3d_fp16.cpp @@ -0,0 +1,24 @@ +#include "utils/ckb_conv_test_common.hpp" + +using namespace ck_tile::builder::test_utils; + +class FwdConv3DFP16Test : public FwdConvBuilderTestBase {}; + +// 3D FP16 NDHWGC (channels-last) with Pipeline V4 and FILTER_1X1_PAD0 +TEST_F(FwdConv3DFP16Test, Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_3D_FP16_NDHWGC_ChannelsLast) +{ + constexpr ConvSignature FwdConvSignature{ + .spatial_dim = 3, + .direction = ConvDirection::FORWARD, + .layout = GroupConvLayout3D::NDHWGC_GKZYXC_NDHWGK, + .data_type = DataType::FP16, + .elementwise_operation = ElementwiseOperation::PASS_THROUGH}; + + constexpr ThreadBlock FwdThreadBlock{.block_size = 256, + .tile_size = {.m = 128, .n = 128, .k = 32}}; + + run_test(); +} diff --git a/experimental/builder/test/conv/test_ckb_conv_fwd_3d_fp32.cpp b/experimental/builder/test/conv/test_ckb_conv_fwd_3d_fp32.cpp new file mode 100644 index 0000000000..3f4b5809b8 --- /dev/null +++ b/experimental/builder/test/conv/test_ckb_conv_fwd_3d_fp32.cpp @@ -0,0 +1,24 @@ +#include "utils/ckb_conv_test_common.hpp" + +using namespace ck_tile::builder::test_utils; + +class FwdConv3DFP32Test : public FwdConvBuilderTestBase {}; + +//3D FP32 NGCDHW (channels-first) with Pipeline V1 and FILTER_1X1_PAD0 +TEST_F(FwdConv3DFP32Test, Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_3D_FP32_ChannelsFirst) +{ + constexpr ConvSignature FwdConvSignature{ + .spatial_dim = 3, + .direction = ConvDirection::FORWARD, + .layout = GroupConvLayout3D::NGCDHW_GKCZYX_NGKDHW, + .data_type = DataType::FP32, + .elementwise_operation = ElementwiseOperation::PASS_THROUGH}; + + constexpr ThreadBlock FwdThreadBlock{.block_size = 256, + .tile_size = {.m = 128, .n = 128, .k = 32}}; + + run_test(); +} diff --git a/experimental/builder/test/test_ckb_build_fwd_instances.cpp b/experimental/builder/test/test_ckb_build_fwd_instances.cpp deleted file mode 100644 index df0bd3478c..0000000000 --- a/experimental/builder/test/test_ckb_build_fwd_instances.cpp +++ /dev/null @@ -1,239 +0,0 @@ -#include - -#include "impl/conv_algorithm_types.hpp" -#include "impl/conv_signature_types.hpp" -#include "ck_tile/builder/conv_builder.hpp" - -class FwdConvBuilderTest : public ::testing::Test -{ -}; - -using namespace ck_tile::builder; -using namespace test; - -template -constexpr void run_test() -{ - constexpr GridwiseGemm FwdGemmParams{.ak1 = 8, - .bk1 = 8, - .m_per_xdl = 32, - .n_per_xdl = 32, - .m_xdl_per_wave = 4, - .n_xdl_per_wave = 4}; - - constexpr BlockTransferABC FwdBlockTransfer{ - .block_transfer_a = {.k0 = 4, .m_n = 64, .k1 = 1}, - .block_transfer_b = {.k0 = 4, .m_n = 64, .k1 = 1}, - .thread_cluster_dims_c = {.m_block = 1, - .m_wave_per_xdl = 32, - .n_block = 1, - .n_wave_per_xdl = 8}, - .lds_transfer_a = {.src_vector_dim = 2, - .src_scalar_per_vector = 2, - .lds_dst_scalar_per_vector = 8, - .is_direct_load = false, - .lds_padding = false}, - .lds_transfer_b = {.src_vector_dim = 2, - .src_scalar_per_vector = 8, - .lds_dst_scalar_per_vector = 8, - .is_direct_load = false, - .lds_padding = false}, - .epilogue_c = {.m_xdl_per_wave_per_shuffle = 1, - .n_xdl_per_wave_per_shuffle = 1, - .scalar_per_vector = 8}, - .block_transfer_access_order_a = {1, 0, 2}, - .block_transfer_access_order_b = {1, 0, 2}, - .src_access_order_a = {1, 0, 2}, - .src_access_order_b = {1, 0, 2}}; - - constexpr ConvAlgorithm FwdConvAlgorithm{.thread_block = FwdThreadBlock, - .gridwise_gemm = FwdGemmParams, - .block_transfer = FwdBlockTransfer, - .pipeline_version = FwdPipelineVersion, - .fwd_specialization = FwdConvSpecialization}; - - using Builder = ConvBuilder; - - auto instance = typename Builder::Instance{}; - - const auto kernel_string = instance.GetTypeString(); - std::cout << "Generated kernel: " << kernel_string << std::endl; - EXPECT_GT(kernel_string.size(), 0); - - EXPECT_TRUE(kernel_string.starts_with("DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3")); - - // Verify pipeline version is correct - if(FwdPipelineVersion == BlockGemmPipelineVersion::V1) - EXPECT_TRUE(kernel_string.find("BlkGemmPipelineVersion: v1") != std::string::npos); - else if(FwdPipelineVersion == BlockGemmPipelineVersion::V3) - EXPECT_TRUE(kernel_string.find("BlkGemmPipelineVersion: v3") != std::string::npos); - else if(FwdPipelineVersion == BlockGemmPipelineVersion::V4) - EXPECT_TRUE(kernel_string.find("BlkGemmPipelineVersion: v4") != std::string::npos); - else if(FwdPipelineVersion == BlockGemmPipelineVersion::V5) - EXPECT_TRUE(kernel_string.find("BlkGemmPipelineVersion: v5") != std::string::npos); - - // Verify specialization is correct - if(FwdConvSpecialization == ConvFwdSpecialization::DEFAULT) - EXPECT_TRUE(kernel_string.find("Default") != std::string::npos); - else if(FwdConvSpecialization == ConvFwdSpecialization::FILTER_1X1_PAD0) - EXPECT_TRUE(kernel_string.find("Filter1x1Pad0") != std::string::npos); - else if(FwdConvSpecialization == ConvFwdSpecialization::FILTER_1X1_STRIDE1_PAD0) - EXPECT_TRUE(kernel_string.find("Filter1x1Stride1Pad0") != std::string::npos); - else if(FwdConvSpecialization == ConvFwdSpecialization::FILTER_3x3) - EXPECT_TRUE(kernel_string.find("Filter3x3") != std::string::npos); - - const auto invoker_ptr = instance.MakeInvokerPointer(); - EXPECT_NE(invoker_ptr, nullptr); -} - -//============================================================================== -// 2D Forward Convolution Tests -//============================================================================== - -// Test 1: 2D BF16 NHWGC (channels-last) with Pipeline V1 and DEFAULT -TEST_F(FwdConvBuilderTest, - Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_2D_BF16_ChannelsLast) -{ - constexpr ConvSignature FwdConvSignature{ - .spatial_dim = 2, - .direction = ConvDirection::FORWARD, - .layout = GroupConvLayout2D::NHWGC_GKYXC_NHWGK, - .data_type = DataType::BF16, - .elementwise_operation = ElementwiseOperation::PASS_THROUGH}; - - constexpr ThreadBlock FwdThreadBlock{.block_size = 256, - .tile_size = {.m = 256, .n = 256, .k = 32}}; - - run_test(); -} - -// Test 2: 2D FP16 GNHWC (group-first, channels-last) with Pipeline V3 and FILTER_1X1_PAD0 -TEST_F(FwdConvBuilderTest, - Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_2D_FP16_GNHWC) -{ - constexpr ConvSignature FwdConvSignature{ - .spatial_dim = 2, - .direction = ConvDirection::FORWARD, - .layout = GroupConvLayout2D::GNHWC_GKYXC_GNHWK, - .data_type = DataType::FP16, - .elementwise_operation = ElementwiseOperation::PASS_THROUGH}; - - constexpr ThreadBlock FwdThreadBlock{.block_size = 256, - .tile_size = {.m = 256, .n = 256, .k = 32}}; - - run_test(); -} - -// Test 3: 2D FP32 NGCHW_GKCYX (channels-first, different weight layout) with Pipeline V4 and -// FILTER_1X1_STRIDE1_PAD0 -TEST_F(FwdConvBuilderTest, - Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_2D_FP32_NGCHW_GKCYX) -{ - constexpr ConvSignature FwdConvSignature{ - .spatial_dim = 2, - .direction = ConvDirection::FORWARD, - .layout = GroupConvLayout2D::NGCHW_GKCYX_NGKHW, - .data_type = DataType::FP32, - .elementwise_operation = ElementwiseOperation::PASS_THROUGH}; - - constexpr ThreadBlock FwdThreadBlock{.block_size = 256, - .tile_size = {.m = 128, .n = 128, .k = 32}}; - - run_test(); -} - -// Test 4: 2D BF16 NHWGC (channels-last) with Pipeline V5 and FILTER_3x3 -TEST_F(FwdConvBuilderTest, - Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_2D_BF16_NHWGC_Filter3x3) -{ - constexpr ConvSignature FwdConvSignature{ - .spatial_dim = 2, - .direction = ConvDirection::FORWARD, - .layout = GroupConvLayout2D::NHWGC_GKYXC_NHWGK, - .data_type = DataType::BF16, - .elementwise_operation = ElementwiseOperation::PASS_THROUGH}; - - constexpr ThreadBlock FwdThreadBlock{.block_size = 256, - .tile_size = {.m = 256, .n = 256, .k = 32}}; - - run_test(); -} - -//============================================================================== -// 3D Forward Convolution Tests -//============================================================================== - -// Test 5: 3D FP32 NGCDHW (channels-first) with Pipeline V1 and FILTER_1X1_PAD0 -TEST_F(FwdConvBuilderTest, - Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_3D_FP32_ChannelsFirst) -{ - constexpr ConvSignature FwdConvSignature{ - .spatial_dim = 3, - .direction = ConvDirection::FORWARD, - .layout = GroupConvLayout3D::NGCDHW_GKCZYX_NGKDHW, - .data_type = DataType::FP32, - .elementwise_operation = ElementwiseOperation::PASS_THROUGH}; - - constexpr ThreadBlock FwdThreadBlock{.block_size = 256, - .tile_size = {.m = 128, .n = 128, .k = 32}}; - - run_test(); -} - -// Test 6: 3D BF16 GNDHWC (group-first, channels-last) with Pipeline V3 and DEFAULT -TEST_F(FwdConvBuilderTest, - Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_3D_BF16_GNDHWC) -{ - constexpr ConvSignature FwdConvSignature{ - .spatial_dim = 3, - .direction = ConvDirection::FORWARD, - .layout = GroupConvLayout3D::GNDHWC_GKZYXC_GNDHWK, - .data_type = DataType::BF16, - .elementwise_operation = ElementwiseOperation::PASS_THROUGH}; - - constexpr ThreadBlock FwdThreadBlock{.block_size = 256, - .tile_size = {.m = 256, .n = 256, .k = 32}}; - - run_test(); -} - -// Test 7: 3D FP16 NDHWGC (channels-last) with Pipeline V4 and FILTER_1X1_PAD0 -TEST_F(FwdConvBuilderTest, - Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_3D_FP16_NDHWGC_ChannelsLast) -{ - constexpr ConvSignature FwdConvSignature{ - .spatial_dim = 3, - .direction = ConvDirection::FORWARD, - .layout = GroupConvLayout3D::NDHWGC_GKZYXC_NDHWGK, - .data_type = DataType::FP16, - .elementwise_operation = ElementwiseOperation::PASS_THROUGH}; - - constexpr ThreadBlock FwdThreadBlock{.block_size = 256, - .tile_size = {.m = 128, .n = 128, .k = 32}}; - - run_test(); -} diff --git a/experimental/builder/test/utils/ckb_conv_test_common.hpp b/experimental/builder/test/utils/ckb_conv_test_common.hpp new file mode 100644 index 0000000000..924256e346 --- /dev/null +++ b/experimental/builder/test/utils/ckb_conv_test_common.hpp @@ -0,0 +1,102 @@ +#pragma once + +#include +#include "impl/conv_algorithm_types.hpp" +#include "impl/conv_signature_types.hpp" +#include "ck_tile/builder/conv_builder.hpp" + +namespace ck_tile::builder::test_utils { + +using namespace ck_tile::builder; +using namespace test; + +// Common test base class +class FwdConvBuilderTestBase : public ::testing::Test {}; + +// Common test implementation +template +constexpr void run_test() +{ + constexpr GridwiseGemm FwdGemmParams{.ak1 = 8, + .bk1 = 8, + .m_per_xdl = 32, + .n_per_xdl = 32, + .m_xdl_per_wave = 4, + .n_xdl_per_wave = 4}; + + constexpr BlockTransferABC FwdBlockTransfer{ + .block_transfer_a = {.k0 = 4, .m_n = 64, .k1 = 1}, + .block_transfer_b = {.k0 = 4, .m_n = 64, .k1 = 1}, + .thread_cluster_dims_c = {.m_block = 1, + .m_wave_per_xdl = 32, + .n_block = 1, + .n_wave_per_xdl = 8}, + .lds_transfer_a = {.src_vector_dim = 2, + .src_scalar_per_vector = 2, + .lds_dst_scalar_per_vector = 8, + .is_direct_load = false, + .lds_padding = false}, + .lds_transfer_b = {.src_vector_dim = 2, + .src_scalar_per_vector = 8, + .lds_dst_scalar_per_vector = 8, + .is_direct_load = false, + .lds_padding = false}, + .epilogue_c = {.m_xdl_per_wave_per_shuffle = 1, + .n_xdl_per_wave_per_shuffle = 1, + .scalar_per_vector = 8}, + .block_transfer_access_order_a = {1, 0, 2}, + .block_transfer_access_order_b = {1, 0, 2}, + .src_access_order_a = {1, 0, 2}, + .src_access_order_b = {1, 0, 2}}; + + constexpr ConvAlgorithm FwdConvAlgorithm{.thread_block = FwdThreadBlock, + .gridwise_gemm = FwdGemmParams, + .block_transfer = FwdBlockTransfer, + .pipeline_version = FwdPipelineVersion, + .fwd_specialization = FwdConvSpecialization}; + + using Builder = ConvBuilder; + + auto instance = typename Builder::Instance{}; + + const auto kernel_string = instance.GetTypeString(); + std::cout << "Generated kernel: " << kernel_string << std::endl; + EXPECT_GT(kernel_string.size(), 0); + + EXPECT_TRUE(kernel_string.starts_with("DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3")); + + // Verify pipeline version is correct + if(FwdPipelineVersion == BlockGemmPipelineVersion::V1) + EXPECT_TRUE(kernel_string.find("BlkGemmPipelineVersion: v1") != std::string::npos); + else if(FwdPipelineVersion == BlockGemmPipelineVersion::V3) + EXPECT_TRUE(kernel_string.find("BlkGemmPipelineVersion: v3") != std::string::npos); + else if(FwdPipelineVersion == BlockGemmPipelineVersion::V4) + EXPECT_TRUE(kernel_string.find("BlkGemmPipelineVersion: v4") != std::string::npos); + else if(FwdPipelineVersion == BlockGemmPipelineVersion::V5) + EXPECT_TRUE(kernel_string.find("BlkGemmPipelineVersion: v5") != std::string::npos); + + // Verify specialization is correct + if(FwdConvSpecialization == ConvFwdSpecialization::DEFAULT) + EXPECT_TRUE(kernel_string.find("Default") != std::string::npos); + else if(FwdConvSpecialization == ConvFwdSpecialization::FILTER_1X1_PAD0) + EXPECT_TRUE(kernel_string.find("Filter1x1Pad0") != std::string::npos); + else if(FwdConvSpecialization == ConvFwdSpecialization::FILTER_1X1_STRIDE1_PAD0) + EXPECT_TRUE(kernel_string.find("Filter1x1Stride1Pad0") != std::string::npos); + else if(FwdConvSpecialization == ConvFwdSpecialization::FILTER_3x3) + EXPECT_TRUE(kernel_string.find("Filter3x3") != std::string::npos); + + const auto invoker_ptr = instance.MakeInvokerPointer(); + EXPECT_NE(invoker_ptr, nullptr); +} + +// Common thread block configurations +constexpr ThreadBlock DefaultThreadBlock{.block_size = 256, + .tile_size = {.m = 256, .n = 256, .k = 32}}; + +constexpr ThreadBlock SmallThreadBlock{.block_size = 256, + .tile_size = {.m = 128, .n = 128, .k = 32}}; + +} // namespace ck_tile::builder::test_utils From 2e9d8402b4e11b20e636e054685a558002a3d07d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= Date: Mon, 27 Oct 2025 13:09:59 +0000 Subject: [PATCH 37/39] Fix layout to string and add name to old CK PassThrough elementwise op. --- .../include/ck_tile/builder/reflect/instance_traits_util.hpp | 2 +- .../gpu/element/unary_element_wise_operation.hpp | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp index 01333833dd..0363ef7132 100644 --- a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp +++ b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp @@ -62,7 +62,7 @@ consteval std::string_view type_name() template constexpr std::string_view layout_name() { - if constexpr(std::is_base_of_v && requires { + if constexpr(std::is_base_of_v && requires { { T::name } -> std::convertible_to; }) return T::name; diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp index 81dd5e5dbb..4643c0bcb3 100644 --- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp @@ -552,6 +552,8 @@ struct PassThrough { y = type_convert(x); } + + static constexpr const char* name = "PassThrough"; }; struct UnaryConvert From 3385dc238e8f398247789af7fb927e43b6a7784d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= Date: Mon, 27 Oct 2025 13:17:55 +0000 Subject: [PATCH 38/39] Enable both CK and CK Tile tensor layouts in instance traits. --- .../include/ck_tile/builder/reflect/instance_traits_util.hpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp index 0363ef7132..8d50a52376 100644 --- a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp +++ b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -62,7 +63,9 @@ consteval std::string_view type_name() template constexpr std::string_view layout_name() { - if constexpr(std::is_base_of_v && requires { + if constexpr((std::is_base_of_v || + std::is_base_of_v + ) && requires { { T::name } -> std::convertible_to; }) return T::name; From 0cea23e620792746ee7daa94f9e2d9c45e568191 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= Date: Mon, 27 Oct 2025 13:23:28 +0000 Subject: [PATCH 39/39] Fix clang-format. --- .../builder/conv_algorithm_concepts.hpp | 9 ++-- .../include/ck_tile/builder/conv_factory.hpp | 5 +- .../builder/reflect/instance_traits_util.hpp | 4 +- .../test/conv/test_ckb_conv_fwd_2d_bf16.cpp | 23 ++++---- .../test/conv/test_ckb_conv_fwd_2d_fp16.cpp | 15 +++--- .../test/conv/test_ckb_conv_fwd_2d_fp32.cpp | 15 +++--- .../test/conv/test_ckb_conv_fwd_3d_bf16.cpp | 15 +++--- .../test/conv/test_ckb_conv_fwd_3d_fp16.cpp | 15 +++--- .../test/conv/test_ckb_conv_fwd_3d_fp32.cpp | 17 +++--- .../test/utils/ckb_conv_test_common.hpp | 53 ++++++++++--------- 10 files changed, 96 insertions(+), 75 deletions(-) diff --git a/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp b/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp index 0324ffe38a..078c066e55 100644 --- a/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp @@ -62,7 +62,8 @@ concept LdsTransferDescriptor = requires(T t) { { t.lds_padding } -> std::convertible_to; }; -// Concept for the convolution output tensor epilogue (copy from registers to global memory via LDS). +// Concept for the convolution output tensor epilogue (copy from registers to global memory via +// LDS). template concept EpilogueDescriptor = requires(T t) { { t.m_xdl_per_wave_per_shuffle } -> std::convertible_to; @@ -99,8 +100,8 @@ concept SpecifiesGridwiseGemm = requires { // Concept to check if a struct specifies convolution input and output block transfer info. template concept SpecifiesBlockTransfer = requires(T t) { - { T::block_transfer.block_transfer_a } -> BlockTransferDescriptor; - { T::block_transfer.block_transfer_b } -> BlockTransferDescriptor; + { T::block_transfer.block_transfer_a } -> BlockTransferDescriptor; + { T::block_transfer.block_transfer_b } -> BlockTransferDescriptor; { T::block_transfer.thread_cluster_dims_c } -> ThreadClusterDescriptor; }; @@ -109,7 +110,7 @@ template concept SpecifiesLdsTransfer = requires(T t) { { T::block_transfer.lds_transfer_a } -> LdsTransferDescriptor; { T::block_transfer.lds_transfer_b } -> LdsTransferDescriptor; - { T::block_transfer.epilogue_c } -> EpilogueDescriptor; + { T::block_transfer.epilogue_c } -> EpilogueDescriptor; }; // Concept to check if a struct specifies thread cluster access order info. diff --git a/experimental/builder/include/ck_tile/builder/conv_factory.hpp b/experimental/builder/include/ck_tile/builder/conv_factory.hpp index ad67a7cc33..ba2087cfa3 100644 --- a/experimental/builder/include/ck_tile/builder/conv_factory.hpp +++ b/experimental/builder/include/ck_tile/builder/conv_factory.hpp @@ -462,8 +462,9 @@ struct ConvFactory .conv_spec = FWD_CONV_SPECIALIZATION, .gemm_spec = ck::tensor_operation::device::GemmSpecialization::MNKPadding, }; - static constexpr auto BLOCK = factory_internal::SetThreadBlockInfo(); - static constexpr auto GRIDWISE_GEMM = factory_internal::SetGridwiseGemmInfo(); + static constexpr auto BLOCK = factory_internal::SetThreadBlockInfo(); + static constexpr auto GRIDWISE_GEMM = + factory_internal::SetGridwiseGemmInfo(); static constexpr auto A_BLOCK_TRANSFER = factory_internal::SetFwdConvABlockTransfer(); static constexpr auto B_BLOCK_TRANSFER = diff --git a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp index 8d50a52376..a7ec568b03 100644 --- a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp +++ b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp @@ -64,8 +64,8 @@ template constexpr std::string_view layout_name() { if constexpr((std::is_base_of_v || - std::is_base_of_v - ) && requires { + std::is_base_of_v) && + requires { { T::name } -> std::convertible_to; }) return T::name; diff --git a/experimental/builder/test/conv/test_ckb_conv_fwd_2d_bf16.cpp b/experimental/builder/test/conv/test_ckb_conv_fwd_2d_bf16.cpp index 2417cf997c..433b39884b 100644 --- a/experimental/builder/test/conv/test_ckb_conv_fwd_2d_bf16.cpp +++ b/experimental/builder/test/conv/test_ckb_conv_fwd_2d_bf16.cpp @@ -2,16 +2,19 @@ using namespace ck_tile::builder::test_utils; -class FwdConv2DBF16Test : public FwdConvBuilderTestBase {}; +class FwdConv2DBF16Test : public FwdConvBuilderTestBase +{ +}; // 2D BF16 NHWGC (channels-last) with Pipeline V1 and DEFAULT -TEST_F(FwdConv2DBF16Test, Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_2D_BF16_ChannelsLast) +TEST_F(FwdConv2DBF16Test, + Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_2D_BF16_ChannelsLast) { constexpr ConvSignature FwdConvSignature{ - .spatial_dim = 2, - .direction = ConvDirection::FORWARD, - .layout = GroupConvLayout2D::NHWGC_GKYXC_NHWGK, - .data_type = DataType::BF16, + .spatial_dim = 2, + .direction = ConvDirection::FORWARD, + .layout = GroupConvLayout2D::NHWGC_GKYXC_NHWGK, + .data_type = DataType::BF16, .elementwise_operation = ElementwiseOperation::PASS_THROUGH}; constexpr ThreadBlock FwdThreadBlock{.block_size = 256, @@ -28,10 +31,10 @@ TEST_F(FwdConv2DBF16Test, Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_2D_BF16_NHWGC_Filter3x3) { constexpr ConvSignature FwdConvSignature{ - .spatial_dim = 2, - .direction = ConvDirection::FORWARD, - .layout = GroupConvLayout2D::NHWGC_GKYXC_NHWGK, - .data_type = DataType::BF16, + .spatial_dim = 2, + .direction = ConvDirection::FORWARD, + .layout = GroupConvLayout2D::NHWGC_GKYXC_NHWGK, + .data_type = DataType::BF16, .elementwise_operation = ElementwiseOperation::PASS_THROUGH}; constexpr ThreadBlock FwdThreadBlock{.block_size = 256, diff --git a/experimental/builder/test/conv/test_ckb_conv_fwd_2d_fp16.cpp b/experimental/builder/test/conv/test_ckb_conv_fwd_2d_fp16.cpp index 66a27e1db3..2b2109a141 100644 --- a/experimental/builder/test/conv/test_ckb_conv_fwd_2d_fp16.cpp +++ b/experimental/builder/test/conv/test_ckb_conv_fwd_2d_fp16.cpp @@ -2,15 +2,18 @@ using namespace ck_tile::builder::test_utils; -class FwdConv2DFP16Test : public FwdConvBuilderTestBase {}; +class FwdConv2DFP16Test : public FwdConvBuilderTestBase +{ +}; -TEST_F(FwdConv2DFP16Test, Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_2D_FP16_GNHWC) +TEST_F(FwdConv2DFP16Test, + Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_2D_FP16_GNHWC) { constexpr ConvSignature FwdConvSignature{ - .spatial_dim = 2, - .direction = ConvDirection::FORWARD, - .layout = GroupConvLayout2D::GNHWC_GKYXC_GNHWK, - .data_type = DataType::FP16, + .spatial_dim = 2, + .direction = ConvDirection::FORWARD, + .layout = GroupConvLayout2D::GNHWC_GKYXC_GNHWK, + .data_type = DataType::FP16, .elementwise_operation = ElementwiseOperation::PASS_THROUGH}; constexpr ThreadBlock FwdThreadBlock{.block_size = 256, diff --git a/experimental/builder/test/conv/test_ckb_conv_fwd_2d_fp32.cpp b/experimental/builder/test/conv/test_ckb_conv_fwd_2d_fp32.cpp index bca9d99e44..3eade37659 100644 --- a/experimental/builder/test/conv/test_ckb_conv_fwd_2d_fp32.cpp +++ b/experimental/builder/test/conv/test_ckb_conv_fwd_2d_fp32.cpp @@ -2,15 +2,18 @@ using namespace ck_tile::builder::test_utils; -class FwdConv2DFP32Test : public FwdConvBuilderTestBase {}; +class FwdConv2DFP32Test : public FwdConvBuilderTestBase +{ +}; -TEST_F(FwdConv2DFP32Test, Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_2D_FP32_NGCHW_GKCYX) +TEST_F(FwdConv2DFP32Test, + Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_2D_FP32_NGCHW_GKCYX) { constexpr ConvSignature FwdConvSignature{ - .spatial_dim = 2, - .direction = ConvDirection::FORWARD, - .layout = GroupConvLayout2D::NGCHW_GKCYX_NGKHW, - .data_type = DataType::FP32, + .spatial_dim = 2, + .direction = ConvDirection::FORWARD, + .layout = GroupConvLayout2D::NGCHW_GKCYX_NGKHW, + .data_type = DataType::FP32, .elementwise_operation = ElementwiseOperation::PASS_THROUGH}; constexpr ThreadBlock FwdThreadBlock{.block_size = 256, diff --git a/experimental/builder/test/conv/test_ckb_conv_fwd_3d_bf16.cpp b/experimental/builder/test/conv/test_ckb_conv_fwd_3d_bf16.cpp index f47d6ff933..6bc62153cd 100644 --- a/experimental/builder/test/conv/test_ckb_conv_fwd_3d_bf16.cpp +++ b/experimental/builder/test/conv/test_ckb_conv_fwd_3d_bf16.cpp @@ -2,16 +2,19 @@ using namespace ck_tile::builder::test_utils; -class FwdConv3DBF16Test : public FwdConvBuilderTestBase {}; +class FwdConv3DBF16Test : public FwdConvBuilderTestBase +{ +}; // 3D BF16 GNDHWC (group-first, channels-last) with Pipeline V3 and DEFAULT -TEST_F(FwdConv3DBF16Test, Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_3D_BF16_GNDHWC) +TEST_F(FwdConv3DBF16Test, + Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_3D_BF16_GNDHWC) { constexpr ConvSignature FwdConvSignature{ - .spatial_dim = 3, - .direction = ConvDirection::FORWARD, - .layout = GroupConvLayout3D::GNDHWC_GKZYXC_GNDHWK, - .data_type = DataType::BF16, + .spatial_dim = 3, + .direction = ConvDirection::FORWARD, + .layout = GroupConvLayout3D::GNDHWC_GKZYXC_GNDHWK, + .data_type = DataType::BF16, .elementwise_operation = ElementwiseOperation::PASS_THROUGH}; constexpr ThreadBlock FwdThreadBlock{.block_size = 256, diff --git a/experimental/builder/test/conv/test_ckb_conv_fwd_3d_fp16.cpp b/experimental/builder/test/conv/test_ckb_conv_fwd_3d_fp16.cpp index 75c9b7dff6..c23e58c702 100644 --- a/experimental/builder/test/conv/test_ckb_conv_fwd_3d_fp16.cpp +++ b/experimental/builder/test/conv/test_ckb_conv_fwd_3d_fp16.cpp @@ -2,16 +2,19 @@ using namespace ck_tile::builder::test_utils; -class FwdConv3DFP16Test : public FwdConvBuilderTestBase {}; +class FwdConv3DFP16Test : public FwdConvBuilderTestBase +{ +}; // 3D FP16 NDHWGC (channels-last) with Pipeline V4 and FILTER_1X1_PAD0 -TEST_F(FwdConv3DFP16Test, Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_3D_FP16_NDHWGC_ChannelsLast) +TEST_F(FwdConv3DFP16Test, + Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_3D_FP16_NDHWGC_ChannelsLast) { constexpr ConvSignature FwdConvSignature{ - .spatial_dim = 3, - .direction = ConvDirection::FORWARD, - .layout = GroupConvLayout3D::NDHWGC_GKZYXC_NDHWGK, - .data_type = DataType::FP16, + .spatial_dim = 3, + .direction = ConvDirection::FORWARD, + .layout = GroupConvLayout3D::NDHWGC_GKZYXC_NDHWGK, + .data_type = DataType::FP16, .elementwise_operation = ElementwiseOperation::PASS_THROUGH}; constexpr ThreadBlock FwdThreadBlock{.block_size = 256, diff --git a/experimental/builder/test/conv/test_ckb_conv_fwd_3d_fp32.cpp b/experimental/builder/test/conv/test_ckb_conv_fwd_3d_fp32.cpp index 3f4b5809b8..deaf2038e2 100644 --- a/experimental/builder/test/conv/test_ckb_conv_fwd_3d_fp32.cpp +++ b/experimental/builder/test/conv/test_ckb_conv_fwd_3d_fp32.cpp @@ -2,16 +2,19 @@ using namespace ck_tile::builder::test_utils; -class FwdConv3DFP32Test : public FwdConvBuilderTestBase {}; +class FwdConv3DFP32Test : public FwdConvBuilderTestBase +{ +}; -//3D FP32 NGCDHW (channels-first) with Pipeline V1 and FILTER_1X1_PAD0 -TEST_F(FwdConv3DFP32Test, Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_3D_FP32_ChannelsFirst) +// 3D FP32 NGCDHW (channels-first) with Pipeline V1 and FILTER_1X1_PAD0 +TEST_F(FwdConv3DFP32Test, + Create_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_Instance_3D_FP32_ChannelsFirst) { constexpr ConvSignature FwdConvSignature{ - .spatial_dim = 3, - .direction = ConvDirection::FORWARD, - .layout = GroupConvLayout3D::NGCDHW_GKCZYX_NGKDHW, - .data_type = DataType::FP32, + .spatial_dim = 3, + .direction = ConvDirection::FORWARD, + .layout = GroupConvLayout3D::NGCDHW_GKCZYX_NGKDHW, + .data_type = DataType::FP32, .elementwise_operation = ElementwiseOperation::PASS_THROUGH}; constexpr ThreadBlock FwdThreadBlock{.block_size = 256, diff --git a/experimental/builder/test/utils/ckb_conv_test_common.hpp b/experimental/builder/test/utils/ckb_conv_test_common.hpp index 924256e346..37ee3a953a 100644 --- a/experimental/builder/test/utils/ckb_conv_test_common.hpp +++ b/experimental/builder/test/utils/ckb_conv_test_common.hpp @@ -11,7 +11,9 @@ using namespace ck_tile::builder; using namespace test; // Common test base class -class FwdConvBuilderTestBase : public ::testing::Test {}; +class FwdConvBuilderTestBase : public ::testing::Test +{ +}; // Common test implementation template