block layout op runtime function added

jjsjann123 · jjsjann123 · commit 2c992ab31dd7 · 2025-09-03T16:44:52.000-07:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1330,6 +1330,7 @@ list(APPEND NVFUSER_RUNTIME_FILES
   ${NVFUSER_ROOT}/runtime/block_sync_atomic.cu
   ${NVFUSER_ROOT}/runtime/block_sync_default.cu
   ${NVFUSER_ROOT}/runtime/block_welford_outer.cu
+  ${NVFUSER_ROOT}/runtime/block_layout.cu
   ${NVFUSER_ROOT}/runtime/broadcast.cu
   ${NVFUSER_ROOT}/runtime/casts.cu
   ${NVFUSER_ROOT}/runtime/cluster.cu
diff --git a/csrc/kernel.cpp b/csrc/kernel.cpp
@@ -272,6 +272,10 @@ class KernelIrScanner : private IrVisitor {
     summary_.has_argsort = true;
   }
 
+  void handle(GroupedBlockScalingFactorLayoutOp* aop) final {
+    summary_.has_grouped_block_sf_layout = true;
+  }
+
   void handle(TopKOp* top) final {
     summary_.has_topk = true;
   }
diff --git a/csrc/kernel.h b/csrc/kernel.h
@@ -142,6 +142,9 @@ struct KernelSummary {
   //! Do we have any argsort op?
   bool has_argsort = false;
 
+  //! Do we have any grouped_block_sf_layout op?
+  bool has_grouped_block_sf_layout = false;
+
   //! Do we have any topk op?
   bool has_topk = false;
 
diff --git a/csrc/runtime/compiled_kernel.cpp b/csrc/runtime/compiled_kernel.cpp
@@ -58,6 +58,7 @@
 #include <nvfuser_resources/basic_type_traits.h>
 #include <nvfuser_resources/bf16_support.h>
 #include <nvfuser_resources/bit.h>
+#include <nvfuser_resources/block_layout.h>
 #include <nvfuser_resources/block_reduction.h>
 #include <nvfuser_resources/block_sync_atomic.h>
 #include <nvfuser_resources/block_sync_default.h>
@@ -1158,7 +1159,8 @@ std::string _getStructuredCode(
     std::string kernel_name,
     bool has_argsort = false,
     bool has_topk = false,
-    bool has_scan = false) {
+    bool has_scan = false,
+    bool has_block_layout = false) {
   // generating cuda code;
   std::string code = "";
 
@@ -1194,6 +1196,9 @@ std::string _getStructuredCode(
   if (has_topk) {
     code += nvfuser_resources::topk_cu;
   }
+  if (has_block_layout) {
+    code += nvfuser_resources::block_layout_cu;
+  }
 
   code += "\nnamespace " + CompiledKernel::kernelNamespace() + " {\n\n";
   code += kernel_str;
@@ -1439,7 +1444,8 @@ std::string CompiledKernel::getStructuredCode() const {
       kernelName(),
       kernel()->summary().has_argsort,
       kernel()->summary().has_topk,
-      kernel()->summary().has_scan);
+      kernel()->summary().has_scan,
+      kernel()->summary().has_grouped_block_sf_layout);
 }
 
 std::string CompiledKernel::disassembledKernelSASS() const {
diff --git a/runtime/block_layout.cu b/runtime/block_layout.cu
@@ -0,0 +1,99 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+
+namespace nvf::block_layout {
+
+namespace {
+
+// TODO: simplify this maybe?!
+template <int BLOCK_ROW_OUTER, int BLOCK_ROW_INNER, int BLOCK_COL>
+__device__ nvfuser_index_t offsetAfterSwizzlePadding(
+    const nvfuser_index_t row_idx,
+    const nvfuser_index_t col_idx,
+    const nvfuser_index_t padded_col_size) {
+  constexpr nvfuser_index_t BLOCK_ROW_SIZE = BLOCK_ROW_OUTER * BLOCK_ROW_INNER;
+
+  /* logical dimension of matrix [ row_size, col_size]
+   *
+   * while layout is decomposed as
+   *   [ (row_tile*BLOCK_ROW_INNER*BLOCK_ROW_OUTER), (col_tile*BLOCK_COL) ]
+   * where
+   *   row_tile = row_size / BLOCK_ROW_OUTER * BLOCK_ROW_INNER)
+   *   col_tile = col_size / BLOCK_COL
+   */
+  nvfuser_index_t row_tile_idx = row_idx / BLOCK_ROW_SIZE;
+
+  nvfuser_index_t row_block_idx = row_idx % BLOCK_ROW_SIZE;
+  nvfuser_index_t row_block_inner_idx = row_block_idx / BLOCK_ROW_OUTER;
+  nvfuser_index_t row_block_outer_idx = row_block_idx % BLOCK_ROW_OUTER;
+  nvfuser_index_t col_tile_idx = col_idx / BLOCK_COL;
+  nvfuser_index_t col_block_idx = col_idx % BLOCK_COL;
+
+  /* layout for matrix [ row_size, col_size]
+   * it is viewed
+   *   [row_tile, BLOCK_ROW_INNER, BLOCK_ROW_OUTER, col_tile, BLOCK_COL]
+   * then transposed with axis (1, 3)
+   *   [row_tile, col_tile, BLOCK_ROW_OUTER, BLOCK_ROW_INNER, BLOCK_COL]
+   * and then made contiguous
+   */
+  constexpr nvfuser_index_t COL_TILE_STRIDE = BLOCK_ROW_SIZE * BLOCK_COL;
+  constexpr nvfuser_index_t BLOCK_ROW_OUTER_STRIDE =
+      BLOCK_ROW_INNER * BLOCK_COL;
+  constexpr nvfuser_index_t BLOCK_ROW_INNER_STRIDE = BLOCK_COL;
+
+  return row_tile_idx * padded_col_size * BLOCK_ROW_SIZE +
+      col_tile_idx * COL_TILE_STRIDE +
+      row_block_outer_idx * BLOCK_ROW_OUTER_STRIDE +
+      row_block_inner_idx * BLOCK_ROW_INNER_STRIDE + col_block_idx;
+}
+
+} // namespace
+
+// TODO: I think we can actually not have this handled as an opaque function.
+template <
+    typename T,
+    typename Index_T,
+    int BLOCK_ROW_OUTER,
+    int BLOCK_ROW_INNER,
+    int BLOCK_COL,
+    int UNROLL_FACTOR>
+__device__ void groupedBlockLayout(
+    T* output,
+    const T* input,
+    const nvfuser_index_t row_idx,
+    const nvfuser_index_t col_idx,
+    const Index_T* expert_offsets,
+    const Index_T* output_offsets,
+    const nvfuser_index_t row_size,
+    const nvfuser_index_t col_size,
+    const nvfuser_index_t group_size) {
+  // find corresponding expert_id
+  int expert_id = 0;
+  for (int i = 0; i < group_size; ++i) {
+    if (row_idx < expert_offsets[i + 1]) {
+      expert_id = i;
+      break;
+    }
+  }
+
+  // row idx for current matmul
+  nvfuser_index_t c_row_idx = row_idx - expert_offsets[expert_id];
+  nvfuser_index_t padded_col_size =
+      (col_size + BLOCK_COL - 1) / BLOCK_COL * BLOCK_COL;
+  T* out_group_offset = output + output_offsets[expert_id] * padded_col_size;
+
+  // TODO: vectorized load/store; The logic could be simplified afterwards.
+  for (int i = 0; i < UNROLL_FACTOR && col_idx + i < col_size; ++i) {
+    nvfuser_index_t index =
+        offsetAfterSwizzlePadding<BLOCK_ROW_OUTER, BLOCK_ROW_INNER, BLOCK_COL>(
+            c_row_idx, col_idx + i, padded_col_size);
+    out_group_offset[index] = input[i];
+  }
+}
+
+} // namespace nvf::block_layout

Original file line number	Diff line number	Diff line change
`@@ -272,6 +272,10 @@ class KernelIrScanner : private IrVisitor {`
`272`	`272`	`summary_.has_argsort = true;`
`273`	`273`	`}`
`274`	`274`
	`275`	`+ void handle(GroupedBlockScalingFactorLayoutOp* aop) final {`
	`276`	`+ summary_.has_grouped_block_sf_layout = true;`
	`277`	`+ }`
	`278`	`+`
`275`	`279`	`void handle(TopKOp* top) final {`
`276`	`280`	`summary_.has_topk = true;`
`277`	`281`	`}`