[layout] refactor fp16/bf16 layout

botbw · botbw · commit 016dd1cc8460 · 2025-10-19T23:15:51.000+08:00
diff --git a/tilelang/intrinsics/mma_layout.py b/tilelang/intrinsics/mma_layout.py
@@ -164,7 +164,7 @@ def mma_load_b_32x16_to_shared_16x32_layout(thread_id, local_id):
     col = 16 * (local_id % 8 // 4) + (thread_id % 4) * 4 + (local_id % 4)
     return row, col
 
-def mma_load_b_32x4_to_shared_16x8_layout_16bit(thread_id, local_id):
+def mma_load_b_32x8_to_shared_16x16_layout(thread_id, local_id):
     """
         groupID           = %laneid >> 2
         threadID_in_group = %laneid % 4
@@ -174,14 +174,10 @@ def mma_load_b_32x4_to_shared_16x8_layout_16bit(thread_id, local_id):
 
         col = groupID
     """
-    row = (thread_id % 4) * 2 + (local_id % 2) + (local_id // 2) * 8
-    col = (thread_id // 4)
+    col = (thread_id % 4) * 2 + (local_id % 2) + (local_id // 2) * 8
+    row = (thread_id // 4) + 8 * (local_id // 4)
     return row, col
 
-def mma_load_b_32x8_to_shared_16x16_layout_16bit(thread_id, local_id):
-    row, col = mma_load_b_32x4_to_shared_16x8_layout_16bit(thread_id, local_id % 4)
-    return row, col + 8 * (local_id // 4)
-
 def shared_16x16_to_mma_32x8_smoothlayout(i, j):
     return (i * 2 + j // 8, j % 8)
 
diff --git a/tilelang/intrinsics/mma_macro_generator.py b/tilelang/intrinsics/mma_macro_generator.py
@@ -18,8 +18,7 @@
     shared_16x32_to_mma_32x16_layout_sr_b,
     mma_load_a_32x4_to_shared_16x8_layout,
     mma_load_b_32x4_to_shared_16x8_layout,
-    mma_load_b_32x4_to_shared_16x8_layout_16bit,
-    mma_load_b_32x8_to_shared_16x16_layout_16bit,
+    mma_load_b_32x8_to_shared_16x16_layout,
     mma_load_a_32x16_to_shared_16x32_layout,
     mma_load_b_32x16_to_shared_16x32_layout,
     mma_load_a_32x8_to_shared_16x16_layout,
@@ -290,7 +289,7 @@ def mma_load_layout(i, j):
             if DataType(b_dtype).bits == 8:
                 mma_load_layout = mma_load_b_32x16_to_shared_16x32_layout
             elif DataType(b_dtype).bits == 16:
-                mma_load_layout = mma_load_b_32x8_to_shared_16x16_layout_16bit if replicate_b else mma_load_b_32x4_to_shared_16x8_layout_16bit
+                mma_load_layout = mma_load_b_32x8_to_shared_16x16_layout
             elif DataType(b_dtype).bits == 32:
                 mma_load_layout = mma_load_b_32x4_to_shared_16x8_layout
             else:
@@ -334,8 +333,8 @@ def _warp_ldmatrix_b(
                     # load 16x32 data from shared buffer to local buffer
                     # must be transposed.
                     for j in T.serial(local_size_b):
-                        mk, mi = mma_load_layout(tx, j)
-                        B_local_buf[i * local_size_b + j] =  B_shared_buf[wk + mk, wi + mi] if trans else B_shared_buf[wi + mi, wk + mk]
+                        mi, mk = mma_load_layout(tx, j)
+                        B_local_buf[i * local_size_b + j] = B_shared_buf[wi + mi, wk + mk] if trans else B_shared_buf[wk + mk, wi + mi]
 
         return _warp_ldmatrix_b(B_local_buf, B_shared_buf, ki, thread_binding, rk)
 
diff --git a/tilelang/intrinsics/mma_sp_layout.py b/tilelang/intrinsics/mma_sp_layout.py
@@ -2,22 +2,22 @@
 
 from .mma_layout import (
     mma_load_a_32x8_to_shared_16x16_layout,
-    mma_load_b_32x4_to_shared_16x8_layout_16bit,
-
+    mma_load_b_32x4_to_shared_8x16_layout_16bit,
 )
 
 def mma_sp_load_a_32x8_to_shared_16x32_layout(thread_id, local_id):
     return mma_load_a_32x8_to_shared_16x16_layout(thread_id, local_id)
 
-def mma_sp_load_b_32x8_to_shared_32x8_layout(thread_id, local_id):
-    return mma_load_b_32x4_to_shared_16x8_layout_16bit(thread_id, local_id)
+def mma_sp_load_b_32x8_to_shared_8x64_layout(thread_id, local_id):
+    return mma_load_b_32x8_to_shared_8x32_layout(thread_id, local_id)
 
-def mma_sp_load_b_32x16_to_shared_32x16_layout(thread_id, local_id):
-    row, col = mma_load_b_32x4_to_shared_16x8_layout_16bit(thread_id, local_id % 8)
-    return row, col + 8 * (local_id // 8) 
+def mma_sp_load_b_32x16_to_shared_16x64_layout(thread_id, local_id):
+    row, col =  mma_load_b_32x16_to_shared_16x32_layout(thread_id, local_id % 8)
+    return row, col + 8 * (local_id // 8)
 
+def mma_sp_load_b_32x16_to_shared_16x32_layout(thread_id, local_id):
+    return mma_load_b_32x8_to_shared_16x16_layout(thread_id, local_id)
 
-def get_logical_id(thread_id: int) -> int:
     return (thread_id // 4) * 2 + (thread_id % 4) % 2
 
 def metadata_load_32x4_to_shared_16x4_layout_8bit(thread_id: int, local_id: int) -> Tuple[int, int]:
diff --git a/tilelang/intrinsics/mma_sp_macro_generator.py b/tilelang/intrinsics/mma_sp_macro_generator.py
@@ -380,7 +380,7 @@ def mma_load_layout(i, j):
             # if DataType(b_dtype).bits == 8:
             #     mma_load_layout = mma_load_b_32x16_to_shared_16x32_layout
             if DataType(b_dtype).bits == 16:
-                mma_load_layout = mma_sp_load_b_32x16_to_shared_32x16_layout if replicate_b else mma_sp_load_b_32x8_to_shared_32x8_layout
+                mma_load_layout = mma_sp_load_b_32x16_to_shared_16x32_layout
             # elif DataType(b_dtype).bits == 32:
             #     mma_load_layout = mma_load_b_32x4_to_shared_16x8_layout
             else:
@@ -425,8 +425,8 @@ def _warp_ldmatrix_b(
                     # load 16x32 data from shared buffer to local buffer
                     # must be transposed.
                     for j in T.serial(local_size_b):
-                        mk, mi = mma_load_layout(tx, j)
-                        B_local_buf[i * local_size_b + j] =  B_shared_buf[wk + mk, wi + mi] if trans else B_shared_buf[wi + mi, wk + mk]
+                        mi, mk = mma_load_layout(tx, j)
+                        B_local_buf[i * local_size_b + j] = B_shared_buf[wi + mi, wk + mk] if trans else B_shared_buf[wk + mk, wi + mi]
 
         return _warp_ldmatrix_b(B_local_buf, B_shared_buf, ki, thread_binding, rk)