ROCm
diff --git a/‎aiter/ops/triton/_triton_kernels/lean_atten.py‎
Lines changed: 48 additions & 14 deletions b/‎aiter/ops/triton/_triton_kernels/lean_atten.py‎
Lines changed: 48 additions & 14 deletions
diff --git a/‎aiter/ops/triton/lean_atten.py‎
Lines changed: 86 additions & 9 deletions b/‎aiter/ops/triton/lean_atten.py‎
Lines changed: 86 additions & 9 deletions
@@ -114,15 +114,20 @@ def _attention_inner(
     offs_n,
     BLOCK_M,
     BLOCK_N,
-    HEAD_DIM,
+    HEAD_DIM_ORIG: tl.constexpr,
+    HEAD_DIM: tl.constexpr,
     local_iter,
     local_iter_end,
+    use_64_indexing: tl.constexpr,
 ):
     """
     Performs attention calculation for an (maybe partial) output tile
     """
+    # Define head-dimension mask for padded dims
+    offs_k_local = tl.arange(0, HEAD_DIM)
+    mask_k_cols_local = offs_k_local < HEAD_DIM_ORIG
     for l_iter in range(local_iter, local_iter_end):
-        k = tl.load(k_ptrs)
+        k = tl.load(k_ptrs, mask=mask_k_cols_local[:, None], other=0.0)
         qk = tl.dot(q, k) * qk_scale
 
         if causal:
@@ -152,17 +157,24 @@ def _attention_inner(
 
         # Update accumulator
         acc = acc * alpha[:, None]
-        v = tl.load(v_ptrs)
+        v = tl.load(v_ptrs, mask=mask_k_cols_local[None, :], other=0.0)
         acc += tl.dot(p.to(v.dtype), v)
 
         # Update stats
         l_ij = tl.sum(p, 1)
         l_i = l_i * alpha + l_ij
         m_i = m_ij.to(m_i.dtype)
 
-        # update k/v pointer
-        v_ptrs += BLOCK_N * stride_vn
-        k_ptrs += BLOCK_N * stride_kn
+        # update k/v pointer with optional 64-bit indexing to avoid overflow
+        if use_64_indexing:
+            BLOCK_N64 = tl.full((), BLOCK_N, tl.int64)
+            stride_kn64 = tl.full((), stride_kn, tl.int64)
+            stride_vn64 = tl.full((), stride_vn, tl.int64)
+            v_ptrs += BLOCK_N64 * stride_vn64
+            k_ptrs += BLOCK_N64 * stride_kn64
+        else:
+            v_ptrs += BLOCK_N * stride_vn
+            k_ptrs += BLOCK_N * stride_kn
     return m_i, l_i, acc
 
 
@@ -222,10 +234,12 @@ def la_persistent(
     stride_om,  # n_ctx_q
     stride_oh,  # Head
     stride_on,  # head_dim
+    n_ctx_q_rows,
     stride_oph,  # total_programs
     stride_opm,  # n_ctx_q
     stride_opn,  # head_dim
     HEADS_PER_XCD: tl.constexpr,
+    HEAD_DIM_ORIG: tl.constexpr,
     HEAD_DIM: tl.constexpr,
     BLOCK_M: tl.constexpr,
     BLOCK_N: tl.constexpr,
@@ -243,6 +257,10 @@ def la_persistent(
     tiles_per_head: tl.constexpr,
     num_splits: tl.constexpr,
     max_output_tile_cnt: tl.constexpr,
+    num_heads_q: tl.constexpr,
+    num_heads_k: tl.constexpr,
+    gqa_group_size: tl.constexpr,
+    use_64_indexing: tl.constexpr,
 ):
     if is_pod:
         current_pid = pod_pid
@@ -321,6 +339,7 @@ def la_persistent(
                 xcd_id=xcd_id,
                 HEADS_PER_XCD=HEADS_PER_XCD,
                 HEAD_DIM=HEAD_DIM,
+                HEAD_DIM_ORIG=HEAD_DIM_ORIG,
                 BLOCK_M=BLOCK_M,
                 BLOCK_N=BLOCK_N,
                 MASKED_BLOCKS=MASKED_BLOCKS,
@@ -335,6 +354,8 @@ def la_persistent(
                 max_tiles_per_wg=max_tiles_per_wg,
                 tiles_per_head=tiles_per_head,
                 num_splits=num_splits,
+                gqa_group_size=gqa_group_size,
+                use_64_indexing=use_64_indexing,
             )
 
 
@@ -372,6 +393,7 @@ def la_persistent_inner(
     xcd_id,  # The XCD the pid belongs to
     HEADS_PER_XCD,
     HEAD_DIM: tl.constexpr,
+    HEAD_DIM_ORIG: tl.constexpr,
     BLOCK_M: tl.constexpr,
     BLOCK_N: tl.constexpr,
     MASKED_BLOCKS: tl.constexpr,
@@ -386,6 +408,8 @@ def la_persistent_inner(
     max_tiles_per_wg: tl.constexpr,
     tiles_per_head: tl.constexpr,
     num_splits: tl.constexpr,
+    gqa_group_size: tl.constexpr,
+    use_64_indexing: tl.constexpr,
 ):
 
     tl.assume(stride_qm > 0)  # n_ctx_q
@@ -478,10 +502,13 @@ def la_persistent_inner(
     # Q/K/V/O offsets calculation needs global head index.
     # When XCD_REMAP=False, xcd_id=0
     tile_head_idx_global = HEADS_PER_XCD * xcd_id + tile_head_idx
+    # Map Q head index to K/V head index via GQA grouping
+    tile_khead_idx_global = tile_head_idx_global // gqa_group_size
 
     offs_m = tl.arange(0, BLOCK_M)
     offs_n = tl.arange(0, BLOCK_N)
     offs_k = tl.arange(0, HEAD_DIM)
+    mask_k_cols = offs_k < HEAD_DIM_ORIG
 
     if causal:
         b_seq_size = tile_batch_idx * num_n_blocks
@@ -495,13 +522,13 @@ def la_persistent_inner(
 
     k_offs = (
         (b_seq_size + local_iter) * BLOCK_N * stride_kn
-        + tile_head_idx_global * stride_kh
+        + tile_khead_idx_global * stride_kh
         + offs_n[None, :] * stride_kn
         + offs_k[:, None] * stride_kk
     )
     v_offs = (
         (b_seq_size + local_iter) * BLOCK_N * stride_vn
-        + tile_head_idx_global * stride_vh
+        + tile_khead_idx_global * stride_vh
         + offs_n[:, None] * stride_vn
         + offs_k[None, :] * stride_vk
     )
@@ -531,7 +558,7 @@ def la_persistent_inner(
     l_i = tl.zeros([BLOCK_M], dtype=tl.float32) + 1.0
     acc = tl.zeros([BLOCK_M, HEAD_DIM], dtype=tl.float32)
 
-    q = tl.load(q_ptrs)
+    q = tl.load(q_ptrs, mask=mask_k_cols[None, :], other=0.0)
 
     m_i, l_i, acc = _attention_inner(
         q,
@@ -550,9 +577,11 @@ def la_persistent_inner(
         offs_n,
         BLOCK_M,
         BLOCK_N,
-        HEAD_DIM,
-        local_iter,
-        local_iter_end,
+        HEAD_DIM_ORIG=HEAD_DIM_ORIG,
+        HEAD_DIM=HEAD_DIM,
+        local_iter=local_iter,
+        local_iter_end=local_iter_end,
+        use_64_indexing=use_64_indexing,
     )
 
     # initialize pointer to m and l
@@ -732,8 +761,13 @@ def la_persistent_inner(
 
         acc0 = acc0 / l_i[:, None]
         acc1 = acc1 / l_i[:, None]
-        tl.store(o_ptrs0, acc0.to(Out.type.element_ty))
-        tl.store(o_ptrs1, acc1.to(Out.type.element_ty))
+        COLS_HALF: tl.constexpr = HEAD_DIM // 2
+        offs0 = tl.arange(0, COLS_HALF)
+        offs1 = tl.arange(0, COLS_HALF) + COLS_HALF
+        mask_cols0 = offs0 < HEAD_DIM_ORIG
+        mask_cols1 = offs1 < HEAD_DIM_ORIG
+        tl.store(o_ptrs0, acc0.to(Out.type.element_ty), mask=mask_cols0[None, :])
+        tl.store(o_ptrs1, acc1.to(Out.type.element_ty), mask=mask_cols1[None, :])
 
     # update iter
     iter = iter + (local_iter_end - local_iter)
 
@@ -22,9 +22,10 @@
 from bisect import bisect_right
 import triton
 import triton.language as tl
-from aiter.ops.triton._triton_kernels.lean_atten import la_persistent
+from aiter.ops.triton._triton_kernels.lean_atten import la_persistent, _get_config
 from aiter.ops.triton.utils.logger import AiterTritonLogger
 from aiter.ops.triton.utils.device_info import get_num_xcds
+from aiter.ops.triton.utils._triton import arch_info
 
 _LOGGER = AiterTritonLogger()
 
@@ -45,6 +46,7 @@ def persistent_lean_attention(
     sm_scale: torch.float16,
     causal: bool = True,  # causal masking
     config: Optional[dict] = None,
+    program_count: Optional[int] = None,
 ):
     """
     Lean Attention kernel.
@@ -55,7 +57,11 @@ def persistent_lean_attention(
     if config is None:
         config = _get_config(causal=causal, batch_size=batch_size)
     sm_count = arch_info.get_num_sms()
-    total_programs = sm_count * config["SM_CNT_FACTOR"]
+    total_programs = (
+        program_count
+        if program_count is not None
+        else sm_count * config["SM_CNT_FACTOR"]
+    )
 
     return _persistent_lean_attention(
         q=q,
@@ -112,7 +118,10 @@ def _persistent_lean_attention(
     assert (
         HEAD_DIM_Q == HEAD_DIM_K and HEAD_DIM_K == HEAD_DIM_V
     ), "Incompatible Q/K/V Hidden Dimensions"
-    assert HEAD_DIM_K in {16, 32, 64, 128, 256}
+    # Allow irregular head dims by padding compute width and masking I/O
+    HEAD_DIM_PADDED = triton.next_power_of_2(HEAD_DIM_K)
+    if HEAD_DIM_PADDED < 16:
+        HEAD_DIM_PADDED = 16
 
     # MASKED_BLOCKS is used for prefill/causal for BLOCK_M > BLOCK_N
     # For MI300, BLOCK_M=128, BLOCK_N=64 is better for performance
@@ -126,6 +135,9 @@ def _persistent_lean_attention(
     N_CTX_Q = q.shape[0] // batch_size
     N_CTX_K = k.shape[0]  # This is the sum of all ctx_n in a batch
     H = q.shape[1]
+    H_K = k.shape[1]
+    assert H % H_K == 0, "For GQA, the number of Q heads must be divisible by K/V heads"
+    GQA_GROUP_SIZE = H // H_K
     HEADS_PER_XCD = H // NUM_XCDS
 
     qk_scale = sm_scale * LOG_TWO_E
@@ -145,7 +157,7 @@ def _persistent_lean_attention(
         N_CTX_Q,
         N_CTX_K,
         H,
-        H,
+        H_K,
         BLOCK_M,
         BLOCK_N,
         total_programs,
@@ -178,6 +190,59 @@ def _persistent_lean_attention(
     if DEBUG:
         print(f"max_output_tile_cnt={max_output_tile_cnt}")
 
+    # Clamp to buffer capacity to avoid deadlocks
+    max_supported = min(
+        int(Mp.shape[0]), int(Lp.shape[0]), int(Op.shape[0]), int(locks.numel())
+    )
+    total_programs = min(total_programs, max_supported)
+
+    # Recompute schedule with clamped total_programs to keep splits consistent
+    (
+        num_m_blocks,
+        num_n_blocks,
+        high_load_wgs,
+        max_tiles_per_wg,
+        tiles_per_head,
+        total_programs,
+        num_splits,
+        even_split,
+    ) = get_num_splits_and_buffer_sizes(
+        causal,
+        batch_size,
+        N_CTX_Q,
+        N_CTX_K,
+        H,
+        H_K,
+        BLOCK_M,
+        BLOCK_N,
+        total_programs,
+        XCD_REMAP,
+        NUM_XCDS,
+    )
+
+    # Runtime safety checks
+    if not (Mp.dim() == 2 and Mp.shape[0] >= total_programs and Mp.shape[1] >= BLOCK_M):
+        raise ValueError(
+            f"Mp must have at least [total_programs, BLOCK_M] >= [{total_programs}, {BLOCK_M}], got {tuple(Mp.shape)}"
+        )
+    if not (Lp.dim() == 2 and Lp.shape[0] >= total_programs and Lp.shape[1] >= BLOCK_M):
+        raise ValueError(
+            f"Lp must have at least [total_programs, BLOCK_M] >= [{total_programs}, {BLOCK_M}], got {tuple(Lp.shape)}"
+        )
+    if not (
+        Op.dim() == 3
+        and Op.shape[0] >= total_programs
+        and Op.shape[1] >= N_CTX_Q
+        and Op.shape[2] >= HEAD_DIM_K
+    ):
+        raise ValueError(
+            f"Op must have shape[0] >= total_programs, rows >= N_CTX_Q, cols >= HEAD_DIM_K; got {tuple(Op.shape)} while required first dim >= {total_programs}, rows >= {N_CTX_Q}, cols >= {HEAD_DIM_K}"
+        )
+    if not (locks.numel() >= total_programs):
+        raise ValueError(
+            f"locks must have length >= total_programs ({total_programs}), got {locks.numel()}"
+        )
+
     max_output_tile_cnt = max_output_tile_cnt + 4
 
     grid = (total_programs, 1, 1)
@@ -220,10 +285,12 @@ def _persistent_lean_attention(
         o.stride(0),
         o.stride(1),
         o.stride(2),
+        N_CTX_Q,
         Op.stride(0),  # total_programs
         Op.stride(1),  # n_ctx_q
         Op.stride(2),  # head_dim
         HEADS_PER_XCD=HEADS_PER_XCD,
+        HEAD_DIM_ORIG=HEAD_DIM_K,
         HEAD_DIM=HEAD_DIM_K,
         BLOCK_M=BLOCK_M,
         BLOCK_N=BLOCK_N,
@@ -245,6 +312,16 @@ def _persistent_lean_attention(
         num_warps=num_warps,
         num_stages=1,
         num_ctas=1,
+        num_heads_q=H,
+        num_heads_k=H_K,
+        gqa_group_size=GQA_GROUP_SIZE,
+        use_64_indexing=(
+            (k.stride(0) * N_CTX_K) >= (1 << 31)
+            or (v.stride(0) * N_CTX_K) >= (1 << 31)
+            or (Op.stride(0) * total_programs) >= (1 << 31)
+            or (Op.stride(1) * N_CTX_Q) >= (1 << 31)
+            or (o.stride(0) * N_CTX_Q) >= (1 << 31)
+        ),
         **config,
     )
     """
@@ -257,7 +334,7 @@ def _persistent_lean_attention(
     """
     # print(f"la kernel {la_kernel.n_regs} registers used, {la_kernel.n_spills} spills")
     ms = 0
-    return o, ms
+    return (o, ms)
 
 
 def get_num_splits_and_buffer_sizes(
@@ -281,8 +358,7 @@ def get_num_splits_and_buffer_sizes(
     num_m_blocks = (max_seqlen_q + BLOCK_M - 1) // BLOCK_M
     num_n_blocks = (max_seqlen_k + BLOCK_N - 1) // BLOCK_N
 
-    # TODO: Support Grouped-Query Attention
-    max_seqlen_q = max_seqlen_q * num_heads // num_heads_k
+    # Schedule over Q heads; K/V heads are mapped inside the kernel via gqa_group_size
 
     # print(f"block_m: {BLOCK_M}, block_n: {BLOCK_N} ")
     # print(f"num_m_block: {num_m_blocks}, num_n_block: {num_n_blocks} ")
@@ -303,10 +379,11 @@ def get_num_splits_and_buffer_sizes(
         # Decode or Not Causal
         tiles_per_head = num_m_blocks * num_n_blocks
 
+    # Total tiles across all Q heads
     if XCD_REMAP:
-        total_tiles = tiles_per_head * (num_heads_k // NUM_XCDS)
+        total_tiles = tiles_per_head * (num_heads // NUM_XCDS)
     else:
-        total_tiles = tiles_per_head * num_heads_k  # Total tiles across all heads
+        total_tiles = tiles_per_head * num_heads
 
     # StreamK Lean has as many threadblocks as SMs
     # This should be a function of tile size and number of scratchpad space