NVIDIA
diff --git a/‎cpp/kernels/fmha_v2/fmha_test.py‎
Lines changed: 1 addition & 2 deletions b/‎cpp/kernels/fmha_v2/fmha_test.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎cpp/kernels/fmha_v2/setup.py‎
Lines changed: 123 additions & 118 deletions b/‎cpp/kernels/fmha_v2/setup.py‎
Lines changed: 123 additions & 118 deletions
diff --git a/‎cpp/kernels/fmha_v2/src/fmha/fragment.h‎
Lines changed: 92 additions & 0 deletions b/‎cpp/kernels/fmha_v2/src/fmha/fragment.h‎
Lines changed: 92 additions & 0 deletions
diff --git a/‎cpp/kernels/fmha_v2/src/fmha/warpspec/epilogue.h‎
Lines changed: 5 additions & 0 deletions b/‎cpp/kernels/fmha_v2/src/fmha/warpspec/epilogue.h‎
Lines changed: 5 additions & 0 deletions
@@ -178,8 +178,7 @@ def test_trtllm_context_mla_attention_fmha(dtype, s):
         check=True)
 
     # For chunked prefill, we need to enable -save-softmax (dtype: bf16, layout: separate-q-k-v).
-    # Currently fp8 kernel doesn't support saving softmax.
-    if dtype == "-bf16":
+    if dtype in ["-bf16", "-e4m3"]:
         # padding mask
         subprocess.run(
             f"bin/fmha.exe -v 0 -runs 1 -min-s 1024 -s {s} -b 8 -h 8 -d 192 -dv 128 {dtype} "
 
@@ -3815,124 +3815,126 @@ def enumerate_qgmma_flash_warpspec_kernels(specs,
     combinations = product([False, True], \
         [InputLayout.PACKED_QKV, InputLayout.CONTIGUOUS_Q_KV,
          InputLayout.Q_PAGED_KV, InputLayout.SEPARATE_Q_K_V],
-        [False, True])
-    for (alibi, input_layout, enable_attn_logit_softcapping) in combinations:
+        [False, True], [False, True])
+    for (alibi, input_layout, enable_attn_logit_softcapping,
+         return_softmax) in combinations:
         # alibi and bmm1_tanh_scale shouldn't be used together.
         if alibi and enable_attn_logit_softcapping:
             continue
-        # D <= 64: KV_STEP = 256
-        specs.append(
-            kernel_spec(
-                sm=sm,
-                sm_mma=90,
-                dtype=dtype,
-                seq_len=0,  # support any sequence length
-                head_size=[32, 40, 48, 64],
-                warps_m=4,  #4x1 warpgroups
-                warps_n=1,
-                version=2,
-                interleaved=False,
-                ldgsts_q=
-                False,  # for Hopper kernels, ldgsts = False signals TMA usage.
-                ldgsts_k=False,
-                ldgsts_v=False,
-                share_smem_k_v=False,
-                loop_step=64,
-                q_tile_buffers=1,  # only used by warp specialized kernels
-                has_noloop=0,
-                noloop_step=64,
-                kv_loop_step=256,
-                kv_tile_buffers=4,  # only used by warp specialized kernels
-                unroll_threshold=1,
-                has_scale_max=False,
-                flash_attention=True,
-                warp_specialization=True,
-                alibi=alibi,
-                enable_attn_logit_softcapping=enable_attn_logit_softcapping,
-                return_softmax_stats=
-                False,  # return softmax stats is not supported for fp8 now
-                scheduling_mode=scheduling_mode,
-                input_layout=input_layout,
-                sage_block_sizes=sage_block_sizes,
-                output_dtype=output_dtype))
-
-        # 64 < D <=128: KV_STEP = 128
-        specs.append(
-            kernel_spec(
-                sm=sm,
-                sm_mma=90,
-                dtype=dtype,
-                seq_len=0,  # support any sequence length
-                head_size=[80, 96, 104, 128],
-                warps_m=4,  #4x1 warpgroups
-                warps_n=1,
-                version=2,
-                interleaved=False,
-                ldgsts_q=
-                False,  # for Hopper kernels, ldgsts = False signals TMA usage.
-                ldgsts_k=False,
-                ldgsts_v=False,
-                share_smem_k_v=False,
-                loop_step=64,
-                q_tile_buffers=1,  # only used by warp specialized kernels
-                has_noloop=0,
-                noloop_step=64,
-                kv_loop_step=256,
-                kv_tile_buffers=2,  # only used by warp specialized kernels
-                unroll_threshold=1,
-                has_scale_max=False,
-                flash_attention=True,
-                warp_specialization=True,
-                alibi=alibi,
-                enable_attn_logit_softcapping=enable_attn_logit_softcapping,
-                return_softmax_stats=
-                False,  # return softmax stats is not supported for fp8 now
-                scheduling_mode=scheduling_mode,
-                input_layout=input_layout,
-                sage_block_sizes=sage_block_sizes,
-                output_dtype=output_dtype))
-
-        # 128 < D <=256: KV_STEP = 128
-        specs.append(
-            kernel_spec(
-                sm=sm,
-                sm_mma=90,
-                dtype=dtype,
-                seq_len=0,  # support any sequence length
-                head_size=[160, 192, 256],
-                warps_m=4,  #4x1 warpgroups
-                warps_n=1,
-                version=2,
-                interleaved=False,
-                ldgsts_q=
-                False,  # for Hopper kernels, ldgsts = False signals TMA usage.
-                ldgsts_k=False,
-                ldgsts_v=False,
-                share_smem_k_v=False,
-                loop_step=64,
-                q_tile_buffers=1,  # only used by warp specialized kernels
-                has_noloop=0,
-                noloop_step=64,
-                kv_loop_step=
-                128,  # use 128 kv step size to avoid register spilling
-                kv_tile_buffers=2,  # only used by warp specialized kernels
-                unroll_threshold=1,
-                has_scale_max=False,
-                flash_attention=True,
-                warp_specialization=True,
-                alibi=alibi,
-                enable_attn_logit_softcapping=enable_attn_logit_softcapping,
-                return_softmax_stats=
-                False,  # return softmax stats is not supported for fp8 now
-                scheduling_mode=scheduling_mode,
-                input_layout=input_layout,
-                sage_block_sizes=sage_block_sizes,
-                output_dtype=output_dtype))
-
-        # context MLA (192x128)
-        # we could use param 'output_dtype' of enumerate_qgmma_flash_warpspec_kernels(),
-        # but it will generate many unnecessary kernels and they are not easy to filter out.
-        for output_type in [None, 'bf16']:
+        # for normal attention, we do not need return softmax for ws fp8 kernels currently.
+        # also fp8 input and bf16 output is only needed for MLA kernel.
+        skip_combination = return_softmax or (output_dtype is not None)
+        # for context mla, we need separate qkv as input layout when returning softmax.
+        skip_mla_combination = return_softmax and input_layout != InputLayout.SEPARATE_Q_K_V
+        if not skip_combination:
+            # D <= 64: KV_STEP = 256
+            specs.append(
+                kernel_spec(
+                    sm=sm,
+                    sm_mma=90,
+                    dtype=dtype,
+                    seq_len=0,  # support any sequence length
+                    head_size=[32, 40, 48, 64],
+                    warps_m=4,  #4x1 warpgroups
+                    warps_n=1,
+                    version=2,
+                    interleaved=False,
+                    ldgsts_q=
+                    False,  # for Hopper kernels, ldgsts = False signals TMA usage.
+                    ldgsts_k=False,
+                    ldgsts_v=False,
+                    share_smem_k_v=False,
+                    loop_step=64,
+                    q_tile_buffers=1,  # only used by warp specialized kernels
+                    has_noloop=0,
+                    noloop_step=64,
+                    kv_loop_step=256,
+                    kv_tile_buffers=4,  # only used by warp specialized kernels
+                    unroll_threshold=1,
+                    has_scale_max=False,
+                    flash_attention=True,
+                    warp_specialization=True,
+                    alibi=alibi,
+                    enable_attn_logit_softcapping=enable_attn_logit_softcapping,
+                    return_softmax_stats=return_softmax,
+                    scheduling_mode=scheduling_mode,
+                    input_layout=input_layout,
+                    sage_block_sizes=sage_block_sizes,
+                    output_dtype=output_dtype))
+
+            # 64 < D <=128: KV_STEP = 128
+            specs.append(
+                kernel_spec(
+                    sm=sm,
+                    sm_mma=90,
+                    dtype=dtype,
+                    seq_len=0,  # support any sequence length
+                    head_size=[80, 96, 104, 128],
+                    warps_m=4,  #4x1 warpgroups
+                    warps_n=1,
+                    version=2,
+                    interleaved=False,
+                    ldgsts_q=
+                    False,  # for Hopper kernels, ldgsts = False signals TMA usage.
+                    ldgsts_k=False,
+                    ldgsts_v=False,
+                    share_smem_k_v=False,
+                    loop_step=64,
+                    q_tile_buffers=1,  # only used by warp specialized kernels
+                    has_noloop=0,
+                    noloop_step=64,
+                    kv_loop_step=256,
+                    kv_tile_buffers=2,  # only used by warp specialized kernels
+                    unroll_threshold=1,
+                    has_scale_max=False,
+                    flash_attention=True,
+                    warp_specialization=True,
+                    alibi=alibi,
+                    enable_attn_logit_softcapping=enable_attn_logit_softcapping,
+                    return_softmax_stats=return_softmax,
+                    scheduling_mode=scheduling_mode,
+                    input_layout=input_layout,
+                    sage_block_sizes=sage_block_sizes,
+                    output_dtype=output_dtype))
+
+            # 128 < D <=256: KV_STEP = 128
+            specs.append(
+                kernel_spec(
+                    sm=sm,
+                    sm_mma=90,
+                    dtype=dtype,
+                    seq_len=0,  # support any sequence length
+                    head_size=[160, 192, 256],
+                    warps_m=4,  #4x1 warpgroups
+                    warps_n=1,
+                    version=2,
+                    interleaved=False,
+                    ldgsts_q=
+                    False,  # for Hopper kernels, ldgsts = False signals TMA usage.
+                    ldgsts_k=False,
+                    ldgsts_v=False,
+                    share_smem_k_v=False,
+                    loop_step=64,
+                    q_tile_buffers=1,  # only used by warp specialized kernels
+                    has_noloop=0,
+                    noloop_step=64,
+                    kv_loop_step=
+                    128,  # use 128 kv step size to avoid register spilling
+                    kv_tile_buffers=2,  # only used by warp specialized kernels
+                    unroll_threshold=1,
+                    has_scale_max=False,
+                    flash_attention=True,
+                    warp_specialization=True,
+                    alibi=alibi,
+                    enable_attn_logit_softcapping=enable_attn_logit_softcapping,
+                    return_softmax_stats=return_softmax,
+                    scheduling_mode=scheduling_mode,
+                    input_layout=input_layout,
+                    sage_block_sizes=sage_block_sizes,
+                    output_dtype=output_dtype))
+
+        if not skip_mla_combination:
+            # context MLA (192x128)
             specs.append(
                 kernel_spec(
                     sm=sm,
@@ -3962,12 +3964,11 @@ def enumerate_qgmma_flash_warpspec_kernels(specs,
                     warp_specialization=True,
                     alibi=alibi,
                     enable_attn_logit_softcapping=enable_attn_logit_softcapping,
-                    return_softmax_stats=
-                    False,  # return softmax stats is not supported for fp8 now
+                    return_softmax_stats=return_softmax,
                     scheduling_mode=scheduling_mode,
                     input_layout=input_layout,
                     sage_block_sizes=sage_block_sizes,
-                    output_dtype=output_type))
+                    output_dtype=output_dtype))
 
 
 def enumerate_igmma_kernels(specs, sm=90):
@@ -6215,6 +6216,10 @@ def enumerate_kernels():
     enumerate_hgmma_flash_warpspec_kernels(specs, sm=90, dtype='fp16')
     enumerate_hgmma_flash_warpspec_kernels(specs, sm=90, dtype='bf16')
     enumerate_qgmma_flash_warpspec_kernels(specs, sm=90, dtype='e4m3')
+    enumerate_qgmma_flash_warpspec_kernels(specs,
+                                           sm=90,
+                                           dtype='e4m3',
+                                           output_dtype="bf16")
 
     # For now SageAttention only needs BF16
     # block_size_q should be divisible by 64
 
@@ -1749,6 +1749,98 @@ struct Tile_o_normalizer<Ada_qmma_e4m3_fp32_traits, Cta_tile>
 
     // Default ctor
     Tile_o_normalizer() = default;
+
+    // The fragment accumulator.
+    using Fragment_accu = Fragment_accumulator<Traits>;
+
+    // The Mma tile.
+    using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+    // The number of MMAs in the M dimension.
+    enum
+    {
+        MMAS_M = Mma_tile::MMAS_M
+    };
+
+    // The number of MMAs in the N dimension.
+    enum
+    {
+        MMAS_N = Mma_tile::VALID_MMAS_N
+    };
+
+    // The number of rows per thread.
+    enum
+    {
+        ROWS_PER_THREAD = 2 * MMAS_M
+    };
+
+    // The number of registers per thread.
+    enum
+    {
+        REGS_PER_THREAD = 8
+    };
+
+    // Warps.
+    enum
+    {
+        WARPS_M = Cta_tile::WARPS_M
+    };
+
+    enum
+    {
+        WARPS_N = Cta_tile::WARPS_N
+    };
+
+    enum
+    {
+        WARPS_K = Cta_tile::WARPS_K
+    };
+
+    // softmax data bytes
+    enum
+    {
+        BYTES_PER_ELEMENT = sizeof(float)
+    };
+
+    // Update o after P * V, the only difference from the basic class is we need to dequant the sum for softmax saver.
+    inline __device__ void final_update(Fragment_accu (&acc_o)[MMAS_M][MMAS_N], float (&sum)[ROWS_PER_THREAD])
+    {
+
+        constexpr float dequant_scale = Traits::SOFTMAX_FP_DEQUANT_SCALE;
+#pragma unroll
+        for (int mi = 0; mi < MMAS_M; ++mi)
+        {
+
+            // Precompute the scaling factors for the 2 rows.
+            float beta[2];
+#pragma unroll
+            for (int ii = 0; ii < 2; ++ii)
+            {
+                // The row.
+                int jj = 2 * mi + ii;
+
+                // The diviser.
+                beta[ii] = (sum[jj] == 0.f || sum[jj] != sum[jj]) ? 1.f : 1.f / sum[jj];
+                // softmax saver need the original sum.
+                sum[jj] = sum[jj] * dequant_scale;
+            }
+
+#pragma unroll
+            for (int ni = 0; ni < MMAS_N; ++ni)
+            {
+#pragma unroll
+                for (int ii = 0; ii < REGS_PER_THREAD; ++ii)
+                {
+                    // The register for O.
+                    float acc_o_f = acc_o[mi][ni].elt(ii);
+                    // Compute the next accumulator.
+                    acc_o_f = acc_o_f * beta[(ii & 2) / 2];
+                    // Update the accumulator.
+                    acc_o[mi][ni].elt(ii) = acc_o_f;
+                }
+            }
+        }
+    }
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -1318,6 +1318,11 @@ struct Tile_o_epilogue<Hopper_qgmma_e4m3_fp32_traits, Kernel_traits>
 #else
             float scale = global_sum_mi == 0.f ? 1.0f : 1.0f / global_sum_mi;
 #endif
+            if constexpr (Kernel_traits::RETURN_SOFTMAX_STATS)
+            {
+                // Save the dequant exp sum for softmax saver.
+                global_sum[mi] *= Traits_o::SOFTMAX_FP_DEQUANT_SCALE;
+            }
 // Assume only N has multiple MMAs (MMAS_M = 1).
 #pragma unroll
             for (int mma_ni = 0; mma_ni < Mma_tile_o::MMAS_N; mma_ni++)