use save for bwd

nrailg · nrailg · commit 10ddbd8011d9 · 2025-10-14T16:33:39.000+08:00
diff --git a/megatron/core/transformer/dot_product_attention.py b/megatron/core/transformer/dot_product_attention.py
@@ -59,7 +59,7 @@ def eager_attn_fwd(q, k, v, attn_bias, sinks, scale, dropout):
     attn_output = einops.rearrange(attn_output, 'b h s d -> b s h d')
     attn_output = attn_output.contiguous()
 
-    return attn_output, None
+    return attn_output, probs
 
 
 # @torch.compile
@@ -71,20 +71,6 @@ def eager_attn_bwd(q, k, v, attn_bias, sinks, scale, dropout, attn_output, probs
     _k_T = einops.rearrange(k, 'b s h d -> b h s d')
     _v_T = einops.rearrange(v, ' b s h d -> b h d s')
 
-    # recompute probs and slice attn_w from probs
-    if probs is None:
-        _q = einops.rearrange(q, 'b s h d -> b h s d')
-        _k = einops.rearrange(k, 'b s h d -> b h d s')
-        attn_w = torch.matmul(_q, _k) * scale
-        attn_w = attn_w + attn_bias
-        if sinks is None:
-            logits = attn_w
-        else:
-            _sinks = sinks.reshape(1, h, 1, 1).expand(b, -1, sq, 1)
-            logits = torch.cat([attn_w, _sinks], dim=-1)
-        probs = F.softmax(logits, dim=-1, dtype=logits.dtype)
-        del _q, _k, logits
-
     if sinks is None:
         attn_w = probs
     else:
@@ -182,7 +168,7 @@ def forward(
 
         nheads = q.shape[2]
         nheads_k = k.shape[2]
-        heads_k_stride = nheads_k
+        heads_k_stride = 1
         assert nheads % nheads_k == 0 and nheads_k % heads_k_stride == 0
         outs = []
         probs = []
@@ -227,38 +213,30 @@ def forward(
         out = torch.cat(outs, dim=2)
         out = einops.rearrange(out, 'b s h d -> s b h d')
 
-        ctx.save_for_backward(q, k, v)
-        ctx.outs = outs
-        ctx.probs = probs
-        ctx.attention_mask = attention_mask
+        ctx.save_for_backward(q, k, v, attention_mask, *outs, *probs)
         ctx.dropout = attention_dropout
         ctx.scale = softmax_scale
-        ctx.op = None
-        ctx.output_dtype = None
-        ctx.heads_k_stride = heads_k_stride
+        ctx.heads_k_stride = heads_k_stride # TODO make it configurable
         ctx.pg = pg
 
         return out
 
     @staticmethod
     def backward(ctx, dout):
-        q, k, v = ctx.saved_tensors
-        outs = ctx.outs
-        probs = ctx.probs
-        attention_mask = ctx.attention_mask
-        op = None
-        output_dtype = ctx.output_dtype
+        q, k, v, attention_mask, *rest = ctx.saved_tensors
+        nheads = q.shape[2]
+        nheads_k = k.shape[2]
         heads_k_stride = ctx.heads_k_stride
-        pg = ctx.pg
+        assert nheads_k % heads_k_stride == 0
+        outs = rest[:nheads_k // heads_k_stride]
+        probs = rest[nheads_k // heads_k_stride:]
 
+        pg = ctx.pg
         cp_size = 1
         if pg is not None:
             cp_size = torch.distributed.get_world_size(pg)
         comm = AllGatherComm(group=pg)
 
-        nheads = q.shape[2]
-        nheads_k = k.shape[2]
-
         kv_buffer = torch.empty(
             (2, k.shape[0] * cp_size, k.shape[1], heads_k_stride, k.shape[3]),
             dtype=k.dtype,