NVIDIA · dalgarak · Oct 14, 2025
@@ -62,7 +62,7 @@ def __init__(
         else:
             self.gate_weight = None
 
-        if self.config.fp8 and is_te_min_version("2.6.0dev0"):
+        if self.config.fp8 and self.config.fp8_recipe != 'delayed' and is_te_min_version("2.6.0dev0"):
             # For fp8 training, the output of pre_mlp_layernorm is saved by router, and
             # the shared expert linear_fc1 also saves the quantized tensor of this output.
             # Here we set the linear_fc1 to save the original input tensors to avoid the extra