Enable Loss Fn in Graph PP

sanketpurandare · sanketpurandare · commit 00ccbf87d29a · 2025-11-12T00:01:15.000-08:00
ghstack-source-id: 56b4b85 Pull Request resolved: #247
diff --git a/autoparallel/_testing/models/dsv3.py b/autoparallel/_testing/models/dsv3.py
@@ -1565,6 +1565,12 @@ def forward(
         return output
 
 
+def dsv3_loss_fn(pred: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
+    return torch.nn.functional.cross_entropy(
+        pred.flatten(0, 1).float(), labels.flatten(0, 1)
+    )
+
+
 ########################
 # Pipeline stuff start #
 ########################
diff --git a/autoparallel/api.py b/autoparallel/api.py
@@ -8,7 +8,7 @@
 import warnings
 from contextlib import ExitStack, contextmanager
 from types import MethodType
-from typing import Any, Optional, Union
+from typing import Any, Callable, Optional, Union
 
 import torch
 from torch._dynamo.functional_export import _dynamo_graph_capture_for_export
@@ -159,7 +159,9 @@ def enable_local_map_wrapping():
         yield
 
 
-def _export(model: torch.nn.Module, inputs: tuple[Any]) -> torch.nn.Module:
+def _export(
+    model: torch.nn.Module, model_wrapper: Callable, inputs: tuple[Any]
+) -> torch.fx.GraphModule:
     """
     Thin wrapper around graph capture output that restores the
     original calling convention and attribute fqn. TODO:
@@ -169,7 +171,7 @@ def _export(model: torch.nn.Module, inputs: tuple[Any]) -> torch.nn.Module:
     3) Be more careful about tensor constants names.
     """
     with torch._dynamo.config.patch(install_free_tensors=True):
-        gm = _dynamo_graph_capture_for_export(model)(*inputs)
+        gm = _dynamo_graph_capture_for_export(model_wrapper)(*inputs)
         _restore_state_dict(model, gm)
         return gm
 
@@ -193,6 +195,7 @@ def __init__(
         ac_stage_size_in_GiB: Optional[Union[float, str]] = "auto",
         reshard_after_forward: bool = True,
         dynamic: bool = False,
+        loss_fn: Optional[Callable] = None,
         **kwargs,
     ):
         self.stack = ExitStack()
@@ -224,6 +227,7 @@ def __init__(
         self.enable_ac = enable_ac
         self.ac_stage_size_in_GiB = ac_stage_size_in_GiB
         self.reshard_after_forward = reshard_after_forward
+        self.loss_fn = loss_fn
 
         if dynamic:
             self.fake_mode.shape_env = ShapeEnv()
@@ -294,11 +298,27 @@ def build_model_graph(self):
             inputs = self.input_fn()
             if not isinstance(inputs, tuple):
                 inputs = (inputs,)
+        model_wrapper: Callable
+        if self.loss_fn is not None:
+
+            def model_with_loss(input, target) -> Any:
+                output = self.model(input)
+                loss = self.loss_fn(output, target)  # type: ignore[misc]
+                return loss
+
+            model_wrapper = model_with_loss
+        else:
+
+            def model_wo_loss(input) -> Any:
+                output = self.model(input)
+                return output
+
+            model_wrapper = model_wo_loss
 
         with set_dtype_cast(
             True
         ), enable_local_map_wrapping(), torch._dynamo.utils._disable_saved_tensors_hooks_during_tracing():
-            torch_ir_with_fqn = _export(self.model, inputs)
+            torch_ir_with_fqn = _export(self.model, model_wrapper, inputs)
             # TODO Cna't use fake mode here because it clashes with the user level
             # fake mode. Ideally dynamo should reuse the user level fake mode.
             self.joint_with_descriptors = aot_export_joint_with_descriptors(
@@ -326,6 +346,7 @@ def build_model_graph(self):
                 print_output=False, include_stride=True, include_device=True
             ),
         )
+        print(gm.graph)
 
         self.gm = gm
 
diff --git a/autoparallel/apply_sharding.py b/autoparallel/apply_sharding.py
@@ -208,13 +208,18 @@ def call_function(self, target, args, kwargs):
 
         # apply sharding to constructor functions as well
         if target in TENSOR_FACTORY_OPS:
-            val = list(new_args[0])
-            spec = self.sharding_placement[node].output_specs
-            for mesh_size, placement in zip(spec.mesh.shape, spec.placements):
-                if placement.is_shard():
-                    # TODO: fix uneven cases ?
-                    val[placement.dim] //= mesh_size
-            new_args[0] = tuple(val)
+            # scalar_tensor has a scalar as first arg, not a shape
+            if target == torch.ops.aten.scalar_tensor.default:
+                # scalar tensors can't be sharded, so no transformation needed
+                pass
+            else:
+                val = list(new_args[0])
+                spec = self.sharding_placement[node].output_specs
+                for mesh_size, placement in zip(spec.mesh.shape, spec.placements):
+                    if placement.is_shard():
+                        # TODO: fix uneven cases ?
+                        val[placement.dim] //= mesh_size
+                new_args[0] = tuple(val)
 
         # use DTensor machinery to ensure the view ops are valid
         # otherwise we would end-up forcing global shapes on local tensors
diff --git a/autoparallel/graph_pp_runner.py b/autoparallel/graph_pp_runner.py
@@ -275,6 +275,11 @@ def stage_forward(
         # Receive activations for this chunk
         # Activations only come in args form
         composite_args = stage._retrieve_recv_activations(mb_index)
+        if stage.is_last and ctx.target_mbs is not None:
+            assert isinstance(
+                composite_args, tuple
+            ), f"Expected composite args to be a tuple but got {type(composite_args)}"
+            composite_args = composite_args + (ctx.target_mbs[mb_index],)  # type: ignore[index]
 
     # stage._validate_fwd_input(args, kwargs) Maybe need to validate composite args?
     logger.debug(
@@ -292,6 +297,8 @@ def stage_forward(
     # Output chunks is only used for the last stage since we only merge the output of the last stage
     if stage.is_last:
         stage.output_chunks.append(output)
+        if ctx.target_mbs is not None:
+            ctx.schedule_ref._internal_losses.append(output)
 
     stage.fwd_cache[mb_index] = (
         output_tuple,  # stage_output
@@ -360,7 +367,7 @@ def stage_full_backward(
         # HACK till we have loss function, we populate the tangents here manually
         bwd_kwargs = {
             "stage_output": loss,
-            "tangents": [torch.randn_like(stage_output[0])],
+            "tangents": [torch.ones_like(stage_output[0])],
             "saved_intermediates": saved_intermediates,
         }
     else:
@@ -525,7 +532,9 @@ def _accumulate_stage_grads_and_clear_states(
         stage.state.clear()
 
     def step(self, *args, **kwargs) -> None:
-
+        has_targets_and_loss = (
+            "losses" in kwargs and "targets" in kwargs if kwargs else False
+        )
         for stage in self.schedule._stages:
             assert isinstance(stage, GraphPipelineStage)
             self._populate_stage_states(stage)
@@ -535,3 +544,11 @@ def step(self, *args, **kwargs) -> None:
         for stage in self.schedule._stages:
             assert isinstance(stage, GraphPipelineStage)
             self._accumulate_stage_grads_and_clear_states(stage)
+            if stage.is_last and has_targets_and_loss:
+                losses = kwargs["losses"]
+                losses.clear()
+                assert (
+                    len(self.schedule._internal_losses) == self.schedule._n_microbatches
+                )
+                losses.extend(self.schedule._internal_losses)
+                self.schedule._internal_losses.clear()
diff --git a/autoparallel/propagation_rules.py b/autoparallel/propagation_rules.py
@@ -363,11 +363,8 @@ def randperm_rule(mesh, specs):
     return OpStrategy([OpSpec(spec, input_specs=[spec], redistribute_cost=[[0.0]])])
 
 
-# We do a few special things for factory ops
-# - use the factory rule below
-# - fake that they have input schemas so the solver doesn't freak out
-# - convert their sizes to 'local tensor sizes' (divide by mesh dim) during ApplySharding
-TENSOR_FACTORY_OPS = [
+# Factory ops that take a shape as the first argument
+_SHAPE_FACTORY_OPS = [
     torch.ops.aten.zeros.default,
     torch.ops.aten.ones.default,
     torch.ops.aten.full.default,
@@ -376,8 +373,49 @@ def randperm_rule(mesh, specs):
     torch.ops.aten.randn.default,
 ]
 
+# We do a few special things for factory ops
+# - use the factory rule below
+# - fake that they have input schemas so the solver doesn't freak out
+# - convert their sizes to 'local tensor sizes' (divide by mesh dim) during ApplySharding
+TENSOR_FACTORY_OPS = _SHAPE_FACTORY_OPS + [
+    torch.ops.aten.scalar_tensor.default,  # Special case: creates 0-dim tensor
+]
+
+
+@register_opschema_rule(torch.ops.aten.scalar_tensor.default)
+def scalar_tensor_rule(mesh, op_schema: OpSchema) -> OpStrategy:
+    """
+    Rule for aten.scalar_tensor which creates a scalar (0-dimensional) tensor.
+    Unlike other factory ops, this doesn't take a shape parameter.
+
+    Schema: scalar_tensor(Scalar s, *, ScalarType? dtype=None, ...) -> Tensor
+    """
+    # scalar_tensor creates a 0-dimensional tensor
+    shape = ()
+    stride = ()
+    dtype = torch.get_default_dtype()
+
+    # Check if dtype is specified in kwargs or args
+    if len(op_schema.args_schema) >= 2 and op_schema.args_schema[1] is not None:
+        dtype = op_schema.args_schema[1]  # type: ignore[assignment]
+
+    tensor_meta = TensorMeta(shape, stride, dtype)  # type: ignore[arg-type]
+
+    # For a scalar (0-dim) tensor, we can only replicate across all mesh dimensions
+    placement = (Replicate(),) * mesh.ndim
+    output_specs = DTensorSpec(mesh, placement, tensor_meta=tensor_meta)
+
+    # Similar to factory_rule, we add a dummy input_specs for solver compatibility
+    strategy = OpSpec(
+        output_specs=output_specs,
+        input_specs=[output_specs],
+        redistribute_cost=[[0.0]],
+    )
+
+    return OpStrategy([strategy])
+
 
-@register_opschema_rule(TENSOR_FACTORY_OPS)
+@register_opschema_rule(_SHAPE_FACTORY_OPS)
 def factory_rule(mesh, op_schema: OpSchema) -> OpStrategy:
     """
     This is an auto-parallel specific util that won't be upstreamed becuase of a UX mismatch.
diff --git a/examples/example_ds3_pp.py b/examples/example_ds3_pp.py
diff --git a/examples/example_pp_graph_passes.py b/examples/example_pp_graph_passes.py