[Traceable FSDP2] Ignore FSDP2 forward hook side-effects in AC; Suppo…

…rt FSDP2 + AC (pytorch#134997) > Ignore FSDP2 forward hook side-effects in AC Under AC, FSDP2 does not rely on forward hook to all-gather weights to do recomputation, instead it relies on pre-backward hook to do this job: https://github.com/pytorch/pytorch/blob/451eaf0ff247090ca5a9648fd1e17c3c011737e1/torch/distributed/_composable/fsdp/_fsdp_state.py#L219-L220 So when we use `speculate_subgraph` to trace the utils.checkpoint AC region, we don't actually need to worry about FSDP2 forward hook's side effects and can safely ignore it, because we are not and we don't expect to re-run the FSDP2 forward hook during backward recomputation. ---- Test commands: - `pytest -rA test/distributed/_composable/fsdp/test_fully_shard_compile.py::TestFullyShardCompile::test_nested_fully_shard_backend_inductor` - `pytest -rA test/distributed/_composable/fsdp/test_fully_shard_compile.py::TestFullyShardCompile::test_transformer_backend_inductor` Pull Request resolved: pytorch#134997 Approved by: https://github.com/zou3519 ghstack dependencies: pytorch#135727
Chao1Han · Chao1Han · Sep 13, 2024 · Sep 13, 2024 · Sep 12, 2024 · Sep 13, 2024
commit 386884e5534bc812e4f90dcc94d420e148f20f2b
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_compile.py b/test/distributed/_composable/fsdp/test_fully_shard_compile.py
@@ -154,25 +154,64 @@ def f(x):
         torch.compile(f, backend="aot_eager")(x)
         self.assertEqual(x, ref_x)
 
-    def _assert_no_aliased_graph_inputs(self, graph: torch.fx.Graph) -> None:
+    def _assert_no_aliased_unsharded_params_in_graph_inputs(
+        self, model, graph: torch.fx.Graph
+    ) -> None:
+        # FSDP2 unsharded params are mutated in the graph without going through functionalization.
+        # Therefore, we want to make sure they don't have aliases in the graph inputs, to make it easier
+        # for us to do the replacement of unsharded params with the all-gathered temporary buffer directly
+        # in downstream users in the graph.
         storage_id_to_graph_inputs = defaultdict(list)
+        unsharded_param_graph_inputs = set()
         for node in graph.nodes:
-            if node.op == "placeholder" and isinstance(
-                node.meta.get("val", None), torch.Tensor
+            if (
+                node.op == "call_function"
+                and node.target
+                in [
+                    torch.ops.inductor.resize_storage_bytes_.default,
+                    torch.ops.fsdp.copy_.default,
+                ]
+                and node.args[0].op == "placeholder"
             ):
-                storage_id_to_graph_inputs[
-                    id(node.meta["val"].untyped_storage())
-                ].append(node)
-        no_aliased_graph_inputs = True
+                unsharded_param_graph_inputs.add(node.args[0])
+        assert len(unsharded_param_graph_inputs) > 0
+        assert len(unsharded_param_graph_inputs) == len(
+            list(model.parameters())
+        ), """\
+Expected all model parameters to be wrapped by FSDP2 and
+have their unsharded version as graph input, but it's not true!
+"""
+        no_aliased_unsharded_params_in_graph_inputs = True
         err_msg = ""
         for aliased_graph_inputs in storage_id_to_graph_inputs.values():
-            if len(aliased_graph_inputs) > 1:
-                no_aliased_graph_inputs = False
+            if len(aliased_graph_inputs) > 1 and any(
+                x in unsharded_param_graph_inputs for x in aliased_graph_inputs
+            ):
+                no_aliased_unsharded_params_in_graph_inputs = False
                 err_msg += f"""\n
-Found aliased graph inputs: {aliased_graph_inputs},
+Found aliased unsharded param in graph inputs: {aliased_graph_inputs},
 val.shape: {[node.meta['val'].shape for node in aliased_graph_inputs]},
 """
-        self.assertTrue(no_aliased_graph_inputs, err_msg)
+        self.assertTrue(no_aliased_unsharded_params_in_graph_inputs, err_msg)
+
+    def _remove_fsdp2_unsharded_param_graph_input_usage_with_optional_checks(
+        self, model, fullgraph
+    ):
+        def _run_with_checks(graph, orig_fn):
+            self._assert_no_aliased_unsharded_params_in_graph_inputs(model, graph)
+            orig_fn(graph)
+
+        if fullgraph:
+            return mock.patch.object(
+                comms,
+                "remove_fsdp2_unsharded_param_graph_input_usage",
+                functools.partial(
+                    _run_with_checks,
+                    orig_fn=comms.remove_fsdp2_unsharded_param_graph_input_usage,
+                ),
+            )
+        else:
+            return contextlib.nullcontext()
 
     def _check_fsdp_copy_and_resize_ops_count_in_graph(
         self,
@@ -359,7 +398,11 @@ def inductor_code_check_fsdp_reduce_scatter(
         return file_check
 
     def _test_traceable_fsdp(
-        self, model_init_fn, input_creation_fn, backend, fullgraph
+        self,
+        model_init_fn,
+        input_creation_fn,
+        backend,
+        fullgraph,
     ):
         def compiler_fn(compiled_autograd_backend):
             def _fn(gm):
@@ -401,13 +444,18 @@ def test_compiled():
             # FSDP2 does lazy init using 1st run, so run it once to init using eager mode
             run_iters(model, optim, n_iter=1)
 
-            model_compiled = torch.compile(model, backend=backend, fullgraph=fullgraph)
-            res = run_iters(
-                model_compiled,
-                optim,
-                compiled_autograd_backend=backend,
-            )
-            return res
+            with self._remove_fsdp2_unsharded_param_graph_input_usage_with_optional_checks(
+                model, fullgraph
+            ):
+                model_compiled = torch.compile(
+                    model, backend=backend, fullgraph=fullgraph
+                )
+                res = run_iters(
+                    model_compiled,
+                    optim,
+                    compiled_autograd_backend=backend,
+                )
+                return res
 
         def test_eager():
             model, optim = model_init_fn()
@@ -421,17 +469,15 @@ def test_eager():
             inline_inbuilt_nn_modules=True,
             skip_fsdp_hooks=False,
         ), torch._functorch.config.patch(
-            recompute_views=True, cse=False
+            recompute_views=True,
+            cse=False,
         ), torch._inductor.config.patch(
             reorder_for_compute_comm_overlap=True,
             reorder_for_compute_comm_overlap_passes=[
                 "sink_waits",
                 "raise_comms",
                 "reorder_compute_for_overlap",
             ],
-            post_grad_custom_pre_pass=self._assert_no_aliased_graph_inputs
-            if fullgraph
-            else None,
         ):
             losses_compiled = test_compiled()
         losses_eager = test_eager()
@@ -677,7 +723,9 @@ def test_nested_fully_shard_backend_inductor(self):
                     "Expected at least 3 separate lowerings to Triton code, which means at least 1 graph break in FWD graph",
                 )
 
-    def _create_transformer_factory_fns(self, all_requires_grad):
+    def _create_transformer_factory_fns(
+        self, all_requires_grad, *, activation_checkpoint=False
+    ):
         seq_len = 16
         vocab_size = 8
         n_layers = 3
@@ -689,6 +737,7 @@ def model_init_fn():
             model_args = ModelArgs(
                 vocab_size=vocab_size,
                 n_layers=n_layers,
+                checkpoint_activations=activation_checkpoint,
             )
             model = Transformer(model_args)
             if not all_requires_grad:
@@ -775,9 +824,11 @@ def test_transformer_backend_aot_eager_decomp_partition(self):
     @torch._inductor.config.patch(fallback_random=True)
     def test_transformer_backend_inductor(self):
         # TODO: enable fullgraph=False case
-        for fullgraph, all_requires_grad in itertools.product([True], [True, False]):
+        for fullgraph, all_requires_grad, activation_checkpoint in itertools.product(
+            [True], [True, False], [True, False]
+        ):
             log.warning(
-                f"fullgraph={fullgraph}, all_requires_grad={all_requires_grad}"  # noqa: G004, G001
+                f"fullgraph={fullgraph}, all_requires_grad={all_requires_grad}, activation_checkpoint={activation_checkpoint}"  # noqa: G004, G001
             )
             with self._maybe_add_graph_break_to_sdpa(
                 fullgraph
@@ -802,7 +853,8 @@ def test_transformer_backend_inductor(self):
                 _, triton_codes = run_and_get_code(
                     lambda: self._test_traceable_fsdp(
                         *self._create_transformer_factory_fns(
-                            all_requires_grad=all_requires_grad
+                            all_requires_grad=all_requires_grad,
+                            activation_checkpoint=activation_checkpoint,
                         ),
                         "inductor",
                         fullgraph=fullgraph,

diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
@@ -326,7 +326,7 @@ def __init__(
         ] = collections.defaultdict(list)
         # Stores the full fqn of a param or buffer to the relevant source.
         self.param_name_to_source: Optional[Dict[str, Source]] = {}
-        self.side_effects = SideEffects()
+        self.side_effects = SideEffects(self)
         # Cached variable trackers. This makes symbolic analysis of LOAD_GLOBAL
         # and LOAD_ATTR for same python objects free.
         self.variable_tracker_cache = VariableTrackerCache()
@@ -1834,6 +1834,14 @@ def __init__(
         # Dicts maintain the order of args for the HigherOrderOperator call.
         self.lifted_freevars = {}
         self.prev_inst = None
+        # True if this tracer is currently tracing into torch.utils.checkpoint
+        # as part of speculate_subgraph.
+        self.under_activation_checkpoint = False
+        # True if we want to allow side-effects (doesn't throw error on their existence)
+        # during this tracer's tracing of torch.utils.checkpoint (via speculate_subgraph).
+        # Only safe if we know for sure that *NOT* replaying these side-effects during
+        # backward recomputation of the checkpoint region doesn't affect its correctness.
+        self.allow_side_effects_under_checkpoint = False
 
         self._cur_code = None
         self._orig_gm_meta = None

diff --git a/torch/_dynamo/side_effects.py b/torch/_dynamo/side_effects.py
@@ -1,7 +1,9 @@
 # mypy: allow-untyped-defs
+import contextlib
 import functools
 import inspect
 import warnings
+import weakref
 from collections.abc import MutableMapping
 from typing import Any, Dict, List, Optional, Type, Union
 
@@ -79,13 +81,15 @@ class SideEffects:
 
     def __init__(
         self,
+        output_graph,
         id_to_variable=None,
         store_attr_mutations=None,
         keepalive=None,
         save_for_backward=None,
         tensor_hooks=None,
     ):
         super().__init__()
+        self.output_graph_weakref = weakref.ref(output_graph)
         self.id_to_variable = id_to_variable or {}
         self.store_attr_mutations = store_attr_mutations or {}
         self.keepalive = keepalive or []
@@ -130,6 +134,7 @@ def diff(self, other: "SideEffects") -> Optional[str]:
     def clone(self):
         """Create a shallow copy"""
         return self.__class__(
+            output_graph=self.output_graph_weakref(),
             id_to_variable=dict(self.id_to_variable),
             store_attr_mutations={
                 k: dict(v) for k, v in self.store_attr_mutations.items()
@@ -145,13 +150,23 @@ def __contains__(self, item):
     def __getitem__(self, item):
         return self.id_to_variable[id(item)]
 
+    def should_allow_side_effects_under_checkpoint(self):
+        output_graph = self.output_graph_weakref()
+        return (
+            output_graph
+            and output_graph.current_tx.output.current_tracer.under_activation_checkpoint
+            and output_graph.current_tx.output.current_tracer.allow_side_effects_under_checkpoint
+        )
+
     def check_allowed_side_effect(self, item):
         from torch._dynamo.variables.misc import AutogradFunctionContextVariable
 
         # People do things like self.dim = dim inside autograd.Function.
         # These are benign.
         if isinstance(item, AutogradFunctionContextVariable):
             return True
+        if self.should_allow_side_effects_under_checkpoint():
+            return True
         if not is_side_effect_safe(item.mutable_local):
             unimplemented(
                 "HigherOrderOperator: Mutating a variable not in the current scope (SideEffects)"
@@ -725,3 +740,14 @@ def is_empty(self):
     def clear(self):
         self.keepalive.clear()
         self.id_to_variable.clear()
+
+
+@contextlib.contextmanager
+def allow_side_effects_under_checkpoint(tx: "InstructionTranslator"):  # type: ignore[name-defined]  # noqa: F821
+    assert tx.output.current_tracer.under_activation_checkpoint
+    orig_val = tx.output.current_tracer.allow_side_effects_under_checkpoint
+    try:
+        tx.output.current_tracer.allow_side_effects_under_checkpoint = True
+        yield
+    finally:
+        tx.output.current_tracer.allow_side_effects_under_checkpoint = orig_val
diff --git a/torch/_dynamo/variables/functions.py b/torch/_dynamo/variables/functions.py
@@ -322,7 +322,20 @@ def call_function(
             return invoke_and_store_as_constant(
                 tx, self.fn, self.get_name(), args, kwargs
             )
-
+        if (
+            tx.output.current_tracer.under_activation_checkpoint
+            and not tx.output.current_tracer.allow_side_effects_under_checkpoint
+        ):
+            try:
+                from torch.distributed._composable.fsdp._fsdp_state import FSDPState
+            except Exception:
+                FSDPState = None
+            if FSDPState is not None and self.fn in [
+                FSDPState._pre_forward,
+                FSDPState._post_forward,
+            ]:
+                with torch._dynamo.side_effects.allow_side_effects_under_checkpoint(tx):
+                    return super().call_function(tx, args, kwargs)
         return super().call_function(tx, args, kwargs)
 
 

diff --git a/torch/_dynamo/variables/higher_order_ops.py b/torch/_dynamo/variables/higher_order_ops.py
@@ -71,6 +71,16 @@ def dynamo_enable_grad(tx: "InstructionTranslator", enable=True):
         GradModeVariable.create(tx, org_value, initialized=True)
 
 
+@contextlib.contextmanager
+def dynamo_under_activation_checkpoint(tx: "InstructionTranslator"):
+    orig_val = tx.output.current_tracer.under_activation_checkpoint
+    try:
+        tx.output.current_tracer.under_activation_checkpoint = True
+        yield
+    finally:
+        tx.output.current_tracer.under_activation_checkpoint = orig_val
+
+
 def only_consist_of(var, types, allow_none=False):
     if isinstance(var, types):
         return True
@@ -388,6 +398,7 @@ def speculate_subgraph(
     set_subgraph_inputs="automatic",
     restore_side_effects=True,
     should_flatten_outputs=False,
+    under_activation_checkpoint=False,
     # Pass in an originating tracer - this is needed for preserving context
     # across fwd-bwd for autograd.Function
     tracer=None,
@@ -439,6 +450,11 @@ def speculate_subgraph(
                 if enable_grad is not None
                 else contextlib.nullcontext()
             )
+            checkpoint_ctx = (
+                dynamo_under_activation_checkpoint(tx)
+                if under_activation_checkpoint
+                else contextlib.nullcontext()
+            )
 
             # For handling side effects, we can make an argument that we don't
             # have to do anything here. The side effects infra does a good job
@@ -458,7 +474,7 @@ def speculate_subgraph(
             if restore_side_effects:
                 prev_side_effects = tx.output.side_effects.clone()
 
-            with autograd_ctx:
+            with autograd_ctx, checkpoint_ctx:
                 output = f.call_function(tx, args, sub_kwargs)
 
             if restore_side_effects:
@@ -1504,7 +1520,12 @@ def call_function(
 
 class WrapHigherOrderVariable(TorchHigherOrderOperatorVariable):
     def create_wrapped_node(
-        self, tx: "InstructionTranslator", args, kwargs, description
+        self,
+        tx: "InstructionTranslator",
+        args,
+        kwargs,
+        description,
+        under_activation_checkpoint=False,
     ):
         # See NOTE [HigherOrderOperator tracing design] for more details
 
@@ -1520,6 +1541,7 @@ def create_wrapped_node(
             description,
             source_target=self.value,
             should_flatten_outputs=True,
+            under_activation_checkpoint=under_activation_checkpoint,
         )
 
         body_gmod = torch.fx.GraphModule(tx.output.nn_modules, body_graph)
@@ -1856,7 +1878,11 @@ def call_function(
             treespec,
             checkpointed_gmod,
         ) = self.create_wrapped_node(
-            tx, args, gmod_kwargs, "torch.utils.checkpoint.checkpoint"
+            tx,
+            args,
+            gmod_kwargs,
+            "torch.utils.checkpoint.checkpoint",
+            under_activation_checkpoint=True,
         )
         if context_fn is not None:
             checkpointed_gmod.meta["_checkpoint_context_fn"] = context_fn