Add Activation Checkpointing Pass

wconstab · wconstab · commit ba401251691c · 2025-08-06T21:14:23.000-07:00
Work in progress: copied the latest code over from `whc/hack_aot` and
tweaked the way it gets hooked up a bit, haven't tested yet. Likely need
to discuss whether we want the AC pass to be popped back off inductor's
passes earlier or keep it at __exit__ from AutoParallel.
diff --git a/autoparallel/activation_checkpointing.py b/autoparallel/activation_checkpointing.py
@@ -0,0 +1,288 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+from collections import defaultdict
+from dataclasses import dataclass
+
+import torch
+from torch._functorch.partitioners import _has_tag_is_backward, _size_of
+from torch.utils._ordered_set import OrderedSet
+from torch.utils.checkpoint import CheckpointPolicy
+
+logger: logging.Logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+# reimplement torch._functorch.partitioners.must_recompute
+# to only check for MUST_RECOMPUTE tag, and not PREFER_RECOMPUTE
+# For now there isn't any distinction in the partitioner between both
+# and I think this is a bug
+def must_recompute(node: torch.fx.Node) -> bool:
+    return node.meta.get("recompute", None) is CheckpointPolicy.MUST_RECOMPUTE
+
+
+def is_graph_input(node: torch.fx.Node) -> bool:
+    return node.op == "placeholder"
+
+
+def is_wait_tensor(node: torch.fx.Node) -> bool:
+    return (
+        node.op == "call_function"
+        and node.target == torch.ops._c10d_functional.wait_tensor.default
+    )
+
+
+def is_all_gather_into_tensor(node: torch.fx.Node) -> bool:
+    return (
+        node.op == "call_function"
+        and node.target == torch.ops._c10d_functional.all_gather_into_tensor.default
+    )
+
+
+def is_wait_tensor_from_fsdp(node: torch.fx.Node) -> bool:
+    if is_wait_tensor(node) and is_all_gather_into_tensor(node.args[0]):
+        # TODO: this needs to be improved, its firing in autoparallel "2D" case where input to AG is wait,
+        # maybe just 2D FSDP
+        # ag_node = node.args[0]
+        # assert is_graph_input(ag_node.args[0]) or (
+        #     ag_node.args[0].op == "call_function"
+        #     and ag_node.args[0].target == torch.ops.prims.convert_element_type.default
+        #     and is_graph_input(ag_node.args[0].args[0])
+        # ), (
+        #     "Assume all_gather_into_tensor input is either graph input "
+        #     + f"or dtype conversion of graph input, but got {ag_node.args[0]}"
+        # )
+        return True
+    return False
+
+
+# mypy: ignore-errors
+
+
+def force_recompute_fsdp_all_gather(graph: torch.fx.Graph) -> None:
+    """
+    Force recompute all_gather nodes from simple fsdp in the graph.
+
+    This pass should be added in torch._inductor.config.joint_custom_post_pass
+    """
+
+    def force_recompute_node(node):
+        node.meta["recompute"] = CheckpointPolicy.MUST_RECOMPUTE
+        if "ac_graph_id" not in node.meta:
+            # ac_graph_id is used in the partitioner to decide
+            # if two nodes which have AC applied come from a different
+            # AC regions. This is needed because  nodes in the boundary
+            # of two AC regions are marked as MUST_SAVE. In our case
+            # we just add a large value of ac_graph_id so that
+            # all nodes we tag for recomputation do indeed get recomputed
+            # and are not influenced by other nodes in the graph with
+            # nearby ac_graph_id values
+            node.meta["ac_graph_id"] = 100000
+
+    # Make all-gather nodes (and related nodes) recomputable, to circumvent
+    # https://github.com/pytorch/pytorch/issues/136433
+    for node in graph.nodes:
+        if is_wait_tensor_from_fsdp(node):
+            ag_node = node.args[0]
+            force_recompute_node(ag_node)  # all_gather
+            force_recompute_node(node)  # wait_tensor
+            # Force-recompute slice that comes after wait
+            for user in node.users:
+                if (
+                    user.op == "call_function"
+                    and user.target == torch.ops.aten.slice.Tensor
+                ):
+                    force_recompute_node(user)
+            # Force-recompute potential dtype casts from all_gather
+            if (
+                ag_node.all_input_nodes[0].op == "call_function"
+                and ag_node.args[0].target
+                == torch.ops.prims.convert_element_type.default
+            ):
+                force_recompute_node(ag_node.all_input_nodes[0])
+
+
+def mark_nodes_as_must_save_to_stage_recomputation(
+    joint_graph: torch.fx.Graph,
+    stage_size_in_GiB: float = -1,
+) -> None:
+    """
+    Marks specific nodes as "must save" to optimize memory usage during recomputation.
+
+    With aggressive recomputation strategies, we often encounter situations where long chains
+    of forward nodes must be recomputed before executing backward pass nodes, causing high
+    peak memory usage. This function breaks these recomputation chains into smaller stages
+    based by periodically saving itermediate nodes, keeping peak memory usage below.
+
+    Args:
+        joint_graph: The joint graph containing both forward and backward nodes
+        stage_size_in_GiB: Target memory size per stage in GiB (-1 to disable staging)
+    """
+
+    if stage_size_in_GiB < 0:
+        return
+
+    INT_INF = int(1e9)
+
+    def get_required_fwd_nodes(
+        joint_graph: torch.fx.Graph,
+    ) -> OrderedSet[torch.fx.Node]:
+        """
+        Return the set of nodes that are required in the forward graph.
+
+        NOTE: this is doing similar things as classify_nodes() in _functorch/partitioenrs.py
+              where nodes are classified into three types -- fwd, bwd, and unclaimed
+              both bwd and unclaimed nodes have partitioner_tag equal to "is_backward"
+        """
+        required_fwd_nodes: OrderedSet[torch.fx.Node] = OrderedSet()
+        for node in joint_graph.nodes:
+            if node.op == "placeholder" and "tangents" in node.target:
+                continue
+            if node.op == "output":
+                continue
+            if _has_tag_is_backward(node):
+                continue
+            required_fwd_nodes.add(node)
+        return required_fwd_nodes
+
+    def get_node_distance_to_bwd(
+        joint_graph: torch.fx.Graph,
+        get_required_fwd_nodes: OrderedSet[torch.fx.Node],
+    ) -> dict[torch.fx.Node, int]:
+        """
+        Compute and return the distance of all nodes to the closest backward node.
+        If a node is not an ancestor of a backward node, then its distance is INT_INF.
+
+        NOTE: this is adapted from
+        https://github.com/pytorch/pytorch/blob/3196a3aca0f16792820158cfd451cb977f99ac7e/torch/_functorch/partitioners.py#L2089-L2097
+        """
+        dist_from_bw = {}
+        for node in reversed(joint_graph.nodes):
+            if node.op == "output":
+                dist_from_bw[node] = INT_INF
+            elif node not in get_required_fwd_nodes:
+                dist_from_bw[node] = 0
+            else:
+                dist_from_bw[node] = INT_INF
+                for user in node.users:
+                    dist_from_bw[node] = min(dist_from_bw[node], dist_from_bw[user] + 1)
+        return dist_from_bw
+
+    def get_all_recomputable_forward_nodes(
+        joint_graph: torch.fx.Graph,
+    ) -> OrderedSet[torch.fx.Node]:
+        """
+        Return the set of all forward nodes that are recomputable
+        """
+        required_fwd_nodes = get_required_fwd_nodes(joint_graph)
+        dist_from_bw = get_node_distance_to_bwd(joint_graph, required_fwd_nodes)
+        fwd_recomputable_nodes: OrderedSet[torch.fx.Node] = OrderedSet()
+        for node in joint_graph.nodes:
+            if (
+                node in required_fwd_nodes
+                and dist_from_bw[node] < INT_INF
+                and node.op != "placeholder"
+            ):
+                fwd_recomputable_nodes.add(node)
+        return fwd_recomputable_nodes
+
+    def mark_nodes_as_must_save(must_save_nodes: list[torch.fx.Node]) -> None:
+        """
+        Given a list of nodes, mark them as must save.
+        """
+        print(f"mark_nodes_as_must_save: {must_save_nodes}")
+        for node in must_save_nodes:
+            node.meta["recompute"] = CheckpointPolicy.MUST_SAVE
+
+    fwd_recomputable_nodes = get_all_recomputable_forward_nodes(joint_graph)
+
+    # Initialize all nodes as 'prefer recompute' and then adjust only the must-save ones below
+    for node in fwd_recomputable_nodes:
+        if node.meta.get("recompute", None) is not None:
+            # do not mess with allgather nodes that have already been marked recompute!
+            continue
+        node.meta["recompute"] = CheckpointPolicy.PREFER_RECOMPUTE
+        # add an arbitrarily large graph id. I'm assuming 100000 here, which should be fine
+        # and is the same we add for the all-gather nodes
+        node.meta["ac_graph_id"] = 100000
+
+    # get the mapping between node name and node
+    name_to_node_mapping = {}
+    for node in fwd_recomputable_nodes:
+        name_to_node_mapping[node.name] = node
+
+    # populate node_to_predecessors, accounting for must_recompute nodes. In particular,
+    # if a node is marked as must recompute, then for its users, their predecessors should
+    # be updated to be instead the predecessors of the must recompute node.
+    node_to_predecessors = defaultdict(OrderedSet)
+    for node in fwd_recomputable_nodes:
+        node_to_predecessors[node] = OrderedSet(
+            [pred for pred in node.all_input_nodes if pred in fwd_recomputable_nodes]
+        )
+    for node in fwd_recomputable_nodes:
+        if not must_recompute(node):
+            continue
+        for user in node.users:
+            if user in fwd_recomputable_nodes:
+                node_to_predecessors[user].remove(node)
+                node_to_predecessors[user].update(node_to_predecessors[node])
+
+    # populate node_to_last_usage
+    # if A is last used by B, then A \in node_to_last_usage[B]
+    node_to_last_usage = defaultdict(OrderedSet)
+    last_used_by = {}
+    for node in fwd_recomputable_nodes:
+        last_used_by[node] = node
+        for pred in node_to_predecessors[node]:
+            last_used_by[pred] = node
+    for producer, consumer in last_used_by.items():
+        node_to_last_usage[consumer].add(producer)
+
+    # loop through nodes in order of the forward graph and keep track of the following:
+    # for each node, right before its execution, the output of what nodes are in memory.
+    @dataclass
+    class NodeCutScore:
+        tot_mem: float
+        alive_node_names: OrderedSet[str]
+
+    alive_nodes = OrderedSet()
+    node2score = {}
+    for node in fwd_recomputable_nodes:
+        if not must_recompute(node):
+            alive_nodes.add(node)
+            for a in node_to_last_usage[node]:
+                alive_nodes.remove(a)
+        tot_mem = sum(_size_of(node) for node in alive_nodes)
+        node2score[node] = NodeCutScore(
+            tot_mem, OrderedSet([n.name for n in alive_nodes])
+        )
+
+    # divide the graph into stages with roughly equal memory usage.
+    stages = defaultdict(OrderedSet)
+    cum_mem_so_far = 0
+    curr_stage_idx = 0
+    target_mem = stage_size_in_GiB * 2**30
+    for node in fwd_recomputable_nodes:
+        stages[curr_stage_idx].add(node)
+        if not must_recompute(node):
+            cum_mem_so_far += _size_of(node)
+        if cum_mem_so_far >= target_mem:
+            curr_stage_idx += 1
+            cum_mem_so_far = 0
+
+    # loop through each stage and pick the best node to cut on, and save
+    # the nodes that will be marked as must save.
+    nodes_to_save = OrderedSet()
+    for stage in stages.values():
+        best_node = min(stage, key=lambda x: node2score[x].tot_mem)
+        nodes_to_save.update(node2score[best_node].alive_node_names)
+    mark_nodes_as_must_save([name_to_node_mapping[n] for n in nodes_to_save])
+
+
+def ac_joint_pass(graph: torch.fx.Graph, ac_stage_size_in_GiB: float = 2.0):
+    force_recompute_fsdp_all_gather(graph)
+    mark_nodes_as_must_save_to_stage_recomputation(
+        graph, stage_size_in_GiB=ac_stage_size_in_GiB
+    )
diff --git a/autoparallel/api.py b/autoparallel/api.py
@@ -6,6 +6,7 @@
 import copy
 import itertools
 from contextlib import ExitStack
+from functools import partial
 from types import MethodType
 from typing import Optional
 
@@ -24,6 +25,7 @@
 from torch.export._unlift import _assign_attr
 from torch.export.unflatten import _AttrKind
 
+from .activation_checkpointing import ac_joint_pass
 from .apply_sharding import apply_sharding_to_model
 from .cast_parametrization import apply_dtype_cast, canonicalize_mp, set_dtype_cast
 from .init_weights import hook_params_setters
@@ -163,6 +165,8 @@ def __init__(
         input_fn,
         mesh: DeviceMesh,
         mp_policy: Optional[MixedPrecisionPolicy] = None,
+        enable_ac: bool = True,
+        ac_stage_size_in_GiB: float = 2.0,
     ):
         self.stack = ExitStack()
         self.fake_mode = (
@@ -187,14 +191,23 @@ def __init__(
         self.model = move_to_fake(model, self.fake_mode, device)
         self.input_fn = input_fn
         self.mesh = mesh
+        self.enable_ac = enable_ac
+        self.ac_stage_size_in_GiB = ac_stage_size_in_GiB
 
         # NB: rest of the construction happens in __enter__
-
         self.active = False
 
     def __enter__(self):
         assert self.active is False
 
+        if self.enable_ac:
+            self.orig_inductor_joint_custom_post_pass = (
+                torch._inductor.config.joint_custom_post_pass
+            )
+            torch._inductor.config.joint_custom_post_pass = partial(
+                ac_joint_pass, ac_stage_size_in_GiB=self.ac_stage_size_in_GiB
+            )
+
         self.build_model_graph()
 
         rescale_grad_comm_cost_for_mp = 1.0
@@ -225,6 +238,10 @@ def __enter__(self):
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         self.active = None
+        if self.enable_ac:
+            torch._inductor.config.joint_custom_post_pass = (
+                self.orig_inductor_joint_custom_post_pass
+            )
         return self.stack.__exit__(exc_type, exc_val, exc_tb)
 
     def _assert_entered(self):