fsdp working

danielvegamyhre · danielvegamyhre · commit c7bca2193ce3 · 2025-06-19T17:02:47.000-07:00
diff --git a/torchao/prototype/moe_training/scaled_grouped_mm.py b/torchao/prototype/moe_training/scaled_grouped_mm.py
@@ -41,7 +41,6 @@ def _scaled_grouped_mm(
         out_dtype (Optional[torch.dtype]): The dtype of the output tensor. Currently only torch.bfloat16 is supported.
         use_triton_for_per_group_scales (bool): Whether to use custom triton kernels to compute per-group scales. Default is True.
     """
-    print("SCALED_GROUPED_MM")
     return _Float8GroupedMM.apply(
         A,
         B_t,
diff --git a/torchao/prototype/moe_training/tensor.py b/torchao/prototype/moe_training/tensor.py
@@ -1,9 +1,28 @@
+from typing import Any, Optional, Tuple
+
 import torch
-from torch.utils._pytree import tree_map
+import torch.utils._pytree as pytree
+from torch._prims_common import suggest_memory_format
 
 from torchao.prototype.moe_training import _scaled_grouped_mm
 
-        
+# FSDP pads its local tensor on dim-0. The subclass should be preserved such
+# that the padded local tensor (and any transformations like copying to GPU)
+# is of the subclass as well.
+_ops_to_preserve_subclass = {
+    torch.ops.aten.empty_like.default,
+    torch.ops.aten.new_zeros.default,
+    torch.ops.aten.slice.Tensor,
+    torch.ops.aten.copy_.default,
+    torch.ops.aten.view.default,
+    torch.ops.aten.as_strided.default,
+    torch.ops.aten._to_copy.default,
+    torch.ops.aten._pin_memory.default,
+    torch.ops.aten.split.Tensor,
+    torch.ops.aten.clone.default,
+}
+
+
 class ScaledGroupedMMTensor(torch.Tensor):
     """
     ScaledGroupedMMTensor is a simple tensor subclass that wraps a regular tensor
@@ -13,22 +32,34 @@ class ScaledGroupedMMTensor(torch.Tensor):
 
     grouped_mm_func_name = "_grouped_mm"
     offs_arg_name = "offs"
-    use_triton_for_per_group_scales = True
 
-    def __init__(
-        self, data: torch.Tensor, use_triton_for_per_group_scales: bool = True
+    @staticmethod
+    def __new__(
+        cls,
+        tensor: torch.Tensor,
     ):
-        self._data = data
-        self._use_triton_for_per_group_scales = use_triton_for_per_group_scales
+        return torch.Tensor._make_wrapper_subclass(
+            cls,
+            tensor.size(),
+            strides=tensor.stride(),
+            storage_offset=tensor.storage_offset(),
+            memory_format=suggest_memory_format(tensor),
+            dtype=tensor.dtype,
+            layout=tensor.layout,
+            device=tensor.device,
+            pin_memory=tensor.is_pinned(),
+            requires_grad=tensor.requires_grad,
+        )
 
-    def __repr__(self):
-        return f"ScaledGroupedMMTensor(use_triton_for_per_group_scales={self._use_triton_for_per_group_scales}, {self._data})"
-
-    def __repr__(self):
-        return f"ScaledGroupedMMTensor(data={self._data})"
+    def __init__(
+        self,
+        tensor: torch.Tensor,
+    ):
+        self._data = tensor
 
     @classmethod
     def __torch_function__(cls, func, types, args, kwargs={}):
+        # override the grouped mm op to use the differentiable _scaled_grouped_mm
         if func.__name__ == cls.grouped_mm_func_name:
             # Use torchao scaled grouped mm with dynamic quant for
             # "2d x 3d with offsets" case (used for routed experts).
@@ -42,32 +73,56 @@ def __torch_function__(cls, func, types, args, kwargs={}):
             B_is_3d = B.dim() == 3
             has_offs = kwargs.get(cls.offs_arg_name) is not None
             if A_is_2d and B_is_3d and has_offs:
-                # prefer to use B to check use_triton, as that will be the weight/nn.Parameter
-                # that is converted to ScaledGroupedMMTensor
-                use_triton = (
-                    B._use_triton_for_per_group_scales
-                    if isinstance(B, cls)
-                    else A._use_triton_for_per_group_scales
-                )
                 return _scaled_grouped_mm(
                     *args,
-                    use_triton_for_per_group_scales=use_triton,
                     **kwargs,
                 )
 
-        # Disable torch_function by hand because we don't want 
+        # Disable torch_function by hand because we don't want
         # the wrapping behavior of the super() impl, go directly to dispatch
-        with torch._C.DisableTorchFunction():
+        # wrap = lambda x: ScaledGroupedMMTensor(x)
+        # wrapped_args, wrapped_kwargs = pytree.tree_map_only(torch.Tensor, wrap, (args, kwargs))
+        with torch._C.DisableTorchFunctionSubclass():
             return func(*args, **kwargs)
 
-
     @classmethod
     def __torch_dispatch__(cls, func, types, args, kwargs={}):
-        unwrap = lambda x: x._data if isinstance(x, cls) else x
-        wrap = lambda x: cls(x) if isinstance(x, torch.Tensor) else x
-        unwrapped_args, unwrapped_kwargs = tree_map(unwrap, (args, kwargs))
-        output = super().__torch_dispatch__(func, types, unwrapped_args, unwrapped_kwargs)
-        wrapped_output = tree_map(wrap, output)
-        print(func.__name__)
-        print(wrapped_output)
-        return wrapped_output
+        # detach is special case
+        if func == torch.ops.aten.detach.default:
+            return ScaledGroupedMMTensor(args[0]._data)
+
+        # unwrap args and kwargs
+        unwrap = lambda tensor: tensor._data
+        args, kwargs = pytree.tree_map_only(
+            ScaledGroupedMMTensor, unwrap, (args, kwargs or {})
+        )
+
+        # perform op
+        out = func(*args, **kwargs)
+
+        # return regular tensors for ops that don't preserve subclass
+        if func not in _ops_to_preserve_subclass:
+            return out
+
+        # wrap outputs back into ScaledGroupedMMTensor for ops that do preserve subclass
+        return pytree.tree_map_only(
+            torch.Tensor,
+            lambda x: ScaledGroupedMMTensor(x),
+            out,
+        )
+
+    def fsdp_pre_all_gather(self, mesh):
+        return (self._data,), ()
+
+    def fsdp_post_all_gather(
+        self,
+        all_gather_outputs: Tuple[torch.Tensor, ...],
+        metadata: Any,
+        param_dtype: torch.dtype,
+        *,
+        out: Optional[torch.Tensor] = None,
+    ):
+        (data,) = all_gather_outputs
+        return ScaledGroupedMMTensor(
+            data,
+        ), (data,)