add to delayed linear

drisspg · drisspg · commit 682c2e896162 · 2024-01-17T12:55:06.000-08:00
diff --git a/float8_experimental/float8_dynamic_linear.py b/float8_experimental/float8_dynamic_linear.py
@@ -53,8 +53,13 @@ def forward(self, x):
 
         # y = torch.nn.functional.linear(x_fp8, w_fp8, self.bias)
         weight_scale = tensor_to_scale(self.weight, torch.float8_e4m3fn)
-        y = float8_linear.apply(
-            x_fp8, self.weight, weight_scale, None, self.emulate, False
+        y = float8_linear(
+            x_fp8,
+            self.weight,
+            weight_scale,
+            None,
+            self.emulate,
+            self.recompute_weight_cast,
         )
         # Cast gradY to float8_e5m2 during backward
         y = self.cast_to_float8e5m2_bw(y)
@@ -72,17 +77,22 @@ def cast_to_float8e5m2_bw(self, gradY):
         return NoopFwToFloat8E5M2Bw.apply(gradY, self.emulate)
 
     @classmethod
-    def from_float(cls, mod, emulate: bool = False):
+    def from_float(
+        cls, mod, emulate: bool = False, recompute_weight_cast: bool = False
+    ):
         """
         Create an nn.Linear with fp8 compute from a regular nn.Linear
 
         Args:
             mod (torch.nn.Linear): nn.Linear to convert
             emulate (bool): whether to emulate fp8 matmul logic in float32
+            recompute_weight_cast (bool): whether to recompute the weight cast on every
+                backwards pass
         """
         with torch.device("meta"):
             new_mod = cls(mod.in_features, mod.out_features, bias=False)
         new_mod.weight = mod.weight
         new_mod.bias = mod.bias
         new_mod.emulate = emulate
+        new_mod.recompute_weight_cast = recompute_weight_cast
         return new_mod
diff --git a/float8_experimental/float8_linear.py b/float8_experimental/float8_linear.py
@@ -19,6 +19,7 @@
 import float8_experimental.config as config
 
 import torch
+from float8_experimental.float8_ops import float8_linear
 
 from float8_experimental.float8_tensor import Float8Tensor
 
@@ -172,6 +173,13 @@ def __init__(self, *args, **kwargs):
         # and torch.compile, this option can disable them
         self.enable_pre_and_post_forward = config.enable_pre_and_post_forward
 
+        # This flag is used to modify what gets saved for backwards. Its default value
+        # is False, this saves the casted weight for backwards. Note that this typically increases memory usage
+        # Because both the weight parameter and the casted weight are saved on device. If set to true
+        # this will only save the weight parameter and during the backwards pass it will re-cast this weight to fp8.
+        # For traditional FSDP this should be set to True in order to not save the un-sharded weight for backwards.
+        self.recompute_weight_cast = False
+
     def register_always_float32_buffer(
         self, name: str, tensor: Optional[torch.Tensor], persistent: bool = True
     ) -> None:
@@ -214,6 +222,20 @@ def cast_x_to_float8(
         )
         return x_fp8
 
+    def _maybe_init_amaxes_scales_weight(
+        self, w: torch.Tensor, is_amax_initialized: bool
+    ):
+        scale_fn_name = self.recipe.scale_fn_name
+        _maybe_initialize_amaxes_scales_for_float8_cast(
+            w,
+            self.fp8_amax_w,
+            self.fp8_amax_history_w,
+            self.fp8_scale_w,
+            scale_fn_name,
+            torch.float8_e4m3fn,
+            is_amax_initialized,
+        )
+
     def cast_w_to_float8(
         self, w: torch.Tensor, is_amax_initialized: bool
     ) -> torch.Tensor:
@@ -284,9 +306,18 @@ def forward(self, x):
         self.float8_pre_forward(x)
 
         x_fp8 = self.cast_x_to_float8(x, self.is_amax_initialized)
-        w_fp8 = self.cast_w_to_float8(self.weight, self.is_amax_initialized)
+        # w_fp8 = self.cast_w_to_float8(self.weight, self.is_amax_initialized)
+        self._maybe_init_amaxes_scales_weight(self.weight, self.is_amax_initialized)
 
-        y = torch.matmul(x_fp8, w_fp8.t())
+        y = float8_linear(
+            x_fp8,
+            self.weight,
+            self.fp8_scale_w,
+            self.fp8_amax_w,
+            self.emulate,
+            self.recompute_weight_cast,
+        )
+        # y = torch.matmul(x_fp8, w_fp8.t())
 
         # Cast gradY to float8_e5m2 during backward
         y = self.cast_y_to_float8_in_bw(y, self.emulate)
diff --git a/float8_experimental/float8_ops.py b/float8_experimental/float8_ops.py
@@ -164,7 +164,7 @@ def autocast_to_copy(aten_op, args, kwargs=None):
     )
 
 
-class float8_linear(torch.autograd.Function):
+class _float8_linear(torch.autograd.Function):
     """Custom autograd function for computing torch.nn.Linear on Float8Tensor.
 
     This is needed for a couple reasons, we want to have fine grained control over the
@@ -238,3 +238,27 @@ def backward(ctx, go_fp8: torch.Tensor):
 
         empty_grads = None, None, None, None, None, None, None, None, None
         return dL_dX, dL_dW, *empty_grads
+
+
+# Need to allow_in_graph because:
+# (1) the forward returns a plain tensor
+# (2) the backward accepts a Float8Tensor subclass
+# dynamo has no good way to be told what the type of
+# the grad_out is today, so it (incorrectly) assumes it is also a plain tensor.
+@torch._dynamo.allow_in_graph
+def float8_linear(
+    x_fp8: torch.Tensor,
+    original_weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    weight_amax_buffer: Optional[torch.Tensor],
+    emulate: bool,
+    recompute_float8_weight: bool,
+):
+    return _float8_linear.apply(
+        x_fp8,
+        original_weight,
+        weight_scale,
+        weight_amax_buffer,
+        emulate,
+        recompute_float8_weight,
+    )