Added changes for FSDP fp8 all-gather (#130)

Andrew Gu · facebook-github-bot · commit 436102b88c42 · 2023-10-27T14:25:30.000-07:00
Summary: This adds two changes: 1. `Float8LinearMixin` saves a constructor for FSDP to use to construct the `Float8Tensor` for `w_fp8`. This is needed for FSDP to manage the unsharded gradient and since FSDP prefers to own the underlying data/storage for all-gather. 2. `Float8Linear.forward()` checks if `self._w_fp8` has been set (by FSDP) and skips the cast itself if so. I have tested this with P865757339 (not cleaned up), but I do not think we need to land yet. (This does mean there is a chance for changes to this repo to break FSDP fp8 all-gather, but I think it is fine for now.) Pull Request resolved: #130 Reviewed By: awgu Differential Revision: D50754666 Pulled By: drisspg fbshipit-source-id: 9f7a9bfc9f2b3cb7455cd8b8642f0c4e4a55ee64
diff --git a/float8_experimental/float8_linear.py b/float8_experimental/float8_linear.py
@@ -214,6 +214,13 @@ def __init__(self, *args, **kwargs):
         # Note: this is not used in non-TP code.
         self.use_sequence_parallel = False
 
+        # Save the Float8Tensor constructor for FSDP.
+        # N.B. Do not partially apply the scale into the constructor because
+        # buffer Python IDs are not preserved by `nn.Module.to()` and the
+        # module could be moved to GPU after this constructor. Instead, FSDP
+        # will access the scale when it has ensured that it is on GPU.
+        self._float8_tensor_ctor = lambda *args, **kwargs: Float8Tensor(*args, **kwargs)
+
     def cast_x_to_float8(self, x, is_amax_initialized):
         # Duplicate the autocast logic for F.linear, so that the output
         # of our module has the right original precision
@@ -305,7 +312,10 @@ def forward(self, x):
         self.float8_pre_forward(x)
 
         x_fp8 = self.cast_x_to_float8(x, self.is_amax_initialized)
-        w_fp8 = self.cast_w_to_float8(self.weight, self.is_amax_initialized)
+        if getattr(self, "_w_fp8", None) is not None:  # FSDP handled the cast
+            w_fp8 = self._w_fp8
+        else:
+            w_fp8 = self.cast_w_to_float8(self.weight, self.is_amax_initialized)
         y = self.float8_mm(x_fp8, w_fp8, self.is_amax_initialized)
         y = self.cast_y_to_float8_in_bw(y)