Combine amax reduction calls (#163)

y-sq · facebook-github-bot · commit b09904985cfc · 2023-12-20T00:01:49.000-08:00
Summary: ~~Add an option to combine the amax sync reduction~~ (Use combine-reduction as the default behavior) - Combine the reduction call of each type amax scaling factor (totally 3 all_reduce calls). We can also further combine them into one single call. - Verified other tests can still pass. So we don't need to change existing benchmark code. - pytest test/test_base.py - ./test/test_fsdp.sh - Tested the new option using small llama models with 8 fsdp groups. Time taken by sync_float8_amax_and_scale_history reduced from 29ms[1] to 3ms[2]. [1] Traces without combine reduction, https://www.internalfb.com/intern/perfdoctor/trace_view?filepath=tree/traces/trace.138932292910521.json.gz&bucket=acadia [2] https://www.internalfb.com/intern/perfdoctor/trace_view?filepath=tree/traces/trace.202842416426594.json.gz&bucket=acadia \* Trace[2] was updated after addressing the comments. \*\* Need Meta internal access to open these traces. Pull Request resolved: #163 Reviewed By: drisspg Differential Revision: D52271595 Pulled By: y-sq fbshipit-source-id: 65d27d32cb4d291dc6fbe62b7a916cf2e32e6482
diff --git a/float8_experimental/float8_linear_utils.py b/float8_experimental/float8_linear_utils.py
@@ -87,8 +87,20 @@ def swap_linear_with_float8_linear(
             swap_linear_with_float8_linear(child, module, emulate)
 
 
+def get_float8_layers(model: torch.nn.Module, fp8_classes=None):
+    if fp8_classes is None:
+        fp8_classes = Float8Linear
+
+    # Get all fp8 layers and tensors
+    fp8_layers = [
+        child for name, child in model.named_modules() if isinstance(child, fp8_classes)
+    ]
+
+    return fp8_layers
+
+
 def sync_float8_amax_and_scale_history(
-    model: torch.nn.Module, fp8_classes=None
+    model: torch.nn.Module, fp8_classes=None, fp8_layers=None
 ) -> None:
     """
     Manages the float8 amax and scale bookkeeping. In detail, it does the
@@ -103,27 +115,54 @@ def sync_float8_amax_and_scale_history(
 
     Args:
         model (torch.nn.Module): The model to track amaxes for
+        fp8_classes (optional): The fp8 classes to look for in the model.
+            The default is Float8Linear.
+            When using with TP, users can pass in the customized TP classes instead.
+        fp8_layers (optional): If fp8_layers are provided, fp8_classes are ignored,
+            and we loop over all fp8_layers to sync and update amax scale histories.
+            Users can use get_float8_layers to get all fp8 layers.
     """
 
     # For now, this is written in a naive way to maximize code readability.
-    # TODO(future): benchmark and optimize as needed, we can combine all
-    # the reductions into one and probably make the history update faster.
-    # Lazy import to avoid circular dependency
-
-    if fp8_classes is None:
-        fp8_classes = Float8Linear
-
-    for name, child in model.named_modules():
-        if not isinstance(child, fp8_classes):
-            continue
+    # TODO(future): benchmark and optimize as needed, we have combined all
+    # the reductions into one and we can probably try other optimizatons to
+    # make the history update faster.
+
+    if fp8_layers is None:
+        fp8_layers = get_float8_layers(model, fp8_classes)
+
+    if dist.is_initialized():
+        fp8_amax_x_tensor = torch.tensor(
+            [child.fp8_amax_x for child in fp8_layers],
+            dtype=torch.float32,
+            device="cuda",
+            requires_grad=False,
+        )
+        fp8_amax_w_tensor = torch.tensor(
+            [child.fp8_amax_w for child in fp8_layers],
+            dtype=torch.float32,
+            device="cuda",
+            requires_grad=False,
+        )
+        fp8_amax_dL_dY_tensor = torch.tensor(
+            [child.fp8_amax_dL_dY for child in fp8_layers],
+            dtype=torch.float32,
+            device="cuda",
+            requires_grad=False,
+        )
+        dist.all_reduce(fp8_amax_x_tensor, op=dist.ReduceOp.MAX)
+        dist.all_reduce(fp8_amax_w_tensor, op=dist.ReduceOp.MAX)
+        dist.all_reduce(fp8_amax_dL_dY_tensor, op=dist.ReduceOp.MAX)
 
+    for idx in range(len(fp8_layers)):
+        child = fp8_layers[idx]
         #
         # 1. in distributed contexts, syncs amax values across workers
         #
         if dist.is_initialized():
-            dist.all_reduce(child.fp8_amax_x, op=dist.ReduceOp.MAX)
-            dist.all_reduce(child.fp8_amax_w, op=dist.ReduceOp.MAX)
-            dist.all_reduce(child.fp8_amax_dL_dY, op=dist.ReduceOp.MAX)
+            child.fp8_amax_x = fp8_amax_x_tensor[idx]
+            child.fp8_amax_w = fp8_amax_w_tensor[idx]
+            child.fp8_amax_dL_dY = fp8_amax_dL_dY_tensor[idx]
 
         #
         # 2. adds the `amax` values to history