Improve compute cost for all2all (#159)

fmassa · web-flow · commit 716c19b2c32f · 2025-09-28T20:10:20.000+02:00
* Add compute cost for all_to_all The all2all implementation performs additional input/output copies depending on the in_shard / out_shard dims, see https://github.com/pytorch/pytorch/blob/afdd4247a2251b3f4c2f4b402cb625f46d6784ba/torch/csrc/distributed/c10d/Functional.cpp#L597-L617 for more details * Add .contiguous cost as well Need to figure out a way of deciding if the input is contiguous or not * Refactor into helper function * Cleanup
diff --git a/autoparallel/collective_runtime_estimation.py b/autoparallel/collective_runtime_estimation.py
@@ -15,7 +15,7 @@
 )
 from torch.distributed.tensor.placement_types import Partial, Shard
 
-from .compute_estimation import _get_device_gmem_bandwidth
+from .compute_estimation import compute_read_write_time
 
 
 def all_to_all_cost(bytes_gb: float, mesh_topo: MeshTopoInfo, mesh_dim: int) -> float:
@@ -65,16 +65,11 @@ def redistribute_cost(
     comm_bytes_gb = (
         spec_to_bytes(current_spec) / current_spec.num_shards / 1024 / 1024 / 1024
     )
-    gpu_memory_bandwidth = _get_device_gmem_bandwidth() / 1024**3  # GB/s
     # Transformation that considered for redistribute cost:
     # 1. allgather 2. alltoall
     # 3. allreduce 4. reduce_scatter
     curr_placements = [current_spec.placements[i] for i in order]
     tgt_placements = [target_spec.placements[i] for i in order]
-
-    # suppose 70% efficiency for the non-collective operators
-    read_write_efficiency = 0.70
-    kernel_launch_overhead = 7  # us
     for i, current, target in zip(order, curr_placements, tgt_placements):
         if current == target:
             continue
@@ -90,15 +85,29 @@ def redistribute_cost(
                 # which corresponds to reshuffling the whole output tensor
                 # we multiply the cost by 2 because we need to count input and output
                 # reads for the reshuffle
-                compute_cost = comm_bytes_gb * 2 / gpu_memory_bandwidth * 1e6  # us
-                compute_cost = max(
-                    compute_cost / read_write_efficiency, kernel_launch_overhead
-                )
+                compute_cost = compute_read_write_time(comm_bytes_gb * 2 * 1024**3)
                 cost += compute_cost
         elif current.is_shard() and target.is_shard():
+            current = cast(Shard, current)
+            target = cast(Shard, target)
             # should be alltoall comm, since we haven't implement it yet, add penalty
             # to favor allgather instead
             cost += all_to_all_cost(comm_bytes_gb, mesh_topo, i)  # us
+
+            num_copies = 0
+            is_contiguous = False
+            if not is_contiguous:
+                num_copies += 1
+
+            if current.dim != 0:
+                num_copies += 1
+
+            if target.dim != 0:
+                num_copies += 1
+
+            compute_cost = compute_read_write_time(comm_bytes_gb * 2 * 1024**3)
+            cost += num_copies * compute_cost
+
         elif current.is_partial() and target.is_replicate():
             # add up allreduce comm cost
             cost += allreduce_cost(comm_bytes_gb, mesh_topo, i)
@@ -111,10 +120,7 @@ def redistribute_cost(
                 # which corresponds to reshuffling the whole input tensor
                 # we multiply the cost by 2 because we need to count input and output
                 # reads for the reshuffle
-                compute_cost = comm_bytes_gb * 2 / gpu_memory_bandwidth * 1e6  # us
-                compute_cost = max(
-                    compute_cost / read_write_efficiency, kernel_launch_overhead
-                )
+                compute_cost = compute_read_write_time(comm_bytes_gb * 2 * 1024**3)
                 cost += compute_cost
             # after reduce_scatter the comm bytes for further collectives halved.
             comm_bytes_gb /= num_devices_on_mesh_dim
diff --git a/autoparallel/compute_estimation.py b/autoparallel/compute_estimation.py
@@ -281,6 +281,21 @@ def _compute_flops(fn, *args, **kwargs):
     return flop_counter.get_total_flops(), out
 
 
+def compute_read_write_time(read_write_bytes):
+    gpu_memory_bandwidth = _get_device_gmem_bandwidth()
+    read_write_time = read_write_bytes / gpu_memory_bandwidth * 1e6  # us
+
+    # suppose 70% efficiency for the operator
+    read_write_efficiency = 0.70
+
+    kernel_launch_overhead = 7  # us
+
+    read_write_time = max(
+        read_write_time / read_write_efficiency, kernel_launch_overhead
+    )
+    return read_write_time
+
+
 def estimate_strategy_runtime_cost(node, strategy):
     """
     This function estimates the runtime cost of a given strategy
@@ -297,17 +312,7 @@ def estimate_strategy_runtime_cost(node, strategy):
     flops, out = _compute_flops(node.target, *args, **kwargs)
 
     read_write_bytes = compute_memory_cost(node.target, args, out)
-    gpu_memory_bandwidth = _get_device_gmem_bandwidth()
-    read_write_time = read_write_bytes / gpu_memory_bandwidth * 1e6  # us
-
-    # suppose 70% efficiency for the operator
-    read_write_efficiency = 0.70
-
-    kernel_launch_overhead = 7  # us
-
-    read_write_time = max(
-        read_write_time / read_write_efficiency, kernel_launch_overhead
-    )
+    read_write_time = compute_read_write_time(read_write_bytes)
 
     if flops == 0:
         return read_write_time
@@ -320,7 +325,7 @@ def estimate_strategy_runtime_cost(node, strategy):
     # suppose 70% efficiency for the operator
     compute_efficiency = 0.70
     compute_time = flops / gpu_flops * 1e6  # us
-    compute_time = max(compute_time / compute_efficiency, kernel_launch_overhead)
+    compute_time = compute_time / compute_efficiency
 
     return max(compute_time, read_write_time)