integrate torchcomms

tushar00jain · facebook-github-bot · commit 4dbd81666738 · 2025-11-07T09:52:27.000-08:00
Differential Revision: D86343575
diff --git a/torchft/manager.py b/torchft/manager.py
@@ -51,6 +51,7 @@
 
 import torch
 import torch.distributed as dist
+import torchcomms
 from torch.distributed import ReduceOp, TCPStore
 from torch.distributed.distributed_c10d import AllreduceOptions, ReduceOp, Work
 
@@ -63,6 +64,7 @@
 
 if TYPE_CHECKING:
     from torchft.process_group import ProcessGroup
+    from torchft.torchcomms import TorchComm
 
 IS_TRITON_AVAILABLE = True
 try:
@@ -163,7 +165,7 @@ class Manager:
 
     def __init__(
         self,
-        pg: "ProcessGroup",
+        pg: Union["ProcessGroup", "TorchComm"],
         load_state_dict: Optional[Callable[[T], None]],
         state_dict: Optional[Callable[[], T]],
         min_replica_size: int,
@@ -188,6 +190,7 @@ def __init__(
     ) -> None:
         """
         Args:
+            pg: process group or torchcomms wrapper to use for communication.
             load_state_dict: function to load the state dict when recovering
             state_dict: function to save the state dict with recovering
             min_replica_size: minimum number of replicas on each step
@@ -456,7 +459,9 @@ def allreduce(
         try:
             # Run the allreduce async and save the work object so we can wait on
             # it later.
+            # TODO: Support quantization with torchcomms
             if should_quantize and IS_TRITON_AVAILABLE:
+                assert isinstance(self._pg, ProcessGroup)
                 work = allreduce_quantized(
                     [tensor],
                     pg_reduce_op,
@@ -465,9 +470,21 @@ def allreduce(
                     torch.accelerator.current_stream(),
                 )
             else:
-                opts = AllreduceOptions()
-                opts.reduceOp = pg_reduce_op
-                work = self._pg.allreduce([tensor], opts)
+                # Check if we're using torchcomms or ProcessGroup
+                if isinstance(self._pg, TorchComm):
+                    # Convert PyTorch ReduceOp to torchcomms ReduceOp
+                    if pg_reduce_op == ReduceOp.SUM:
+                        tc_op = torchcomms.ReduceOp.SUM
+                    elif pg_reduce_op == ReduceOp.AVG:
+                        tc_op = torchcomms.ReduceOp.AVG
+                    else:
+                        raise AssertionError("unsupported reduce op")
+
+                    work = self._pg.allreduce(tensor, tc_op)
+                else:
+                    opts = AllreduceOptions()
+                    opts.reduceOp = pg_reduce_op
+                    work = self._pg.allreduce([tensor], opts)
 
             # schedule grad normalization as a continuation
             # on the Future
diff --git a/torchft/process_group.py b/torchft/process_group.py
@@ -1331,6 +1331,7 @@ class ManagedProcessGroup(ProcessGroupWrapper):
     """
 
     def __init__(self, manager: "Manager") -> None:
+        assert isinstance(manager._pg, ProcessGroup)
         super().__init__(pg=manager._pg)
 
         self._manager = manager
@@ -1350,6 +1351,7 @@ def size(self) -> int:
         return self._manager.num_participants()
 
     def getBackendName(self) -> str:
+        assert isinstance(self._manager._pg, ProcessGroup)
         return self._manager._pg.getBackendName()
 
 
diff --git a/torchft/torchcomms.py b/torchft/torchcomms.py