test allreduce failures for diloco

tushar00jain · tushar00jain · commit a5396a8f433a · 2025-06-24T14:17:20.000-07:00
Summary:
- test when allreduce fails but no new nodes join
- added another event of type `AllreduceFailure`
- This new event required modifying some manager code to inject the failure
diff --git a/torchft/local_sgd_integ_test.py b/torchft/local_sgd_integ_test.py
@@ -210,6 +210,7 @@ def state_dict() -> Dict[str, Dict[str, object]]:  # pyre-ignore[53]
             # pyre-fixme[6]: Incompatible parameter type
             **runner.manager_args,
         )
+        runner.event_injector.set_manager(manager)
         stack.callback(manager.shutdown)
         # initialize default group for device mesh to work
         if not torch.distributed.is_initialized():
@@ -669,3 +670,79 @@ def test_streaming_diloco_upscale(
 
         for event_injector in event_injectors:
             self.assertEqual(event_injectors[1].count[EventInjectorEvent.Barrier], 1)
+
+    # pyre-fixme[56]: Pyre was not able to infer the type of argument
+    @skipIf(sys.platform == "darwin", "not reliable on mac")
+    @parameterized.expand(CONFIG)
+    def test_streaming_diloco_commit_failure(
+        self, use_cuda: bool, n_fragments: int, fragment_sync_delay: int
+    ) -> None:
+        # Skip the test if use_cuda is True and there are not enough GPUs
+        if use_cuda and torch.cuda.device_count() < 2:
+            self.skipTest("Not enough GPUs for CUDA test")
+
+        lighthouse = LighthouseServer(
+            bind="[::]:0",
+            min_replicas=2,
+        )
+        num_replicas = 2
+        futures = []
+        executors = []
+
+        event_injectors = [
+            EventInjector().fail_allreduce_at(0, 1),
+            EventInjector().fail_allreduce_at(0, 1),
+        ]
+
+        torch.manual_seed(42)
+        # Initialize the model so we can pass in the state_dict
+        m: nn.Module = MultiMyModel(2, 3, n_fragments)
+
+        for replica_id, event_injector in zip(range(num_replicas), event_injectors):
+            executor = ThreadPoolExecutor(max_workers=1)
+            executors.append(executor)
+            runner = Runner(
+                replica_id=replica_id,
+                num_replicas=num_replicas,
+                lighthouse_address=lighthouse.address(),
+                event_injector=event_injector,
+                train_loop=diloco_train_loop,
+                train_loop_args={
+                    "model_state_dict": m.state_dict(),
+                    "n_fragments": n_fragments,
+                    "diloco_args": {
+                        "fragment_sync_delay": fragment_sync_delay,
+                        "sync_every": 4,
+                    },
+                },
+            )
+            futures.append(executor.submit(runner.run_replica))
+
+        state_dicts = []
+
+        for fut in as_completed(futures):
+            continue
+
+        for fut in futures:
+            try:
+                state_dicts.append(fut.result()[0])
+            except Exception as e:
+                print(e)
+                raise
+
+        lighthouse.shutdown()
+
+        rep0, rep1 = state_dicts
+
+        assert_equal_global_state(rep0, rep1)
+
+        for step in rep0.keys():
+            print(step, rep0[step]["user"]["local_step"])
+            self.assertEqual(
+                rep0[step]["user"]["local_step"], rep1[step]["user"]["local_step"]
+            )
+
+        for event_injector in event_injectors:
+            self.assertEqual(
+                event_injector.count[EventInjectorEvent.AllreduceFailure], 1
+            )
diff --git a/torchft/manager.py b/torchft/manager.py
@@ -268,6 +268,15 @@ def __init__(
         self._participating_replica_rank: Optional[int] = None
         self._participating_replica_world_size: int = 0
 
+        # used to artificially fail the next allreduce by tests
+        self._TEST_should_fail_allreduce = False
+
+    def TEST_fail_allreduce(self) -> None:
+        """
+        Fails the next allreduce. This is used for testing.
+        """
+        self._TEST_should_fail_allreduce = True
+
     def register_state_dict_fn(
         self,
         key: str,
@@ -356,6 +365,10 @@ def callback(
             ) -> torch.Tensor:
                 nonlocal tensor, stream, num_participants
 
+                if self._TEST_should_fail_allreduce:
+                    self._TEST_should_fail_allreduce = False
+                    raise
+
                 # change the stream to avoid making the callback stream
                 # dependent on process group stream running the allreduce
                 with torch.cuda.stream(stream) if stream is not None else nullcontext():
diff --git a/torchft/manager_integ_test.py b/torchft/manager_integ_test.py
@@ -76,10 +76,13 @@ class InjectedFailure(Exception):
 
 
 class EventInjectorEvent(Enum):
+    # Crashes a rank
     Failure = auto()
     # Used to wait for a rank to reach a certain step before continuing.
     # Users need to make sure the size of the barrier is appropriately set.
     Barrier = auto()
+    # Fails the allreduce call made by a rank
+    AllreduceFailure = auto()
 
 
 class EventInjectorInfo:
@@ -90,10 +93,15 @@ def __init__(self, event: EventInjectorEvent, data: object) -> None:
 
 class EventInjector:
     def __init__(self) -> None:
+        self._manager: Optional[Manager] = None
         self._lock = threading.Lock()
         self._events: Dict[Tuple[int, int], EventInjectorInfo] = {}
         self.count: dict[EventInjectorEvent, int] = defaultdict(int)
 
+    def set_manager(self, manager: Manager) -> None:
+        with self._lock:
+            self._manager = manager
+
     def fail_at(self, rank: int, step: int) -> "EventInjector":
         with self._lock:
             assert (rank, step) not in self._events
@@ -102,6 +110,14 @@ def fail_at(self, rank: int, step: int) -> "EventInjector":
             )
             return self
 
+    def fail_allreduce_at(self, rank: int, step: int) -> "EventInjector":
+        with self._lock:
+            assert (rank, step) not in self._events
+            self._events[(rank, step)] = EventInjectorInfo(
+                EventInjectorEvent.AllreduceFailure, None
+            )
+            return self
+
     def barrier_at(
         self, rank: int, step: int, barrier: threading.Barrier
     ) -> "EventInjector":
@@ -124,6 +140,12 @@ def check(self, rank: int, step: int) -> None:
                     print(f"injecting failure {rank=} {step=}")
                     raise InjectedFailure(f"injected failure {rank=} {step=}")
 
+                if event_info.event == EventInjectorEvent.AllreduceFailure:
+                    print(f"injecting allreduce failure {rank=} {step=}")
+                    assert self._manager is not None
+                    self._manager.TEST_fail_allreduce()
+                    return
+
                 if event_info.event == EventInjectorEvent.Barrier:
                     print(f"waiting for barrier {rank=} {step=}")
                     cast(threading.Barrier, event_info.data).wait()