✨ add profiling to manager (#178)

tushar00jain · tushar00jain · web-flow · commit b03752343280 · 2025-04-29T11:35:48.000-07:00
Summary: Fixes #137. Add profiler annotations to manager.py Test Plan: <img width="1499" alt="image" src="https://github.com/user-attachments/assets/b34b3701-66b1-4b90-9cab-eec8db35bc38" /> Reviewers: @d4l3k Co-authored-by: tushar00jain <tushar00jain@devvm5549.pnb0.facebook.com>
diff --git a/torchft/manager.py b/torchft/manager.py
@@ -448,6 +448,7 @@ def wait_quorum(self) -> None:
         ), "must call start_quorum before wait_quorum"
         self._quorum_future.result()
 
+    @torch.profiler.record_function("torchft::manager::_async_quorum")
     def _async_quorum(
         self,
         allow_heal: bool,
@@ -459,14 +460,17 @@ def _async_quorum(
 
         if curr_device >= 0 and torch.cuda.is_available():
             torch.cuda.set_device(curr_device)
-        quorum = self._client._quorum(
-            rank=self._rank,
-            step=self._step,
-            checkpoint_metadata=self._checkpoint_transport.metadata(),
-            shrink_only=shrink_only,
-            timeout=quorum_timeout,
-            init_sync=self._init_sync,
-        )
+
+        quorum = None
+        with torch.profiler.record_function("torchft::manager::_client::_quorum"):
+            quorum = self._client._quorum(
+                rank=self._rank,
+                step=self._step,
+                checkpoint_metadata=self._checkpoint_transport.metadata(),
+                shrink_only=shrink_only,
+                timeout=quorum_timeout,
+                init_sync=self._init_sync,
+            )
 
         quorum_id = quorum.quorum_id
         replica_rank = quorum.replica_rank
@@ -505,7 +509,10 @@ def _async_quorum(
             self._logger.info(f"reconfiguring for {quorum_id=} {store_prefixed_addr=}")
             # We use the replica rank and world as we want all replicas in the PG.
             # TODO: handle configure errors
-            self._pg.configure(store_prefixed_addr, replica_rank, replica_world_size)
+            with torch.profiler.record_function("torchft::manager::_pg.configure"):
+                self._pg.configure(
+                    store_prefixed_addr, replica_rank, replica_world_size
+                )
             self._quorum_id = quorum_id
 
         if allow_heal:
@@ -520,12 +527,15 @@ def _async_quorum(
                     self._logger.info(
                         f"peers need recovery from us {quorum.recover_dst_ranks}"
                     )
-                    self._checkpoint_transport.send_checkpoint(
-                        dst_ranks=quorum.recover_dst_ranks,
-                        step=max_step,
-                        state_dict=self._manager_state_dict(),
-                        timeout=self._timeout,
-                    )
+                    with torch.profiler.record_function(
+                        "torchft::manager::_checkpoint_transport::send_checkpoint"
+                    ):
+                        self._checkpoint_transport.send_checkpoint(
+                            dst_ranks=quorum.recover_dst_ranks,
+                            step=max_step,
+                            state_dict=self._manager_state_dict(),
+                            timeout=self._timeout,
+                        )
 
                 # See manager.rs for healing conditions
                 if heal:
@@ -551,14 +561,17 @@ def _async_quorum(
 
                     # we apply the user state dict only when safe from the main thread
                     # save it for now
-                    self._pending_state_dict = (
-                        self._checkpoint_transport.recv_checkpoint(
-                            src_rank=recover_src_rank,
-                            metadata=checkpoint_metadata,
-                            step=max_step,
-                            timeout=self._timeout,
+                    with torch.profiler.record_function(
+                        "torchft::manager::_checkpoint_transport::recv_checkpoint"
+                    ):
+                        self._pending_state_dict = (
+                            self._checkpoint_transport.recv_checkpoint(
+                                src_rank=recover_src_rank,
+                                metadata=checkpoint_metadata,
+                                step=max_step,
+                                timeout=self._timeout,
+                            )
                         )
-                    )
 
                     # pyre-fixme[6]: got object
                     self.load_state_dict(self._pending_state_dict["torchft"])
@@ -584,6 +597,7 @@ def _apply_pending_state_dict(self) -> None:
         self._pending_state_dict = None
         self._logger.info("Loaded state dict.")
 
+    @torch.profiler.record_function("torchft::manager::should_commit")
     def should_commit(self, timeout: Optional[timedelta] = None) -> bool:
         """
         .. note::
diff --git a/train_ddp.py b/train_ddp.py
@@ -143,10 +143,30 @@ def forward(self, x):
     num_params = sum(p.numel() for p in m.parameters())
     print(f"Total number of parameters: {num_params}")
 
+    sort_by_keyword = "self_" + device + "_time_total"
+
+    def trace_handler(p):
+        output = p.key_averages().table(
+            sort_by=sort_by_keyword,
+            row_limit=100,
+        )
+        print(output)
+        p.export_chrome_trace("/tmp/trace_" + str(p.step_num) + ".json")
+
     # You can use an epoch based training but with faults it's easier to use step
     # based training.
+    prof = torch.profiler.profile(
+        schedule=torch.profiler.schedule(wait=5, warmup=1, active=10, repeat=2),
+        on_trace_ready=trace_handler,
+        record_shapes=True,
+        profile_memory=True,
+    )
+
+    prof.start()
     while True:
         for i, (inputs, labels) in enumerate(trainloader):
+            prof.step()
+
             inputs = inputs.to(device)
             labels = labels.to(device)
 
@@ -178,6 +198,7 @@ def forward(self, x):
 
             if manager.current_step() >= 10000:
                 # complete training
+                prof.stop()
                 exit()