reset flight recorder trace (#283)

tushar00jain · facebook-github-bot · commit c5a3407e69ef · 2025-11-06T15:10:51.000-08:00
Summary:

- call FR api to reset the trace after every quorum - we reset so that after every quorum, we start a fresh FR trace since the pg's could have changed and we already dumped FR trace from previous errors
- change the env var that's used to determine the file after every quorum
- return replica id's in quorum response so we can determine global ranks in the pg - this is used to set the metadata on the pg for flight recorder to work

Reviewed By: d4l3k

Differential Revision: D84260745
diff --git a/proto/torchft.proto b/proto/torchft.proto
@@ -96,6 +96,7 @@ message ManagerQuorumResponse {
   int64 replica_world_size = 10;
   bool heal = 11;
   int64 commit_failures = 12;
+  repeated string replica_ids = 13;
 }
 
 message CheckpointMetadataRequest {
diff --git a/src/lib.rs b/src/lib.rs
@@ -213,6 +213,7 @@ impl ManagerClient {
                 max_replica_rank: resp.max_replica_rank,
                 max_world_size: resp.max_world_size,
                 heal: resp.heal,
+                replica_ids: resp.replica_ids,
             })
         })
     }
@@ -293,6 +294,7 @@ struct QuorumResult {
     max_replica_rank: Option<i64>,
     max_world_size: i64,
     heal: bool,
+    replica_ids: Vec<String>,
 }
 
 #[pymethods]
@@ -311,6 +313,7 @@ impl QuorumResult {
             max_replica_rank: None,
             max_world_size: 1,
             heal: false,
+            replica_ids: Vec::new(),
         }
     }
 }
diff --git a/src/manager.rs b/src/manager.rs
@@ -620,6 +620,7 @@ fn compute_quorum_results(
             .map(|p| p.commit_failures)
             .max()
             .unwrap_or(0),
+        replica_ids: participants.iter().map(|p| p.replica_id.clone()).collect(),
     })
 }
 
diff --git a/torchft/_torchft.pyi b/torchft/_torchft.pyi
@@ -36,6 +36,7 @@ class QuorumResult:
     max_world_size: int
     heal: bool
     commit_failures: int
+    replica_ids: list[str]
 
 class ManagerServer:
     def __init__(
diff --git a/torchft/manager.py b/torchft/manager.py
@@ -88,6 +88,8 @@
 # crash if call to quorum fails, all replicas will crash.
 QUORUM_RETRIES_ENV: str = "TORCHFT_QUORUM_RETRIES"
 
+TORCH_FR_DUMP_TEMP_FILE_ENV: str = "TORCH_FR_DUMP_TEMP_FILE"
+
 T = TypeVar("T")
 
 
@@ -109,6 +111,17 @@ def get_timeout(
     return default_timeout_sec
 
 
+def extract_trailing_digits(s: str) -> int:
+    """
+    Extracts the trailing digits from the end of the string s.
+    Returns an empty string if no trailing digits are found.
+    """
+    i = len(s) - 1
+    while i >= 0 and s[i].isdigit():
+        i -= 1
+    return int(s[i + 1 :]) if i < len(s) - 1 else 0
+
+
 class WorldSizeMode(Enum):
     """
     This controls the numerics for the job when doing allreduces across replicas
@@ -223,6 +236,9 @@ def __init__(
         self._load_state_dict_fns: Dict[str, Callable[[object], None]] = {}
         self._user_state_dicts: Dict[str, Callable[[], object]] = {}
 
+        self._original_fr_dump_temp_file: Optional[str] = os.environ.get(
+            TORCH_FR_DUMP_TEMP_FILE_ENV
+        )
         self._replica_id = replica_id
 
         # Protects state dict
@@ -257,7 +273,7 @@ def __init__(
         store_port = store_port or int(os.environ["MASTER_PORT"])
         self._group_rank: int = rank if rank is not None else int(os.environ["RANK"])
         group_rank = self._group_rank
-        group_world_size = world_size or int(os.environ["WORLD_SIZE"])
+        self._group_world_size: int = world_size or int(os.environ["WORLD_SIZE"])
         self._min_replica_size = min_replica_size
 
         if checkpoint_transport is None:
@@ -310,7 +326,7 @@ def __init__(
                 hostname=hostname,
                 bind=bind,
                 store_addr=f"{store_addr}:{store_port}",
-                world_size=group_world_size,
+                world_size=self._group_world_size,
                 heartbeat_interval=heartbeat_interval,
                 connect_timeout=connect_timeout,
                 quorum_retries=self._quorum_retries,
@@ -338,6 +354,17 @@ def __init__(
         self._participating_replica_world_size: int = 0
         self._is_state_dict_read_allowed = True
 
+        self._global_rank: int = (
+            self._group_rank
+            if self._replica_id is None
+            else (
+                extract_trailing_digits(self._replica_id) * self._group_world_size
+                + self._group_rank
+            )
+        )
+
+        self._update_fr_path()
+
     def allow_state_dict_read(self) -> None:
         if self._is_state_dict_read_allowed:
             return
@@ -446,7 +473,7 @@ def allreduce(
             # on the Future
             @torch.profiler.record_function("torchft::manager::allreduce::callback")
             def callback(
-                fut: torch.futures.Future[list[torch.Tensor]],
+                fut: torch.futures.Future[torch.Tensor],
             ) -> torch.Tensor:
                 nonlocal tensor
                 if reduce_op == ReduceOp.AVG:
@@ -455,6 +482,7 @@ def callback(
 
             managed_work = _ManagedWork(self, work, tensor)
             fut = managed_work.get_future()
+            fut = cast(torch.futures.Future[torch.Tensor], fut)
             fut = fut.then(callback)
             return managed_work
 
@@ -634,6 +662,13 @@ def _async_quorum(
         max_replica_rank = quorum.max_replica_rank
         max_replica_world_size = quorum.max_world_size
         heal = quorum.heal
+        replica_ids = quorum.replica_ids
+
+        ranks_in_quorum = [
+            extract_trailing_digits(replica_id.split(":")[0]) * self._group_world_size
+            + self._group_rank
+            for replica_id in replica_ids
+        ]
 
         # When using async quorum we need to take the recovered workers.
         # When not using async quorum we need to take the max world size as all
@@ -674,16 +709,30 @@ def _async_quorum(
             self._logger.info(f"reconfiguring for {quorum_id=} {store_prefixed_addr=}")
             # We use the replica rank and world as we want all replicas in the PG.
             try:
+                self._quorum_id = quorum_id
                 with torch.profiler.record_function("torchft::manager::_pg::configure"):
+                    # Reset GPU state for Flight Recorder
                     if torch.accelerator.is_available():
                         torch.accelerator.synchronize()
+
                     self._pg.configure(
                         store_prefixed_addr,
                         self._replica_id if self._replica_id is not None else "0",
                         replica_rank,
                         replica_world_size,
+                        quorum_id,
+                        self._group_rank,
+                        self._group_world_size,
+                        ranks_in_quorum,
                     )
-                self._quorum_id = quorum_id
+
+                    # We need to reset the trace after reconfiguring the PG because that
+                    # calls abort which may trigger a dump
+                    self._logger.info(
+                        f"resetting fr recording for quorum id {self._quorum_id}"
+                    )
+                    self._update_fr_path()
+                    torch._C._distributed_c10d._reset_fr_recording_nccl()  # pyre-ignore
             except Exception as e:
                 self._logger.exception(f"got exception in pg configure: {e}")
                 self.report_error(e)
@@ -758,6 +807,17 @@ def _async_quorum(
                     else None
                 )
 
+    def _update_fr_path(self) -> None:
+        """
+        Update the path that flight recorder will dump the traces to.
+        The format is
+        <TORCH_FR_DUMP_TEMP_FILE_ENV>_quorum_<quorum_id>/<global_rank>
+        """
+        if self._original_fr_dump_temp_file is not None:
+            folder = f"{self._original_fr_dump_temp_file}_quorum_{self._quorum_id}"
+            os.makedirs(folder, exist_ok=True)
+            os.environ[TORCH_FR_DUMP_TEMP_FILE_ENV] = f"{folder}/{self._global_rank}"
+
     def _apply_pending_state_dict(self) -> None:
         assert self._healing, "must be in healing state"
 
diff --git a/torchft/process_group.py b/torchft/process_group.py

Original file line number	Diff line number	Diff line change
`@@ -96,6 +96,7 @@ message ManagerQuorumResponse {`
`96`	`96`	`int64 replica_world_size = 10;`
`97`	`97`	`bool heal = 11;`
`98`	`98`	`int64 commit_failures = 12;`
	`99`	`+ repeated string replica_ids = 13;`
`99`	`100`	`}`
`100`	`101`
`101`	`102`	`message CheckpointMetadataRequest {`
Original file line number	Diff line number	Diff line change
`@@ -213,6 +213,7 @@ impl ManagerClient {`
`213`	`213`	`max_replica_rank: resp.max_replica_rank,`
`214`	`214`	`max_world_size: resp.max_world_size,`
`215`	`215`	`heal: resp.heal,`
	`216`	`+ replica_ids: resp.replica_ids,`
`216`	`217`	`})`
`217`	`218`	`})`
`218`	`219`	`}`
`@@ -293,6 +294,7 @@ struct QuorumResult {`
`293`	`294`	`max_replica_rank: Option<i64>,`
`294`	`295`	`max_world_size: i64,`
`295`	`296`	`heal: bool,`
	`297`	`+ replica_ids: Vec<String>,`
`296`	`298`	`}`
`297`	`299`
`298`	`300`	`#[pymethods]`
`@@ -311,6 +313,7 @@ impl QuorumResult {`
`311`	`313`	`max_replica_rank: None,`
`312`	`314`	`max_world_size: 1,`
`313`	`315`	`heal: false,`
	`316`	`+ replica_ids: Vec::new(),`
`314`	`317`	`}`
`315`	`318`	`}`
`316`	`319`	`}`
Original file line number	Diff line number	Diff line change
`@@ -620,6 +620,7 @@ fn compute_quorum_results(`
`620`	`620`	`.map(\|p\| p.commit_failures)`
`621`	`621`	`.max()`
`622`	`622`	`.unwrap_or(0),`
	`623`	`+ replica_ids: participants.iter().map(\|p\| p.replica_id.clone()).collect(),`
`623`	`624`	`})`
`624`	`625`	`}`
`625`	`626`