meta-pytorch · tushar00jain · Nov 7, 2025 · Nov 7, 2025
diff --git a/proto/torchft.proto b/proto/torchft.proto
@@ -96,6 +96,7 @@ message ManagerQuorumResponse {
   int64 replica_world_size = 10;
   bool heal = 11;
   int64 commit_failures = 12;
+  repeated string replica_ids = 13;
 }
 
 message CheckpointMetadataRequest {

diff --git a/src/lib.rs b/src/lib.rs
@@ -213,6 +213,7 @@ impl ManagerClient {
                 max_replica_rank: resp.max_replica_rank,
                 max_world_size: resp.max_world_size,
                 heal: resp.heal,
+                replica_ids: resp.replica_ids,
             })
         })
     }
@@ -293,6 +294,7 @@ struct QuorumResult {
     max_replica_rank: Option<i64>,
     max_world_size: i64,
     heal: bool,
+    replica_ids: Vec<String>,
 }
 
 #[pymethods]
@@ -311,6 +313,7 @@ impl QuorumResult {
             max_replica_rank: None,
             max_world_size: 1,
             heal: false,
+            replica_ids: Vec::new(),
         }
     }
 }

diff --git a/src/manager.rs b/src/manager.rs
@@ -620,6 +620,7 @@ fn compute_quorum_results(
             .map(|p| p.commit_failures)
             .max()
             .unwrap_or(0),
+        replica_ids: participants.iter().map(|p| p.replica_id.clone()).collect(),
     })
 }
 

diff --git a/torchft/_test/diloco_trainer.py b/torchft/_test/diloco_trainer.py
@@ -1,15 +1,13 @@
 import copy
 import logging
 import os
-from contextlib import ExitStack
 from datetime import timedelta
-from typing import Any, cast, Dict, List
+from typing import Any, Dict
 
 import torch
 from torch import nn
-from torch.distributed.tensor import DTensor
+from torch.distributed.tensor import DeviceMesh, DTensor
 
-from torchft.device_mesh import ft_init_device_mesh, ManagedDeviceMesh
 from torchft.local_sgd import DiLoCo
 from torchft.manager import Manager
 from torchft.manager_integ_test import MyModel, Runner
@@ -113,7 +111,7 @@ def __init__(
 
         self.manager: Manager = self.setup_manager()
 
-        self.ft_device_mesh: None | ManagedDeviceMesh = None
+        self.device_mesh: None | DeviceMesh = None
         self.setup_distributed()
 
         self.criterion: nn.CrossEntropyLoss = nn.CrossEntropyLoss()
@@ -197,12 +195,9 @@ def setup_distributed(self) -> None:
                 os.environ["WORLD_SIZE"] = str(self.runner.world_size)
                 os.environ["RANK"] = str(self.rank)
 
-        self.ft_device_mesh = ft_init_device_mesh(
-            device_type=self.device.type,
-            mesh_shape=(self.runner.world_size, 1),
-            mesh_dim_names=("replicate", "none"),
-            replicate_dim=0,
-            manager=self.manager,
+        self.device_mesh = DeviceMesh(
+            self.device.type,
+            torch.arange(self.runner.world_size),
         )
 
         # Convert model parameters to DTensor
@@ -211,7 +206,7 @@ def setup_distributed(self) -> None:
                 for param in layer.parameters():
                     param = DTensor.from_local(
                         param,
-                        device_mesh=self.ft_device_mesh,
+                        device_mesh=self.device_mesh,
                     )
 
     def load_state_dict(self, state_dict: Dict[str, Dict[str, object]]) -> None:

diff --git a/torchft/_torchft.pyi b/torchft/_torchft.pyi
@@ -36,6 +36,7 @@ class QuorumResult:
     max_world_size: int
     heal: bool
     commit_failures: int
+    replica_ids: list[str]
 
 class ManagerServer:
     def __init__(
-Original file line number
+Diff line change
@@ Expand Up / @@ -620,6 +620,7 @@ fn compute_quorum_results( @@
                 .map(|p| p.commit_failures)
                 .max()
                 .unwrap_or(0),
+            replica_ids: participants.iter().map(|p| p.replica_id.clone()).collect(),
         })
     }
@@ Expand Down @@