meta-pytorch
diff --git a/‎.github/workflows/lint.yaml‎
Lines changed: 0 additions & 3 deletions b/‎.github/workflows/lint.yaml‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎.github/workflows/unittest.yaml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/unittest.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 38 additions & 0 deletions b/‎README.md‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎proto/torchft.proto‎
Lines changed: 3 additions & 0 deletions b/‎proto/torchft.proto‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/lib.rs‎
Lines changed: 17 additions & 6 deletions b/‎src/lib.rs‎
Lines changed: 17 additions & 6 deletions
@@ -23,9 +23,6 @@ jobs:
 
           sudo apt-get install -y protobuf-compiler
 
-          # use RC build
-          pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu128
-
           pip install lintrunner lintrunner-adapters
           lintrunner init
 
 
@@ -15,11 +15,11 @@ jobs:
           - runs-on: "linux.2xlarge"
             gpu-arch-type: "cpu"
             gpu-arch-version: ""
-            torch-version: "test"
+            torch-version: "stable"
           - runs-on: "linux.g5.12xlarge.nvidia.gpu"
             gpu-arch-type: "cuda"
             gpu-arch-version: "12.4"
-            torch-version: "test"
+            torch-version: "stable"
           - runs-on: "linux.g5.12xlarge.nvidia.gpu"
             gpu-arch-type: "cuda"
             gpu-arch-version: "12.4"
 
@@ -208,6 +208,44 @@ for i in range(1000):
     optimizer.step()
 ```
 
+### Running DDP
+
+After starting the lighthouse server by running:
+
+```sh
+RUST_BACKTRACE=1 torchft_lighthouse --min_replicas 1 --quorum_tick_ms 100 --join_timeout_ms 10000
+```
+
+A test DDP script can be launched with torchX with:
+
+```sh
+torchx run
+```
+
+See [.torchxconfig](.torchxconfig), [torchx.py](./torchft/torchx.py) and the [torchX documentation](https://pytorch.org/torchx/latest/) to understand how DDP is being ran. 
+
+`torchx.py` could also launch HSDP jobs when `workers_per_replica` is set > 1, if the training script supports it. For an example HSDP training implementation with torchFT enabled, see [torchtitan](https://github.com/pytorch/torchtitan).
+
+Alternatively, to test on a node with two GPUs, you can launch two replica groups running  [train_ddp.py](./train_ddp.py) by:
+
+On shell 1 (one replica groups starts initial training):
+```sh
+export REPLICA_GROUP_ID=0
+export NUM_REPLICA_GROUPS=2
+
+CUDA_VISIBLE_DEVICES=0 TORCHFT_LIGHTHOUSE=http://localhost:29510 torchrun --master_port=29600 --nnodes=1 --nproc_per_node=1 -- train_ddp.py
+```
+
+On shell 2 (a second replica group joins):
+```sh
+export REPLICA_GROUP_ID=1
+export NUM_REPLICA_GROUPS=2
+
+CUDA_VISIBLE_DEVICES=1 TORCHFT_LIGHTHOUSE=http://localhost:29510 torchrun --master_port=29601 --nnodes=1 --nproc_per_node=1 -- train_ddp.py
+```
+
+By observing the outputs from both shells, you should observe process group reconfiguration and live checkpoint recovery.
+
 ### Example Parameter Server
 
 torchft has a fault tolerant parameter server implementation built on it's
 
@@ -42,6 +42,7 @@ message QuorumMember {
     int64 step = 4;
     uint64 world_size = 5;
     bool shrink_only = 6;
+    int64 commit_failures = 8;
     // User passing in data stored as JSON string.
     string data = 7;
 }
@@ -77,6 +78,7 @@ message ManagerQuorumRequest {
     string checkpoint_metadata = 3;
     bool shrink_only = 4;
     bool init_sync = 5;
+    int64 commit_failures = 6;
 }
 
 message ManagerQuorumResponse {
@@ -93,6 +95,7 @@ message ManagerQuorumResponse {
     int64 replica_rank = 9;
     int64 replica_world_size = 10;
     bool heal = 11;
+    int64 commit_failures = 12;
 }
 
 message CheckpointMetadataRequest {
 
@@ -12,7 +12,7 @@ classifiers = [
 ]
 dynamic = ["version"]
 dependencies = [
-    "torch"
+    "torch>=2.7"
 ]
 
 [project.urls]
 
@@ -41,7 +41,8 @@ use crate::torchftpb::lighthouse_service_server::LighthouseServiceServer;
 use crate::torchftpb::manager_service_client::ManagerServiceClient;
 use crate::torchftpb::LighthouseHeartbeatRequest;
 use crate::torchftpb::{
-    CheckpointMetadataRequest, LighthouseQuorumRequest, ManagerQuorumRequest, ShouldCommitRequest,
+    CheckpointMetadataRequest, LighthouseHeartbeatRequest, LighthouseQuorumRequest,
+    ManagerQuorumRequest, ShouldCommitRequest,
 };
 use pyo3::prelude::*;
 use pyo3::types::{PyDict, PyString};
@@ -182,6 +183,7 @@ impl ManagerClient {
         checkpoint_metadata: String,
         shrink_only: bool,
         init_sync: bool,
+        commit_failures: i64,
         timeout: Duration,
     ) -> Result<QuorumResult, StatusError> {
         py.allow_threads(move || {
@@ -191,6 +193,7 @@ impl ManagerClient {
                 checkpoint_metadata: checkpoint_metadata,
                 shrink_only: shrink_only,
                 init_sync: init_sync,
+                commit_failures: commit_failures,
             });
 
             // This timeout is processed on the server side so we also enable
@@ -562,6 +565,7 @@ impl LighthouseClient {
                     world_size: world_size,
                     shrink_only: shrink_only,
                     data: data_string,
+                    commit_failures: 0,
                 }),
             });
 
@@ -615,6 +619,7 @@ impl LighthouseClient {
         }
         req
     }
+
 }
 
 /// LighthouseServer is a GRPC server for the lighthouse service.
@@ -741,11 +746,17 @@ fn setup_logging() -> Result<(), Box<dyn std::error::Error>> {
         .debug(Color::Blue)
         .trace(Color::Magenta);
     let level_filter = match env::var("RUST_LOG").as_deref() {
-        Ok("error") => LevelFilter::Error,
-        Ok("warn") => LevelFilter::Warn,
-        Ok("info") => LevelFilter::Info,
-        Ok("debug") => LevelFilter::Debug,
-        Ok("trace") => LevelFilter::Trace,
+        Ok(value) => {
+            let value_lower = value.to_lowercase();
+            match value_lower.as_str() {
+                "error" => LevelFilter::Error,
+                "warn" => LevelFilter::Warn,
+                "info" => LevelFilter::Info,
+                "debug" => LevelFilter::Debug,
+                "trace" => LevelFilter::Trace,
+                _ => LevelFilter::Info,
+            }
+        }
         _ => LevelFilter::Info,
     };
     fern::Dispatch::new()
Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,7 @@ classifiers = [`
`12`	`12`	`]`
`13`	`13`	`dynamic = ["version"]`
`14`	`14`	`dependencies = [`
`15`		`- "torch"`
	`15`	`+ "torch>=2.7"`
`16`	`16`	`]`
`17`	`17`
`18`	`18`	`[project.urls]`