Sample PR to fix CI

xmfan · xmfan · commit 0c4338baeeab · 2025-10-01T14:31:28.000-07:00
stack-info: PR: #182, branch: xmfan/stack/11
diff --git a/.github/workflows/test_cuda.yml b/.github/workflows/test_cuda.yml
@@ -30,8 +30,9 @@ jobs:
       gpu-arch-type: ${{ matrix.gpu-arch-type }}
       gpu-arch-version: ${{ matrix.gpu-arch-version }}
       submodules: recursive
-      python-version: "3.12"
       script: |
+        alias python=python3.12
+        alias pip="python -m pip"
         pip install --quiet -r requirements-test.txt
         # For some reason the spec above isnt working
         pip uninstall -y torch
diff --git a/.github/workflows/test_torchtitan.yml b/.github/workflows/test_torchtitan.yml
@@ -30,8 +30,9 @@ jobs:
       gpu-arch-type: ${{ matrix.gpu-arch-type }}
       gpu-arch-version: ${{ matrix.gpu-arch-version }}
       submodules: recursive
-      python-version: "3.12"
       script: |
+        alias python=python3.12
+        alias pip="python -m pip"
         pip install --quiet -r requirements-test.txt
         # For some reason the spec above isnt working
         pip uninstall -y torch
diff --git a/examples/example_local_map.py b/examples/example_local_map.py
@@ -15,6 +15,23 @@
 
 from autoparallel.api import AutoParallel
 
+world_size = 256
+
+fake_store = FakeStore()
+torch.distributed.init_process_group(
+    "fake", store=fake_store, rank=0, world_size=world_size
+)
+mesh = torch.distributed.device_mesh.init_device_mesh(
+    "cuda",
+    (world_size // 32, 8, 4),
+    mesh_dim_names=(
+        "dp",
+        "tp",
+        "cp",
+    ),
+)
+assert mesh.ndim == 3, "Please also update local_map"
+
 
 def policy_fn(ctx, op, *args, **kwargs):
     if (
@@ -37,7 +54,7 @@ def policy_fn(ctx, op, *args, **kwargs):
     ),
     redistribute_inputs=True,
     in_grad_placements=None,
-    device_mesh=None,
+    device_mesh=mesh,
 )
 def replicate_linear(w, x):
     return torch.matmul(x, w.t())
@@ -54,7 +71,7 @@ def replicate_linear(w, x):
     ),
     redistribute_inputs=True,
     in_grad_placements=None,
-    device_mesh=None,
+    device_mesh=mesh,
 )
 def sharded_pointwise(x, scalar):
     return x + scalar, scalar
@@ -69,7 +86,7 @@ def sharded_pointwise(x, scalar):
     ),
     redistribute_inputs=True,
     in_grad_placements=None,
-    device_mesh=None,
+    device_mesh=mesh,
 )
 def context_parallel_attention(query, key, value):
     out = nn.functional.scaled_dot_product_attention(
@@ -128,22 +145,6 @@ def forward(self, x):
         return o
 
 
-world_size = 256
-
-fake_store = FakeStore()
-torch.distributed.init_process_group(
-    "fake", store=fake_store, rank=0, world_size=world_size
-)
-mesh = torch.distributed.device_mesh.init_device_mesh(
-    "cuda",
-    (world_size // 32, 8, 4),
-    mesh_dim_names=(
-        "dp",
-        "tp",
-        "cp",
-    ),
-)
-
 bs = 8 * mesh.shape[0]
 seq_len = 256
 nheads = 48