pytorch · zpcore · Jul 24, 2024 · Jul 12, 2024 · Jul 12, 2024 · Jul 15, 2024
@@ -77,7 +77,7 @@ for input, target in data:
     # Exits the context manager before backward pass
     scaler.scale(loss).backward()
     gradients = xm._fetch_gradients(optimizer)
-    xm.all_reduce('sum', gradients, scale=1.0 / xm.xrt_world_size())
+    xm.all_reduce('sum', gradients, scale=1.0 / xm.pjrt_world_size())
     scaler.step(optimizer)
     scaler.update()
 ```

@@ -36,7 +36,7 @@ dist.init_process_group("xla", rank=rank, world_size=world_size)
 
 ```
 new_rank = xm.get_ordinal()
-world_size = xm.xrt_world_size()
+world_size = xm.pjrt_world_size()
 ```
 
 4. Pass `gradient_as_bucket_view=True` to the DDP wrapper.
@@ -96,7 +96,7 @@ def demo_basic(rank):
     # xla specific APIs to get rank, world_size.
     new_rank = xm.get_ordinal()
     assert new_rank == rank
-    world_size = xm.xrt_world_size()
+    world_size = xm.pjrt_world_size()
 
     print(f"Running basic DDP example on rank {rank}.")
     setup(rank, world_size)

@@ -45,7 +45,7 @@ ckpt = {
     'shard_metadata': model.get_shard_metadata(),
     'optimizer': optimizer.state_dict(),
 }
-ckpt_path = f'/tmp/rank-{xm.get_ordinal()}-of-{xm.xrt_world_size()}.pth'
+ckpt_path = f'/tmp/rank-{xm.get_ordinal()}-of-{xm.pjrt_world_size()}.pth'
 xm.save(ckpt, ckpt_path, master_only=False)
 ```
 * The checkpoint consolidation script can also be launched from the command line as follows.

@@ -70,7 +70,7 @@ Sample diff from XRT to PJRT:
 
  def _mp_fn(index):
    device = xm.xla_device()
--  dist.init_process_group('xla', rank=xm.get_ordinal(), world_size=xm.xrt_world_size())
+-  dist.init_process_group('xla', rank=xm.get_ordinal(), world_size=xm.pjrt_world_size())
 +  dist.init_process_group('xla', init_method='xla://')
 
    torch.manual_seed(42)

@@ -56,6 +56,7 @@ xla_model
 .. autofunction:: xla_device
 .. autofunction:: xla_device_hw
 .. autofunction:: is_master_ordinal
+.. autofunction:: pjrt_world_size
 .. autofunction:: all_reduce
 .. autofunction:: all_gather
 .. autofunction:: all_to_all

diff --git a/test/distributed_util.py b/test/distributed_util.py
@@ -91,7 +91,7 @@ def ddp_correctness(init_method: str = 'env://',
                     debug: bool = False):
   if init_method == 'env://':
     rank = xm.get_ordinal()
-    world_size = xm.xrt_world_size()
+    world_size = xm.pjrt_world_size()
     dist.init_process_group(
         "xla", init_method=init_method, rank=rank, world_size=world_size)
   else:

diff --git a/test/pjrt/test_collective_ops_tpu.py b/test/pjrt/test_collective_ops_tpu.py
@@ -74,7 +74,7 @@ def test_all_gather(self, pin_layout):
   @staticmethod
   def _reduce_scatter(pin_layout):
     device = xm.xla_device()
-    world_size = xm.xrt_world_size()
+    world_size = xm.pjrt_world_size()
     tensor = -torch.arange(world_size, dtype=torch.float32).to(device)
 
     out = xm.reduce_scatter(
@@ -99,7 +99,7 @@ def test_reduce_scatter(self, pin_layout):
   @staticmethod
   def _all_to_all(pin_layout):
     device = xm.xla_device()
-    world_size = xm.xrt_world_size()
+    world_size = xm.pjrt_world_size()
 
     tensor = torch.cat(
         [

diff --git a/test/pjrt/test_runtime.py b/test/pjrt/test_runtime.py
@@ -63,7 +63,7 @@ def test_num_global_devices(self):
                    xr.global_device_count())
 
   def test_world_size(self):
-    self.assertEqual(xm.xrt_world_size(), xr.world_size())
+    self.assertEqual(xm.pjrt_world_size(), xr.world_size())
 
   def test_xla_device_error(self):
     with self.assertRaises(IndexError):

diff --git a/test/pjrt/test_runtime_multi_gpu.py b/test/pjrt/test_runtime_multi_gpu.py
@@ -210,7 +210,7 @@ def test_all_gather(self, pin_layout):
   @staticmethod
   def _reduce_scatter(pin_layout):
     device = xm.xla_device()
-    world_size = xm.xrt_world_size()
+    world_size = xm.pjrt_world_size()
     tensor = -torch.arange(world_size, dtype=torch.float32).to(device)
 
     out = xm.reduce_scatter(
@@ -236,7 +236,7 @@ def test_reduce_scatter(self, pin_layout):
   @staticmethod
   def _all_to_all(pin_layout):
     device = xm.xla_device()
-    world_size = xm.xrt_world_size()
+    world_size = xm.pjrt_world_size()
 
     tensor = torch.cat(
         [

diff --git a/test/pjrt/test_train_hf_transformer.py b/test/pjrt/test_train_hf_transformer.py
@@ -33,7 +33,7 @@ def finetune(rank, train_dataset, test_dataset, tokenizer, flags):
 
   train_sampler = torch.utils.data.distributed.DistributedSampler(
       train_dataset,
-      num_replicas=xm.xrt_world_size(),
+      num_replicas=xm.pjrt_world_size(),
       rank=xm.get_ordinal(),
       shuffle=True)
 

diff --git a/test/spmd/test_xla_spmd_python_api_interaction.py b/test/spmd/test_xla_spmd_python_api_interaction.py
@@ -27,7 +27,7 @@ def test_get_xla_supported_devices(self):
     self.assertEqual(len(devices), 1)
 
   def test_world_size(self):
-    self.assertEqual(xm.xrt_world_size(), 1)
+    self.assertEqual(xm.pjrt_world_size(), 1)
 
   def test_get_ordinal(self):
     self.assertEqual(xm.get_ordinal(), 0)

diff --git a/test/test_mp_all_gather.py b/test/test_mp_all_gather.py
@@ -12,7 +12,7 @@ def all_gather(tensor, dim):
 
 def _mp_fn(index):
   device = xm.xla_device()
-  world_size = xm.xrt_world_size()
+  world_size = xm.pjrt_world_size()
   input_list_size = 5
   if xm.xla_device_hw(device) in ('TPU', 'CUDA', 'NEURON'):
     # Testing with a single replica group

diff --git a/test/test_mp_all_to_all.py b/test/test_mp_all_to_all.py
@@ -9,17 +9,17 @@ def _mp_fn(index):
   device = xm.xla_device()
   if xm.xla_device_hw(device) == 'TPU':
     slots_per_device = 4
-    size = slots_per_device * xm.xrt_world_size()
+    size = slots_per_device * xm.pjrt_world_size()
     ordinal = xm.get_ordinal()
     value = torch.tensor([ordinal] * size, dtype=torch.int32, device=device)
     result_tensor = xm.all_to_all(
         value,
         split_dimension=0,
         concat_dimension=0,
-        split_count=xm.xrt_world_size())
+        split_count=xm.pjrt_world_size())
 
     result = result_tensor.cpu().tolist()
-    for i in range(0, xm.xrt_world_size()):
+    for i in range(0, xm.pjrt_world_size()):
       expected = [i] * slots_per_device
       if expected != result[i * slots_per_device:(i + 1) * slots_per_device]:
         print(

diff --git a/test/test_mp_collective_permute.py b/test/test_mp_collective_permute.py
@@ -8,7 +8,7 @@
 def _mp_fn(index):
   device = xm.xla_device()
   if xm.xla_device_hw(device) == 'TPU':
-    world_size = xm.xrt_world_size()
+    world_size = xm.pjrt_world_size()
     ordinal = xm.get_ordinal()
     value = torch.tensor([ordinal] * 100, dtype=torch.int32, device=device)
     pairs = []

diff --git a/test/test_mp_distributed_mm.py b/test/test_mp_distributed_mm.py
@@ -10,7 +10,7 @@ def _mp_fn(index):
   device = xm.xla_device()
 
   if xm.xla_device_hw(device) in ('TPU', 'CUDA'):
-    world_size = xm.xrt_world_size()
+    world_size = xm.pjrt_world_size()
     torch_xla._XLAC._xla_set_use_full_mat_mul_precision(
         use_full_mat_mul_precision=True)
     torch.manual_seed(11)

diff --git a/test/test_mp_early_exit.py b/test/test_mp_early_exit.py
@@ -18,7 +18,7 @@ def _mp_fn():
     train_loader = pl.MpDeviceLoader(train_loader, device)
     max_steps = 10
     for step, inputs in enumerate(train_loader):
-      xm.all_reduce('sum', [inputs], scale=1.0 / xm.xrt_world_size())
+      xm.all_reduce('sum', [inputs], scale=1.0 / xm.pjrt_world_size())
       if step > max_steps:
         break
   else:

diff --git a/test/test_mp_mesh_reduce.py b/test/test_mp_mesh_reduce.py
@@ -12,7 +12,7 @@ def reduce_add(vlist):
   svalue = 1.25
   rvalue = xm.mesh_reduce('test_mp_mesh_reduce._test_scalar', svalue,
                           reduce_add)
-  assert rvalue == svalue * xm.xrt_world_size()
+  assert rvalue == svalue * xm.pjrt_world_size()
 
 
 def _test_tensor():
@@ -23,7 +23,7 @@ def reduce_add(vlist):
   tvalue = torch.tensor([[1, 2], [3, 4], [5, 6]], dtype=torch.float32)
   rvalue = xm.mesh_reduce('test_mp_mesh_reduce._test_tensor', tvalue,
                           reduce_add)
-  assert rvalue.allclose(tvalue * xm.xrt_world_size())
+  assert rvalue.allclose(tvalue * xm.pjrt_world_size())
 
 
 def _mp_fn(index):

diff --git a/test/test_mp_reduce_scatter.py b/test/test_mp_reduce_scatter.py
@@ -6,7 +6,7 @@
 
 def _mp_fn(index):
   device = xm.xla_device()
-  world_size = xm.xrt_world_size()
+  world_size = xm.pjrt_world_size()
   scale = 1 / world_size
   scatter_dim = 1
   shard_size = 2

diff --git a/test/test_mp_rendezvous.py b/test/test_mp_rendezvous.py
@@ -5,7 +5,7 @@
 
 
 def _get_replica_group(index):
-  world_size = xm.xrt_world_size()
+  world_size = xm.pjrt_world_size()
   split = world_size // 2
   gid = index // split if split > 0 else 0
   return list(range(0, split)) if index < split else list(

diff --git a/test/test_mp_replication.py b/test/test_mp_replication.py
@@ -11,7 +11,7 @@ def all_reduce(tensor):
 
 def _mp_fn(index):
   device = xm.xla_device()
-  world_size = xm.xrt_world_size()
+  world_size = xm.pjrt_world_size()
   if world_size > 1:
     ones = torch.ones((2, 3))
     twos = ones + 1.0

diff --git a/test/test_mp_sync_batch_norm.py b/test/test_mp_sync_batch_norm.py
@@ -22,10 +22,10 @@ def run_step(model: torch.nn.Module, batch: torch.Tensor) -> torch.Tensor:
     xm.optimizer_step(optimizer)
   else:
     # Scale as we scale within xm.optimizer_step()
-    loss = loss / xm.xrt_world_size()
+    loss = loss / xm.pjrt_world_size()
     loss.backward()
     optimizer.step()
-    split_size = batch.shape[0] // xm.xrt_world_size()
+    split_size = batch.shape[0] // xm.pjrt_world_size()
     result = result.split(split_size, dim=0)[xm.get_ordinal()]
 
   return result
@@ -45,7 +45,7 @@ def _sync_bn1d_no_channel(rank):
     torch.manual_seed(1)
     bsz = 32
     length = 64
-    t_global = torch.rand((xm.xrt_world_size() * bsz, length))
+    t_global = torch.rand((xm.pjrt_world_size() * bsz, length))
 
     # XLA SyncBatchNorm
     device = xm.xla_device()
@@ -70,7 +70,7 @@ def _sync_bn1d_multi_channel(rank):
     bsz = 64
     features = 20
     length = 128
-    t_global = torch.rand((xm.xrt_world_size() * bsz, features, length))
+    t_global = torch.rand((xm.pjrt_world_size() * bsz, features, length))
 
     # XLA SyncBatchNorm
     device = xm.xla_device()
@@ -95,7 +95,7 @@ def _sync_bn2d(rank):
     bsz = 8
     features = 10
     h, w = 64, 64
-    t_global = torch.rand((xm.xrt_world_size() * bsz, features, h, w))
+    t_global = torch.rand((xm.pjrt_world_size() * bsz, features, h, w))
 
     # XLA SyncBatchNorm
     device = xm.xla_device()
@@ -120,7 +120,7 @@ def _sync_bn3d(rank):
     bsz = 16
     features = 32
     d, h, w = 16, 32, 32
-    t_global = torch.rand((xm.xrt_world_size() * bsz, features, d, h, w))
+    t_global = torch.rand((xm.pjrt_world_size() * bsz, features, d, h, w))
 
     # XLA SyncBatchNorm
     device = xm.xla_device()

diff --git a/test/test_profile_mp_mnist.py b/test/test_profile_mp_mnist.py
@@ -99,12 +99,12 @@ def train_mnist(flags,
         data=(torch.zeros(flags.batch_size, 1, 28,
                           28), torch.zeros(flags.batch_size,
                                            dtype=torch.int64)),
-        sample_count=600000 // flags.batch_size // xm.xrt_world_size())
+        sample_count=600000 // flags.batch_size // xm.pjrt_world_size())
     test_loader = xu.SampleGenerator(
         data=(torch.zeros(flags.batch_size, 1, 28,
                           28), torch.zeros(flags.batch_size,
                                            dtype=torch.int64)),
-        sample_count=100000 // flags.batch_size // xm.xrt_world_size())
+        sample_count=100000 // flags.batch_size // xm.pjrt_world_size())
   else:
     train_dataset = datasets.MNIST(
         os.path.join(flags.datadir, str(xm.get_ordinal())),
@@ -121,10 +121,10 @@ def train_mnist(flags,
             [transforms.ToTensor(),
              transforms.Normalize((0.1307,), (0.3081,))]))
     train_sampler = None
-    if xm.xrt_world_size() > 1:
+    if xm.pjrt_world_size() > 1:
       train_sampler = torch.utils.data.distributed.DistributedSampler(
           train_dataset,
-          num_replicas=xm.xrt_world_size(),
+          num_replicas=xm.pjrt_world_size(),
           rank=xm.get_ordinal(),
           shuffle=True)
     train_loader = torch.utils.data.DataLoader(
@@ -142,7 +142,7 @@ def train_mnist(flags,
         num_workers=flags.num_workers)
 
   # Scale learning rate to num cores
-  lr = flags.lr * xm.xrt_world_size()
+  lr = flags.lr * xm.pjrt_world_size()
 
   device = xm.xla_device()
   model = MNIST().to(device)

diff --git a/test/test_train_mp_imagenet.py b/test/test_train_mp_imagenet.py
@@ -190,11 +190,11 @@ def train_imagenet():
         data=(torch.zeros(FLAGS.batch_size, 3, img_dim, img_dim),
               torch.zeros(FLAGS.batch_size, dtype=torch.int64)),
         sample_count=train_dataset_len // FLAGS.batch_size //
-        xm.xrt_world_size())
+        xm.pjrt_world_size())
     test_loader = xu.SampleGenerator(
         data=(torch.zeros(FLAGS.test_set_batch_size, 3, img_dim, img_dim),
               torch.zeros(FLAGS.test_set_batch_size, dtype=torch.int64)),
-        sample_count=50000 // FLAGS.batch_size // xm.xrt_world_size())
+        sample_count=50000 // FLAGS.batch_size // xm.pjrt_world_size())
   else:
     normalize = transforms.Normalize(
         mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
@@ -221,15 +221,15 @@ def train_imagenet():
         ]))
 
     train_sampler, test_sampler = None, None
-    if xm.xrt_world_size() > 1:
+    if xm.pjrt_world_size() > 1:
       train_sampler = torch.utils.data.distributed.DistributedSampler(
           train_dataset,
-          num_replicas=xm.xrt_world_size(),
+          num_replicas=xm.pjrt_world_size(),
           rank=xm.get_ordinal(),
           shuffle=True)
       test_sampler = torch.utils.data.distributed.DistributedSampler(
           test_dataset,
-          num_replicas=xm.xrt_world_size(),
+          num_replicas=xm.pjrt_world_size(),
           rank=xm.get_ordinal(),
           shuffle=False)
     train_loader = torch.utils.data.DataLoader(
@@ -273,7 +273,7 @@ def train_imagenet():
       momentum=FLAGS.momentum,
       weight_decay=1e-4)
   num_training_steps_per_epoch = train_dataset_len // (
-      FLAGS.batch_size * xm.xrt_world_size())
+      FLAGS.batch_size * xm.pjrt_world_size())
   lr_scheduler = schedulers.wrap_optimizer_with_scheduler(
       optimizer,
       scheduler_type=getattr(FLAGS, 'lr_scheduler_type', None),