[dtensor][cp][experiment] add CP experimental API to choose rotate method (pytorch#142093)

XilunWu · pytorchmergebot · commit bce07deb96fc · 2024-12-10T18:25:23.000Z
**Summary** This PR adds a new experimental API `set_rotate_method` for Context Parallel. This API allows user to choose the desired communication method (between all-to-all and all-gather) for shards rotation. **Test** `pytest test/distributed/_tensor/test_attention.py` Pull Request resolved: pytorch#142093 Approved by: https://github.com/fegin
diff --git a/test/distributed/_tensor/test_attention.py b/test/distributed/_tensor/test_attention.py
@@ -15,6 +15,7 @@
     _RotateMethod,
     context_parallel,
     context_parallel_unshard,
+    set_rotate_method,
 )
 from torch.distributed.tensor.debug import CommDebugMode
 from torch.distributed.tensor.parallel import parallelize_module
@@ -48,6 +49,12 @@
     backends.append(SDPBackend.EFFICIENT_ATTENTION)
 
 
+rotater_enum_to_str = {
+    _RotateMethod.ALL_GATHER: "allgather",
+    _RotateMethod.ALL_TO_ALL: "alltoall",
+}  # mapping from _RotateMethod enum to string
+
+
 class RingAttentionTest(DTensorTestBase):
     @property
     def world_size(self) -> int:
@@ -76,7 +83,8 @@ def test_ring_attention_sdpa(
         load_balance: bool,
         rotater: _RotateMethod,
     ) -> None:
-        _cp_options.rotate_method = rotater
+        set_rotate_method(rotater_enum_to_str[rotater])
+        self.assertEqual(_cp_options.rotate_method, rotater)
         device_mesh = DeviceMesh(self.device_type, torch.arange(0, self.world_size))
         dtype = torch.bfloat16
         bs = 8
@@ -230,7 +238,8 @@ def test_ring_attention_native_transformer(
         self, is_causal: bool, rotater: _RotateMethod
     ) -> None:
         _cp_options.enable_load_balance = is_causal
-        _cp_options.rotate_method = rotater
+        set_rotate_method(rotater_enum_to_str[rotater])
+        self.assertEqual(_cp_options.rotate_method, rotater)
         device_mesh = DeviceMesh(
             self.device_type,
             torch.arange(0, self.world_size),
@@ -314,7 +323,8 @@ def test_ring_attention_native_transformer(
     @sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION])
     @parametrize("rotater", [_RotateMethod.ALL_GATHER, _RotateMethod.ALL_TO_ALL])
     def test_ring_attention_custom_transformer(self, rotater: _RotateMethod) -> None:
-        _cp_options.rotate_method = rotater
+        set_rotate_method(rotater_enum_to_str[rotater])
+        self.assertEqual(_cp_options.rotate_method, rotater)
         device_mesh = DeviceMesh(
             self.device_type,
             torch.arange(0, self.world_size),
diff --git a/torch/distributed/tensor/experimental/_attention.py b/torch/distributed/tensor/experimental/_attention.py
@@ -31,7 +31,7 @@
 from torch.distributed.tensor.parallel.style import ParallelStyle
 
 
-__all__ = ["context_parallel"]
+__all__ = ["context_parallel", "set_rotate_method"]
 
 
 class _CausalBehavior(Enum):
@@ -1284,10 +1284,46 @@ def context_parallel_unshard(
 ) -> List[torch.Tensor]:
     """
     Unshard the tensors (e.g., output) that are sharded due to context parallelism.
+
+    Args:
+        mesh (:class:`DeviceMesh`): the device mesh for the context parallelism.
+        buffers (List[torch.Tensor]): the buffers to be unsharded.
+        seq_dims (List[int]): the sequence dimensions of ``buffers``. This list
+            must have the same length as ``buffers``.
+
+    Returns:
+        List[torch.Tensor]: the unsharded buffers.
     """
     sharder = (
         _RoundRobinLoadBalancer
         if _cp_options.enable_load_balance
         else _SequentialSharder
     )
     return [sharder.unshard(b, mesh, dim) for b, dim in zip(buffers, seq_dims)]
+
+
+def set_rotate_method(rotate_method: str) -> None:
+    """
+    Context Parallel SDPA requires the rotation of kv shards. Users can call this
+    API to specify which rotation method to use. "alltoall" shuffles the kv shards
+    using all-to-all collective. While "allgather" gathers the kv shards using
+    all-gather collective after the first sub-SDPA computation. If this API has not
+    been called, the default rotate method is "allgather".
+
+    Args:
+        rotate_method (str): the rotate method to use. Currently only supports
+        "allgather" and "alltoall". If a different string other than these two
+        is passed in, the function will raise an error.
+
+    Returns:
+        None
+    """
+    if rotate_method == "allgather":
+        _cp_options.rotate_method = _RotateMethod.ALL_GATHER
+    elif rotate_method == "alltoall":
+        _cp_options.rotate_method = _RotateMethod.ALL_TO_ALL
+    else:
+        raise NotImplementedError(
+            "Context Parallel does not support "
+            f"using {rotate_method} for kv shards rotation"
+        )