remove get_cpu_world_group

vllm-project · youkaichao · Jun 13, 2024 · Jun 5, 2024 · Jun 5, 2024 · Jun 5, 2024
commit e284955ca4498a3f6f8c2e3b91a70e9eb8b9c111
diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
@@ -10,7 +10,7 @@
 from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
 from vllm.distributed.device_communicators.pynccl_wrapper import NCCLLibrary
 from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
-                                             graph_capture,
+                                             get_world, graph_capture,
                                              init_distributed_environment)
 from vllm.utils import update_environment_variables
 
@@ -54,7 +54,7 @@ def wrapped_fn(env):
 
 @worker_fn_wrapper
 def worker_fn():
-    pynccl_comm = PyNcclCommunicator()
+    pynccl_comm = PyNcclCommunicator(get_world().cpu_group)
     tensor = torch.ones(16, 1024, 1024,
                         dtype=torch.float32).cuda(pynccl_comm.rank)
     with pynccl_comm.change_state(enable=True):
@@ -130,7 +130,7 @@ def test_pynccl_multiple_allreduce_with_vllm():
 def worker_fn_with_cudagraph():
     with torch.no_grad():
         graph = torch.cuda.CUDAGraph()
-        pynccl_comm = PyNcclCommunicator()
+        pynccl_comm = PyNcclCommunicator(get_world().cpu_group)
         # run something in the default stream to initialize torch engine
         a = torch.ones((4, 4), device=f'cuda:{pynccl_comm.rank}')
         torch.cuda.synchronize()
@@ -155,7 +155,7 @@ def test_pynccl_with_cudagraph():
 
 @worker_fn_wrapper
 def send_recv_worker_fn():
-    pynccl_comm = PyNcclCommunicator()
+    pynccl_comm = PyNcclCommunicator(get_world().cpu_group)
     if pynccl_comm.rank == 0:
         tensor = torch.ones(16, 1024, 1024,
                             dtype=torch.float32).cuda(pynccl_comm.rank)

diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
@@ -9,7 +9,7 @@
 from vllm.distributed.device_communicators.pynccl_wrapper import (
     NCCLLibrary, buffer_type, cudaStream_t, ncclComm_t, ncclDataTypeEnum,
     ncclRedOpTypeEnum, ncclUniqueId)
-from vllm.distributed.parallel_state import get_cpu_world_group, get_local_rank
+from vllm.distributed.parallel_state import get_local_rank
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
@@ -19,7 +19,7 @@ class PyNcclCommunicator:
 
     def __init__(
         self,
-        group: Optional[ProcessGroup] = None,
+        group: ProcessGroup,
         device: Optional[Union[int, str, torch.device]] = None,
         library_path: Optional[str] = None,
     ):
@@ -35,7 +35,6 @@ def __init__(
         is bind to a unique device.
         """
         assert dist.is_initialized()
-        group = get_cpu_world_group() if group is None else group
         assert dist.get_backend(group) != dist.Backend.NCCL, (
             "PyNcclCommunicator should be attached to a non-NCCL group.")
         self.group = group

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
@@ -678,12 +678,6 @@ def model_parallel_is_initialized():
     return (_TP is not None and _PP_DEVICE_GROUP is not None)
 
 
-def get_cpu_world_group():
-    """Get the CPU world group."""
-    assert _CPU_WORLD_GROUP is not None, ("CPU world group is not initialized")
-    return _CPU_WORLD_GROUP
-
-
 def get_pipeline_model_parallel_group():
     """Get the pipeline model parallel group the caller rank belongs to."""
     assert _PP_DEVICE_GROUP is not None, (