Refactor send and recv functions and add new test

Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai>
vllm-project · simon-mo · Jun 23, 2024 · Jun 20, 2024 · Jun 20, 2024 · Jun 20, 2024
commit 31ce144feedb6239b36cc1c4435ab42aeb3e6016
diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py
@@ -146,6 +146,28 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
         assert torch.allclose(recv_dict["f"], test_dict["f"])
 
 
+@ray.remote(num_gpus=1, max_calls=1)
+def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
+                          distributed_init_port: str):
+    del os.environ["CUDA_VISIBLE_DEVICES"]
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(tp_size, pp_size, rank,
+                                      distributed_init_port)
+
+    size = 64
+    test_tensor = torch.arange(64, dtype=torch.float32, device="cuda")
+
+    if not is_pipeline_model_parallel_first_rank():
+        recv_tensor = get_pp_group().recv(size, dtype=torch.float32)
+
+    if not is_pipeline_model_parallel_last_rank():
+        get_pp_group().send(test_tensor)
+
+    if not is_pipeline_model_parallel_first_rank():
+        assert torch.allclose(test_tensor, recv_tensor)
+
+
 @pytest.mark.skipif(torch.cuda.device_count() < 2,
                     reason="Need at least 2 GPUs to run the test.")
 @pytest.mark.parametrize("tp_size", [2])
@@ -160,6 +182,7 @@ def test_multi_process_tensor_parallel(tp_size, test_target):
 @pytest.mark.skipif(torch.cuda.device_count() < 2,
                     reason="Need at least 2 GPUs to run the test.")
 @pytest.mark.parametrize("pp_size", [2])
-@pytest.mark.parametrize("test_target", [send_recv_tensor_dict_test_worker])
+@pytest.mark.parametrize(
+    "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
 def test_multi_process_pipeline_parallel(pp_size, test_target):
     multi_process_parallel(1, pp_size, test_target)