[CI] Add Decode Context Parallelism (DCP) test to CI (#24487)

minosfuture · web-flow · commit 4e5affeaa142 · 2025-09-16T21:21:28.000+08:00
Signed-off-by: Ming Yang &lt;minos.future@gmail.com&gt;
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -946,7 +946,6 @@ steps:
   commands:
   - pytest -v -s distributed/test_pp_cudagraph.py
   - pytest -v -s distributed/test_pipeline_parallel.py
-  # - pytest -v -s distributed/test_context_parallel.py # TODO: enable it on Hopper runners or add triton MLA support
 
 - label: LoRA TP Test (Distributed) # 17 min
   timeout_in_minutes: 30
@@ -1020,9 +1019,21 @@ steps:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
 
-- label: Qwen MoE EP Test # optional
+##### H200 test #####
+- label: Distrubted Tests (H200) # optional
   gpu: h200
   optional: true
+  working_dir: "/vllm-workspace/"
+  num_gpus: 2
+  commands:
+    - pytest -v -s tests/distributed/test_context_parallel.py
+    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+
+##### B200 test #####
+- label: Distributed Tests (B200) # optional
+  gpu: b200
+  optional: true
+  working_dir: "/vllm-workspace/"
   num_gpus: 2
   commands:
-    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 /vllm-workspace/examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+    - pytest -v -s tests/distributed/test_context_parallel.py
diff --git a/tests/distributed/test_context_parallel.py b/tests/distributed/test_context_parallel.py
@@ -71,12 +71,13 @@ def detailed(
         parallel_setups = []
         for eager_mode_val in [False]:
             for pp_multiplier in [1]:
-                for dcp_multiplier in [2, 4]:
+                for dcp_multiplier in [0.5, 1]:
                     for chunked_prefill_val in [True]:
                         parallel_setups.append(
                             ParallelSetup(tp_size=tp_base,
                                           pp_size=pp_multiplier * pp_base,
-                                          dcp_size=dcp_multiplier * dcp_base,
+                                          dcp_size=int(dcp_multiplier *
+                                                       tp_base),
                                           eager_mode=eager_mode_val,
                                           chunked_prefill=chunked_prefill_val))
         return CPTestSettings(
@@ -223,7 +224,9 @@ def _compare_cp_with_tp(
 
 CP_TEXT_GENERATION_MODELS = {
     # [MLA attention only]
-    "deepseek-ai/DeepSeek-V2-Lite-Chat": CPTestSettings.detailed(),
+    "deepseek-ai/DeepSeek-V2-Lite-Chat":
+    [CPTestSettings.detailed(),
+     CPTestSettings.detailed(tp_base=2)],
 }
 
 CP_TEST_MODELS = [
@@ -238,7 +241,7 @@ def _compare_cp_with_tp(
      "runner", "test_options"),
     [
         params for model_id, settings in CP_TEXT_GENERATION_MODELS.items()
-        for params in settings.iter_params(model_id)
+        for setting in settings for params in setting.iter_params(model_id)
         if model_id in CP_TEST_MODELS
     ],
 )