vllm-project
diff --git a/‎tests/spec_decode/e2e/test_integration_dist_pp2.py
Lines changed: 125 additions & 0 deletions b/‎tests/spec_decode/e2e/test_integration_dist_pp2.py
Lines changed: 125 additions & 0 deletions
diff --git a/‎tests/spec_decode/test_multi_step_worker.py
Lines changed: 13 additions & 12 deletions b/‎tests/spec_decode/test_multi_step_worker.py
Lines changed: 13 additions & 12 deletions
diff --git a/‎tests/spec_decode/test_spec_decode_worker.py
Lines changed: 2 additions & 2 deletions b/‎tests/spec_decode/test_spec_decode_worker.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎vllm/config.py
Lines changed: 34 additions & 3 deletions b/‎vllm/config.py
Lines changed: 34 additions & 3 deletions
diff --git a/‎vllm/distributed/parallel_state.py
Lines changed: 14 additions & 2 deletions b/‎vllm/distributed/parallel_state.py
Lines changed: 14 additions & 2 deletions
diff --git a/‎vllm/engine/arg_utils.py
Lines changed: 10 additions & 0 deletions b/‎vllm/engine/arg_utils.py
Lines changed: 10 additions & 0 deletions
diff --git a/‎vllm/model_executor/models/eagle.py
Lines changed: 2 additions & 1 deletion b/‎vllm/model_executor/models/eagle.py
Lines changed: 2 additions & 1 deletion
@@ -0,0 +1,125 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Tests which cover integration of the speculative decoding framework with
+pipeline parallelism.
+"""
+
+from typing import Optional
+
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+
+from .conftest import run_equality_correctness_test_tp
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [[
+        # Skip cuda graph recording for fast test.
+        "--enforce-eager",
+        "--pipeline-parallel-size",
+        "2",
+
+        # precision
+        "--dtype",
+        "bfloat16",
+    ]])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]])
+@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
+@pytest.mark.parametrize("model, test_llm_kwargs",
+                         [("JackFram/llama-68m", [
+                             "--speculative-model",
+                             "JackFram/llama-68m",
+                             "--num-speculative-tokens",
+                             "5",
+                             "--speculative-draft-pipeline-parallel-size",
+                             "1",
+                         ]),
+                          ("ibm-granite/granite-3b-code-instruct", [
+                              "--speculative-model",
+                              "ibm-granite/granite-3b-code-instruct",
+                              "--num-speculative-tokens",
+                              "5",
+                              "--speculative-draft-pipeline-parallel-size",
+                              "1",
+                          ])])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize("seed", [1])
+def test_draft_model_pp_lt_target_model_pp2(model, common_llm_kwargs,
+                                            per_test_common_llm_kwargs,
+                                            baseline_llm_kwargs,
+                                            test_llm_kwargs, batch_size: int,
+                                            seed: int):
+    """Verify spec decode works well with smaller pp for draft models.
+    """
+    if current_platform.is_rocm():
+        pytest.skip("hip is not well-supported yet")
+    run_equality_correctness_test_tp(model,
+                                     common_llm_kwargs,
+                                     per_test_common_llm_kwargs,
+                                     baseline_llm_kwargs,
+                                     test_llm_kwargs,
+                                     batch_size,
+                                     max_output_len=32,
+                                     seed=seed,
+                                     temperature=0.0)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [[
+        # Skip cuda graph recording for fast test.
+        "--enforce-eager",
+        "--pipeline-parallel-size",
+        "2",
+
+        # precision
+        "--dtype",
+        "bfloat16",
+    ]])
+@pytest.mark.parametrize(
+    "per_test_common_llm_kwargs",
+    [["--enable-chunked-prefill", "False"],
+     [
+         "--enable-chunked-prefill", "True", "--max-num-batched-tokens", "4",
+         "--max-num-seqs", "4"
+     ]])
+@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
+@pytest.mark.parametrize("model, test_llm_kwargs",
+                         [("JackFram/llama-68m", [
+                              "--speculative-model",
+                              "JackFram/llama-68m",
+                              "--num-speculative-tokens",
+                              "3",
+                              "--speculative-draft-pipeline-parallel-size",
+                              "1",
+                          ])])
+@pytest.mark.parametrize("logprobs", [None, 2])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize("seed", [1])
+def test_spec_decode_chunked_prefill_pp2(model, common_llm_kwargs,
+                                         per_test_common_llm_kwargs,
+                                         baseline_llm_kwargs, test_llm_kwargs,
+                                         logprobs: Optional[int],
+                                         batch_size: int, seed: int):
+    """Verify spec decode works well with same and different PP size for
+    the draft model with chunked prefill.
+    """
+    if logprobs:
+        test_llm_kwargs.extend(
+            ["--disable-logprobs-during-spec-decoding", "False"])
+    run_equality_correctness_test_tp(model,
+                                     common_llm_kwargs,
+                                     per_test_common_llm_kwargs,
+                                     baseline_llm_kwargs,
+                                     test_llm_kwargs,
+                                     batch_size,
+                                     max_output_len=32,
+                                     seed=seed,
+                                     temperature=0.0,
+                                     logprobs=logprobs)
@@ -12,7 +12,7 @@
 from vllm.model_executor.utils import set_random_seed
 from vllm.sequence import (ExecuteModelRequest, HiddenStates, Logprob,
                            get_all_seq_ids)
-from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner
+from vllm.spec_decode.draft_model_runner import TP1PP1DraftModelRunner
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
 from vllm.spec_decode.top1_proposer import Top1Proposer
 from vllm.worker.worker import Worker
@@ -91,7 +91,7 @@ def test_same_output_for_single_step():
         block_size,
         num_gpu_blocks,
         seed,
-        model_runner_cls=TP1DraftModelRunner,
+        model_runner_cls=TP1PP1DraftModelRunner,
     )
     worker = create_worker(
         Worker,
@@ -304,7 +304,7 @@ def test_multi_step_with_batch_expansion_correct_output():
         block_size,
         num_gpu_blocks,
         seed,
-        model_runner_cls=TP1DraftModelRunner,
+        model_runner_cls=TP1PP1DraftModelRunner,
     )
     multi_step_worker.set_include_gpu_probs_tensor()
     worker = create_worker(
@@ -399,7 +399,7 @@ def test_multi_step_with_batch_expansion_incorrect_output():
         block_size,
         num_gpu_blocks,
         seed,
-        model_runner_cls=TP1DraftModelRunner,
+        model_runner_cls=TP1PP1DraftModelRunner,
     )
     multi_step_worker.set_include_gpu_probs_tensor()
     worker = create_worker(
@@ -502,13 +502,14 @@ def test_multi_step_correct_kvcache(num_steps, attn_backend):
 
     with global_force_attn_backend_context_manager(attn_backend):
         dtype = 'float16' if attn_backend == _Backend.FLASH_ATTN else 'float32'
-        multi_step_worker = create_worker(MultiStepWorker,
-                                          model_name,
-                                          block_size,
-                                          num_gpu_blocks,
-                                          seed,
-                                          model_runner_cls=TP1DraftModelRunner,
-                                          dtype=dtype)
+        multi_step_worker = create_worker(
+            MultiStepWorker,
+            model_name,
+            block_size,
+            num_gpu_blocks,
+            seed,
+            model_runner_cls=TP1PP1DraftModelRunner,
+            dtype=dtype)
         multi_step_worker.set_include_gpu_probs_tensor()
         worker = create_worker(Worker,
                                model_name,
@@ -771,7 +772,7 @@ def test_use_draft_model_runner_advance_step():
         block_size,
         num_gpu_blocks,
         seed,
-        model_runner_cls=TP1DraftModelRunner,
+        model_runner_cls=TP1PP1DraftModelRunner,
     )
 
     # Mock "_gpu_advance_step" to raise an exception when called.
 
@@ -12,7 +12,7 @@
 from vllm.model_executor.utils import set_random_seed
 from vllm.sequence import ExecuteModelRequest, SequenceOutput
 from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
-from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner
+from vllm.spec_decode.draft_model_runner import TP1PP1DraftModelRunner
 from vllm.spec_decode.interfaces import SpeculativeProposals
 from vllm.spec_decode.metrics import (AsyncMetricsCollector,
                                       SpecDecodeWorkerMetrics)
@@ -929,7 +929,7 @@ def test_correctly_load_weight_for_eagle():
         block_size,
         num_gpu_blocks,
         seed,
-        model_runner_cls=TP1DraftModelRunner,
+        model_runner_cls=TP1PP1DraftModelRunner,
     )
 
     spec_decode_sampler = mock_spec_decode_sampler("rejection_sampler")
 
@@ -1855,6 +1855,7 @@ def maybe_create_spec_config(
         speculative_model: Optional[str],
         speculative_model_quantization: Optional[str],
         speculative_draft_tensor_parallel_size: Optional[int],
+        speculative_draft_pipeline_parallel_size: Optional[int],
         num_speculative_tokens: Optional[int],
         speculative_disable_mqa_scorer: Optional[bool],
         speculative_max_model_len: Optional[int],
@@ -1887,6 +1888,8 @@ def maybe_create_spec_config(
                 None, we assume the model weights are not quantized.
             speculative_draft_tensor_parallel_size (Optional[int]): The degree
                 of the tensor parallelism for the draft model.
+            speculative_draft_pipeline_parallel_size (Optional[int]): The degree
+                of the pipeline parallelism for the draft model.
             num_speculative_tokens (Optional[int]): The number of speculative
                 tokens, if provided. Will default to the number in the draft
                 model config if present, otherwise is required.
@@ -2029,6 +2032,12 @@ def maybe_create_spec_config(
                     speculative_draft_tensor_parallel_size,
                     draft_hf_config
             )
+            speculative_draft_pipeline_parallel_size = \
+                SpeculativeConfig._verify_and_get_draft_model_pipeline_parallel_size(
+                    target_parallel_config,
+                    speculative_draft_pipeline_parallel_size,
+                    draft_hf_config
+            )
 
             draft_model_config.max_model_len = (
                 SpeculativeConfig._maybe_override_draft_max_model_len(
@@ -2040,7 +2049,8 @@ def maybe_create_spec_config(
             draft_parallel_config = (
                 SpeculativeConfig.create_draft_parallel_config(
                     target_parallel_config,
-                    speculative_draft_tensor_parallel_size, draft_hf_config))
+                    speculative_draft_tensor_parallel_size,
+                    speculative_draft_pipeline_parallel_size, draft_hf_config))
 
         if num_speculative_tokens is None:
             raise ValueError(
@@ -2136,19 +2146,40 @@ def _verify_and_get_draft_model_tensor_parallel_size(
                 f"other value than 1 or target model tensor_parallel_size")
         return speculative_draft_tensor_parallel_size
 
+    @staticmethod
+    def _verify_and_get_draft_model_pipeline_parallel_size(
+            target_parallel_config: ParallelConfig,
+            speculative_draft_pipeline_parallel_size: Optional[int],
+            draft_hf_config: PretrainedConfig) -> int:
+        """
+        Verifies and adjusts the tensor parallel size for a draft model
+        specified using speculative_draft_pipeline_parallel_size.
+        """
+        # If speculative_draft_pipeline_parallel_size is unset then set it
+        # appropriately else verify that it is set correctly.
+        if speculative_draft_pipeline_parallel_size is None:
+            speculative_draft_pipeline_parallel_size = \
+                target_parallel_config.pipeline_parallel_size
+        elif speculative_draft_pipeline_parallel_size not in (
+                1, target_parallel_config.pipeline_parallel_size):
+            raise ValueError(
+                f"{speculative_draft_pipeline_parallel_size=} cannot be "
+                f"other value than 1 or target model pipeline_parallel_size")
+        return speculative_draft_pipeline_parallel_size
+
     @staticmethod
     def create_draft_parallel_config(
         target_parallel_config: ParallelConfig,
         speculative_draft_tensor_parallel_size: int,
+        speculative_draft_pipeline_parallel_size: int,
         draft_hf_config: PretrainedConfig,
     ) -> ParallelConfig:
         """Create a parallel config for use by the draft worker.
 
         This is mostly a copy of the target parallel config, except the tp_size.
         """
         draft_parallel_config = ParallelConfig(
-            pipeline_parallel_size=target_parallel_config.
-            pipeline_parallel_size,
+            pipeline_parallel_size=speculative_draft_pipeline_parallel_size,
             tensor_parallel_size=speculative_draft_tensor_parallel_size,
             distributed_executor_backend=target_parallel_config.
             distributed_executor_backend,
 
@@ -1024,11 +1024,13 @@ def model_parallel_is_initialized():
 
 
 _TP_STATE_PATCHED = False
+_PP_STATE_PATCHED = False
 
 
 @contextmanager
-def patch_tensor_parallel_group(tp_group: GroupCoordinator):
-    """Patch the tp group temporarily until this function ends.
+def patch_model_parallel_group(tp_group: GroupCoordinator,
+                               pp_group: GroupCoordinator):
+    """Patch the tp and pp group temporarily until this function ends.
 
     This method is for draft workers of speculative decoding to run draft model
     with different tp degree from that of target model workers.
@@ -1039,16 +1041,26 @@ def patch_tensor_parallel_group(tp_group: GroupCoordinator):
     global _TP_STATE_PATCHED
     assert not _TP_STATE_PATCHED, "Should not call when it's already patched"
 
+    global _PP_STATE_PATCHED
+    assert not _PP_STATE_PATCHED, "Should not call when it's already patched"
+
     _TP_STATE_PATCHED = True
     old_tp_group = get_tp_group()
     global _TP
     _TP = tp_group
+
+    _PP_STATE_PATCHED = True
+    old_pp_group = get_pp_group()
+    global _PP
+    _PP = pp_group
     try:
         yield
     finally:
         # restore the original state
         _TP_STATE_PATCHED = False
         _TP = old_tp_group
+        _PP_STATE_PATCHED = False
+        _PP = old_pp_group
 
 
 def get_tensor_model_parallel_world_size():
 
@@ -181,6 +181,7 @@ class EngineArgs:
     speculative_model: Optional[str] = None
     speculative_model_quantization: Optional[str] = None
     speculative_draft_tensor_parallel_size: Optional[int] = None
+    speculative_draft_pipeline_parallel_size: Optional[int] = None
     num_speculative_tokens: Optional[int] = None
     speculative_disable_mqa_scorer: Optional[bool] = False
     speculative_max_model_len: Optional[int] = None
@@ -812,6 +813,13 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             default=EngineArgs.speculative_draft_tensor_parallel_size,
             help='Number of tensor parallel replicas for '
             'the draft model in speculative decoding.')
+        parser.add_argument(
+            '--speculative-draft-pipeline-parallel-size',
+            '-spec-draft-pp',
+            type=int,
+            default=EngineArgs.speculative_draft_pipeline_parallel_size,
+            help='Number of pipeline parallel replicas for '
+            'the draft model in speculative decoding.')
 
         parser.add_argument(
             '--speculative-max-model-len',
@@ -1266,6 +1274,8 @@ def create_engine_config(
                 self.speculative_model_quantization,
             speculative_draft_tensor_parallel_size = \
                 self.speculative_draft_tensor_parallel_size,
+            speculative_draft_pipeline_parallel_size = \
+                self.speculative_draft_pipeline_parallel_size,
             num_speculative_tokens=self.num_speculative_tokens,
             speculative_disable_mqa_scorer=self.speculative_disable_mqa_scorer,
             speculative_disable_by_batch_size=self.
 
@@ -16,6 +16,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
+from .interfaces import SupportsPP
 from .utils import maybe_prefix
 
 logger = init_logger(__name__)
@@ -41,7 +42,7 @@ def forward(self, x, residual):
             return x + residual, None
 
 
-class EAGLE(nn.Module):
+class EAGLE(nn.Module, SupportsPP):
     """This class implements the EAGLE draft model from the paper: https://arxiv.org/pdf/2401.15077
     Reference implementation: https://github.com/SafeAILab/EAGLE