vllm-project · RuBing-Yang · Aug 18, 2025 · Aug 18, 2025 · Aug 18, 2025 · Oct 13, 2025
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
@@ -16,8 +16,12 @@
                          ParallelConfig, SchedulerConfig, SpeculativeConfig,
                          VllmConfig)
 from vllm.model_executor.models.llama import LlamaForCausalLM
+from vllm.model_executor.utils import ThinkSettings
 from vllm.platforms import current_platform
+from vllm.sampling_params import SamplingParams
 from vllm.v1.spec_decode.eagle import EagleProposer
+from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
+from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 
 model_dir = "meta-llama/Llama-3.1-8B-Instruct"
 eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
@@ -532,3 +536,92 @@ def create_deterministic_logits(token_ids, k: int):
 
     # Verify that the draft tokens match our expectations.
     assert torch.equal(result, expected_tokens)
+
+
+@pytest.mark.parametrize("early_stop_method,early_stop_signal,should_stop", [
+    ("confidence", [0.9], True),
+    ("confidence", [0.7], False),
+    ("remain", [50.0], True),
+    ("remain", [150.0], False),
+    ("progress", [0.5], True),
+    ("progress", [0.1], False),
+    ("confidence_progress_remain", [0.9, 0.5, 50.0], True),
+    ("confidence_progress_remain", [0.7, 0.5, 50.0], False),
+    ("confidence_progress_remain", [0.9, 0.1, 50.0], False),
+    ("confidence_progress_remain", [0.9, 0.5, 150.0], False),
+])
+def test_speculative_early_exit(early_stop_method, early_stop_signal,
+                                should_stop):
+    """
+    Tests the early exit functionality for speculative decoding.
+    """
+    # Setup runner and configurations
+    device = torch.device(current_platform.device_type)
+    proposer = _create_proposer("eagle3", num_speculative_tokens=5)
+    proposer.speculative_config.early_stop_thinking = True
+    proposer.speculative_config.early_stop_method = early_stop_method
+
+    # Mock GPUModelRunner
+    runner = mock.MagicMock(spec=GPUModelRunner)
+    runner.model = mock.MagicMock()
+    runner.drafter = proposer
+    runner.speculative_config = proposer.speculative_config
+
+    # Mock ThinkSettings
+    think_settings = ThinkSettings(start_think_id=100,
+                                   stop_think_id=101,
+                                   min_think_tokens=0,
+                                   think_prob_threshold={
+                                       "confidence": 0.8,
+                                       "progress": 0.3,
+                                       "remain": 100.0
+                                   })
+    think_settings.step_split_token_ids.add(200)
+    runner.model.think_settings = think_settings
+
+    # Setup request state to simulate a thinking step
+    output_token_ids = [1, 2, 3, 4, 5, 6]
+    req_id = "test_req_1"
+    req_state: CachedRequestState = CachedRequestState(
+        req_id=req_id,
+        prompt_token_ids=[0],
+        sampling_params=SamplingParams(),
+        pooling_params=None,
+        mm_kwargs=[],
+        mm_positions=[],
+        block_ids=([], ),
+        generator=None,
+        num_computed_tokens=len(output_token_ids),
+        output_token_ids=output_token_ids,
+        thinking_state=True)
+    req_state.early_stop_ewma_score = early_stop_signal
+
+    input_batch: InputBatch = InputBatch(
+        max_num_reqs=1,
+        max_model_len=1024,
+        max_num_batched_tokens=1024,
+        device=torch.device(device),
+        pin_memory=True,
+        vocab_size=1024,
+        block_sizes=[16],
+    )
+    input_batch.add_request(req_state)
+
+    runner.requests = {req_id: req_state}
+    runner.input_batch = input_batch
+
+    # Prepare sampled tokens that include a step_split_token
+    valid_sampled_token_ids = [[200, 10, 11, 12, 13]]
+
+    GPUModelRunner._apply_early_stop_thinking(runner, valid_sampled_token_ids)
+
+    if should_stop:
+        # Expect the stop_think_id to be inserted and sequence truncated
+        assert valid_sampled_token_ids[0] == [
+            200, think_settings.stop_think_id
+        ]
+        assert not req_state.thinking_state
+    else:
+        # Expect the tokens to remain unchanged
+        assert valid_sampled_token_ids[0] == [200, 10, 11, 12, 13]
+        assert req_state.thinking_state
@@ -1985,6 +1985,11 @@ class SpeculativeConfig:
         ParallelConfig] = None  # type: ignore
     """The parallel configuration for the draft model initialized internal."""
 
+    early_stop_thinking: bool = False
+
+    early_stop_method: Optional[Literal["confidence", "remain", "progress",
+                                        "confidence_progress_remain"]] = None
+
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,

@@ -20,7 +20,15 @@
 from openai.types.responses import (ResponseFunctionToolCall,
                                     ResponseInputItemParam, ResponseOutputItem,
                                     ResponsePrompt, ResponseReasoningItem,
-                                    ResponseStatus, ResponseTextConfig)
+                                    ResponseStatus)
+
+# Backward compatibility for OpenAI client versions
+try:  # For older openai versions (< 1.100.0)
+    from openai.types.responses import ResponseTextConfig
+except ImportError:  # For newer openai versions (>= 1.100.0)
+    from openai.types.responses import (ResponseFormatTextConfig as
+                                        ResponseTextConfig)
+
 from openai.types.responses.response import ToolChoice
 from openai.types.responses.tool import Tool
 from openai.types.shared import Metadata, Reasoning

diff --git a/vllm/model_executor/__init__.py b/vllm/model_executor/__init__.py
@@ -5,12 +5,13 @@
                                            PackedvLLMParameter)
 from vllm.model_executor.sampling_metadata import (SamplingMetadata,
                                                    SamplingMetadataCache)
-from vllm.model_executor.utils import set_random_seed
+from vllm.model_executor.utils import ThinkSettings, set_random_seed
 
 __all__ = [
     "SamplingMetadata",
     "SamplingMetadataCache",
     "set_random_seed",
     "BasevLLMParameter",
     "PackedvLLMParameter",
+    "ThinkSettings",
 ]
@@ -762,11 +762,11 @@ def __init__(
         self.global_num_experts = num_experts + num_redundant_experts
 
         # we padding globally so EP buffer allocation works
-        if (quant_config and quant_config.get_name() == "mxfp4"
-                and (current_platform.is_rocm()
-                     or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
-                     or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16)):
-            hidden_size = round_up(hidden_size, 256)
+        if quant_config and quant_config.get_name() == "mxfp4":
+            from vllm.model_executor.layers.quantization.mxfp4 import (  # noqa: E501
+                should_use_flashinfer_mxfp4)
+            if current_platform.is_rocm() or should_use_flashinfer_mxfp4():
+                hidden_size = round_up(hidden_size, 256)
 
         # For smuggling this layer into the fused moe custom op
         compilation_config = vllm_config.compilation_config

@@ -6,6 +6,7 @@
 from torch.nn.parameter import Parameter
 
 from vllm import envs
+from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig,
                                                   FusedMoEMethodBase)
 from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
@@ -26,12 +27,38 @@
 from vllm.scalar_type import scalar_types
 from vllm.utils import (has_triton_kernels, is_torch_equal_or_newer,
                         next_power_of_2, round_up)
+from vllm.utils.flashinfer import has_flashinfer
 
-if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
-        or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
-    # from flashinfer.fused_moe import cutlass_fused_moe
-    from flashinfer import (mxfp8_quantize, shuffle_matrix_a,
-                            shuffle_matrix_sf_a, trtllm_fp4_block_scale_moe)
+logger = init_logger(__name__)
+
+
+def _should_use_flashinfer_mxfp4_bf16():
+    """Determine if FlashInfer MXFP4 BF16 should be used."""
+    # If explicitly set, respect the setting
+    if envs.is_set("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16"):
+        return envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16
+
+    # Enable by default on SM100 if MXFP8 is not explicitly enabled
+    if (current_platform.is_device_capability(100) and has_flashinfer()
+            and not envs.is_set("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8")):
+        logger.info_once(
+            "Enabling FlashInfer MXFP4 BF16 backend by default for Blackwell. "
+            "For faster performance, consider setting "
+            "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1, "
+            "though this may impact accuracy.")
+        return True
+
+    return False
+
+
+def _should_use_flashinfer_mxfp4_mxfp8():
+    """Determine if FlashInfer MXFP4 MXFP8 should be used."""
+    return envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
+
+
+def should_use_flashinfer_mxfp4():
+    return (_should_use_flashinfer_mxfp4_mxfp8()
+            or _should_use_flashinfer_mxfp4_bf16())
 
 
 class Mxfp4Config(QuantizationConfig):
@@ -87,12 +114,18 @@ def __init__(self, moe: FusedMoEConfig):
         self.moe = moe
         self.use_marlin = self._should_use_marlin()
 
+        if current_platform.is_device_capability(100) and not has_flashinfer():
+            logger.warning_once(
+                "MXFP4 MoE is enabled on Blackwell but FlashInfer "
+                "is not available. This may result in degraded performance. "
+                "Please `pip install vllm[flashinfer]` for best results.")
+
     def _should_use_marlin(self):
         if envs.VLLM_MXFP4_USE_MARLIN is not None:
             return envs.VLLM_MXFP4_USE_MARLIN
         if current_platform.is_cuda() and \
-                not current_platform.has_device_capability(100):
-            if not current_platform.is_device_capability(90):
+                not current_platform.is_device_capability(100):
+            if not current_platform.has_device_capability(90):
                 # marlin kernel has better performance on ampere
                 return True
             if not has_triton_kernels():
@@ -138,8 +171,7 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
             layer.hidden_size = hidden_size
             layer.intermediate_size_per_partition = \
                 intermediate_size_per_partition_after_pad
-        elif (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
-              or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
+        elif should_use_flashinfer_mxfp4():
             # pad the intermediate size to be a multiple of 2 * mxfp4_block
             # for to hold non-uniform sharded tensor as well as swizzling
             # other padding to increase performance
@@ -230,8 +262,8 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
     def process_weights_after_loading(self, layer):
         if self.use_marlin:
             prepare_moe_fp4_layer_for_marlin(layer)
-        elif (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
-              or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
+        elif should_use_flashinfer_mxfp4():
+            from flashinfer import shuffle_matrix_a, shuffle_matrix_sf_a
             layer.gemm1_alpha = Parameter(torch.tensor(
                 [1.702] * self.num_experts, dtype=torch.float32).cuda(),
                                           requires_grad=False)
@@ -478,11 +510,11 @@ def apply(
             logical_replica_count), (
                 "MXFP4 are not supported with this configuration.")
 
-        if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
-                or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
+        if should_use_flashinfer_mxfp4():
+            from flashinfer import mxfp8_quantize, trtllm_fp4_block_scale_moe
             assert not self.moe.use_ep, (
                 "EP is not supported for flashinfer mxfp4 moe backend yet.")
-            if envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16:
+            if _should_use_flashinfer_mxfp4_bf16():
                 assert x.dtype == torch.bfloat16
                 x_quant = x
                 x_scale = None

diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
@@ -28,7 +28,7 @@
 
 import torch
 from torch import nn
-from transformers import LlamaConfig
+from transformers import AutoTokenizer, LlamaConfig
 
 from vllm.attention import Attention, AttentionType
 from vllm.compilation.decorators import support_torch_compile
@@ -47,6 +47,7 @@
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.utils import ThinkSettings
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP
@@ -549,6 +550,30 @@ def __init__(self,
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+        tokenizer = AutoTokenizer.from_pretrained(
+            vllm_config.model_config.model)
+        start_think_ids = tokenizer.encode("<think>", add_special_tokens=False)
+        stop_think_ids = tokenizer.encode("</think>", add_special_tokens=False)
+        assert len(start_think_ids) == 1 and len(stop_think_ids) == 1, \
+            f"Invalid think IDs: " \
+            f"</think> {start_think_ids}, " \
+            f"</think> {stop_think_ids}"
-        assert len(start_think_ids) == 1 and len(stop_think_ids) == 1, \
-            f"Invalid think IDs: " \
-            f"</think> {start_think_ids}, " \
-            f"</think> {stop_think_ids}"
+        assert len(start_think_ids) == 1 and len(stop_think_ids) == 1, \
+            f"Invalid think IDs: " \
+            f"<think> {start_think_ids}, " \
+            f"</think> {stop_think_ids}"
-        assert len(start_think_ids) == 1 and len(stop_think_ids) == 1, \
-            f"Invalid think IDs: " \
-            f"</think> {start_think_ids}, " \
-            f"</think> {stop_think_ids}"
+        assert len(start_think_ids) == 1 and len(stop_think_ids) == 1, \
+            f"Invalid think IDs: " \
+            f"<think> {start_think_ids}, " \
+            f"</think> {stop_think_ids}"
+
+        self.think_settings = ThinkSettings(
+            start_think_id=start_think_ids[0],
+            stop_think_id=stop_think_ids[0],
+        )
+
+        for text in self.think_settings.step_split_tokens:
+            encoded_tokens = tokenizer.encode(text, add_special_tokens=False)
+            if len(encoded_tokens) == 1:
+                self.think_settings.step_split_token_ids.add(encoded_tokens[0])
+        for text in self.think_settings.discourse_marker_tokens:
+            encoded_tokens = tokenizer.encode(text, add_special_tokens=False)
+            if len(encoded_tokens) == 1:
+                self.think_settings.discourse_marker_token_ids.add(
+                    encoded_tokens[0])
+
     def set_aux_hidden_state_layers(self, layers: tuple[int]) -> None:
         self.model.aux_hidden_state_layers = layers
 

diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py
@@ -127,13 +127,25 @@ def __init__(
             )
         ])
         if hasattr(self.config, "target_hidden_size"):
-            self.fc = torch.nn.Linear(self.config.target_hidden_size * 3,
-                                      self.config.hidden_size,
-                                      bias=False)
+            fc_input_size = self.config.target_hidden_size * 3
         else:
-            self.fc = torch.nn.Linear(self.config.hidden_size * 3,
-                                      self.config.hidden_size,
-                                      bias=False)
+            fc_input_size = self.config.hidden_size * 3
+
+        fc_output_size = self.config.hidden_size
+
+        if hasattr(self.config, "early_stop_method"):
+            early_stop_method = self.config.early_stop_method
+            if early_stop_method in ["confidence", "progress", "remain"]:
+                fc_output_size = self.config.hidden_size + 1
+            elif early_stop_method == "confidence_progress_remain":
+                fc_output_size = self.config.hidden_size + 3
+            else:
+                logger.error("Unknown confidence loss type: %s",
+                             early_stop_method)
+        logger.info("eagle fc_output_size=%d", fc_output_size)
+
+        self.fc = torch.nn.Linear(fc_input_size, fc_output_size, bias=False)
+
         self.norm = RMSNorm(
             self.config.hidden_size,
             eps=self.config.rms_norm_eps,