defer return_hidden_states speculation methods

NickLucche · NickLucche · commit 8b88b8a36668 · 2024-10-14T10:30:27.000Z
diff --git a/vllm/config.py b/vllm/config.py
@@ -1292,6 +1292,14 @@ def maybe_create_spec_config(
                 "speculative_model unless the draft model config contains an "
                 "n_predict parameter.")
 
+        if enable_chunked_prefill and draft_hf_config.model_type in [
+                "medusa", "mlp_speculator", "eagle"
+        ]:
+            raise ValueError(
+                "Chunked prefill and hidden-state based draft models are not "
+                "yet compatible."
+            )
+
         if typical_acceptance_sampler_posterior_threshold is None:
             typical_acceptance_sampler_posterior_threshold = 0.09
         if typical_acceptance_sampler_posterior_alpha is None:
diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py
@@ -195,10 +195,6 @@ def _contract_batch(
         else:
             all_hidden_states = None
 
-        # TODO fix with `return_hidden_states=True` where hidden states are full size,
-        # and we'll need all indices prior to selecting `do_sample=True`,
-        # while logits are indexed by `selected_token_indices` True
-
         # Rule out prefills that are in `non_spec_indices` but produce no tokens.
         non_spec_indices = [
             idx for idx in non_spec_indices
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
@@ -584,6 +584,9 @@ def _run_no_spec(self, execute_model_req: ExecuteModelRequest,
         hidden_states = sampler_output.hidden_states
         if hidden_states is not None:
             # remove hidden_states for prompt tokens
+            # TODO Enable `return_hidden_states`: prefill chunks hidden states are
+            # pruned by the logits processor. Also, they should be arranged back into
+            # full-prefill latent. Address it to enable MLPSpeculator.
             if any(seq.is_prompt
                    for seq in execute_model_req.seq_group_metadata_list):
                 hidden_states = hidden_states[
@@ -698,7 +701,7 @@ def _run_speculative_decoding_step(
         # TODO skip this if chunking is not enabled
         if len(non_spec_indices):
             all_hidden_states = proposal_scores.hidden_states
-            # TODO fix `return_hidden_states`
+            # TODO fix `return_hidden_states`, same as in `_run_no_spec`
             if all_hidden_states is not None:
                 prefill_hidden_states = all_hidden_states[non_spec_indices]
                 execute_model_req.previous_hidden_states = prepare_prefill_hidden_states(