huggingface · manueldeprada · Sep 18, 2025 · Sep 18, 2025 · Sep 18, 2025 · Sep 18, 2025
diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
@@ -177,12 +177,9 @@ def __init__(
         self.main_model_min_length = self.generation_config.min_length
         self.generation_config.min_length = 0
         self.generation_config.min_new_tokens = None
-        for processor in self.logits_processor:
-            if isinstance(processor, MinLengthLogitsProcessor):
-                raise ValueError(
-                    "Passing `MinLengthLogitsProcessor` when using `assisted_generation is disabled. "
-                    "Please pass in `min_length` into `.generate()` instead"
-                )
+        self.logits_processor = [
+            processor for processor in self.logits_processor if not isinstance(processor, MinLengthLogitsProcessor)
+        ]
 
         # We need to roll back the cache in assisted generation, only DynamicCache is supported
         self.generation_config.cache_implementation = "dynamic_full"

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
@@ -2217,6 +2217,7 @@ def _extract_generation_mode_kwargs(
             "assistant_tokenizer": kwargs.pop("assistant_tokenizer", None),
             "assistant_model": assistant_model,
             "streamer": streamer,
+            "assistant_temperature": kwargs.pop("assistant_temperature", None),
         }
         generation_mode_kwargs["synced_gpus"] = (
             (is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)) and dist.get_world_size() > 1
@@ -3457,6 +3458,7 @@ def _assisted_decoding(
         assistant_model: Optional["PreTrainedModel"] = None,
         assistant_tokenizer: Optional["PreTrainedTokenizerBase"] = None,
         tokenizer: Optional["PreTrainedTokenizerBase"] = None,
+        assistant_temperature: Optional[float] = None,
         **model_kwargs,
     ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
         r"""
@@ -3491,6 +3493,9 @@ def _assisted_decoding(
                 The tokenizer used for the assistant model. If not provided, the token space is assumed to be the same.
             tokenizer (`PreTrainedTokenizerBase`, *optional*):
                 The tokenizer used for the main model. If not provided, the token space is assumed to be the same.
+            assistant_temperature (`float`, *optional*):
+                The temperature to use for the assistant model. If not provided and main generation temperature is below
+                1.5, it will be set to 1.5 (to improve decoding speed).
             model_kwargs:
                 Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
                 If model is an encoder-decoder model the kwargs should include `encoder_outputs`.
@@ -3511,6 +3516,20 @@ def _assisted_decoding(
             and any(getattr(l, "is_compileable", False) for l in model_kwargs["past_key_values"].layers)
         ):
             raise ValueError("assisted generate is not supported with Static cache classes`")
+        # Prefer a slightly higher temperature for the assistant when not explicitly provided
+        idx = next((i for i, p in enumerate(logits_processor) if isinstance(p, TemperatureLogitsWarper)), None)
+        temp_processor = logits_processor.pop(idx) if idx is not None else TemperatureLogitsWarper(temperature=1.0)
+
+        if assistant_temperature is None and temp_processor is not None and temp_processor.temperature < 1.5:
+            logger.warning_once(
+                f"The assistant's sampling temperature comes from main generation loop set to {temp_processor.temperature}, "
+                "but speculative decoding benefits from slightly hotter candidate generation, (see #40976) so we are setting it "
+                "to 1.5. This should improve decoding speed in most cases. Use `assistant_temperature` to override this value."
+            )
+            assistant_temperature = 1.5
+
+        if assistant_temperature is not None:
+            logits_processor.insert(0, TemperatureLogitsWarper(temperature=assistant_temperature))
         # Get the candidate generator, given the parameterization
         candidate_generator = self._get_candidate_generator(
             generation_config=generation_config,