From d0169e1b0fa44a80ba40baf92dd2cedd3611076b Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 7 Jan 2025 11:05:17 +0800 Subject: [PATCH] [Model] Future-proof Qwen2-Audio multi-modal processor (#11776) Signed-off-by: DarkLight1337 --- vllm/model_executor/models/qwen2_audio.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index a7bb3425ed17c..576b01776e5de 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -227,12 +227,14 @@ def get_replacement_qwen2_audio(item_idx: int): ] def _always_apply_prompt_replacements(self) -> bool: - # HF never applies prompt replacements, so we have to do it ourselves. + # Qwen2-Audio processor will start inserting placeholder tokens + # in an upcoming release: + # https://github.com/huggingface/transformers/pull/35534 # NOTE: `_find_placeholders_by_modality` may incorrectly think that HF # has already performed processing for multi-audio input when the input # audios are short (the corresponding placeholders may take up fewer # tokens than the number of audio items) - return True + return not hasattr(self._get_hf_processor(), "audio_token") @MULTIMODAL_REGISTRY.register_processor(Qwen2AudioMultiModalProcessor)