From e691918e3bd75a05bc473c77577c494aa6442640 Mon Sep 17 00:00:00 2001 From: SangBin Cho Date: Sat, 15 Jun 2024 23:59:36 +0900 Subject: [PATCH] [misc] Do not allow to use lora with chunked prefill. (#5538) Co-authored-by: Cyrus Leung --- vllm/config.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/config.py b/vllm/config.py index d9e4a619ee010..54f36e1d66783 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1092,6 +1092,8 @@ def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig): "Due to limitations of the custom LoRA CUDA kernel, " "max_num_batched_tokens must be <= 65528 when " "LoRA is enabled.") + if scheduler_config.chunked_prefill_enabled: + raise ValueError("LoRA is not supported with chunked prefill yet.") @dataclass