[misc] Do not allow to use lora with chunked prefill. (vllm-project#5538)

rkooo567 · DarkLight1337 · jimpang · commit 078ac25f6ed3 · 2024-07-08T11:34:32.000+08:00
Co-authored-by: Cyrus Leung &lt;tlleungac@connect.ust.hk&gt;
diff --git a/vllm/config.py b/vllm/config.py
@@ -1092,6 +1092,8 @@ def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
                 "Due to limitations of the custom LoRA CUDA kernel, "
                 "max_num_batched_tokens must be <= 65528 when "
                 "LoRA is enabled.")
+        if scheduler_config.chunked_prefill_enabled:
+            raise ValueError("LoRA is not supported with chunked prefill yet.")
 
 
 @dataclass