Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion vllm/config/lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def __post_init__(self):

# Setting the maximum rank to 512 should be able to satisfy the vast
# majority of applications.
possible_max_ranks = (8, 16, 32, 64, 128, 256, 320, 512)
possible_max_ranks = (1, 8, 16, 32, 64, 128, 256, 320, 512)
possible_lora_extra_vocab_size = (256, 512)
if self.max_lora_rank not in possible_max_ranks:
raise ValueError(
Expand Down
8 changes: 4 additions & 4 deletions vllm/v1/worker/lora_model_runner_mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@

# Defined as a mixin for GPUModelRunner
class LoRAModelRunnerMixin:
LORA_WARMUP_RANK = 8

def load_lora_model(
self, model: nn.Module, vllm_config: VllmConfig, device: torch.device
) -> nn.Module:
Expand Down Expand Up @@ -96,7 +94,9 @@ def maybe_setup_dummy_loras(
assert self.lora_manager is not None, "LoRA is not enabled"

num_loras = lora_config.max_loras

lora_warmup_rank = (
lora_config.max_lora_rank if lora_config.max_lora_rank < 8 else 8
)
Comment on lines +97 to +99
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

This logic for determining lora_warmup_rank is correct for supporting max_lora_rank=1, but it perpetuates a significant performance issue for CUDA graph capture when max_lora_rank > 8.

The _dummy_run method, which uses this lora_warmup_rank, is responsible for both profiling and CUDA graph capture. By capping the warmup rank at 8, any captured CUDA graph will be specific to LoRA ranks up to 8. If the engine is configured with max_lora_rank > 8 (e.g., 16) and receives requests with LoRAs of a rank greater than 8, the captured graph will not be used, causing a fallback to eager execution and negating the performance benefits of CUDA graphs.

To ensure CUDA graphs are captured for the maximum configured rank, the warmup rank should be lora_config.max_lora_rank during graph capture. A simple fix is to always use max_lora_rank, which would also make it consistent with the TPU runner. This might slow down warm-up for large ranks, but it ensures that CUDA graphs are effective for all configured LoRA ranks.

            lora_warmup_rank = lora_config.max_lora_rank

# Make dummy lora requests
lora_requests: set[LoRARequest] = {
LoRARequest(
Expand All @@ -111,7 +111,7 @@ def maybe_setup_dummy_loras(
# Add the dummy LoRAs here so _set_active_loras doesn't try to
# load from disk.
for lr in lora_requests:
self.lora_manager.add_dummy_lora(lr, rank=self.LORA_WARMUP_RANK)
self.lora_manager.add_dummy_lora(lr, rank=lora_warmup_rank)

yield

Expand Down