diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index d8cc68d5e9599..ad86f7bdf6101 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -420,7 +420,7 @@ def create_random_embedding_layer(): @torch.inference_mode() @pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) @pytest.mark.parametrize("device", CUDA_DEVICES) -@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000]) +@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 256512]) @pytest.mark.parametrize("stage", STAGES) def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size, stage) -> None: diff --git a/tests/lora/test_punica_variation.py b/tests/lora/test_punica_variation.py index 7e73ea67ee5f4..5bf3f72e7d97b 100644 --- a/tests/lora/test_punica_variation.py +++ b/tests/lora/test_punica_variation.py @@ -25,7 +25,7 @@ BATCHES = [1, 4, 16, 32] NUM_LORA = [1, 4, 8, 16, 32, 64, 128] DTYPES = [torch.float16, torch.bfloat16] -MAX_RANKS = [1, 4, 8, 16, 32, 64, 128] +MAX_RANKS = [1, 4, 8, 16, 32, 64, 128, 256] SCALES = [0.5] SEED = [0] CUDA_DEVICES = [f"cuda:{0}"] diff --git a/vllm/config.py b/vllm/config.py index 4b968f549d902..3cc197f3d655f 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1311,8 +1311,9 @@ class LoRAConfig: long_lora_scaling_factors: Optional[Tuple[float]] = None def __post_init__(self): - # TODO: Increase the range of rank - possible_max_ranks = (8, 16, 32, 64) + # Setting the maximum rank to 256 should be able to satisfy the vast + # majority of applications. + possible_max_ranks = (8, 16, 32, 64, 128, 256) possible_lora_extra_vocab_size = (0, 256, 512) if self.max_lora_rank not in possible_max_ranks: raise ValueError( diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index d3978ff6f4ff1..e3316059dc6d1 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -1073,10 +1073,10 @@ def create_lora_weights( lora_config: LoRAConfig, model_config: Optional[PretrainedConfig] = None, ) -> None: - # TODO: Verify if this condition can be relaxed - if 32000 < self.base_layer.vocab_size > 128512: + # TODO: Verify if this condition can be further relaxed + if 32000 < self.base_layer.vocab_size > 257024: raise ValueError("When using LoRA, vocab size must be " - "32000 >= vocab_size <= 128512") + "32000 >= vocab_size <= 257024") self.lora_a_stacked = torch.zeros( ( max_loras,