Skip to content

Commit b189ed9

Browse files
ashwin-phadkejeejeelee
authored andcommitted
Remove LoRA bias support (vllm-project#25807)
Signed-off-by: Ashwin Phadke <ashwinphadke12@rediffmail.com> Signed-off-by: Ashwin Phadke <23502062+ashwin-phadke@users.noreply.github.com> Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com> Signed-off-by: 0xrushi <6279035+0xrushi@users.noreply.github.com>
1 parent cddd04a commit b189ed9

20 files changed

+35
-366
lines changed

tests/entrypoints/openai/test_lora_adapters.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,6 @@
2323
{"r": 1024},
2424
"is greater than max_lora_rank",
2525
),
26-
(
27-
"test_bias",
28-
{"bias": "all"},
29-
"Adapter bias cannot be used without bias_enabled",
30-
),
3126
("test_dora", {"use_dora": True}, "does not yet support DoRA"),
3227
(
3328
"test_modules_to_save",

tests/lora/test_peft_helper.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,6 @@
1616
{"r": 1024},
1717
"is greater than max_lora_rank",
1818
),
19-
(
20-
"test_bias",
21-
{"bias": "all"},
22-
"Adapter bias cannot be used without bias_enabled",
23-
),
2419
("test_dora", {"use_dora": True}, "does not yet support DoRA"),
2520
(
2621
"test_modules_to_save",

tests/lora/test_utils.py

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ class LoRANameParserTestConfig(NamedTuple):
2121
name: str
2222
module_name: str
2323
is_lora_a: bool
24-
is_bias: bool
2524
weights_mapper: Optional[WeightsMapper] = None
2625

2726

@@ -37,44 +36,37 @@ def test_parse_fine_tuned_lora_name_valid():
3736
"base_model.model.model.embed_tokens.lora_embedding_A",
3837
"model.embed_tokens",
3938
True,
40-
False,
4139
),
4240
LoRANameParserTestConfig(
4341
"base_model.model.model.embed_tokens.lora_embedding_B",
4442
"model.embed_tokens",
4543
False,
46-
False,
4744
),
4845
LoRANameParserTestConfig(
4946
"base_model.model.model.layers.9.mlp.down_proj.lora_A.weight",
5047
"model.layers.9.mlp.down_proj",
5148
True,
52-
False,
5349
),
5450
LoRANameParserTestConfig(
5551
"base_model.model.model.layers.9.mlp.down_proj.lora_B.weight",
5652
"model.layers.9.mlp.down_proj",
5753
False,
58-
False,
5954
),
6055
LoRANameParserTestConfig(
6156
"language_model.layers.9.mlp.down_proj.lora_A.weight",
6257
"language_model.layers.9.mlp.down_proj",
6358
True,
64-
False,
6559
),
6660
LoRANameParserTestConfig(
6761
"language_model.layers.9.mlp.down_proj.lora_B.weight",
6862
"language_model.layers.9.mlp.down_proj",
6963
False,
70-
False,
7164
),
7265
# Test with WeightsMapper
7366
LoRANameParserTestConfig(
7467
"base_model.model.model.layers.9.mlp.down_proj.lora_A.weight",
7568
"language_model.model.layers.9.mlp.down_proj",
7669
True,
77-
False,
7870
weights_mapper=WeightsMapper(
7971
orig_to_new_prefix={"model.": "language_model.model."}
8072
),
@@ -83,7 +75,6 @@ def test_parse_fine_tuned_lora_name_valid():
8375
"base_model.model.model.layers.9.mlp.down_proj.lora_B.weight",
8476
"language_model.model.layers.9.mlp.down_proj",
8577
False,
86-
False,
8778
weights_mapper=WeightsMapper(
8879
orig_to_new_prefix={"model.": "language_model.model."}
8980
),
@@ -92,7 +83,6 @@ def test_parse_fine_tuned_lora_name_valid():
9283
"model.layers.9.mlp.down_proj.lora_A.weight",
9384
"language_model.model.layers.9.mlp.down_proj",
9485
True,
95-
False,
9686
weights_mapper=WeightsMapper(
9787
orig_to_new_prefix={"model.": "language_model.model."}
9888
),
@@ -101,14 +91,13 @@ def test_parse_fine_tuned_lora_name_valid():
10191
"model.layers.9.mlp.down_proj.lora_B.weight",
10292
"language_model.model.layers.9.mlp.down_proj",
10393
False,
104-
False,
10594
weights_mapper=WeightsMapper(
10695
orig_to_new_prefix={"model.": "language_model.model."}
10796
),
10897
),
10998
]
110-
for name, module_name, is_lora_a, is_bias, weights_mapper in fixture:
111-
assert (module_name, is_lora_a, is_bias) == parse_fine_tuned_lora_name(
99+
for name, module_name, is_lora_a, weights_mapper in fixture:
100+
assert (module_name, is_lora_a) == parse_fine_tuned_lora_name(
112101
name, weights_mapper
113102
)
114103

vllm/config/lora.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -70,12 +70,6 @@ class LoRAConfig:
7070
per prompt. When run in offline mode, the lora IDs for n modalities
7171
will be automatically assigned to 1-n with the names of the modalities
7272
in alphabetic order."""
73-
bias_enabled: bool = Field(
74-
default=False,
75-
deprecated="`bias_enabled` is deprecated and will be removed in v0.12.0.",
76-
)
77-
"""[DEPRECATED] Enable bias for LoRA adapters. This option will be
78-
removed in v0.12.0."""
7973

8074
def compute_hash(self) -> str:
8175
"""
@@ -96,7 +90,7 @@ def compute_hash(self) -> str:
9690
factors.append(self.lora_dtype)
9791
factors.append(self.lora_extra_vocab_size)
9892
factors.append(self.lora_vocab_padding_size)
99-
factors.append(self.bias_enabled)
93+
10094
hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
10195
return hash_str
10296

vllm/engine/arg_utils.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -439,7 +439,6 @@ class EngineArgs:
439439
video_pruning_rate: float = MultiModalConfig.video_pruning_rate
440440
# LoRA fields
441441
enable_lora: bool = False
442-
enable_lora_bias: bool = LoRAConfig.bias_enabled
443442
max_loras: int = LoRAConfig.max_loras
444443
max_lora_rank: int = LoRAConfig.max_lora_rank
445444
default_mm_loras: Optional[dict[str, str]] = LoRAConfig.default_mm_loras
@@ -916,7 +915,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
916915
action=argparse.BooleanOptionalAction,
917916
help="If True, enable handling of LoRA adapters.",
918917
)
919-
lora_group.add_argument("--enable-lora-bias", **lora_kwargs["bias_enabled"])
920918
lora_group.add_argument("--max-loras", **lora_kwargs["max_loras"])
921919
lora_group.add_argument("--max-lora-rank", **lora_kwargs["max_lora_rank"])
922920
lora_group.add_argument(
@@ -1515,7 +1513,6 @@ def create_engine_config(
15151513

15161514
lora_config = (
15171515
LoRAConfig(
1518-
bias_enabled=self.enable_lora_bias,
15191516
max_lora_rank=self.max_lora_rank,
15201517
max_loras=self.max_loras,
15211518
default_mm_loras=self.default_mm_loras,

vllm/lora/layers/base.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,6 @@ def set_lora(
4545
lora_a: torch.Tensor,
4646
lora_b: torch.Tensor,
4747
embeddings_tensor: Optional[torch.Tensor],
48-
bias: Optional[torch.Tensor] = None,
4948
):
5049
"""Overwrites lora tensors at index."""
5150
...

vllm/lora/layers/base_linear.py

Lines changed: 2 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33

4-
from typing import Optional, cast
4+
from typing import Optional
55

66
import torch
77
from transformers import PretrainedConfig
@@ -29,7 +29,6 @@ def __init__(self, base_layer: LinearBase):
2929
self.tp_size = self.base_layer.tp_size
3030
self.tp_rank = self.base_layer.tp_rank
3131
self.device = _get_lora_device(self.base_layer)
32-
self.lora_bias_stacked: Optional[tuple[torch.Tensor, ...]] = None
3332
self.output_slices: tuple[int, ...]
3433
self.output_size: int
3534
self.n_slices: int
@@ -86,38 +85,19 @@ def create_lora_weights(
8685
)
8786
for _ in range(self.n_slices)
8887
)
89-
if lora_config.bias_enabled:
90-
lora_bias_out_size = lora_b_out_size
91-
self.lora_bias_stacked = tuple(
92-
torch.zeros(
93-
max_loras,
94-
1,
95-
lora_bias_out_size,
96-
dtype=lora_config.lora_dtype,
97-
device=self.device,
98-
)
99-
for _ in range(self.n_slices)
100-
)
10188
self.output_slices = (self.lora_b_stacked[0].shape[2],)
10289

10390
def reset_lora(self, index: int):
10491
for s_index in range(self.n_slices):
10592
self.lora_a_stacked[s_index][index] = 0
10693
self.lora_b_stacked[s_index][index] = 0
107-
if self.lora_config.bias_enabled:
108-
# Make mypy happy
109-
self.lora_bias_stacked = cast(
110-
tuple[torch.Tensor, ...], self.lora_bias_stacked
111-
)
112-
self.lora_bias_stacked[s_index][index] = 0
11394

11495
def set_lora(
11596
self,
11697
index: int,
11798
lora_a: torch.Tensor,
11899
lora_b: torch.Tensor,
119100
embeddings_tensor: Optional[torch.Tensor],
120-
lora_bias: Optional[torch.Tensor] = None,
121101
):
122102
# Except for QKVParallelLinearWithLoRA and
123103
# MergedColumnParallelLinearWithLoRA, all other linear LoRA layers
@@ -131,23 +111,13 @@ def set_lora(
131111
if self.tp_size > 1:
132112
lora_a = self.slice_lora_a(lora_a)
133113
lora_b = self.slice_lora_b(lora_b)
134-
if lora_bias is not None:
135-
lora_bias = self.slice_bias(lora_bias)
136114

137115
self.lora_a_stacked[0][index, 0, : lora_a.shape[0], : lora_a.shape[1]].copy_(
138116
lora_a, non_blocking=True
139117
)
140118
self.lora_b_stacked[0][index, 0, : lora_b.shape[0], : lora_b.shape[1]].copy_(
141119
lora_b, non_blocking=True
142120
)
143-
if lora_bias is not None:
144-
self.lora_bias_stacked = cast(
145-
tuple[torch.Tensor, ...], self.lora_bias_stacked
146-
)
147-
assert len(self.lora_bias_stacked)
148-
self.lora_bias_stacked[0][index, 0, : lora_bias.shape[0]].copy_(
149-
lora_bias, non_blocking=True
150-
)
151121

152122
def apply(
153123
self, x: torch.Tensor, bias: Optional[torch.Tensor] = None
@@ -162,13 +132,7 @@ def apply(
162132
x = x.flatten(0, 1)
163133

164134
lora_output: Optional[torch.Tensor] = self.punica_wrapper.add_lora_linear(
165-
output,
166-
x,
167-
self.lora_a_stacked,
168-
self.lora_b_stacked,
169-
self.lora_bias_stacked,
170-
1.0,
171-
self.output_slices,
135+
output, x, self.lora_a_stacked, self.lora_b_stacked, 1.0, self.output_slices
172136
)
173137
if not current_platform.can_update_inplace():
174138
output = lora_output

vllm/lora/layers/column_parallel_linear.py

Lines changed: 1 addition & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33

4-
from typing import Optional, Union, cast
4+
from typing import Optional, Union
55

66
import torch
77
import torch.nn as nn
@@ -32,8 +32,6 @@ def _mcp_apply(x, bias, layer: "ColumnParallelLinearWithLoRA"):
3232
== len(layer.lora_b_stacked)
3333
== len(layer.output_slices)
3434
)
35-
if layer.lora_bias_stacked is not None:
36-
assert layer.n_slices == len(layer.lora_bias_stacked)
3735

3836
output = layer.base_layer.quant_method.apply(layer.base_layer, x, bias)
3937

@@ -61,7 +59,6 @@ def _mcp_apply(x, bias, layer: "ColumnParallelLinearWithLoRA"):
6159
output,
6260
buffers,
6361
layer.lora_b_stacked,
64-
layer.lora_bias_stacked,
6562
layer.output_slices,
6663
offset_start=0,
6764
add_input=True,
@@ -122,16 +119,6 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
122119
lora_b = lora_b[start_idx:end_idx, :]
123120
return lora_b
124121

125-
def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
126-
# TODO: Fix the slicing logic of bias.
127-
if bias is None:
128-
return bias
129-
shard_size = self.output_size
130-
start_idx = self.tp_rank * shard_size
131-
end_idx = (self.tp_rank + 1) * shard_size
132-
bias = bias[start_idx:end_idx]
133-
return bias
134-
135122
def forward(
136123
self, input_: torch.Tensor
137124
) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[torch.Tensor]]]:
@@ -238,17 +225,6 @@ def create_lora_weights(
238225
)
239226
for output_size in self.output_slices
240227
)
241-
if lora_config.bias_enabled:
242-
self.lora_bias_stacked = tuple(
243-
torch.zeros(
244-
max_loras,
245-
1,
246-
output_size,
247-
dtype=lora_config.lora_dtype,
248-
device=self.device,
249-
)
250-
for output_size in self.output_slices
251-
)
252228

253229
def slice_lora_a(
254230
self, lora_a: list[Union[torch.Tensor, None]]
@@ -268,31 +244,18 @@ def slice_lora_b(
268244
]
269245
return sliced_lora_b
270246

271-
def slice_bias(
272-
self, bias: list[Union[torch.Tensor, None]]
273-
) -> list[Union[torch.Tensor, None]]:
274-
for i, (shard_id, shard_size) in enumerate(
275-
zip(self.output_ids, self.output_slices)
276-
):
277-
if (bias_i := bias[i]) is not None:
278-
bias[i] = bias_i[shard_size * shard_id : shard_size * (shard_id + 1)]
279-
return bias
280-
281247
def set_lora(
282248
self,
283249
index: int,
284250
lora_a: torch.Tensor,
285251
lora_b: torch.Tensor,
286252
embeddings_tensor: Optional[torch.Tensor],
287-
lora_bias: Optional[torch.Tensor] = None,
288253
):
289254
self.reset_lora(index)
290255

291256
if self.tp_size > 1:
292257
lora_a = self.slice_lora_a(lora_a)
293258
lora_b = self.slice_lora_b(lora_b)
294-
if lora_bias is not None:
295-
lora_bias = self.slice_bias(lora_bias)
296259

297260
for i in range(self.n_slices):
298261
if (lora_a_i := lora_a[i]) is not None:
@@ -304,16 +267,6 @@ def set_lora(
304267
index, 0, : lora_b_i.shape[0], : lora_b_i.shape[1]
305268
].copy_(lora_b_i, non_blocking=True)
306269

307-
if lora_bias is not None:
308-
self.lora_bias_stacked = cast(
309-
tuple[torch.Tensor, ...], self.lora_bias_stacked
310-
)
311-
for i in range(self.n_slices):
312-
if (lora_bias_i := lora_bias[i]) is not None:
313-
self.lora_bias_stacked[i][index, 0, : lora_bias_i.shape[0]].copy_(
314-
lora_bias_i, non_blocking=True
315-
)
316-
317270
@classmethod
318271
@_not_fully_sharded_can_replace
319272
def can_replace_layer(
@@ -380,24 +333,6 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
380333
lora_b = torch.cat([lora_b_q, lora_b_k, lora_b_v], dim=0)
381334
return lora_b
382335

383-
def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
384-
bias_q = bias[
385-
self.q_proj_shard_size * self.q_shard_id : self.q_proj_shard_size
386-
* (self.q_shard_id + 1)
387-
]
388-
k_offset = self.q_proj_total_size
389-
bias_k = bias[
390-
k_offset + self.kv_proj_shard_size * self.kv_shard_id : k_offset
391-
+ self.kv_proj_shard_size * (self.kv_shard_id + 1)
392-
]
393-
v_offset = k_offset + self.kv_proj_total_size
394-
bias_v = bias[
395-
v_offset + self.kv_proj_shard_size * self.kv_shard_id : v_offset
396-
+ self.kv_proj_shard_size * (self.kv_shard_id + 1)
397-
]
398-
bias = torch.cat([bias_q, bias_k, bias_v], dim=1)
399-
return bias
400-
401336
@classmethod
402337
@_not_fully_sharded_can_replace
403338
def can_replace_layer(

0 commit comments

Comments
 (0)