Skip to content

Commit 3c33e99

Browse files
committed
Initial commit for finite loras implementation
Signed-off-by: Jou-An Chen <quic_jouachen@quicinc.com>
1 parent 08ca83c commit 3c33e99

File tree

15 files changed

+993
-21
lines changed

15 files changed

+993
-21
lines changed

QEfficient/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,5 +24,6 @@
2424
"QEffAutoModel",
2525
"QEFFAutoModelForCausalLM",
2626
"QEffAutoPeftModelForCausalLM",
27+
"QEffAutoLoraModelForCausalLM",
2728
"QEFFCommonLoader",
2829
]

QEfficient/exporter/export_hf_to_cloud_ai_100.py

Lines changed: 24 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
1515

1616
import QEfficient
17-
from QEfficient.base.common import AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP, QEFF_MODEL_TYPE, QEFFCommonLoader
17+
from QEfficient.base.common import QEFFCommonLoader
1818
from QEfficient.base.modeling_qeff import QEFFBaseModel
1919
from QEfficient.exporter.export_utils import export_onnx, fix_onnx_fp16, generate_input_files, run_model_on_ort
2020
from QEfficient.transformers.modeling_utils import get_lists_of_cb_qeff_models
@@ -149,6 +149,7 @@ def convert_to_cloud_kvstyle(
149149
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
150150
onnx_dir_path: str,
151151
seq_len: int,
152+
max_num_adapters: int,
152153
) -> str:
153154
"""
154155
API to convert model with kv retention and export to ONNX.
@@ -181,7 +182,7 @@ def convert_to_cloud_kvstyle(
181182

182183
# Decide path for saving exported ONNX files.
183184
model_name = export_kvstyle_transformed_model_to_onnx(
184-
model_name, qeff_model.model, tokenizer, onnx_dir_path, seq_len
185+
model_name, qeff_model.model, tokenizer, onnx_dir_path, seq_len, max_num_adapters
185186
) # type: ignore
186187

187188
# return the model path for automation.
@@ -195,6 +196,7 @@ def export_kvstyle_transformed_model_to_onnx(
195196
onnx_dir_path: str,
196197
seq_len: int,
197198
full_batch_size: Optional[int] = None,
199+
max_num_adapters: Optional[int] = None,
198200
) -> str:
199201
# Disabling requires_grad on all parameters
200202
for _, p in enumerate(transformed_model.parameters()):
@@ -213,6 +215,7 @@ def export_kvstyle_transformed_model_to_onnx(
213215
prompt_len=Constants.PROMPT_LEN,
214216
ctx_len=seq_len,
215217
full_batch_size=full_batch_size,
218+
max_num_adapters=max_num_adapters,
216219
)
217220

218221
inputs = input_handler.prepare_pytorch_inputs()
@@ -318,6 +321,7 @@ def export_for_cloud(
318321
onnx_dir_path: str,
319322
seq_length: int = Constants.SEQ_LEN,
320323
full_batch_size: Optional[int] = None,
324+
max_num_adapters: Optional[int] = None,
321325
) -> str:
322326
# Check if model architecture is supported for continuous batching.
323327
if full_batch_size and qeff_model.model.config.architectures[0] not in get_lists_of_cb_qeff_models.architectures:
@@ -326,19 +330,20 @@ def export_for_cloud(
326330
)
327331

328332
# FIXME: move all this to class instead of here, and just call qeff_model.export here.
329-
if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(qeff_model.__class__, None) == QEFF_MODEL_TYPE.CAUSALLM: # type: ignore
330-
return export_lm_model_for_cloud(
331-
model_name=model_name,
332-
qeff_model=qeff_model, # type: ignore
333-
tokenizer=tokenizer,
334-
onnx_dir_path=onnx_dir_path,
335-
seq_length=seq_length,
336-
full_batch_size=full_batch_size,
337-
)
338-
else:
339-
raise NotImplementedError(
340-
f"Only model type {QEFFAutoModelForCausalLM.__class__.__name__} is supported for export, got {type(qeff_model)}"
341-
)
333+
# if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(qeff_model.__class__, None) == QEFF_MODEL_TYPE.CAUSALLM: # type: ignore
334+
return export_lm_model_for_cloud(
335+
model_name=model_name,
336+
qeff_model=qeff_model, # type: ignore
337+
tokenizer=tokenizer,
338+
onnx_dir_path=onnx_dir_path,
339+
seq_length=seq_length,
340+
full_batch_size=full_batch_size,
341+
max_num_adapters=max_num_adapters,
342+
)
343+
# else:
344+
# raise NotImplementedError(
345+
# f"Only model type {QEFFAutoModelForCausalLM.__class__.__name__} is supported for export, got {type(qeff_model)}"
346+
# )
342347

343348

344349
def export_lm_model_for_cloud(
@@ -348,6 +353,7 @@ def export_lm_model_for_cloud(
348353
onnx_dir_path: str,
349354
seq_length: int,
350355
full_batch_size: Optional[int] = None,
356+
max_num_adapters: Optional[int] = None,
351357
) -> str:
352358
if os.path.exists(onnx_dir_path):
353359
logger.warning(f"Overriding {onnx_dir_path}")
@@ -361,6 +367,7 @@ def export_lm_model_for_cloud(
361367
onnx_dir_path=onnx_dir_path,
362368
seq_len=seq_length,
363369
full_batch_size=full_batch_size,
370+
max_num_adapters=max_num_adapters,
364371
) # type: ignore
365372

366373
else:
@@ -386,6 +393,7 @@ def qualcomm_efficient_converter(
386393
kv: bool = True,
387394
form_factor: str = "cloud",
388395
full_batch_size: Optional[int] = None,
396+
max_num_adapters: Optional[int] = None,
389397
) -> Tuple[str, str]:
390398
"""
391399
This method is an alias for ``QEfficient.export``.
@@ -466,6 +474,7 @@ def qualcomm_efficient_converter(
466474
onnx_dir_path=onnx_dir_path,
467475
seq_length=seq_length,
468476
full_batch_size=full_batch_size,
477+
max_num_adapters=max_num_adapters,
469478
)
470479
return onnx_dir_path, generated_onnx_model_path
471480
else:

QEfficient/exporter/export_utils.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,8 @@ def export_onnx(
8383
dynamic_axes[iname] = {0: dynamic_axis_past_key, 2: "ctx_len"}
8484
elif iname == "batch_index":
8585
dynamic_axes[iname] = {0: "batch_size"}
86+
elif iname == "lora_ids":
87+
dynamic_axes[iname] = {0: "batch_size"}
8688

8789
if "past_key.0" in input_names and "attention_mask" in input_names:
8890
dynamic_axes["attention_mask"] = {0: "batch_size", 1: "ctx_len"}

QEfficient/generation/text_generation_inference.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,7 @@ def cloud_ai_100_exec_kv(
230230
write_io_dir: Optional[str] = None,
231231
automation=False,
232232
full_batch_size: Optional[int] = None,
233+
prompt_to_lora_id_mapping: Optional[List[int]] = None,
233234
):
234235
"""
235236
This method generates output until ``eos`` or ``generation_len`` by executing the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards.
@@ -277,6 +278,7 @@ def cloud_ai_100_exec_kv(
277278
stream=stream,
278279
write_io_dir=write_io_dir,
279280
full_batch_size=full_batch_size,
281+
prompt_to_lora_id_mapping=prompt_to_lora_id_mapping,
280282
)
281283
if full_batch_size is None:
282284
exec_info = [
@@ -313,6 +315,7 @@ def __init__(
313315
qpc_path: str,
314316
prompt: List[str],
315317
full_batch_size: Optional[int] = None,
318+
prompt_to_lora_id_mapping: Optional[List[int]] = None,
316319
ctx_len: Optional[int] = None,
317320
generation_len: Optional[int] = None,
318321
device_id: Optional[List[int]] = None,
@@ -342,6 +345,13 @@ def __init__(
342345
full_batch_size if full_batch_size else self._fetch_full_batch_size()
343346
) # Check and fetch full batch size if CB is enabled
344347

348+
if prompt_to_lora_id_mapping:
349+
self.prompt_to_lora_id_mapping_prefill = deque(prompt_to_lora_id_mapping)
350+
self.prompt_to_lora_id_mapping_decode = prompt_to_lora_id_mapping
351+
else:
352+
self.prompt_to_lora_id_mapping_prefill = None
353+
self.prompt_to_lora_id_mapping_decode = None
354+
345355
self.set_tokenizer_params() # set tokenizer params
346356

347357
# Initialize the storage variables.
@@ -460,6 +470,10 @@ def prepare_decode_inputs(self):
460470
if self.batch_index is not None:
461471
decode_inputs["batch_index"] = self.batch_index
462472

473+
if self.prompt_to_lora_id_mapping_decode and self.full_batch_size is not None:
474+
first_batch_lora_ids = [self.prompt_to_lora_id_mapping_decode[i] for i in range(self.full_batch_size)]
475+
decode_inputs["lora_ids"] = np.array(first_batch_lora_ids, dtype=np.int64).reshape(self.full_batch_size, 1)
476+
463477
return decode_inputs
464478

465479
def _update_decode_input(self, outputs, position_ids, generation_len, decode_batch_id=None):
@@ -547,6 +561,11 @@ def run_prefill(self, prompt, generation_len, prefill_logit_bs=1, decode_batch_i
547561
if decode_batch_id is not None:
548562
inputs["batch_index"] = decode_batch_id
549563

564+
if self.prompt_to_lora_id_mapping_prefill:
565+
inputs["lora_ids"] = np.array(self.prompt_to_lora_id_mapping_prefill.popleft(), dtype=np.int64).reshape(
566+
1, 1
567+
)
568+
550569
for i in range(num_chunks):
551570
chunk_inputs = inputs.copy()
552571
chunk_inputs["input_ids"] = inputs["input_ids"][
@@ -634,6 +653,12 @@ def run_continuous_batching_decode(self, prompt_queue, generation_len):
634653
)
635654

636655
generated_id_current_index[decode_batch_id] += 1
656+
657+
if self.prompt_to_lora_id_mapping_decode:
658+
decode_inputs["lora_ids"][decode_batch_id] = self.prompt_to_lora_id_mapping_decode[
659+
batch_id_map[decode_batch_id]
660+
]
661+
637662
return decode_pause_time
638663

639664
def run_decode(self, decode_inputs, generation_len):

QEfficient/lora/__init__.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# -----------------------------------------------------------------------------
2+
#
3+
# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
4+
# SPDX-License-Identifier: BSD-3-Clause
5+
#
6+
# ----------------------------------------------------------------------------
7+
8+
from QEfficient.lora.auto import QEffAutoLoraModelForCausalLM
9+
10+
__all__ = [
11+
"QEffAutoLoraModelForCausalLM",
12+
]

0 commit comments

Comments
 (0)