14
14
from transformers import PreTrainedTokenizer , PreTrainedTokenizerFast
15
15
16
16
import QEfficient
17
- from QEfficient .base .common import AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP , QEFF_MODEL_TYPE , QEFFCommonLoader
17
+ from QEfficient .base .common import QEFFCommonLoader
18
18
from QEfficient .base .modeling_qeff import QEFFBaseModel
19
19
from QEfficient .exporter .export_utils import export_onnx , fix_onnx_fp16 , generate_input_files , run_model_on_ort
20
20
from QEfficient .transformers .modeling_utils import get_lists_of_cb_qeff_models
@@ -149,6 +149,7 @@ def convert_to_cloud_kvstyle(
149
149
tokenizer : Union [PreTrainedTokenizer , PreTrainedTokenizerFast ],
150
150
onnx_dir_path : str ,
151
151
seq_len : int ,
152
+ max_num_adapters : int ,
152
153
) -> str :
153
154
"""
154
155
API to convert model with kv retention and export to ONNX.
@@ -181,7 +182,7 @@ def convert_to_cloud_kvstyle(
181
182
182
183
# Decide path for saving exported ONNX files.
183
184
model_name = export_kvstyle_transformed_model_to_onnx (
184
- model_name , qeff_model .model , tokenizer , onnx_dir_path , seq_len
185
+ model_name , qeff_model .model , tokenizer , onnx_dir_path , seq_len , max_num_adapters
185
186
) # type: ignore
186
187
187
188
# return the model path for automation.
@@ -195,6 +196,7 @@ def export_kvstyle_transformed_model_to_onnx(
195
196
onnx_dir_path : str ,
196
197
seq_len : int ,
197
198
full_batch_size : Optional [int ] = None ,
199
+ max_num_adapters : Optional [int ] = None ,
198
200
) -> str :
199
201
# Disabling requires_grad on all parameters
200
202
for _ , p in enumerate (transformed_model .parameters ()):
@@ -213,6 +215,7 @@ def export_kvstyle_transformed_model_to_onnx(
213
215
prompt_len = Constants .PROMPT_LEN ,
214
216
ctx_len = seq_len ,
215
217
full_batch_size = full_batch_size ,
218
+ max_num_adapters = max_num_adapters ,
216
219
)
217
220
218
221
inputs = input_handler .prepare_pytorch_inputs ()
@@ -318,6 +321,7 @@ def export_for_cloud(
318
321
onnx_dir_path : str ,
319
322
seq_length : int = Constants .SEQ_LEN ,
320
323
full_batch_size : Optional [int ] = None ,
324
+ max_num_adapters : Optional [int ] = None ,
321
325
) -> str :
322
326
# Check if model architecture is supported for continuous batching.
323
327
if full_batch_size and qeff_model .model .config .architectures [0 ] not in get_lists_of_cb_qeff_models .architectures :
@@ -326,19 +330,20 @@ def export_for_cloud(
326
330
)
327
331
328
332
# FIXME: move all this to class instead of here, and just call qeff_model.export here.
329
- if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP .get (qeff_model .__class__ , None ) == QEFF_MODEL_TYPE .CAUSALLM : # type: ignore
330
- return export_lm_model_for_cloud (
331
- model_name = model_name ,
332
- qeff_model = qeff_model , # type: ignore
333
- tokenizer = tokenizer ,
334
- onnx_dir_path = onnx_dir_path ,
335
- seq_length = seq_length ,
336
- full_batch_size = full_batch_size ,
337
- )
338
- else :
339
- raise NotImplementedError (
340
- f"Only model type { QEFFAutoModelForCausalLM .__class__ .__name__ } is supported for export, got { type (qeff_model )} "
341
- )
333
+ # if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(qeff_model.__class__, None) == QEFF_MODEL_TYPE.CAUSALLM: # type: ignore
334
+ return export_lm_model_for_cloud (
335
+ model_name = model_name ,
336
+ qeff_model = qeff_model , # type: ignore
337
+ tokenizer = tokenizer ,
338
+ onnx_dir_path = onnx_dir_path ,
339
+ seq_length = seq_length ,
340
+ full_batch_size = full_batch_size ,
341
+ max_num_adapters = max_num_adapters ,
342
+ )
343
+ # else:
344
+ # raise NotImplementedError(
345
+ # f"Only model type {QEFFAutoModelForCausalLM.__class__.__name__} is supported for export, got {type(qeff_model)}"
346
+ # )
342
347
343
348
344
349
def export_lm_model_for_cloud (
@@ -348,6 +353,7 @@ def export_lm_model_for_cloud(
348
353
onnx_dir_path : str ,
349
354
seq_length : int ,
350
355
full_batch_size : Optional [int ] = None ,
356
+ max_num_adapters : Optional [int ] = None ,
351
357
) -> str :
352
358
if os .path .exists (onnx_dir_path ):
353
359
logger .warning (f"Overriding { onnx_dir_path } " )
@@ -361,6 +367,7 @@ def export_lm_model_for_cloud(
361
367
onnx_dir_path = onnx_dir_path ,
362
368
seq_len = seq_length ,
363
369
full_batch_size = full_batch_size ,
370
+ max_num_adapters = max_num_adapters ,
364
371
) # type: ignore
365
372
366
373
else :
@@ -386,6 +393,7 @@ def qualcomm_efficient_converter(
386
393
kv : bool = True ,
387
394
form_factor : str = "cloud" ,
388
395
full_batch_size : Optional [int ] = None ,
396
+ max_num_adapters : Optional [int ] = None ,
389
397
) -> Tuple [str , str ]:
390
398
"""
391
399
This method is an alias for ``QEfficient.export``.
@@ -466,6 +474,7 @@ def qualcomm_efficient_converter(
466
474
onnx_dir_path = onnx_dir_path ,
467
475
seq_length = seq_length ,
468
476
full_batch_size = full_batch_size ,
477
+ max_num_adapters = max_num_adapters ,
469
478
)
470
479
return onnx_dir_path , generated_onnx_model_path
471
480
else :
0 commit comments