1414from transformers import PreTrainedTokenizer , PreTrainedTokenizerFast
1515
1616import QEfficient
17- from QEfficient .base .common import AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP , QEFF_MODEL_TYPE , QEFFCommonLoader
17+ from QEfficient .base .common import QEFFCommonLoader
1818from QEfficient .base .modeling_qeff import QEFFBaseModel
1919from QEfficient .exporter .export_utils import export_onnx , fix_onnx_fp16 , generate_input_files , run_model_on_ort
2020from QEfficient .transformers .modeling_utils import get_lists_of_cb_qeff_models
@@ -149,6 +149,7 @@ def convert_to_cloud_kvstyle(
149149 tokenizer : Union [PreTrainedTokenizer , PreTrainedTokenizerFast ],
150150 onnx_dir_path : str ,
151151 seq_len : int ,
152+ max_num_adapters : int ,
152153) -> str :
153154 """
154155 API to convert model with kv retention and export to ONNX.
@@ -181,7 +182,7 @@ def convert_to_cloud_kvstyle(
181182
182183 # Decide path for saving exported ONNX files.
183184 model_name = export_kvstyle_transformed_model_to_onnx (
184- model_name , qeff_model .model , tokenizer , onnx_dir_path , seq_len
185+ model_name , qeff_model .model , tokenizer , onnx_dir_path , seq_len , max_num_adapters
185186 ) # type: ignore
186187
187188 # return the model path for automation.
@@ -195,6 +196,7 @@ def export_kvstyle_transformed_model_to_onnx(
195196 onnx_dir_path : str ,
196197 seq_len : int ,
197198 full_batch_size : Optional [int ] = None ,
199+ max_num_adapters : Optional [int ] = None ,
198200) -> str :
199201 # Disabling requires_grad on all parameters
200202 for _ , p in enumerate (transformed_model .parameters ()):
@@ -213,6 +215,7 @@ def export_kvstyle_transformed_model_to_onnx(
213215 prompt_len = Constants .PROMPT_LEN ,
214216 ctx_len = seq_len ,
215217 full_batch_size = full_batch_size ,
218+ max_num_adapters = max_num_adapters ,
216219 )
217220
218221 inputs = input_handler .prepare_pytorch_inputs ()
@@ -318,6 +321,7 @@ def export_for_cloud(
318321 onnx_dir_path : str ,
319322 seq_length : int = Constants .SEQ_LEN ,
320323 full_batch_size : Optional [int ] = None ,
324+ max_num_adapters : Optional [int ] = None ,
321325) -> str :
322326 # Check if model architecture is supported for continuous batching.
323327 if full_batch_size and qeff_model .model .config .architectures [0 ] not in get_lists_of_cb_qeff_models .architectures :
@@ -326,19 +330,20 @@ def export_for_cloud(
326330 )
327331
328332 # FIXME: move all this to class instead of here, and just call qeff_model.export here.
329- if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP .get (qeff_model .__class__ , None ) == QEFF_MODEL_TYPE .CAUSALLM : # type: ignore
330- return export_lm_model_for_cloud (
331- model_name = model_name ,
332- qeff_model = qeff_model , # type: ignore
333- tokenizer = tokenizer ,
334- onnx_dir_path = onnx_dir_path ,
335- seq_length = seq_length ,
336- full_batch_size = full_batch_size ,
337- )
338- else :
339- raise NotImplementedError (
340- f"Only model type { QEFFAutoModelForCausalLM .__class__ .__name__ } is supported for export, got { type (qeff_model )} "
341- )
333+ # if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(qeff_model.__class__, None) == QEFF_MODEL_TYPE.CAUSALLM: # type: ignore
334+ return export_lm_model_for_cloud (
335+ model_name = model_name ,
336+ qeff_model = qeff_model , # type: ignore
337+ tokenizer = tokenizer ,
338+ onnx_dir_path = onnx_dir_path ,
339+ seq_length = seq_length ,
340+ full_batch_size = full_batch_size ,
341+ max_num_adapters = max_num_adapters ,
342+ )
343+ # else:
344+ # raise NotImplementedError(
345+ # f"Only model type {QEFFAutoModelForCausalLM.__class__.__name__} is supported for export, got {type(qeff_model)}"
346+ # )
342347
343348
344349def export_lm_model_for_cloud (
@@ -348,6 +353,7 @@ def export_lm_model_for_cloud(
348353 onnx_dir_path : str ,
349354 seq_length : int ,
350355 full_batch_size : Optional [int ] = None ,
356+ max_num_adapters : Optional [int ] = None ,
351357) -> str :
352358 if os .path .exists (onnx_dir_path ):
353359 logger .warning (f"Overriding { onnx_dir_path } " )
@@ -361,6 +367,7 @@ def export_lm_model_for_cloud(
361367 onnx_dir_path = onnx_dir_path ,
362368 seq_len = seq_length ,
363369 full_batch_size = full_batch_size ,
370+ max_num_adapters = max_num_adapters ,
364371 ) # type: ignore
365372
366373 else :
@@ -386,6 +393,7 @@ def qualcomm_efficient_converter(
386393 kv : bool = True ,
387394 form_factor : str = "cloud" ,
388395 full_batch_size : Optional [int ] = None ,
396+ max_num_adapters : Optional [int ] = None ,
389397) -> Tuple [str , str ]:
390398 """
391399 This method is an alias for ``QEfficient.export``.
@@ -466,6 +474,7 @@ def qualcomm_efficient_converter(
466474 onnx_dir_path = onnx_dir_path ,
467475 seq_length = seq_length ,
468476 full_batch_size = full_batch_size ,
477+ max_num_adapters = max_num_adapters ,
469478 )
470479 return onnx_dir_path , generated_onnx_model_path
471480 else :
0 commit comments