quic
diff --git a/‎QEfficient/exporter/export_hf_to_cloud_ai_100.py‎
Lines changed: 2 additions & 14 deletions b/‎QEfficient/exporter/export_hf_to_cloud_ai_100.py‎
Lines changed: 2 additions & 14 deletions
diff --git a/‎QEfficient/exporter/export_utils.py‎
Lines changed: 0 additions & 2 deletions b/‎QEfficient/exporter/export_utils.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎QEfficient/generation/text_generation_inference.py‎
Lines changed: 20 additions & 8 deletions b/‎QEfficient/generation/text_generation_inference.py‎
Lines changed: 20 additions & 8 deletions
@@ -16,7 +16,6 @@
 from QEfficient.base.common import AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP, QEFF_MODEL_TYPE, QEFFCommonLoader
 from QEfficient.base.modeling_qeff import QEFFBaseModel
 from QEfficient.exporter.export_utils import export_onnx, fix_onnx_fp16, generate_input_files, run_model_on_ort
-from QEfficient.lora.auto import QEffAutoLoraModelForCausalLM
 from QEfficient.transformers.modeling_utils import get_lists_of_cb_qeff_models
 from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
 from QEfficient.utils import load_hf_tokenizer
@@ -149,7 +148,6 @@ def convert_to_cloud_kvstyle(
     tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
     onnx_dir_path: str,
     seq_len: int,
-    max_num_adapters: int,
 ) -> str:
     """
     API to convert model with kv retention and export to ONNX.
@@ -178,7 +176,7 @@ def convert_to_cloud_kvstyle(
 
     # Decide path for saving exported ONNX files.
     model_name = export_kvstyle_transformed_model_to_onnx(
-        model_name, qeff_model.model, tokenizer, onnx_dir_path, seq_len, max_num_adapters
+        model_name, qeff_model.model, tokenizer, onnx_dir_path, seq_len
     )  # type: ignore
 
     # return the model path for automation.
@@ -192,7 +190,6 @@ def export_kvstyle_transformed_model_to_onnx(
     onnx_dir_path: str,
     seq_len: int,
     full_batch_size: Optional[int] = None,
-    max_num_adapters: Optional[int] = None,
 ) -> str:
     # Disabling requires_grad on all parameters
     for _, p in enumerate(transformed_model.parameters()):
@@ -211,7 +208,6 @@ def export_kvstyle_transformed_model_to_onnx(
         prompt_len=Constants.PROMPT_LEN,
         ctx_len=seq_len,
         full_batch_size=full_batch_size,
-        max_num_adapters=max_num_adapters,
     )
 
     inputs = input_handler.prepare_pytorch_inputs()
@@ -319,7 +315,6 @@ def export_for_cloud(
     onnx_dir_path: str,
     seq_length: int = Constants.SEQ_LEN,
     full_batch_size: Optional[int] = None,
-    max_num_adapters: Optional[int] = None,
 ) -> str:
     # Check if model architecture is supported for continuous batching.
     if full_batch_size and qeff_model.model.config.architectures[0].lower() not in {
@@ -330,18 +325,14 @@ def export_for_cloud(
         )
 
     # FIXME: move all this to class instead of here, and just call qeff_model.export here.
-    if (
-        AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(qeff_model.__class__, None) == QEFF_MODEL_TYPE.CAUSALLM
-        or qeff_model.__class__ == QEffAutoLoraModelForCausalLM
-    ):  # type: ignore
+    if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(qeff_model.__class__, None) == QEFF_MODEL_TYPE.CAUSALLM:  # type: ignore
         return export_lm_model_for_cloud(
             model_name=model_name,
             qeff_model=qeff_model,  # type: ignore
             tokenizer=tokenizer,
             onnx_dir_path=onnx_dir_path,
             seq_length=seq_length,
             full_batch_size=full_batch_size,
-            max_num_adapters=max_num_adapters,
         )
     else:
         raise NotImplementedError(
@@ -356,7 +347,6 @@ def export_lm_model_for_cloud(
     onnx_dir_path: str,
     seq_length: int,
     full_batch_size: Optional[int] = None,
-    max_num_adapters: Optional[int] = None,
 ) -> str:
     if os.path.exists(onnx_dir_path):
         logger.warning(f"Overriding {onnx_dir_path}")
@@ -385,7 +375,6 @@ def qualcomm_efficient_converter(
     kv: bool = True,
     form_factor: str = "cloud",
     full_batch_size: Optional[int] = None,
-    max_num_adapters: Optional[int] = None,
 ) -> Tuple[str, str]:
     """
     This method is an alias for ``QEfficient.export``.
@@ -461,7 +450,6 @@ def qualcomm_efficient_converter(
             onnx_dir_path=onnx_dir_path,
             seq_length=seq_length,
             full_batch_size=full_batch_size,
-            max_num_adapters=max_num_adapters,
         )
         return onnx_dir_path, generated_onnx_model_path
     else:
 
@@ -83,8 +83,6 @@ def export_onnx(
             dynamic_axes[iname] = {0: dynamic_axis_past_key, 2: "ctx_len"}
         elif iname == "batch_index":
             dynamic_axes[iname] = {0: "batch_size"}
-        elif iname == "lora_ids":
-            dynamic_axes[iname] = {0: "batch_size"}
 
     if "past_key.0" in input_names and "attention_mask" in input_names:
         dynamic_axes["attention_mask"] = {0: "batch_size", 1: "ctx_len"}
 
@@ -230,7 +230,6 @@ def cloud_ai_100_exec_kv(
     stream: bool = True,
     write_io_dir: Optional[str] = None,
     automation=False,
-    full_batch_size: Optional[int] = None,
     prompt_to_lora_id_mapping: Optional[List[int]] = None,
 ):
     """
@@ -348,7 +347,10 @@ def __init__(
 
         if prompt_to_lora_id_mapping:
             self.prompt_to_lora_id_mapping_prefill = deque(prompt_to_lora_id_mapping)
-            self.prompt_to_lora_id_mapping_decode = prompt_to_lora_id_mapping
+            if self.full_batch_size:
+                self.prompt_to_lora_id_mapping_decode = prompt_to_lora_id_mapping
+            else:
+                self.prompt_to_lora_id_mapping_decode = deque(prompt_to_lora_id_mapping)
         else:
             self.prompt_to_lora_id_mapping_prefill = None
             self.prompt_to_lora_id_mapping_decode = None
@@ -472,9 +474,15 @@ def prepare_decode_inputs(self):
         if self.batch_index is not None:
             decode_inputs["batch_index"] = self.batch_index
 
-        if self.prompt_to_lora_id_mapping_decode and self.full_batch_size is not None:
-            first_batch_lora_ids = [self.prompt_to_lora_id_mapping_decode[i] for i in range(self.full_batch_size)]
-            decode_inputs["lora_ids"] = np.array(first_batch_lora_ids, dtype=np.int64).reshape(self.full_batch_size, 1)
+        if self.prompt_to_lora_id_mapping_decode:
+            if self.full_batch_size:
+                first_batch_lora_ids = [self.prompt_to_lora_id_mapping_decode[i] for i in range(self.full_batch_size)]
+                decode_inputs["lora_ids"] = np.array(first_batch_lora_ids, dtype=np.int64).reshape(
+                    self.full_batch_size, 1
+                )
+            else:
+                batch_lora_ids = [self.prompt_to_lora_id_mapping_decode.popleft() for i in range(self.batch_size)]
+                decode_inputs["lora_ids"] = np.array(batch_lora_ids, dtype=np.int64).reshape(self.batch_size, 1)
 
         return decode_inputs
 
@@ -565,9 +573,13 @@ def run_prefill(self, prompt, generation_len, prefill_logit_bs=1, decode_batch_i
             inputs["batch_index"] = decode_batch_id
 
         if self.prompt_to_lora_id_mapping_prefill:
-            inputs["lora_ids"] = np.array(self.prompt_to_lora_id_mapping_prefill.popleft(), dtype=np.int64).reshape(
-                1, 1
-            )
+            if self.full_batch_size:
+                inputs["lora_ids"] = np.array(self.prompt_to_lora_id_mapping_prefill.popleft(), dtype=np.int64).reshape(
+                    1, 1
+                )
+            else:
+                batch_lora_ids = [self.prompt_to_lora_id_mapping_prefill.popleft() for i in range(self.batch_size)]
+                inputs["lora_ids"] = np.array(batch_lora_ids, dtype=np.int64).reshape(self.batch_size, 1)
 
         for i in range(num_chunks):
             chunk_inputs = inputs.copy()