Added CB support for Mistral3

asmigosw · asmigosw · commit c39ae015cf5f · 2025-11-16T16:29:52.000Z
Signed-off-by: Asmita Goswami &lt;asmigosw@qti.qualcomm.com&gt;
diff --git a/QEfficient/generation/embedding_handler.py b/QEfficient/generation/embedding_handler.py
@@ -168,6 +168,9 @@ def prepare_vlm_inputs(self, image_url: str, query: str, prefill_seq_len: int) -
             else:
                 image = Image.open(image_url)
 
+            if "mistral3" in self._qeff_model.model.config.model_type:
+                image = image.resize((1540, 1540))
+
             # Prepare conversation format
             conversation = [
                 {
diff --git a/QEfficient/transformers/models/internvl/modeling_internvl.py b/QEfficient/transformers/models/internvl/modeling_internvl.py
@@ -151,7 +151,7 @@ def get_specializations(
                     "batch_size": full_batch_size if continuous_batching else batch_size,
                     "seq_len": "1",
                     "ctx_len": ctx_len,
-                    "comp_ctx_lengths": comp_ctx_lengths_prefill[i],
+                    "comp_ctx_lengths": comp_ctx_lengths_decode[i],
                     "num_patches": num_patches,
                     "img_size": img_size,
                     "vision_size": vision_size,
diff --git a/QEfficient/transformers/models/mistral3/modeling_mistral3.py b/QEfficient/transformers/models/mistral3/modeling_mistral3.py
@@ -176,6 +176,7 @@ def forward(
         image_idx,
         past_key_values,
         comp_ctx_lengths: Optional[List[int]] = None,
+        batch_index: Optional[torch.LongTensor] = None,
     ):
         inputs_embeds = self.model.get_input_embeddings()(input_ids)
         vision_embeds = vision_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
@@ -190,6 +191,7 @@ def forward(
             position_ids=position_ids,
             past_key_values=past_key_values,
             comp_ctx_lengths=comp_ctx_lengths,
+            batch_index=batch_index,
         )
 
         # Cast to int32 to avoid ONNXRT issue
@@ -250,7 +252,7 @@ def forward(
 
         return logits, pixel_values, image_idx, outputs.past_key_values
 
-    def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, **kwargs):
+    def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False, **kwargs):
         inputs_shapes = {}
         inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN)
         height = self.config.vision_config.image_size
@@ -290,10 +292,14 @@ def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offl
             .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1)
         )
         lang_inputs["image_idx"] = torch.zeros((inputs_shapes["image_idx"]), dtype=torch.int64)
+
+        bs: int = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE
+        fbs: int = constants.ONNX_EXPORT_EXAMPLE_FBS
+
         # Add data for KV
         kv_cache_shape = get_padding_shape_from_config(
-            config=self.language_model.config,
-            batch_size=constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE,
+            config=self.model.config.text_config,
+            batch_size=fbs if continuous_batching else bs,
             seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN,
         )
 
@@ -304,6 +310,8 @@ def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offl
 
         if comp_ctx_lengths is not None:
             lang_inputs["comp_ctx_lengths"] = torch.randint(0, 100, (40,), dtype=torch.long)
+        if continuous_batching:
+            lang_inputs["batch_index"] = torch.arange(bs).view(bs, 1)
 
         inputs = {}
         if kv_offload:
@@ -324,6 +332,9 @@ def get_specializations(
         comp_ctx_lengths_prefill: Optional[List[int]] = None,
         comp_ctx_lengths_decode: Optional[List[int]] = None,
         kv_offload: bool = False,
+        continuous_batching: bool = False,
+        kv_cache_batch_size: Optional[int] = None,
+        full_batch_size: Optional[int] = None,
         **compiler_options,
     ):
         if img_size is None and hasattr(self.config.vision_config, "image_size"):
@@ -352,46 +363,65 @@ def get_specializations(
             lang = []
 
             for i in range(0, len(comp_ctx_lengths_prefill)):
-                lang.append(
-                    {
-                        "batch_size": batch_size,
-                        "seq_len": prefill_seq_len,
-                        "ctx_len": ctx_len,
-                        "comp_ctx_lengths": comp_ctx_lengths_prefill[i],
-                        "image_size": img_size,
-                        "vision_size": vision_size,
-                    }
-                )
-
-            # Remaining elements use comp_ctx_lengths[1:] in a loop
-            for i in range(0, len(comp_ctx_lengths_decode)):
-                lang.append(
-                    {
-                        "batch_size": batch_size,
-                        "seq_len": "1",
-                        "ctx_len": ctx_len,
-                        "comp_ctx_lengths": comp_ctx_lengths_decode[i],
-                        "image_size": img_size,
-                        "vision_size": vision_size,
-                    }
-                )
-        else:
-            lang = [
-                {
-                    "batch_size": batch_size,
+                lang_prefill = {
+                    "batch_size": 1 if continuous_batching else batch_size,
                     "seq_len": prefill_seq_len,
                     "ctx_len": ctx_len,
+                    "comp_ctx_lengths": comp_ctx_lengths_prefill[i],
                     "image_size": img_size,
                     "vision_size": vision_size,
-                },
-                {
-                    "batch_size": batch_size,
+                }
+                if continuous_batching:
+                    lang_prefill["full_batch_size"] = kv_cache_batch_size
+                else:
+                    lang_prefill["batch_size"] = kv_cache_batch_size
+                if full_batch_size:
+                    lang_prefill["full_batch_exec_size"] = full_batch_size
+                lang.append(lang_prefill)
+
+            # Remaining elements use comp_ctx_lengths[1:] in a loop
+            for i in range(0, len(comp_ctx_lengths_decode)):
+                lang_decode = {
+                    "batch_size": full_batch_size if continuous_batching else batch_size,
                     "seq_len": "1",
                     "ctx_len": ctx_len,
+                    "comp_ctx_lengths": comp_ctx_lengths_decode[i],
                     "image_size": img_size,
                     "vision_size": vision_size,
-                },
-            ]
+                }
+
+                if continuous_batching:
+                    lang_decode["full_batch_size"] = kv_cache_batch_size
+                else:
+                    lang_decode["batch_size"] = kv_cache_batch_size
+                lang.append(lang_decode)
+        else:
+            lang_prefill = {
+                "batch_size": 1 if continuous_batching else batch_size,
+                "seq_len": prefill_seq_len,
+                "ctx_len": ctx_len,
+                "image_size": img_size,
+                "vision_size": vision_size,
+            }
+            if continuous_batching:
+                lang_prefill["full_batch_size"] = kv_cache_batch_size
+            else:
+                lang_prefill["batch_size"] = kv_cache_batch_size
+            if full_batch_size:
+                lang_prefill["full_batch_exec_size"] = full_batch_size
+
+            lang_decode = {
+                "batch_size": full_batch_size if continuous_batching else batch_size,
+                "seq_len": "1",
+                "ctx_len": ctx_len,
+                "image_size": img_size,
+                "vision_size": vision_size,
+            }
+
+            if continuous_batching:
+                lang_decode["full_batch_size"] = kv_cache_batch_size
+            else:
+                lang_decode["batch_size"] = kv_cache_batch_size
 
         specializations = {}
 
@@ -404,7 +434,7 @@ def get_specializations(
             lang[1].pop("vision_size")
             return lang, compiler_options
 
-    def get_onnx_dynamic_axes(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False):
+    def get_onnx_dynamic_axes(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False):
         # Define dynamic axes
         num_layers = self.config.text_config.num_hidden_layers
 
@@ -417,9 +447,18 @@ def get_onnx_dynamic_axes(self, comp_ctx_lengths: Optional[List[int]] = None, kv
             "vision_embeds": {0: "vision_size"},
         }
 
+        if continuous_batching:
+            lang_dynamic_axes["batch_index"] = {0: "batch_size"}
+
         for i in range(num_layers):
-            lang_dynamic_axes[f"past_key.{i}"] = {0: "batch_size", 2: "ctx_len"}
-            lang_dynamic_axes[f"past_value.{i}"] = {0: "batch_size", 2: "ctx_len"}
+            lang_dynamic_axes[f"past_key.{i}"] = {
+                0: "full_batch_size" if continuous_batching else "batch_size",
+                2: "ctx_len",
+            }
+            lang_dynamic_axes[f"past_value.{i}"] = {
+                0: "full_batch_size" if continuous_batching else "batch_size",
+                2: "ctx_len",
+            }
 
         if comp_ctx_lengths is not None:
             lang_dynamic_axes["comp_ctx_lengths"] = {0: "comp_ctx_lengths"}
diff --git a/examples/internvl_CB_example.py b/examples/internvl_CB_example.py
@@ -45,7 +45,7 @@
         num_cores=16,
         num_devices=4,
         batch_size=1,
-        full_batch_size=1,
+        full_batch_size=4,
         mxfp6_matmul=True,
         mxint8_kv_cache=True,
         aic_enable_depth_first=True,

Original file line number	Diff line number	Diff line change
`@@ -168,6 +168,9 @@ def prepare_vlm_inputs(self, image_url: str, query: str, prefill_seq_len: int) -`
`168`	`168`	`else:`
`169`	`169`	`image = Image.open(image_url)`
`170`	`170`
	`171`	`+ if "mistral3" in self._qeff_model.model.config.model_type:`
	`172`	`+ image = image.resize((1540, 1540))`
	`173`	`+`
`171`	`174`	`# Prepare conversation format`
`172`	`175`	`conversation = [`
`173`	`176`	`{`