Support for dual compilation on VLMs (#361)

quic-rishinr · web-flow · commit 574a6df2594e · 2025-04-11T23:33:11.000+05:30
Signed-off-by: Rishin Raj &lt;quic_rishinr@quicinc.com&gt;
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
@@ -600,6 +600,7 @@ def compile(
         num_speculative_tokens: Optional[int] = None,
         enable_qnn: bool = False,
         qnn_config: Optional[str] = None,
+        compile_for: Optional[str] = None,
         **compiler_options,
     ) -> str:
         if (
@@ -615,6 +616,9 @@ def compile(
                 f"enable_qnn={enable_qnn}, qnn_config={qnn_config}"
             )
 
+        if compile_for not in {"vision", "lang", None}:
+            raise ValueError(f"Expected 'compile_for' to be one of 'vision', 'lang', or None but got: {compile_for}")
+
         output_names = self.model.get_output_names(kv_offload=True)
 
         specializations, compiler_options = self.model.get_specializations(
@@ -642,41 +646,49 @@ def compile(
         ):
             self.export()
 
-        self.vision_model._compile(
-            compile_dir,
-            compile_only=True,
-            specializations=specializations["vision"],
-            convert_to_fp16=True,
-            mxfp6_matmul=mxfp6_matmul,
-            mdp_ts_num_devices=num_devices,
-            aic_num_cores=num_cores,
-            custom_io=custom_io_vision,
-            **compiler_options,
-        )
+        if compile_for is None or compile_for.lower() == "vision":
+            vision_qpc_path = self.vision_model._compile(
+                compile_dir,
+                compile_only=True,
+                specializations=specializations["vision"],
+                convert_to_fp16=True,
+                mxfp6_matmul=mxfp6_matmul,
+                mdp_ts_num_devices=num_devices,
+                aic_num_cores=num_cores,
+                custom_io=custom_io_vision,
+                **compiler_options,
+            )
 
-        custom_io_lang = {}
-        # Inputs
-        for output_name in output_names["lang"]:
-            if output_name.endswith("_RetainedState"):
-                custom_io_lang[output_name[: -len("_RetainedState")]] = kv_cache_dtype
+            if compile_for == "vision":
+                return vision_qpc_path
 
-        # outputs
-        for output_name in output_names["lang"]:
-            if output_name.endswith("_RetainedState"):
-                custom_io_lang[output_name] = kv_cache_dtype
+        if compile_for is None or compile_for.lower() == "lang":
+            custom_io_lang = {}
+            # Inputs
+            for output_name in output_names["lang"]:
+                if output_name.endswith("_RetainedState"):
+                    custom_io_lang[output_name[: -len("_RetainedState")]] = kv_cache_dtype
+
+            # outputs
+            for output_name in output_names["lang"]:
+                if output_name.endswith("_RetainedState"):
+                    custom_io_lang[output_name] = kv_cache_dtype
+
+            lang_qpc_path = self.lang_model._compile(
+                compile_dir,
+                compile_only=True,
+                retained_state=True,
+                specializations=specializations["lang"],
+                convert_to_fp16=True,
+                mxfp6_matmul=mxfp6_matmul,
+                mdp_ts_num_devices=num_devices,
+                aic_num_cores=num_cores,
+                custom_io=custom_io_lang,
+                **compiler_options,
+            )
+            if compile_for == "lang":
+                return lang_qpc_path
 
-        self.lang_model._compile(
-            compile_dir,
-            compile_only=True,
-            retained_state=True,
-            specializations=specializations["lang"],
-            convert_to_fp16=True,
-            mxfp6_matmul=mxfp6_matmul,
-            mdp_ts_num_devices=num_devices,
-            aic_num_cores=num_cores,
-            custom_io=custom_io_lang,
-            **compiler_options,
-        )
         return self.qpc_path
 
     def generate(
@@ -711,6 +723,15 @@ def kv_offload_generate(
         device_ids: List[int] = None,
         generation_len: int = None,
     ):
+        if not self.vision_model.qpc_path or not self.lang_model.qpc_path:
+            raise TypeError("Please run compile API for vision and language model first!")
+
+        if not self.vision_model.qpc_path:
+            raise TypeError("Please run compile API for vision model first!")
+
+        if not self.lang_model.qpc_path:
+            raise TypeError("Please run compile API for language model first!")
+
         lang_session = QAICInferenceSession(self.lang_model.qpc_path, device_ids, activate=False)
 
         vision_session = QAICInferenceSession(self.vision_model.qpc_path, device_ids)