QNN Compilation path Support in QEFFBaseModel class. (quic#374)

shubhagr-quic · eplatero97 · commit be0e174daf83 · 2025-04-29T05:31:46.000-05:00
This change will facilitate the support of QNN Compilation path for any
model class derived from QEFFBaseModel class.

---------

Signed-off-by: Shubham Agrawal &lt;quic_shubhagr@quicinc.com&gt;

Signed-off-by: eplatero &lt;quic_eplatero@quicinc.com&gt;
diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py
@@ -254,6 +254,22 @@ def _compile(
         qpc_path = compile_dir / "qpc"
         if not onnx_path.is_file():
             raise FileNotFoundError(f"ONNX file not found at: {onnx_path}")
+
+        if enable_qnn:
+            self.qpc_path = qnn_compile(
+                onnx_path=onnx_path,
+                qpc_base_path=compile_dir,
+                specializations=specializations,
+                custom_io=custom_io,
+                device_group=list(range(mdp_ts_num_devices)),
+                num_cores=compiler_options.get("aic_num_cores", 16),
+                mxfp6=compiler_options.get("mxfp6_matmul", False),
+                mxint8=mxint8_kv_cache,
+                qnn_config=qnn_config,
+            )
+
+            return self.qpc_path
+
         command = constants.COMPILER + [f"-m={onnx_path}"]
         if mdp_ts_json_path := compiler_options.pop("mdp_ts_json_path", None):
             mdp_ts_num_devices = None
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
@@ -605,6 +605,8 @@ def compile(
         num_speculative_tokens: Optional[int] = None,
         enable_qnn: bool = False,
         qnn_config: Optional[str] = None,
+        skip_vision: Optional[bool] = False,
+        skip_lang: Optional[bool] = False,
         **compiler_options,
     ) -> str:
         if any(param is not None for param in [full_batch_size, kv_cache_batch_size, num_speculative_tokens]):
@@ -646,17 +648,18 @@ def compile(
         ):
             self.export()
 
-        self.vision_model._compile(
-            compile_dir,
-            compile_only=True,
-            specializations=specializations["vision"],
-            convert_to_fp16=True,
-            mxfp6_matmul=mxfp6_matmul,
-            mdp_ts_num_devices=num_devices,
-            aic_num_cores=num_cores,
-            custom_io=custom_io_vision,
-            **compiler_options,
-        )
+        if not skip_vision:
+            self.vision_model._compile(
+                compile_dir,
+                compile_only=True,
+                specializations=specializations["vision"],
+                convert_to_fp16=True,
+                mxfp6_matmul=mxfp6_matmul,
+                mdp_ts_num_devices=num_devices,
+                aic_num_cores=num_cores,
+                custom_io=custom_io_vision,
+                **compiler_options,
+            )
 
         custom_io_lang = {}
         # Inputs
@@ -669,18 +672,18 @@ def compile(
             if output_name.endswith("_RetainedState"):
                 custom_io_lang[output_name] = kv_cache_dtype
 
-        self.lang_model._compile(
-            compile_dir,
-            compile_only=True,
-            retained_state=True,
-            specializations=specializations["lang"],
-            convert_to_fp16=True,
-            mxfp6_matmul=mxfp6_matmul,
-            mdp_ts_num_devices=num_devices,
-            aic_num_cores=num_cores,
-            custom_io=custom_io_lang,
-            **compiler_options,
-        )
+            self.lang_model._compile(
+                compile_dir,
+                compile_only=True,
+                retained_state=True,
+                specializations=specializations["lang"],
+                convert_to_fp16=True,
+                mxfp6_matmul=mxfp6_matmul,
+                mdp_ts_num_devices=num_devices,
+                aic_num_cores=num_cores,
+                custom_io=custom_io_lang,
+                **compiler_options,
+            )
         return self.qpc_path
 
     def generate(
@@ -1539,6 +1542,7 @@ def compile(
         num_speculative_tokens: Optional[int] = None,
         enable_qnn: bool = False,
         qnn_config: Optional[str] = None,
+        prefill_only: Optional[bool] = None,
         **compiler_options,
     ) -> str:
         """
@@ -1562,6 +1566,8 @@ def compile(
             :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``.
             :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.``
             :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
+            :prefill_only (bool): if ``True`` compile for prefill only and if ``False`` compile for decode only. Defaults to None, which compiles for both ``prefill and ``decode``.
+            :compiler_options (dict, optional): Any other options that the `qaic-exec` takes. ``Defaults to None``.
 
         Returns:
             :str: Path of the compiled ``qpc`` package.
@@ -1583,48 +1589,33 @@ def compile(
                 "enable `continuous_batching=True` in `from_pretrained`."
             )
 
-        kv_cache_batch_size = (
-            kv_cache_batch_size if kv_cache_batch_size else (full_batch_size if full_batch_size else batch_size)
-        )
-        # Define prefill specialization
-        prefill_specialization = {
-            # Prefill is always run with single BS for continuous batching.
-            "batch_size": 1 if self.continuous_batching else batch_size,
-            "seq_len": prefill_seq_len,
-            "ctx_len": ctx_len,
-            # TODO: should be renamed to kv_cache_batch_size in specialization too
-        }
-        prefill_specialization.update({"num_logits_to_keep": 1}) if self.is_tlm else ...
-        if self.continuous_batching:
-            prefill_specialization.update({"full_batch_size": kv_cache_batch_size})
-        else:
-            prefill_specialization.update({"batch_size": kv_cache_batch_size})
-        prefill_specialization.update({"full_batch_exec_size": full_batch_size}) if full_batch_size else ...
-        specializations = [
-            prefill_specialization,
-        ]
+        # Infer kv_cache_batch_size if not provided
+        kv_cache_batch_size = kv_cache_batch_size or full_batch_size or batch_size
 
-        # Skip decode specialization if we are not in continuous batching and prefill_seq_len=1 as this repeats prefill specialization
-        if prefill_seq_len != 1 or self.continuous_batching:
-            decode_specialization = {
-                "batch_size": full_batch_size if self.continuous_batching else batch_size,
-                "seq_len": num_speculative_tokens + 1 if self.is_tlm else 1,
-                "ctx_len": ctx_len,
-            }
-            if self.continuous_batching:
-                decode_specialization.update({"full_batch_size": kv_cache_batch_size})
-            else:
-                decode_specialization.update({"batch_size": kv_cache_batch_size})
-            decode_specialization.update({"num_logits_to_keep": num_speculative_tokens + 1}) if self.is_tlm else ...
-            specializations.append(decode_specialization)
+        # --- Specializations ---
+        specializations = []
+
+        if prefill_only is None or prefill_only or prefill_seq_len == 1:
+            specializations.append(
+                self.build_prefill_specialization(
+                    prefill_seq_len, ctx_len, batch_size, kv_cache_batch_size, full_batch_size
+                )
+            )
+        if prefill_only is None or not prefill_only:
+            decode_spec = self.build_decode_specialization(
+                prefill_seq_len, ctx_len, batch_size, kv_cache_batch_size, full_batch_size, num_speculative_tokens
+            )
+            if decode_spec:
+                specializations.append(decode_spec)
 
+        # --- Compilation ---
         if enable_qnn:
             if compiler_options:
-                logger.warning("Extra arguments to QNN compilation are supported via qnn_config.json only")
+                logger.warning("Extra arguments to QNN compilation are ignored. Use `qnn_config.json`.")
 
             qpc_path = self._qnn_compile(
-                onnx_path,
-                compile_dir,
+                onnx_path=onnx_path,
+                compile_dir=compile_dir,
                 specializations=specializations,
                 prefill_seq_len=prefill_seq_len,
                 ctx_len=ctx_len,
@@ -1638,17 +1629,17 @@ def compile(
                 kv_cache_batch_size=kv_cache_batch_size,
             )
         else:
-            # Custom IO
-            custom_io = {}
             kv_cache_dtype = "mxint8" if mxint8_kv_cache else "float16"
+            custom_io = {}
+
             for suffix in ["", "_RetainedState"]:
                 for i in range(self.num_layers):
                     for kv in ["key", "value"]:
                         custom_io[f"past_{kv}.{i}{suffix}"] = kv_cache_dtype
 
             qpc_path = self._compile(
-                onnx_path,
-                compile_dir,
+                onnx_path=onnx_path,
+                compile_dir=compile_dir,
                 compile_only=True,
                 retained_state=True,
                 specializations=specializations,
@@ -1660,6 +1651,7 @@ def compile(
                 aic_num_cores=num_cores,
                 **compiler_options,
             )
+
         return qpc_path
 
     # FIXME: Update this method to match with transformers AutoModelForCausalLM.generate
@@ -1867,22 +1859,8 @@ def compile(
         if num_speculative_tokens:
             logger.warning("Speculative decoding is not yet enabled for AutoModelForSpeechSeq2Seq")
 
-        output_names = self.model.get_output_names()
-
-        kv_cache_dtype = "float16"
-        custom_io = {}
-
-        custom_io["input_features"] = kv_cache_dtype
-
-        # Slice output_names to get input names
-        for output_name in output_names:
-            if output_name.endswith("_RetainedState"):
-                custom_io[output_name[: -len("_RetainedState")]] = kv_cache_dtype
-
-        # Get output names
-        for output_name in output_names:
-            if output_name.endswith("_RetainedState"):
-                custom_io[output_name] = kv_cache_dtype
+        if enable_qnn or qnn_config:
+            logger.warning("QNN compile is not yet enabled for AutoModelForSpeechSeq2Seq")
 
         return self._compile(
             onnx_path,
diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py
@@ -51,6 +51,13 @@
     "Snowflake/Llama-3.1-SwiftKV-8B-Instruct",  # SwiftKV model
 ]
 
+test_models_qnn = [
+    "mistralai/Mixtral-8x7B-Instruct-v0.1",
+    "meta-llama/Llama-3.2-1B",
+    "unsloth/gemma-2b",
+    "ibm-granite/granite-guardian-3.1-2b",
+]
+
 spd_test_models = [
     "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
     "Qwen/Qwen2-0.5B",
@@ -88,6 +95,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
     ctx_len: int = Constants.CTX_LEN,
     n_layer: int = 1,
     num_speculative_tokens: Optional[int] = None,
+    prefill_only: Optional[bool] = None,
 ):
     """
     Validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching.
@@ -145,6 +153,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
         mxfp6=False,
         aic_enable_depth_first=False,
         num_speculative_tokens=num_speculative_tokens,
+        prefill_only=prefill_only,
     )
     exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR)
     cloud_ai_100_tokens = exec_info.generated_ids[0][
@@ -193,6 +202,8 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
         aic_enable_depth_first=False,
         full_batch_size=full_batch_size,
         num_speculative_tokens=num_speculative_tokens,
+        enable_qnn=enable_qnn,
+        qnn_config=qnn_config,
     )
     exec_info_fbs = qeff_model.generate(tokenizer, prompts=fbs_prompts)
 
@@ -361,10 +372,12 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_qnn(model_name):
     if model_name == "microsoft/Phi-3-mini-4k-instruct":
         n_layer = 2  # test only 2 layer models
     else:
-        n_layer = 1
+        n_layer = 2
+
+    check_non_hf_kv_vs_ort_vs_ai100(model_name=model_name, n_layer=n_layer)
 
-    qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json")
-    create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG)
+
+@pytest.mark.on_qaic
 @pytest.mark.parametrize("model_name", spd_test_models)
 def test_causal_tlm_pytorch_vs_kv_vs_ort_vs_ai100(model_name):
     """
@@ -391,3 +404,12 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1():
     prompt_len = 1
 
     check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, prompt_len=prompt_len)
+
+
+@pytest.mark.on_qaic
+def test_prefiill_only_pytorch_vs_kv_vs_ort_vs_ai100():
+    model_name = "gpt2"
+    n_layer = 1
+    check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, n_layer=n_layer, prefill_only=True)
+
+    check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, n_layer=n_layer, prefill_only=False)