quic · quic-rishinr · Apr 23, 2025 · Apr 21, 2025 · Apr 22, 2025 · Apr 22, 2025
@@ -24,7 +24,6 @@
 from QEfficient.compile.qnn_compiler import compile as qnn_compile
 from QEfficient.generation.cloud_infer import QAICInferenceSession
 from QEfficient.utils import constants, dump_qconfig
-from QEfficient.utils._utils import load_json
 from QEfficient.utils.cache import QEFF_HOME, to_hashable
 
 logger = logging.getLogger(__name__)
@@ -98,7 +97,11 @@ def compile(self, *args, **kwargs) -> Path:
             :num_cores (int): Number of cores to utilize in each device ``Defaults to 16``.
             :mxfp6_matmul (bool): Use MXFP6 to compress weights for MatMul nodes to run faster on device. ``Defaults to False``.
             :mxint8_kv_cache (bool): Use MXINT8 to compress KV-cache on device to access and update KV-cache faster. ``Defaults to False``.
-            :compiler_options: Pass any compiler option as input. Any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below:
+            :compiler_options: Pass any compiler option as input.
+            Following flag can be passed in compiler_options to enable QNN Compilation path.
+                :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False. if not passed.``
+                :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None. if not passed``
+            for QAIC compilation path, any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below:
                 - aic_num_cores=16 -> -aic-num-cores=16
                 - convert_to_fp16=True -> -convert-to-fp16
 
@@ -217,10 +220,13 @@ def _compile(
         onnx_path: Optional[str] = None,
         compile_dir: Optional[str] = None,
         *,
+        mxint8_kv_cache: bool = False,
         specializations: Optional[List[Dict[str, int]]] = None,
         custom_io: Optional[Dict[str, str]] = None,
         mdp_ts_num_devices: int = 1,
         num_speculative_tokens: Optional[int] = None,
+        enable_qnn: Optional[bool] = False,
+        qnn_config: Optional[str] = None,
         **compiler_options,
     ) -> str:
         """
@@ -229,10 +235,13 @@ def _compile(
         Args:
             :onnx_path (str): Onnx file to compile
             :compile_dir (str): Directory path to compile the qpc. A suffix is added to the directory path to avoid reusing same qpc for different parameters.
+            :mxint8_kv_cache (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``.
             :specializations (list): List of specializations to compile for
             :custom_io (dict): Custom IO to specify the input and outputs in different formats than default
             :mdp_ts_num_devices (int): Number of devices to partition to use Multi-Device Partitioning with tensor-slicing.
             :num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model.
+            :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.``
+            :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
             :compiler_options: Pass any compiler option as input. Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below:
                 - aic_num_cores=16 -> -aic-num-cores=16
                 - convert_to_fp16=True -> -convert-to-fp16
@@ -245,6 +254,22 @@ def _compile(
         qpc_path = compile_dir / "qpc"
         if not onnx_path.is_file():
             raise FileNotFoundError(f"ONNX file not found at: {onnx_path}")
+
+        if enable_qnn:
+            self.qpc_path = qnn_compile(
+                onnx_path=onnx_path,
+                qpc_base_path=compile_dir,
+                specializations=specializations,
+                custom_io=custom_io,
+                device_group=list(range(mdp_ts_num_devices)),
+                num_cores=compiler_options.get("aic_num_cores", 16),
+                mxfp6=compiler_options.get("mxfp6_matmul", False),
+                mxint8=mxint8_kv_cache,
+                qnn_config=qnn_config,
+            )
+
+            return self.qpc_path
+
         command = constants.COMPILER + [f"-m={onnx_path}"]
         if mdp_ts_json_path := compiler_options.pop("mdp_ts_json_path", None):
             mdp_ts_num_devices = None
@@ -339,104 +364,3 @@ def _compile(
         self.qpc_path = qpc_path
 
         return qpc_path
-
-    @dump_qconfig
-    def _qnn_compile(
-        self,
-        onnx_path: Optional[str] = None,
-        compile_dir: Optional[str] = None,
-        *,
-        specializations: Optional[List[Dict[str, int]]] = None,
-        prefill_seq_len: int = 32,
-        ctx_len: int = 128,
-        batch_size: int = 1,
-        full_batch_size: Optional[int] = None,
-        mdp_ts_num_devices: int = 1,
-        num_cores: int = 16,
-        mxfp6_matmul: bool = False,
-        mxint8_kv_cache: bool = False,
-        qnn_config: Optional[str] = None,
-        kv_cache_batch_size: Optional[int] = None,
-    ) -> str:
-        """
-        Interface for QNN compiler
-
-        Args:
-            :onnx_path (str): Onnx file to compile
-            :compile_dir (str): Directory path to compile the qpc. A suffix is added to the directory path to avoid reusing same qpc for different parameters.
-            :specializations (list): List of specializations to compile for
-            :prefill_seq_len (int, optional): The length of the Prefill prompt should be less that ``prefill_seq_len``. ``Defaults to 32``.
-            :ctx_len (int, optional): Maximum ``ctx`` that the compiled model can remember. ``Defaults to 128``.
-            :batch_size (int, optional): Batch size. ``Defaults to 1``.
-            :full_batch_size (int, optional): Continuous batching batch size.
-            :mdp_ts_num_devices (int): Number of devices to partition to use Multi-Device Partitioning with tensor-slicing.
-            :num_cores (int): Number of cores used to compile the model.
-            :mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to True``.
-            :mxint8_kv_cache (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``.
-            :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
-            :kv_cache_batch_size (int): kv_cache_batch_size for Prefix Caching. ``Defaults to None.``
-        """
-        if onnx_path is None and self.onnx_path is None:
-            self.export()
-
-        onnx_path = Path(onnx_path or self.onnx_path)
-        compile_dir = Path(compile_dir or onnx_path.parent)
-        qpc_path = compile_dir / "qpc"
-        if not onnx_path.is_file():
-            raise FileNotFoundError(f"ONNX file not found at: {onnx_path}")
-
-        compile_hash = hashlib.sha256(to_hashable("qnn"))
-
-        if specializations is not None:
-            compile_hash.update(to_hashable(specializations))
-
-        if qnn_config is not None:
-            qnn_config_values = load_json(qnn_config)
-            compile_hash.update(to_hashable(qnn_config_values))
-
-        if mdp_ts_num_devices > 1:
-            compile_hash.update(to_hashable({"mdp_ts_num_devices": mdp_ts_num_devices}))
-
-        compile_hash.update(to_hashable({"num_cores": num_cores}))
-        compile_hash.update(to_hashable({"mxfp6_matmul": mxfp6_matmul}))
-        compile_hash.update(to_hashable({"mxint8_kv_cache": mxint8_kv_cache}))
-
-        # Check if already compiled
-        compile_hash = compile_hash.hexdigest()[:16]
-        qpc_path = qpc_path.with_name(qpc_path.name + "-" + compile_hash)
-        if qpc_path.is_dir():
-            if (qpc_path / "programqpc.bin").is_file():
-                self.qpc_path = qpc_path
-                return qpc_path
-            # Probably compilation failure last time, delete directory to start over
-            shutil.rmtree(qpc_path)
-
-        # Write specializations.json file
-        if specializations is not None:
-            specializations_json = compile_dir / "specializations.json"
-            with open(specializations_json, "w") as fp:
-                json.dump(
-                    {"specializations": [{k: str(v) for k, v in spec.items()} for spec in specializations]},
-                    fp,
-                    indent=4,
-                )
-
-        qnn_compile(
-            onnx_path=onnx_path,
-            qpc_base_path=compile_dir,
-            num_cores=num_cores,
-            device_group=list(range(mdp_ts_num_devices)),
-            batch_size=batch_size,
-            prompt_len=prefill_seq_len,
-            ctx_len=ctx_len,
-            mxfp6=mxfp6_matmul,
-            mxint8=mxint8_kv_cache,
-            full_batch_size=full_batch_size,
-            qnn_config=qnn_config,
-            qnn_binary_dir=qpc_path,
-            kv_cache_batch_size=kv_cache_batch_size,
-        )
-
-        self.qpc_path = qpc_path
-
-        return qpc_path
@@ -13,6 +13,7 @@
 from typing import List, Optional, Tuple
 
 from QEfficient.compile.qnn_compiler import compile as qnn_compile
+from QEfficient.utils._utils import load_json, load_yaml
 from QEfficient.utils.logging_utils import logger
 
 
@@ -180,36 +181,35 @@ def compile(
         full_batch_size=full_batch_size,
     )
 
+    # Select the customIO config based on the mx flag.
+    custom_io_file_name = "custom_io_int8.yaml" if mxint8 else "custom_io_fp16.yaml"
+
+    if custom_io_file_path is None:
+        custom_io_file_path = os.path.join(os.path.dirname(onnx_path), custom_io_file_name)
+
+    if not os.path.isfile(custom_io_file_path):
+        raise FileNotFoundError(
+            f"Custom IO file {custom_io_file_name} is not present at the expected path {custom_io_file_path}. Please pass the correct file path or rerun infer/export API"
+        )
+
     if enable_qnn:
         qpc_path = qnn_compile(
             onnx_path=onnx_path,
             qpc_base_path=qpc_path,
+            qnn_binary_dir=os.path.join(qpc_path, "qpcs"),
             num_cores=num_cores,
-            batch_size=batch_size,
-            prompt_len=prompt_len,
-            ctx_len=ctx_len,
             mxfp6=mxfp6,
             mxint8=mxint8,
             allow_mxint8_mdp_io=allow_mxint8_mdp_io,
             aic_enable_depth_first=aic_enable_depth_first,
             mos=mos,
             device_group=device_group,
-            full_batch_size=full_batch_size,
             qnn_config=qnn_config,
+            specializations=(load_json(specialization_json_path))["specializations"],
+            custom_io=load_yaml(custom_io_file_path),
         )
         logger.info(f"QNN Compiled QPC files can be found here: {qpc_path}")
     else:
-        # Select the customIO config based on the mx flag.
-        custom_io_file_name = "custom_io_int8.yaml" if mxint8 else "custom_io_fp16.yaml"
-
-        if custom_io_file_path is None:
-            custom_io_file_path = os.path.join(os.path.dirname(onnx_path), custom_io_file_name)
-
-        if not os.path.isfile(custom_io_file_path):
-            raise FileNotFoundError(
-                f"Custom IO file {custom_io_file_name} is not present at the expected path {custom_io_file_path}. Please pass the correct file path or rerun infer/export API"
-            )
-
         _, qpc_path = compile_kv_model_on_cloud_ai_100(
             onnx_path=onnx_path,
             specializations_json=specialization_json_path,