Skip to content

QNN Compilation path Support in QEFFBaseModel class. #374

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Apr 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 27 additions & 103 deletions QEfficient/base/modeling_qeff.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
from QEfficient.compile.qnn_compiler import compile as qnn_compile
from QEfficient.generation.cloud_infer import QAICInferenceSession
from QEfficient.utils import constants, dump_qconfig
from QEfficient.utils._utils import load_json
from QEfficient.utils.cache import QEFF_HOME, to_hashable

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -98,7 +97,11 @@ def compile(self, *args, **kwargs) -> Path:
:num_cores (int): Number of cores to utilize in each device ``Defaults to 16``.
:mxfp6_matmul (bool): Use MXFP6 to compress weights for MatMul nodes to run faster on device. ``Defaults to False``.
:mxint8_kv_cache (bool): Use MXINT8 to compress KV-cache on device to access and update KV-cache faster. ``Defaults to False``.
:compiler_options: Pass any compiler option as input. Any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below:
:compiler_options: Pass any compiler option as input.
Following flag can be passed in compiler_options to enable QNN Compilation path.
:enable_qnn (bool): Enables QNN Compilation. ``Defaults to False. if not passed.``
:qnn_config (str): Path of QNN Config parameters file. ``Defaults to None. if not passed``
for QAIC compilation path, any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below:
- aic_num_cores=16 -> -aic-num-cores=16
- convert_to_fp16=True -> -convert-to-fp16

Expand Down Expand Up @@ -217,10 +220,13 @@ def _compile(
onnx_path: Optional[str] = None,
compile_dir: Optional[str] = None,
*,
mxint8_kv_cache: bool = False,
specializations: Optional[List[Dict[str, int]]] = None,
custom_io: Optional[Dict[str, str]] = None,
mdp_ts_num_devices: int = 1,
num_speculative_tokens: Optional[int] = None,
enable_qnn: Optional[bool] = False,
qnn_config: Optional[str] = None,
**compiler_options,
) -> str:
"""
Expand All @@ -229,10 +235,13 @@ def _compile(
Args:
:onnx_path (str): Onnx file to compile
:compile_dir (str): Directory path to compile the qpc. A suffix is added to the directory path to avoid reusing same qpc for different parameters.
:mxint8_kv_cache (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``.
:specializations (list): List of specializations to compile for
:custom_io (dict): Custom IO to specify the input and outputs in different formats than default
:mdp_ts_num_devices (int): Number of devices to partition to use Multi-Device Partitioning with tensor-slicing.
:num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model.
:enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.``
:qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
:compiler_options: Pass any compiler option as input. Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below:
- aic_num_cores=16 -> -aic-num-cores=16
- convert_to_fp16=True -> -convert-to-fp16
Expand All @@ -245,6 +254,22 @@ def _compile(
qpc_path = compile_dir / "qpc"
if not onnx_path.is_file():
raise FileNotFoundError(f"ONNX file not found at: {onnx_path}")

if enable_qnn:
self.qpc_path = qnn_compile(
onnx_path=onnx_path,
qpc_base_path=compile_dir,
specializations=specializations,
custom_io=custom_io,
device_group=list(range(mdp_ts_num_devices)),
num_cores=compiler_options.get("aic_num_cores", 16),
mxfp6=compiler_options.get("mxfp6_matmul", False),
mxint8=mxint8_kv_cache,
qnn_config=qnn_config,
)

return self.qpc_path

command = constants.COMPILER + [f"-m={onnx_path}"]
if mdp_ts_json_path := compiler_options.pop("mdp_ts_json_path", None):
mdp_ts_num_devices = None
Expand Down Expand Up @@ -339,104 +364,3 @@ def _compile(
self.qpc_path = qpc_path

return qpc_path

@dump_qconfig
def _qnn_compile(
self,
onnx_path: Optional[str] = None,
compile_dir: Optional[str] = None,
*,
specializations: Optional[List[Dict[str, int]]] = None,
prefill_seq_len: int = 32,
ctx_len: int = 128,
batch_size: int = 1,
full_batch_size: Optional[int] = None,
mdp_ts_num_devices: int = 1,
num_cores: int = 16,
mxfp6_matmul: bool = False,
mxint8_kv_cache: bool = False,
qnn_config: Optional[str] = None,
kv_cache_batch_size: Optional[int] = None,
) -> str:
"""
Interface for QNN compiler

Args:
:onnx_path (str): Onnx file to compile
:compile_dir (str): Directory path to compile the qpc. A suffix is added to the directory path to avoid reusing same qpc for different parameters.
:specializations (list): List of specializations to compile for
:prefill_seq_len (int, optional): The length of the Prefill prompt should be less that ``prefill_seq_len``. ``Defaults to 32``.
:ctx_len (int, optional): Maximum ``ctx`` that the compiled model can remember. ``Defaults to 128``.
:batch_size (int, optional): Batch size. ``Defaults to 1``.
:full_batch_size (int, optional): Continuous batching batch size.
:mdp_ts_num_devices (int): Number of devices to partition to use Multi-Device Partitioning with tensor-slicing.
:num_cores (int): Number of cores used to compile the model.
:mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to True``.
:mxint8_kv_cache (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``.
:qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
:kv_cache_batch_size (int): kv_cache_batch_size for Prefix Caching. ``Defaults to None.``
"""
if onnx_path is None and self.onnx_path is None:
self.export()

onnx_path = Path(onnx_path or self.onnx_path)
compile_dir = Path(compile_dir or onnx_path.parent)
qpc_path = compile_dir / "qpc"
if not onnx_path.is_file():
raise FileNotFoundError(f"ONNX file not found at: {onnx_path}")

compile_hash = hashlib.sha256(to_hashable("qnn"))

if specializations is not None:
compile_hash.update(to_hashable(specializations))

if qnn_config is not None:
qnn_config_values = load_json(qnn_config)
compile_hash.update(to_hashable(qnn_config_values))

if mdp_ts_num_devices > 1:
compile_hash.update(to_hashable({"mdp_ts_num_devices": mdp_ts_num_devices}))

compile_hash.update(to_hashable({"num_cores": num_cores}))
compile_hash.update(to_hashable({"mxfp6_matmul": mxfp6_matmul}))
compile_hash.update(to_hashable({"mxint8_kv_cache": mxint8_kv_cache}))

# Check if already compiled
compile_hash = compile_hash.hexdigest()[:16]
qpc_path = qpc_path.with_name(qpc_path.name + "-" + compile_hash)
if qpc_path.is_dir():
if (qpc_path / "programqpc.bin").is_file():
self.qpc_path = qpc_path
return qpc_path
# Probably compilation failure last time, delete directory to start over
shutil.rmtree(qpc_path)

# Write specializations.json file
if specializations is not None:
specializations_json = compile_dir / "specializations.json"
with open(specializations_json, "w") as fp:
json.dump(
{"specializations": [{k: str(v) for k, v in spec.items()} for spec in specializations]},
fp,
indent=4,
)

qnn_compile(
onnx_path=onnx_path,
qpc_base_path=compile_dir,
num_cores=num_cores,
device_group=list(range(mdp_ts_num_devices)),
batch_size=batch_size,
prompt_len=prefill_seq_len,
ctx_len=ctx_len,
mxfp6=mxfp6_matmul,
mxint8=mxint8_kv_cache,
full_batch_size=full_batch_size,
qnn_config=qnn_config,
qnn_binary_dir=qpc_path,
kv_cache_batch_size=kv_cache_batch_size,
)

self.qpc_path = qpc_path

return qpc_path
30 changes: 15 additions & 15 deletions QEfficient/compile/compile_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from typing import List, Optional, Tuple

from QEfficient.compile.qnn_compiler import compile as qnn_compile
from QEfficient.utils._utils import load_json, load_yaml
from QEfficient.utils.logging_utils import logger


Expand Down Expand Up @@ -180,36 +181,35 @@ def compile(
full_batch_size=full_batch_size,
)

# Select the customIO config based on the mx flag.
custom_io_file_name = "custom_io_int8.yaml" if mxint8 else "custom_io_fp16.yaml"

if custom_io_file_path is None:
custom_io_file_path = os.path.join(os.path.dirname(onnx_path), custom_io_file_name)

if not os.path.isfile(custom_io_file_path):
raise FileNotFoundError(
f"Custom IO file {custom_io_file_name} is not present at the expected path {custom_io_file_path}. Please pass the correct file path or rerun infer/export API"
)

if enable_qnn:
qpc_path = qnn_compile(
onnx_path=onnx_path,
qpc_base_path=qpc_path,
qnn_binary_dir=os.path.join(qpc_path, "qpcs"),
num_cores=num_cores,
batch_size=batch_size,
prompt_len=prompt_len,
ctx_len=ctx_len,
mxfp6=mxfp6,
mxint8=mxint8,
allow_mxint8_mdp_io=allow_mxint8_mdp_io,
aic_enable_depth_first=aic_enable_depth_first,
mos=mos,
device_group=device_group,
full_batch_size=full_batch_size,
qnn_config=qnn_config,
specializations=(load_json(specialization_json_path))["specializations"],
custom_io=load_yaml(custom_io_file_path),
)
logger.info(f"QNN Compiled QPC files can be found here: {qpc_path}")
else:
# Select the customIO config based on the mx flag.
custom_io_file_name = "custom_io_int8.yaml" if mxint8 else "custom_io_fp16.yaml"

if custom_io_file_path is None:
custom_io_file_path = os.path.join(os.path.dirname(onnx_path), custom_io_file_name)

if not os.path.isfile(custom_io_file_path):
raise FileNotFoundError(
f"Custom IO file {custom_io_file_name} is not present at the expected path {custom_io_file_path}. Please pass the correct file path or rerun infer/export API"
)

_, qpc_path = compile_kv_model_on_cloud_ai_100(
onnx_path=onnx_path,
specializations_json=specialization_json_path,
Expand Down
Loading
Loading