Disaggregated serving (quic#365)

quic-amitraj · quic-rishinr · ochougul · eplatero97 · commit 91eeb68ff418 · 2025-04-29T04:47:27.000-05:00
Adding support of-
1. `prefill_only`
2. `compile_for` for VLM
3. `mdp_ts_json_path`

---------

Signed-off-by: Rishin Raj &lt;quic_rishinr@quicinc.com&gt;
Signed-off-by: Amit Raj &lt;quic_amitraj@quicinc.com&gt;
Signed-off-by: Onkar Chougule &lt;quic_ochougul@quicinc.com&gt;
Signed-off-by: Onkar Chougule &lt;168134249+ochougul@users.noreply.github.com&gt;
Co-authored-by: Rishin Raj &lt;quic_rishinr@quicinc.com&gt;
Co-authored-by: Onkar Chougule &lt;quic_ochougul@quicinc.com&gt;
Co-authored-by: Onkar Chougule &lt;168134249+ochougul@users.noreply.github.com&gt;
diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py
@@ -254,22 +254,6 @@ def _compile(
         qpc_path = compile_dir / "qpc"
         if not onnx_path.is_file():
             raise FileNotFoundError(f"ONNX file not found at: {onnx_path}")
-
-        if enable_qnn:
-            self.qpc_path = qnn_compile(
-                onnx_path=onnx_path,
-                qpc_base_path=compile_dir,
-                specializations=specializations,
-                custom_io=custom_io,
-                device_group=list(range(mdp_ts_num_devices)),
-                num_cores=compiler_options.get("aic_num_cores", 16),
-                mxfp6=compiler_options.get("mxfp6_matmul", False),
-                mxint8=mxint8_kv_cache,
-                qnn_config=qnn_config,
-            )
-
-            return self.qpc_path
-
         command = constants.COMPILER + [f"-m={onnx_path}"]
         if mdp_ts_json_path := compiler_options.pop("mdp_ts_json_path", None):
             mdp_ts_num_devices = None
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
@@ -603,8 +603,8 @@ def compile(
         mxfp6_matmul: bool = False,
         mxint8_kv_cache: bool = False,
         num_speculative_tokens: Optional[int] = None,
-        skip_vision: Optional[bool] = False,
-        skip_lang: Optional[bool] = False,
+        enable_qnn: bool = False,
+        qnn_config: Optional[str] = None,
         **compiler_options,
     ) -> str:
         if any(param is not None for param in [full_batch_size, kv_cache_batch_size, num_speculative_tokens]):
@@ -643,19 +643,17 @@ def compile(
         ):
             self.export()
 
-        if not skip_vision:
-            self.vision_model._compile(
-                compile_dir,
-                compile_only=True,
-                specializations=specializations["vision"],
-                convert_to_fp16=True,
-                mxfp6_matmul=mxfp6_matmul,
-                mdp_ts_num_devices=num_devices,
-                aic_num_cores=num_cores,
-                custom_io=custom_io_vision,
-                mxint8_kv_cache=mxint8_kv_cache,
-                **compiler_options,
-            )
+        self.vision_model._compile(
+            compile_dir,
+            compile_only=True,
+            specializations=specializations["vision"],
+            convert_to_fp16=True,
+            mxfp6_matmul=mxfp6_matmul,
+            mdp_ts_num_devices=num_devices,
+            aic_num_cores=num_cores,
+            custom_io=custom_io_vision,
+            **compiler_options,
+        )
 
         if not skip_lang:
             custom_io_lang = {}
@@ -681,6 +679,7 @@ def compile(
             custom_io=custom_io_lang,
             **compiler_options,
         )
+        return self.qpc_path
 
     def generate(
         self,
@@ -1533,7 +1532,8 @@ def compile(
         mxfp6_matmul: bool = False,
         mxint8_kv_cache: bool = False,
         num_speculative_tokens: Optional[int] = None,
-        prefill_only: Optional[bool] = None,
+        enable_qnn: bool = False,
+        qnn_config: Optional[str] = None,
         **compiler_options,
     ) -> str:
         """
@@ -1555,14 +1555,8 @@ def compile(
             :num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model.
             :mos (int, optional): Effort level to reduce on-chip memory. Defaults to -1, meaning no effort. ``Defaults to -1``.
             :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``.
-            :prefill_only (bool): if ``True`` compile for prefill only and if ``False`` compile for decode only. Defaults to None, which compiles for both ``prefill and ``decode``.
-            :compiler_options (dict, optional): Pass any compiler option as input. ``Defaults to None``.
-            Following flag can be passed in compiler_options to enable QNN Compilation path.
-                :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False. if not passed.``
-                :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None. if not passed``
-            for QAIC compilation path, any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below:
-                - aic_num_cores=16 -> -aic-num-cores=16
-                - convert_to_fp16=True -> -convert-to-fp16
+            :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.``
+            :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
 
         Returns:
             :str: Path of the compiled ``qpc`` package.
@@ -1572,9 +1566,19 @@ def compile(
             raise TypeError("`prefill_only` must be a boolean.")
 
         if self.is_tlm:
-            num_speculative_tokens: int = self.check_and_get_num_speculative_tokens(
-                num_speculative_tokens, prefill_seq_len
-            )
+            # assert num_speculative_tokens cfg is acceptable if defined
+            if num_speculative_tokens is None:
+                raise TypeError("missing required argument `num_speculative_tokens` as `is_tlm` is True.")
+            if not isinstance(num_speculative_tokens, int) and num_speculative_tokens < 2:
+                ValueError(
+                    f"`num_speculative_tokens` arg should be an integer greater than 1, got {num_speculative_tokens}"
+                )
+            num_logits_to_keep = num_speculative_tokens + 1
+            if prefill_seq_len < num_logits_to_keep:
+                raise ValueError(
+                    f"sequence length ({prefill_seq_len}) must be at least `num_speculative_tokens+1` ({num_logits_to_keep})"
+                )
+
         if self.continuous_batching and full_batch_size is None:
             raise TypeError("`full_batch_size` is required when `continuous_batching=True`.")
 
@@ -1584,50 +1588,83 @@ def compile(
                 "enable `continuous_batching=True` in `from_pretrained`."
             )
 
-        # Infer kv_cache_batch_size if not provided
-        kv_cache_batch_size = kv_cache_batch_size or full_batch_size or batch_size
-
-        # --- Specializations ---
-        specializations = []
+        kv_cache_batch_size = (
+            kv_cache_batch_size if kv_cache_batch_size else (full_batch_size if full_batch_size else batch_size)
+        )
+        # Define prefill specialization
+        prefill_specialization = {
+            # Prefill is always run with single BS for continuous batching.
+            "batch_size": 1 if self.continuous_batching else batch_size,
+            "seq_len": prefill_seq_len,
+            "ctx_len": ctx_len,
+            # TODO: should be renamed to kv_cache_batch_size in specialization too
+        }
+        prefill_specialization.update({"num_logits_to_keep": 1}) if self.is_tlm else ...
+        if self.continuous_batching:
+            prefill_specialization.update({"full_batch_size": kv_cache_batch_size})
+        else:
+            prefill_specialization.update({"batch_size": kv_cache_batch_size})
+        prefill_specialization.update({"full_batch_exec_size": full_batch_size}) if full_batch_size else ...
+        specializations = [
+            prefill_specialization,
+        ]
 
-        if prefill_only is None or prefill_only or prefill_seq_len == 1:
-            specializations.append(
-                self.build_prefill_specialization(
-                    prefill_seq_len, ctx_len, batch_size, kv_cache_batch_size, full_batch_size
-                )
+        # Skip decode specialization if we are not in continuous batching and prefill_seq_len=1 as this repeats prefill specialization
+        if prefill_seq_len != 1 or self.continuous_batching:
+            decode_specialization = {
+                "batch_size": full_batch_size if self.continuous_batching else batch_size,
+                "seq_len": num_speculative_tokens + 1 if self.is_tlm else 1,
+                "ctx_len": ctx_len,
+            }
+            if self.continuous_batching:
+                decode_specialization.update({"full_batch_size": kv_cache_batch_size})
+            else:
+                decode_specialization.update({"batch_size": kv_cache_batch_size})
+            decode_specialization.update({"num_logits_to_keep": num_speculative_tokens + 1}) if self.is_tlm else ...
+            specializations.append(decode_specialization)
+
+        if enable_qnn:
+            if compiler_options:
+                logger.warning("Extra arguments to QNN compilation are supported via qnn_config.json only")
+
+            qpc_path = self._qnn_compile(
+                onnx_path,
+                compile_dir,
+                specializations=specializations,
+                prefill_seq_len=prefill_seq_len,
+                ctx_len=ctx_len,
+                batch_size=batch_size,
+                full_batch_size=full_batch_size,
+                mdp_ts_num_devices=num_devices,
+                num_cores=num_cores,
+                mxfp6_matmul=mxfp6_matmul,
+                mxint8_kv_cache=mxint8_kv_cache,
+                qnn_config=qnn_config,
+                kv_cache_batch_size=kv_cache_batch_size,
             )
-        if prefill_only is None or not prefill_only:
-            decode_spec = self.build_decode_specialization(
-                prefill_seq_len, ctx_len, batch_size, kv_cache_batch_size, full_batch_size, num_speculative_tokens
+        else:
+            # Custom IO
+            custom_io = {}
+            kv_cache_dtype = "mxint8" if mxint8_kv_cache else "float16"
+            for suffix in ["", "_RetainedState"]:
+                for i in range(self.num_layers):
+                    for kv in ["key", "value"]:
+                        custom_io[f"past_{kv}.{i}{suffix}"] = kv_cache_dtype
+
+            qpc_path = self._compile(
+                onnx_path,
+                compile_dir,
+                compile_only=True,
+                retained_state=True,
+                specializations=specializations,
+                convert_to_fp16=True,
+                mxfp6_matmul=mxfp6_matmul,
+                custom_io=custom_io,
+                mdp_ts_num_devices=num_devices,
+                num_speculative_tokens=num_speculative_tokens,
+                aic_num_cores=num_cores,
+                **compiler_options,
             )
-            if decode_spec:
-                specializations.append(decode_spec)
-
-        # --- Compilation ---
-        kv_cache_dtype = "mxint8" if mxint8_kv_cache else "float16"
-        custom_io = {}
-
-        for suffix in ["", "_RetainedState"]:
-            for i in range(self.num_layers):
-                for kv in ["key", "value"]:
-                    custom_io[f"past_{kv}.{i}{suffix}"] = kv_cache_dtype
-
-        qpc_path = self._compile(
-            onnx_path=onnx_path,
-            compile_dir=compile_dir,
-            compile_only=True,
-            retained_state=True,
-            specializations=specializations,
-            convert_to_fp16=True,
-            mxfp6_matmul=mxfp6_matmul,
-            custom_io=custom_io,
-            mdp_ts_num_devices=num_devices,
-            num_speculative_tokens=num_speculative_tokens,
-            aic_num_cores=num_cores,
-            mxint8_kv_cache=mxint8_kv_cache,
-            **compiler_options,
-        )
-
         return qpc_path
 
     # FIXME: Update this method to match with transformers AutoModelForCausalLM.generate
diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py