added latest generalized generate code

ochougul · ochougul · commit eb7db29c217a · 2025-02-10T17:05:08.000+05:30
Signed-off-by: Onkar Chougule &lt;quic_ochougul@quicinc.com&gt;
diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py
@@ -63,6 +63,19 @@ def __repr__(self):
         \nTotal (E2E) inference time is= {round(self.perf_metrics.total_time, 2)}"
 
 
+@dataclass
+class CloudAI100ExecInfoNew:
+    batch_size: int
+    generated_ids: Union[List[np.ndarray], np.ndarray]
+    perf_metrics: PerfMetrics
+
+    def __repr__(self):
+        return f"Average Prefill time a.k.a TTFT is= {round(self.perf_metrics.prefill_time, 2)}\
+        \nDecode token/sec is= {round(self.perf_metrics.decode_perf * self.batch_size, 2)}\
+        \nTotal token/sec is= {round(self.perf_metrics.total_perf * self.batch_size, 2)}\
+        \nTotal (E2E) inference time is= {round(self.perf_metrics.total_time, 2)}"
+ 
+ 
 io_files = []
 
 
diff --git a/QEfficient/transformers/models/internvl/modeling_internvl.py b/QEfficient/transformers/models/internvl/modeling_internvl.py
@@ -62,7 +62,7 @@ def get_output_names(self,):
         return output_names
         
     
-    def generate_dummy_inputs(self, kv_offload: bool = False):
+    def get_dummy_inputs(self, kv_offload: bool = False):
         if kv_offload:
             raise ValueError("kv_offload method not supported for InternVL yet!")
         NUM_CROPS = 13
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
@@ -6,7 +6,6 @@
 # ----------------------------------------------------------------------------
 
 import hashlib
-import logging
 import sys
 import warnings
 from pathlib import Path
@@ -29,7 +28,7 @@
 from QEfficient.base.modeling_qeff import QEFFBaseModel
 from QEfficient.base.onnx_transforms import FP16ClipTransform, SplitTensorsTransform
 from QEfficient.generation.cloud_infer import QAICInferenceSession
-from QEfficient.generation.text_generation_inference import get_compilation_dims
+from QEfficient.generation.text_generation_inference import CloudAI100ExecInfoNew, PerfMetrics, get_compilation_dims
 from QEfficient.transformers.models.pytorch_transforms import (
     CustomOpsTransform,
     KVCacheModuleMethodMapperTransform,
@@ -820,7 +819,7 @@ def export(
         export_dir: Optional[str] = None,
         **kwargs,
     ) -> str:
-        inputs = self.model.generate_dummy_inputs()
+        inputs = self.model.get_dummy_inputs()
         dynamic_axes = self.model.get_onnx_dynamic_axes()
         output_names = self.model.get_output_names()
         self._export(inputs, output_names, dynamic_axes, export_dir=export_dir)
@@ -843,6 +842,7 @@ def compile(
         output_names = self.model.get_output_names()
 
         # Get specializations from modelling file
+        # TODO: expose this via the auto class as well
         specializations = self.model.get_specializations(batch_size=batch_size, prefill_seq_len=prefill_seq_len,
                                                          ctx_len=ctx_len, img_size=img_size, **compiler_options)
 
@@ -873,7 +873,6 @@ def compile(
             )
         return self.qpc_path
     
-    @property
     def get_onnx_dynamic_axes(self):
         return self.model.get_onnx_dynamic_axes()
 
@@ -905,111 +904,121 @@ def generate(
         )
 
     def cloud_ai_100_generate(
-        self,
-        inputs: torch.Tensor,
-        device_ids: List[int],
-        enable_debug_logs: bool = False,
-        generation_len: int = None,
-        streamer: Optional[TextStreamer] = None,
-        generation_len: Optional[int] = None
-    ) -> np.ndarray:
-        qpc_session = QAICInferenceSession(
-            self.qpc_path, device_ids, enable_debug_logs=enable_debug_logs, activate=False
-        )
-
-        batch_size, ctx_len, fbs = get_compilation_dims(self.qpc_path)
-        
-        # Skip inputs/outputs
-        qpc_session.skip_buffers(
-            [x for x in (qpc_session.input_names + qpc_session.output_names) if x.startswith("past_")] + ["pixel_values_RetainedState"]
-        )
-
-        # Read prompt and ctx len from session
-        batch_size = max(
-            [x[qpc_session.binding_index_map["input_ids"]][1][0] for x in qpc_session.allowed_shapes]
-            + [qpc_session.bindings[qpc_session.binding_index_map["input_ids"]].dims[0]]
-        )
-
-        prefill_seq_len = max(
-            [x[qpc_session.binding_index_map["input_ids"]][1][1] for x in qpc_session.allowed_shapes]
-            + [qpc_session.bindings[qpc_session.binding_index_map["input_ids"]].dims[1]]
-        )
+            self,
+            inputs: torch.Tensor,
+            device_ids: List[int],
+            enable_debug_logs: bool = False,
+            generation_len: int = None,
+            streamer: Optional[TextStreamer] = None,
+        ) -> np.ndarray:
+            qpc_session = QAICInferenceSession(
+                self.qpc_path, device_ids, enable_debug_logs=enable_debug_logs, activate=False
+            )
 
-        input_len = inputs["attention_mask"].sum(1, keepdims=True)
-        padded_len = inputs["input_ids"].shape[1]
-        num_chunks = -(padded_len // -prefill_seq_len)  # ceil divide without float
-        padded_len = num_chunks * prefill_seq_len  # Convert to a multiple of prompt_len
+            batch_size, ctx_len, fbs = get_compilation_dims(self.qpc_path)
 
-        if generation_len is None:
-            generation_len = ctx_len - input_len.max()
+            pad_token_id = 1
 
-        assert generation_len > 0, "generation length should be greater than zero"
-        generated_ids = np.full((batch_size, generation_len + 1), -1)
+            # Skip inputs/outputs
+            qpc_session.skip_buffers(
+                [x for x in qpc_session.input_names + qpc_session.output_names if x.startswith("past_") or x.endswith("_RetainedState")]
+            )
 
-        # Prepare inputs for prefill
-        start = perf_counter()
+            # Read prompt and ctx len from session
+            batch_size = max(
+                [x[qpc_session.binding_index_map["input_ids"]][1][0] for x in qpc_session.allowed_shapes]
+                + [qpc_session.bindings[qpc_session.binding_index_map["input_ids"]].dims[0]]
+            )
 
-        inputs["position_ids"] = np.where(
-            inputs.pop("attention_mask"), np.arange(padded_len), -1
-        )  # Need to use -1 as position_ids for invalid tokens
-        inputs = dict(inputs)
+            prefill_seq_len = max(
+                [x[qpc_session.binding_index_map["input_ids"]][1][1] for x in qpc_session.allowed_shapes]
+                + [qpc_session.bindings[qpc_session.binding_index_map["input_ids"]].dims[1]]
+            )
 
-        # vision_session.deactivate()
-        qpc_session.activate()
+            input_len = inputs["attention_mask"].sum(1, keepdims=True)
+            input_ids_length = inputs["input_ids"].shape[1]
 
-        # Run prefill
-        for i in range(num_chunks):
-            chunk_inputs = inputs.copy()
-            chunk_inputs["input_ids"] = inputs["input_ids"].numpy()[:, i * prefill_seq_len : (i + 1) * prefill_seq_len]
-            chunk_inputs['pixel_values'] = chunk_inputs['pixel_values'].numpy().astype(np.float16)
-            chunk_inputs["position_ids"] = inputs["position_ids"][:, i * prefill_seq_len : (i + 1) * prefill_seq_len]
-            outputs = qpc_session.run(chunk_inputs)
+            num_chunks = -(input_ids_length //-prefill_seq_len)  # ceil divide without float
+            
+            padded_len = num_chunks * prefill_seq_len  # Convert to a multiple of prompt_len
+            if generation_len is None:
+                generation_len = ctx_len - input_len.max()
+
+            assert generation_len > 0, "generation length should be greater than zero"
+            generated_ids = np.full((batch_size, generation_len + 1), pad_token_id)
+
+            # Prepare inputs for prefill
+            prefill_start = perf_counter()
+
+            input_ids = inputs["input_ids"]
+            input_ids_size = input_ids.shape[1]
+            inputs["input_ids"] = torch.nn.functional.pad(
+                inputs["input_ids"],
+                (0, padded_len - input_ids_size),
+                "constant",
+                1,
+            )
+            inputs["attention_mask"] = torch.nn.functional.pad(
+                inputs["attention_mask"],
+                (0, padded_len - input_ids_size), "constant", 0
+            )
 
-        # Skip inputs/outputs again
-        qpc_session.skip_buffers(
-            ["pixel_values"]
-        )
+            for k, v in inputs.items():
+                inputs[k] = np.array(v)
 
-        # Get first token
-        inputs = {}
-        inputs["input_ids"] = outputs["logits"].argmax(2)
-        inputs["position_ids"] = input_len.numpy().astype(np.int64)
-        # inputs["cross_attention_mask"] = inputs["cross_attention_mask"][:, -1:, :, :]
-        generated_ids[:, 0] = inputs["input_ids"].squeeze(1)
-        # finished_sequences = inputs["input_ids"] == self.tokenizer.eos_token_id
-        if streamer:
-            streamer.put(inputs["input_ids"][0])
+            inputs["pixel_values"] = inputs["pixel_values"].astype("float16")
+            inputs["position_ids"] = np.where(inputs.pop("attention_mask"), np.arange(padded_len), -1)
 
-        # Decode loop
-        loop_start = perf_counter()
-        for num_token in range(1, generation_len):
-            outputs = qpc_session.run(inputs)
+            qpc_session.activate()
 
-            # Prepare inputs for next iteration
+            # Run prefill
+            
+            for i in range(num_chunks):
+                chunk_inputs = inputs.copy()
+                chunk_inputs["input_ids"] = inputs["input_ids"][:, i * prefill_seq_len : (i + 1) * prefill_seq_len]
+                chunk_inputs["position_ids"] = inputs["position_ids"][:, i * prefill_seq_len : (i + 1) * prefill_seq_len]
+                outputs = qpc_session.run(chunk_inputs)
+
+            prefill_time = prefill_start-perf_counter()
+            # Get first token
             inputs["input_ids"] = outputs["logits"].argmax(2)
-            inputs["position_ids"] += 1
-            generated_ids[:, num_token] = inputs["input_ids"].squeeze(1)
+            inputs["position_ids"] = input_len.numpy()
+            generated_ids[:, 0] = inputs["input_ids"].squeeze(1)
             if streamer:
                 streamer.put(inputs["input_ids"][0])
 
-        end = perf_counter()
-        if streamer:
-            streamer.end()
-
-        prefill_perf = 1 / (loop_start - start)
-        decode_perf = (num_token - 1) / (end - loop_start)
-        total_perf = num_token / (end - start)
+            qpc_session.skip_buffers(["pixel_values"])
+            inputs.pop("pixel_values")
+
+            # Decode loop
+            decode_start = perf_counter()
+            for num_token in range(1, generation_len):
+                outputs = qpc_session.run(inputs)
+                # Prepare inputs for next iteration
+                inputs["input_ids"] = outputs["logits"].argmax(2)
+                inputs["position_ids"] += 1
+                generated_ids[:, num_token] = inputs["input_ids"].squeeze(1)
+                if streamer:
+                    streamer.put(inputs["input_ids"][0])
+
+            decode_end = perf_counter()
+            if streamer:
+                streamer.end()
 
-        print("TTFT:", round(loop_start - start, 2), "s", file=sys.stderr)
-        print("E2ET:", round(end - start, 2), "s", file=sys.stderr)
-        print("Prefill:", round(prefill_perf, 2), "tok/s", file=sys.stderr)
-        print("Decode:", round(decode_perf, 2), "tok/s", file=sys.stderr)
-        print("E2E:", round(total_perf, 2), "tok/s", file=sys.stderr)
-        if batch_size > 1:
-            print("Prefill (batch):", round(prefill_perf * batch_size, 2), "tok/s", file=sys.stderr)
-            print("Decode (batch):", round(decode_perf * batch_size, 2), "tok/s", file=sys.stderr)
-            print("E2E (batch):", round(total_perf * batch_size, 2), "tok/s", file=sys.stderr)
-        return generated_ids[:, :generation_len]
+            decode_perf = (num_token - 1) / (decode_end - decode_start)
+            total_time = decode_end - prefill_start
+            total_perf = num_token / total_time
+            
+            return CloudAI100ExecInfoNew(
+                batch_size=batch_size,
+                generated_ids=generated_ids,
+                perf_metrics=PerfMetrics(
+                    prefill_time=prefill_time,
+                    decode_perf=decode_perf,
+                    total_perf=total_perf,
+                    total_time=total_time
+                )
+            )
     
     @property
     def model_hash(self) -> str:
@@ -1040,9 +1049,9 @@ class QEFFAutoModelForImageTextToText:
     @classmethod
     def from_pytorch_model(cls, model: nn.Module, kv_offload=False, **kwargs):
         if kv_offload:
-            return QEffAutoModelForImageTextToText2QPC(model, **kwargs)
+            return _QEffAutoModelForImageTextToText2QPC(model, **kwargs)
         else:
-            return QEFFAutoModelForImageTextToText1QPC(model, **kwargs)
+            return _QEFFAutoModelForImageTextToText1QPC(model, **kwargs)
         
     
     @classmethod
diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py
@@ -377,7 +377,7 @@ class KVCacheModuleMethodMapperTransform(ModuleMethodMapperTransform):
     _match_string_replace_method = {
         "InternVLChatModel": {
             "forward": QEffInternVLModel.forward,
-            "generate_dummy_inputs": QEffInternVLModel.generate_dummy_inputs,
+            "get_dummy_inputs": QEffInternVLModel.get_dummy_inputs,
             "get_specializations": QEffInternVLModel.get_specializations,
             "get_onnx_dynamic_axes": QEffInternVLModel.get_onnx_dynamic_axes,
             "get_output_names": QEffInternVLModel.get_output_names,
diff --git a/tests/transformers/models/test_image_text_to_text_intern.py b/tests/transformers/models/test_image_text_to_text_intern.py