SwiftKV backup PR (quic#367)

ochougul · eplatero97 · commit 2eb48598ffbb · 2025-04-29T04:44:08.000-05:00
* SwiftKV support added for CB as well as non-cb
diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
@@ -10,8 +10,6 @@
 
 import torch
 import torch.nn as nn
-import transformers.models.auto.modeling_auto as mapping
-from transformers import AutoModelForCausalLM
 from transformers.models.codegen.modeling_codegen import (
     CodeGenAttention,
     CodeGenBlock,
@@ -279,20 +277,6 @@
     WhisperForConditionalGeneration: QEffWhisperForConditionalGeneration,
 }
 
-# Map of model type to config class, Modelling class and transformer model architecture class
-MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS = {
-    "llama_swiftkv": [QEffLlamaSwiftKVConfig, QEffLlamaSwiftKVForCausalLM, AutoModelForCausalLM],
-}
-
-
-MODEL_CLASS_MAPPING = {
-    **{architecture: "QEFFAutoModelForCausalLM" for architecture in mapping.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values()},
-    **{
-        architecture: "QEFFAutoModelForImageTextToText"
-        for architecture in mapping.MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values()
-    },
-}
-
 
 def _prepare_cross_attention_mask(
     cross_attention_mask: torch.Tensor,
diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py
@@ -47,13 +47,6 @@
     "ibm-granite/granite-3.1-1b-a400m-base",
 ]
 
-test_models_qnn = [
-    "mistralai/Mixtral-8x7B-Instruct-v0.1",
-    "meta-llama/Llama-3.2-1B",
-    "unsloth/gemma-2b",
-    "ibm-granite/granite-guardian-3.1-2b",
-]
-
 spd_test_models = [
     "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
     "Qwen/Qwen2-0.5B",
@@ -122,7 +115,6 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
     )
 
     pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch(model_hf)
-
     is_tlm = False if num_speculative_tokens is None else True
     qaic_config = None
     if is_tlm:
@@ -156,22 +148,16 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
         qnn_config=qnn_config,
     )
     exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR)
-    cloud_ai_100_tokens = exec_info.generated_ids[0][
-        :, :gen_len
-    ]  # Because we always run for single input and single batch size
-    if prefill_only:
-        assert (ort_tokens[0][0] == cloud_ai_100_tokens[0][0]).all(), (
-            "prefill run output tokens don't match for ONNXRT output and Cloud AI 100 output."
-        )
-    else:
-        assert (ort_tokens == cloud_ai_100_tokens).all(), (
-            "Tokens don't match for ONNXRT output and Cloud AI 100 output."
-        )
-        assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json"))
-    if prefill_only is not None:
-        return
+    cloud_ai_100_tokens = exec_info.generated_ids[0]  # Because we always run for single input and single batch size
+    gen_len = ort_tokens.shape[-1]
+    assert (ort_tokens == cloud_ai_100_tokens[:, :gen_len]).all(), (
+        "Tokens don't match for ONNXRT output and Cloud AI 100 output."
+    )
+    assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json"))
+
     # testing for CB models
     model_hf, _ = load_causal_lm_model(model_config)
+    config = model_hf.config
     full_batch_size = 4
     fbs_prompts = Constants.INPUT_STR * 4
     api_runner = ApiRunner(
@@ -187,7 +173,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
     pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch_CB(model_hf)
     pytorch_hf_tokens = np.vstack(pytorch_hf_tokens)
 
-    qeff_model = QEFFAutoModelForCausalLM(model_hf, continuous_batching=True, is_tlm=is_tlm)
+    qeff_model = QEFFAutoModelForCausalLM(model_hf, continuous_batching=True, is_tlm=False)
     onnx_model_path = qeff_model.export()
 
     if not get_available_device_id():
@@ -198,7 +184,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
         prefill_seq_len=prompt_len,
         ctx_len=ctx_len,
         num_cores=14,
-        mxfp6=False,
+        mxfp6_matmul=False,
         aic_enable_depth_first=False,
         full_batch_size=full_batch_size,
         num_speculative_tokens=num_speculative_tokens,
@@ -216,6 +202,103 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
     assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json"))
 
 
+def check_non_hf_kv_vs_ort_vs_ai100(
+    model_name: str,
+    prompt_len: int = Constants.PROMPT_LEN,
+    ctx_len: int = Constants.CTX_LEN,
+    n_layer: int = 1,
+    num_speculative_tokens: Optional[int] = None,
+):
+    """
+    Validate the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching.
+    ``Mandatory`` Args:
+        :model_name (str): Hugging Face Model Card name, Example: ``gpt2``
+        :prompt_len (int): Prompt length for the model to compile.
+        :ctx_len (int): Maximum context length to compile the model.
+        :n_layers (int): Number of layers for the Model.
+    """
+    replace_transformers_quantizers()
+    model_config = {"model_name": model_name}
+    model_config["n_layer"] = n_layer
+
+    model_hf, _ = load_causal_lm_model(model_config)
+
+    tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_name)
+    config = model_hf.config
+    batch_size = len(Constants.INPUT_STR)
+    api_runner = ApiRunner(
+        batch_size,
+        tokenizer,
+        config,
+        Constants.INPUT_STR,
+        Constants.PROMPT_LEN,
+        Constants.CTX_LEN,
+    )
+
+    is_tlm = False if num_speculative_tokens is None else True
+
+    qeff_model = QEFFAutoModelForCausalLM(model_hf, is_tlm=is_tlm)
+    pytorch_kv_tokens = api_runner.run_kv_model_on_pytorch(qeff_model.model)
+
+    onnx_model_path = qeff_model.export()
+    ort_tokens = api_runner.run_kv_model_on_ort(onnx_model_path, is_tlm=is_tlm)
+
+    assert (pytorch_kv_tokens == ort_tokens).all(), "Tokens don't match for ONNXRT output and PyTorch output."
+
+    if not get_available_device_id():
+        pytest.skip("No available devices to run model on Cloud AI 100")
+
+    qpc_path = qeff_model.compile(
+        prefill_seq_len=prompt_len,
+        ctx_len=ctx_len,
+        num_cores=14,
+        mxfp6=False,
+        aic_enable_depth_first=False,
+        num_speculative_tokens=num_speculative_tokens,
+    )
+
+    exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR)
+    cloud_ai_100_tokens = exec_info.generated_ids[0]  # Because we always run for single input and single batch size
+    gen_len = ort_tokens.shape[-1]
+
+    assert (ort_tokens == cloud_ai_100_tokens[:, :gen_len]).all(), (
+        "Tokens don't match for ONNXRT output and Cloud AI 100 output."
+    )
+    assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json"))
+
+    # testing for CB models
+    model_hf, _ = load_causal_lm_model(model_config)
+    config = model_hf.config
+    full_batch_size = 4
+    fbs_prompts = Constants.INPUT_STR * 4
+
+    qeff_model = QEFFAutoModelForCausalLM(model_hf, continuous_batching=True, is_tlm=is_tlm)
+    onnx_model_path = qeff_model.export()
+
+    if not get_available_device_id():
+        pytest.skip("No available devices to run model on Cloud AI 100")
+
+    qpc_path = qeff_model.compile(
+        prefill_seq_len=prompt_len,
+        ctx_len=ctx_len,
+        num_cores=14,
+        mxfp6=False,
+        aic_enable_depth_first=False,
+        full_batch_size=full_batch_size,
+        num_speculative_tokens=num_speculative_tokens,
+    )
+
+    exec_info_fbs = qeff_model.generate(tokenizer, prompts=fbs_prompts)
+
+    assert all(
+        [
+            all(pt_token[:24] == cloud_token[:24])
+            for pt_token, cloud_token in zip(ort_tokens, exec_info_fbs.generated_ids)
+        ]
+    ), "Tokens don't match for  HF PyTorch model output and Cloud AI 100 output."
+    assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json"))
+
+
 # FIXME: there should be a CB test here
 @pytest.mark.parametrize("model_name", ["gpt2"], ids=lambda x: x)
 def test_causal_lm_export_with_deprecated_api(model_name):
@@ -262,6 +345,22 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name):
     check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, n_layer=n_layer)
 
 
+@pytest.mark.on_qaic
+@pytest.mark.parametrize("model_name", swiftkv_test_models)
+def test_non_hf_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name):
+    """
+    Test function to validate the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching.
+    ``Mandatory`` Args:
+        :model_name (str): Hugging Face Model Card name, Example: ``gpt2``
+    """
+    if model_name == "Snowflake/Llama-3.1-SwiftKV-8B-Instruct":
+        n_layer = 32
+    else:
+        n_layer = 2
+
+    check_non_hf_kv_vs_ort_vs_ai100(model_name=model_name, n_layer=n_layer)
+
+
 @pytest.mark.on_qaic
 @pytest.mark.qnn
 @pytest.mark.parametrize("model_name", test_models_qnn)