intel-analytics · Oscilloscope98 · Sep 25, 2024 · Sep 23, 2024 · Sep 24, 2024 · Sep 25, 2024
diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md
@@ -10,6 +10,7 @@ In this directory, you will find examples on how to directly run HuggingFace `tr
 | Chatglm3 | [THUDM/chatglm3-6b](https://huggingface.co/THUDM/chatglm3-6b) |
 | Chatglm2 | [THUDM/chatglm2-6b](https://huggingface.co/THUDM/chatglm2-6b) |
 | Qwen2 | [Qwen/Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct), [Qwen/Qwen2-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2-1.5B-Instruct) |
+| Qwen2.5 | [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) |
 | MiniCPM | [openbmb/MiniCPM-2B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16) |
 | Phi-3 | [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) |
 | Stablelm | [stabilityai/stablelm-zephyr-3b](https://huggingface.co/stabilityai/stablelm-zephyr-3b) |
@@ -83,6 +84,7 @@ The examples below show how to run the **_optimized HuggingFace model implementa
 - [Llama3-8B](./llama.py)
 - [Qwen2-1.5B](./qwen2.py)
 - [Qwen2-7B](./qwen2.py)
+- [Qwen2.5-7B](./qwen2.5.py)
 - [MiniCPM-1B](./minicpm.py)
 - [MiniCPM-2B](./minicpm.py)
 - [Baichuan2-7B](./baichuan2.py)
@@ -95,7 +97,7 @@ Supported models: Llama2-7B, Llama3-8B, Qwen2-1.5B, Qwen2-7B, MiniCPM-1B, MiniCP
 #### 32.0.100.2625
 Supported models: Llama2-7B, MiniCPM-1B, Baichuan2-7B
 #### 32.0.101.2715
-Supported models: Llama3-8B, MiniCPM-2B, Qwen2-7B, Qwen2-1.5B
+Supported models: Llama3-8B, MiniCPM-2B, Qwen2-7B, Qwen2-1.5B, Qwen2.5-7B
 
 ### Run
 ```cmd
@@ -105,12 +107,15 @@ python llama.py
 :: to run Meta-Llama-3-8B-Instruct (LNL driver version: 32.0.101.2715)
 python llama.py --repo-id-or-model-path meta-llama/Meta-Llama-3-8B-Instruct
 
-:: to run Qwen2-1.5B-Instruct LNL driver version: 32.0.101.2715)
+:: to run Qwen2-1.5B-Instruct (LNL driver version: 32.0.101.2715)
 python qwen2.py
 
-:: to run Qwen2-7B-Instruct LNL driver version: 32.0.101.2715)
+:: to run Qwen2-7B-Instruct (LNL driver version: 32.0.101.2715)
 python qwen2.py --repo-id-or-model-path Qwen/Qwen2-7B-Instruct
 
+:: to run Qwen2.5-7B-Instruct (LNL driver version: 32.0.101.2715)
+python qwen2.5.py --repo-id-or-model-path Qwen/Qwen2.5-7B-Instruct
+
 :: to run MiniCPM-1B-sft-bf16
 python minicpm.py
 
@@ -150,6 +155,9 @@ python qwen2.py --disable-transpose-value-cache
 :: to run Qwen2-7B-Instruct LNL driver version: 32.0.101.2715)
 python qwen2.py --repo-id-or-model-path Qwen/Qwen2-7B-Instruct --disable-transpose-value-cache
 
+:: to run Qwen2.5-7B-Instruct LNL driver version: 32.0.101.2715)
+python qwen2.5.py --repo-id-or-model-path Qwen/Qwen2.5-7B-Instruct --disable-transpose-value-cache
+
 :: to run MiniCPM-1B-sft-bf16
 python minicpm.py --disable-transpose-value-cache
 
@@ -160,11 +168,14 @@ python minicpm.py --repo-id-or-model-path openbmb/MiniCPM-2B-sft-bf16 --disable-
 python baichuan2.py --disable-transpose-value-cache
 ```
 
-For [Qwen2-7B](./qwen2.py), you could also try to enable mixed precision optimization when encountering output problems:
+For [Qwen2-7B](./qwen2.py) and [Qwen2.5-7B](./qwen2.5.py), you could also try to enable mixed precision optimization when encountering output problems:
 
 ```cmd
 python qwen2.py --repo-id-or-model-path Qwen/Qwen2-7B-Instruct --mixed-precision
 ``` 
+```cmd
+python qwen2.5.py --repo-id-or-model-path Qwen/Qwen2.5-7B-Instruct --mixed-precision
+``` 
 
 #### Better Performance with High CPU Utilization
 You could enable optimization by setting the environment variable with `set IPEX_LLM_CPU_LM_HEAD=1` for better performance. But this will cause high CPU utilization.

diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen2.5.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen2.5.py
@@ -0,0 +1,119 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+import torch
+import time
+import argparse
+
+from ipex_llm.transformers.npu_model import AutoModelForCausalLM
+from transformers import AutoTokenizer
+
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Predict Tokens using `generate()` API for npu model"
+    )
+    parser.add_argument(
+        "--repo-id-or-model-path",
+        type=str,
+        default="Qwen/Qwen2.5-7B-Instruct",
+        help="The huggingface repo id for the Qwen2.5 model to be downloaded"
+        ", or the path to the huggingface checkpoint folder",
+    )
+    parser.add_argument("--lowbit-path", type=str,
+        default="",
+        help="The path to the lowbit model folder, leave blank if you do not want to save. \
+            If path not exists, lowbit model will be saved there. \
+            Else, lowbit model will be loaded.",
+    )
+    parser.add_argument('--prompt', type=str, default="AI是什么?",
+                        help='Prompt to infer')
+    parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
+    parser.add_argument("--max-output-len", type=int, default=1024)
+    parser.add_argument("--max-prompt-len", type=int, default=512)
+    parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
+    parser.add_argument("--intra-pp", type=int, default=None)
+    parser.add_argument("--inter-pp", type=int, default=None)
+    parser.add_argument("--mixed-precision", action='store_true')
+
+    args = parser.parse_args()
+    model_path = args.repo_id_or_model_path
+
+    if not args.lowbit_path or not os.path.exists(args.lowbit_path):
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            torch_dtype=torch.float16,
+            trust_remote_code=True,
+            attn_implementation="eager",
+            load_in_low_bit="sym_int4",
+            optimize_model=True,
+            max_output_len=args.max_output_len,
+            max_prompt_len=args.max_prompt_len,
+            intra_pp=args.intra_pp,
+            inter_pp=args.inter_pp,
+            transpose_value_cache=not args.disable_transpose_value_cache,
+            mixed_precision=args.mixed_precision
+        )
+    else:
+        model = AutoModelForCausalLM.load_low_bit(
+            args.lowbit_path,
+            attn_implementation="eager",
+            torch_dtype=torch.float16,
+            optimize_model=True,
+            max_output_len=args.max_output_len,
+            max_prompt_len=args.max_prompt_len,
+            intra_pp=args.intra_pp,
+            inter_pp=args.inter_pp,
+            transpose_value_cache=not args.disable_transpose_value_cache,
+        )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+
+    if args.lowbit_path and not os.path.exists(args.lowbit_path):
+        model.save_low_bit(args.lowbit_path)
+
+    print("-" * 80)
+    print("done")
+    messages = [{"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": args.prompt}]
+    text = tokenizer.apply_chat_template(messages,
+                                         tokenize=False,
+                                         add_generation_prompt=True)
+    with torch.inference_mode():
+        print("finish to load")
+        for i in range(3):
+            _input_ids = tokenizer([text], return_tensors="pt").input_ids
+            print("input length:", len(_input_ids[0]))
+            st = time.time()
+            output = model.generate(
+                _input_ids, num_beams=1, do_sample=False, max_new_tokens=args.n_predict
+            )
+            end = time.time()
+            print(f"Inference time: {end-st} s")
+            input_str = tokenizer.decode(_input_ids[0], skip_special_tokens=False)
+            print("-" * 20, "Input", "-" * 20)
+            print(input_str)
+            output_str = tokenizer.decode(output[0], skip_special_tokens=False)
+            print("-" * 20, "Output", "-" * 20)
+            print(output_str)
+
+    print("-" * 80)
+    print("done")
+    print("success shut down")