Added example to demonstrate data-aware LLM compression with NNCF API (…

…openvinotoolkit#2368)
LGC33 · Jan 10, 2024 · 07028c5 · 07028c5
1 parent e533c12
commit 07028c5
Show file tree

Hide file tree

Showing 6 changed files with 129 additions and 0 deletions.
diff --git a/examples/llm_compression/openvino/README.md b/examples/llm_compression/openvino/README.md
@@ -0,0 +1,26 @@
+# Large Language Models Weight Compression Example
+
+This example demonstrates how to optimize Large Language Models (LLMs) using NNCF weight compression API. The example applies 4/8-bit mixed-precision quantization to weights of Linear (Fully-connected) layers of [TinyLlama/TinyLlama-1.1B-Chat-v1.0](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0) model. This leads to significant descrease of model footprint and performance improvement with OpenVINO.
+
+## Prerequisites
+
+To use this example:
+
+- Create a separate Python* environment and activate it: `python3 -m venv nncf_env && source nncf_env/bin/activate`
+- Install dependencies:
+
+```bash
+pip install -U pip
+pip install -r requirements.txt
+pip install ../../../../
+```
+
+## Run Example
+
+To run example:
+
+```bash
+python main.py
+```
+
+It will automatically download dataset and baseline model and save the resulted model.
diff --git a/examples/llm_compression/openvino/main.py b/examples/llm_compression/openvino/main.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2023 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+from functools import partial
+
+import datasets
+import numpy as np
+import openvino as ov
+from optimum.intel.openvino import OVModelForCausalLM
+from transformers import AutoTokenizer
+
+import nncf
+
+
+def main():
+    MODEL_ID = "PY007/TinyLlama-1.1B-Chat-v0.3"
+    OUTPUT_DIR = "tinyllama_compressed"
+
+    dataset = datasets.load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    model = OVModelForCausalLM.from_pretrained(MODEL_ID, export=True, load_in_8bit=False, compile=False, stateful=False)
+
+    def transform_fn(data, model, tokenizer):
+        tokenized_text = tokenizer(data["text"], return_tensors="np")
+        input_ids = tokenized_text["input_ids"]
+        attention_mask = tokenized_text["attention_mask"]
+
+        inputs = {}
+        inputs["input_ids"] = input_ids
+        inputs["attention_mask"] = tokenized_text["attention_mask"]
+        position_ids = np.cumsum(attention_mask, axis=1) - 1
+        position_ids[attention_mask == 0] = 1
+
+        # The magic forms KV cache as model inputs
+        batch_size = input_ids.shape[0]
+        for input_name in model.key_value_input_names:
+            model_inputs = model.model.input(input_name)
+            shape = model_inputs.get_partial_shape()
+            shape[0] = batch_size
+            if shape[2].is_dynamic:
+                shape[2] = 0
+            else:
+                shape[1] = 0
+            inputs[input_name] = ov.Tensor(model_inputs.get_element_type(), shape.get_shape())
+
+        inputs["position_ids"] = position_ids
+        return inputs
+
+    quantization_dataset = nncf.Dataset(dataset, partial(transform_fn, model=model, tokenizer=tokenizer))
+
+    # Comment this text to turn off model optimization and measure performance of baseline model
+    model.model = nncf.compress_weights(
+        model.model,
+        dataset=quantization_dataset,
+        mode=nncf.CompressWeightsMode.INT4_SYM,
+        ratio=0.8,
+        sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
+    )
+    model.save_pretrained(OUTPUT_DIR)
+
+    model = OVModelForCausalLM.from_pretrained(OUTPUT_DIR)
+    input_ids = tokenizer("What is PyTorch?", return_tensors="pt").to(device=model.device)
+
+    start_t = time.time()
+    output = model.generate(**input_ids, max_new_tokens=100)
+    print("Elapsed time: ", time.time() - start_t)
+
+    output_text = tokenizer.decode(output[0])
+    print(output_text)
+    return output_text
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/llm_compression/openvino/requirements.txt b/examples/llm_compression/openvino/requirements.txt
@@ -0,0 +1,3 @@
+transformers
+datasets==2.14.7
+optimum-intel[openvino]
diff --git a/nncf/__init__.py b/nncf/__init__.py
@@ -22,6 +22,7 @@
 from nncf.parameters import DropType as DropType
 from nncf.parameters import ModelType as ModelType
 from nncf.parameters import QuantizationMode as QuantizationMode
+from nncf.parameters import SensitivityMetric as SensitivityMetric
 from nncf.parameters import TargetDevice as TargetDevice
 from nncf.quantization import QuantizationPreset as QuantizationPreset
 from nncf.quantization import compress_weights as compress_weights

diff --git a/tests/cross_fw/examples/example_scope.json b/tests/cross_fw/examples/example_scope.json
@@ -143,5 +143,13 @@
             "int8_model_size": 35.55147361755371,
             "model_compression_rate": 3.8631822183889652
         }
+    },
+    "llm_compression": {
+        "backend": "openvino",
+        "requirements": "examples/llm_compression/openvino/requirements.txt",
+        "cpu": "Intel(R) Core(TM) i9-10980XE CPU @ 3.00GHz",
+        "accuracy_metrics": {
+            "word_count": 71
+        }
     }
 }
diff --git a/tests/cross_fw/examples/run_example.py b/tests/cross_fw/examples/run_example.py
@@ -130,6 +130,14 @@ def post_training_quantization_torch_ssd300_vgg16() -> Dict[str, float]:
     }
 
 
+def llm_compression() -> Dict[str, float]:
+    from examples.llm_compression.openvino.main import main as llm_compression_main
+
+    result = llm_compression_main()
+
+    return {"word_count": len(result.split())}
+
+
 def main(argv):
     parser = ArgumentParser()
     parser.add_argument("--name", help="Example name", required=True)