Skip to content

Commit

Permalink
Added example to demonstrate data-aware LLM compression with NNCF API (
Browse files Browse the repository at this point in the history
  • Loading branch information
AlexKoff88 authored Jan 10, 2024
1 parent e533c12 commit 07028c5
Show file tree
Hide file tree
Showing 6 changed files with 129 additions and 0 deletions.
26 changes: 26 additions & 0 deletions examples/llm_compression/openvino/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Large Language Models Weight Compression Example

This example demonstrates how to optimize Large Language Models (LLMs) using NNCF weight compression API. The example applies 4/8-bit mixed-precision quantization to weights of Linear (Fully-connected) layers of [TinyLlama/TinyLlama-1.1B-Chat-v1.0](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0) model. This leads to significant descrease of model footprint and performance improvement with OpenVINO.

## Prerequisites

To use this example:

- Create a separate Python* environment and activate it: `python3 -m venv nncf_env && source nncf_env/bin/activate`
- Install dependencies:

```bash
pip install -U pip
pip install -r requirements.txt
pip install ../../../../
```

## Run Example

To run example:

```bash
python main.py
```

It will automatically download dataset and baseline model and save the resulted model.
83 changes: 83 additions & 0 deletions examples/llm_compression/openvino/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# Copyright (c) 2023 Intel Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
from functools import partial

import datasets
import numpy as np
import openvino as ov
from optimum.intel.openvino import OVModelForCausalLM
from transformers import AutoTokenizer

import nncf


def main():
MODEL_ID = "PY007/TinyLlama-1.1B-Chat-v0.3"
OUTPUT_DIR = "tinyllama_compressed"

dataset = datasets.load_dataset("wikitext", "wikitext-2-raw-v1", split="test")

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = OVModelForCausalLM.from_pretrained(MODEL_ID, export=True, load_in_8bit=False, compile=False, stateful=False)

def transform_fn(data, model, tokenizer):
tokenized_text = tokenizer(data["text"], return_tensors="np")
input_ids = tokenized_text["input_ids"]
attention_mask = tokenized_text["attention_mask"]

inputs = {}
inputs["input_ids"] = input_ids
inputs["attention_mask"] = tokenized_text["attention_mask"]
position_ids = np.cumsum(attention_mask, axis=1) - 1
position_ids[attention_mask == 0] = 1

# The magic forms KV cache as model inputs
batch_size = input_ids.shape[0]
for input_name in model.key_value_input_names:
model_inputs = model.model.input(input_name)
shape = model_inputs.get_partial_shape()
shape[0] = batch_size
if shape[2].is_dynamic:
shape[2] = 0
else:
shape[1] = 0
inputs[input_name] = ov.Tensor(model_inputs.get_element_type(), shape.get_shape())

inputs["position_ids"] = position_ids
return inputs

quantization_dataset = nncf.Dataset(dataset, partial(transform_fn, model=model, tokenizer=tokenizer))

# Comment this text to turn off model optimization and measure performance of baseline model
model.model = nncf.compress_weights(
model.model,
dataset=quantization_dataset,
mode=nncf.CompressWeightsMode.INT4_SYM,
ratio=0.8,
sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
)
model.save_pretrained(OUTPUT_DIR)

model = OVModelForCausalLM.from_pretrained(OUTPUT_DIR)
input_ids = tokenizer("What is PyTorch?", return_tensors="pt").to(device=model.device)

start_t = time.time()
output = model.generate(**input_ids, max_new_tokens=100)
print("Elapsed time: ", time.time() - start_t)

output_text = tokenizer.decode(output[0])
print(output_text)
return output_text


if __name__ == "__main__":
main()
3 changes: 3 additions & 0 deletions examples/llm_compression/openvino/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
transformers
datasets==2.14.7
optimum-intel[openvino]
1 change: 1 addition & 0 deletions nncf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from nncf.parameters import DropType as DropType
from nncf.parameters import ModelType as ModelType
from nncf.parameters import QuantizationMode as QuantizationMode
from nncf.parameters import SensitivityMetric as SensitivityMetric
from nncf.parameters import TargetDevice as TargetDevice
from nncf.quantization import QuantizationPreset as QuantizationPreset
from nncf.quantization import compress_weights as compress_weights
Expand Down
8 changes: 8 additions & 0 deletions tests/cross_fw/examples/example_scope.json
Original file line number Diff line number Diff line change
Expand Up @@ -143,5 +143,13 @@
"int8_model_size": 35.55147361755371,
"model_compression_rate": 3.8631822183889652
}
},
"llm_compression": {
"backend": "openvino",
"requirements": "examples/llm_compression/openvino/requirements.txt",
"cpu": "Intel(R) Core(TM) i9-10980XE CPU @ 3.00GHz",
"accuracy_metrics": {
"word_count": 71
}
}
}
8 changes: 8 additions & 0 deletions tests/cross_fw/examples/run_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,14 @@ def post_training_quantization_torch_ssd300_vgg16() -> Dict[str, float]:
}


def llm_compression() -> Dict[str, float]:
from examples.llm_compression.openvino.main import main as llm_compression_main

result = llm_compression_main()

return {"word_count": len(result.split())}


def main(argv):
parser = ArgumentParser()
parser.add_argument("--name", help="Example name", required=True)
Expand Down

0 comments on commit 07028c5

Please sign in to comment.