Skip to content

Commit

Permalink
update docs and expand testing
Browse files Browse the repository at this point in the history
  • Loading branch information
Sara Adkins committed Sep 11, 2024
1 parent d8f7073 commit c4fbf70
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 5 deletions.
4 changes: 4 additions & 0 deletions docs/source/en/quantization/compressed_tensors.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ pip install compressed-tensors


## Sample Model Load
Quantized models can be easily loaded for inference as shown below. Only models that
have already been quantized can be loaded. To quantize a model into the compressed-tensors
format see [llm-compressor](https://github.com/vllm-project/llm-compressor).

```python
from transformers import AutoModelForCausalLM
compressed_tensors_model = AutoModelForCausalLM.from_pretrained("nm-testing/tinyllama-oneshot-w4a16-group128-v3")
Expand Down
1 change: 1 addition & 0 deletions docs/source/en/quantization/overview.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ Use the table below to help you decide which quantization method to use.
| [AQLM](./aqlm) | 🔴 | 🟢 | 🟢 | 🔴 | 🔴 | 🟢 | 1 / 2 | 🟢 | 🟢 | 🟢 | https://github.com/Vahe1994/AQLM |
| [AWQ](./awq) | 🔴 | 🔴 | 🟢 | 🟢 | 🔴 | ? | 4 | 🟢 | 🟢 | 🟢 | https://github.com/casper-hansen/AutoAWQ |
| [bitsandbytes](./bitsandbytes) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | 4 / 8 | 🟢 | 🟢 | 🟢 | https://github.com/TimDettmers/bitsandbytes |
| [compressed-tensors](./compressed_tensors) | 🔴 | 🟢 | 🟢 | 🟢 | 🔴 | 🔴 | 1 - 8 | 🟢 | 🟢 | 🟢 | https://github.com/neuralmagic/compressed-tensors |
| [EETQ](./eetq) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | ? | 8 | 🟢 | 🟢 | 🟢 | https://github.com/NetEase-FuXi/EETQ |
| GGUF / GGML (llama.cpp) | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | 🔴 | 1 - 8 | 🔴 | [See GGUF section](../gguf) | [See GGUF section](../gguf) | https://github.com/ggerganov/llama.cpp |
| [GPTQ](./gptq) | 🔴 | 🔴 | 🟢 | 🟢 | 🔴 | 🔴 | 2 - 3 - 4 - 8 | 🟢 | 🟢 | 🟢 | https://github.com/AutoGPTQ/AutoGPTQ |
Expand Down
23 changes: 18 additions & 5 deletions tests/quantization/compressed_tensor/test_compressed_tensors.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
@require_compressed_tensors
@require_torch
class CompressedTensorsTest(unittest.TestCase):
tinyllama_w8a8 = "nm-testing/tinyllama-oneshot-w8a8-test-static-shape-change-v3"
tinyllama_w8a16 = "nm-testing/tinyllama-w8a16-dense-hf-quantizer"
tinyllama_w4a16 = "nm-testing/tinyllama-w4a16-compressed-hf-quantizer"
tinyllama_w8a8 = "nm-testing/tinyllama-w8a8-compressed-hf-quantizer"
llama3_8b_fp8 = "nm-testing/Meta-Llama-3-8B-Instruct-fp8-hf_compat"

prompt = "Paris is the capital of which country?"
Expand Down Expand Up @@ -45,12 +47,22 @@ def test_config_to_from_dict(self):
self.assertIsInstance(config_from_dict.sparsity_config, SparsityCompressionConfig)

def test_tinyllama_w8a8(self):
self._test_quantized_model(self.tinyllama_w8a8)
expected_out = "<s> Paris is the capital of which country?\n\n**A) Paris**\n\n**Q** ** Paris is the capital of which country?\n\n**A) Paris**\n\n**Q** ** Paris is the capital of which country"
self._test_quantized_model(self.tinyllama_w8a8, expected_out)

def test_tinyllama_w4a16(self):
expected_out = "<s> Paris is the capital of which country?\nAnswer: Paris is the capital of France.\nQuestion: Which country is the capital of which city?\nAnswer: The capital of the city of New York is New York.\nQuestion: Which"
self._test_quantized_model(self.tinyllama_w4a16, expected_out)

def test_tinyllama_w8a16(self):
expected_out = "<s> Paris is the capital of which country?\nA. France\nB. Germany\nC. Spain\nD. Italy\nE. Switzerland\nQ10. Which of the following is not a country in the European Union?\nA."
self._test_quantized_model(self.tinyllama_w8a16, expected_out)

def test_llama_8b_fp8(self):
self._test_quantized_model(self.llama3_8b_fp8)
expected_out = "<|begin_of_text|>Paris is the capital of which country? France\nWhat is the name of the famous art museum in Paris? The Louvre\nWhat is the name of the famous opera house in Paris? Palais Garnier\nWhat is the name of the"
self._test_quantized_model(self.llama3_8b_fp8, expected_out)

def _test_quantized_model(self, model_name: str):
def _test_quantized_model(self, model_name: str, expected_output: str):
"""Carry out generation"""
quantized_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
Expand All @@ -68,7 +80,8 @@ def _test_quantized_model(self, model_name: str):
"quantized model should load a non-trivial scale into the state dict",
)
inputs = tokenizer(self.prompt, return_tensors="pt").to(device)
generated_ids = quantized_model.generate(**inputs, max_length=50)
generated_ids = quantized_model.generate(**inputs, max_length=50, do_sample=False)
outputs = tokenizer.batch_decode(generated_ids)

self.assertIsNotNone(outputs)
self.assertEqual(outputs[0], expected_output)

0 comments on commit c4fbf70

Please sign in to comment.