update docs and expand testing

neuralmagic · Sep 11, 2024 · c4fbf70 · c4fbf70
1 parent d8f7073
commit c4fbf70
Show file tree

Hide file tree

Showing 3 changed files with 23 additions and 5 deletions.
diff --git a/docs/source/en/quantization/compressed_tensors.md b/docs/source/en/quantization/compressed_tensors.md
@@ -38,6 +38,10 @@ pip install compressed-tensors
 
 
 ## Sample Model Load
+Quantized models can be easily loaded for inference as shown below. Only models that 
+have already been quantized can be loaded. To quantize a model into the compressed-tensors 
+format see [llm-compressor](https://github.com/vllm-project/llm-compressor).
+
 ```python
 from transformers import AutoModelForCausalLM
 compressed_tensors_model = AutoModelForCausalLM.from_pretrained("nm-testing/tinyllama-oneshot-w4a16-group128-v3")

diff --git a/docs/source/en/quantization/overview.md b/docs/source/en/quantization/overview.md
@@ -50,6 +50,7 @@ Use the table below to help you decide which quantization method to use.
 | [AQLM](./aqlm)                                | 🔴                       |  🟢   |     🟢     | 🔴              | 🔴                     | 🟢                      | 1 / 2          | 🟢                                   | 🟢            | 🟢                      | https://github.com/Vahe1994/AQLM            |
 | [AWQ](./awq) | 🔴                       | 🔴   | 🟢        | 🟢              | 🔴                     | ?                       | 4              | 🟢                                   | 🟢            | 🟢                      | https://github.com/casper-hansen/AutoAWQ    |
 | [bitsandbytes](./bitsandbytes)                        | 🟢                       | 🔴   |     🟢     | 🔴              | 🔴                     | 🔴                       | 4 / 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/TimDettmers/bitsandbytes |
+| [compressed-tensors](./compressed_tensors)                        | 🔴                       | 🟢   |     🟢     | 🟢              | 🔴                     | 🔴                       | 1 - 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/neuralmagic/compressed-tensors |
 | [EETQ](./eetq)                                | 🟢                       | 🔴   | 🟢        | 🔴              | 🔴                     | ?                       | 8              | 🟢                                   | 🟢            | 🟢                      | https://github.com/NetEase-FuXi/EETQ        |
 | GGUF / GGML (llama.cpp)             | 🟢                       | 🟢   | 🟢        | 🔴              | 🟢                     | 🔴                       | 1 - 8          | 🔴                                   | [See GGUF section](../gguf)                | [See GGUF section](../gguf)                      | https://github.com/ggerganov/llama.cpp      |
 | [GPTQ](./gptq)                                | 🔴                       | 🔴   | 🟢        | 🟢              | 🔴                     | 🔴                       | 2 - 3 - 4 - 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/AutoGPTQ/AutoGPTQ        |

diff --git a/tests/quantization/compressed_tensor/test_compressed_tensors.py b/tests/quantization/compressed_tensor/test_compressed_tensors.py
@@ -13,7 +13,9 @@
 @require_compressed_tensors
 @require_torch
 class CompressedTensorsTest(unittest.TestCase):
-    tinyllama_w8a8 = "nm-testing/tinyllama-oneshot-w8a8-test-static-shape-change-v3"
+    tinyllama_w8a16 = "nm-testing/tinyllama-w8a16-dense-hf-quantizer"
+    tinyllama_w4a16 = "nm-testing/tinyllama-w4a16-compressed-hf-quantizer"
+    tinyllama_w8a8 = "nm-testing/tinyllama-w8a8-compressed-hf-quantizer"
     llama3_8b_fp8 = "nm-testing/Meta-Llama-3-8B-Instruct-fp8-hf_compat"
 
     prompt = "Paris is the capital of which country?"
@@ -45,12 +47,22 @@ def test_config_to_from_dict(self):
         self.assertIsInstance(config_from_dict.sparsity_config, SparsityCompressionConfig)
 
     def test_tinyllama_w8a8(self):
-        self._test_quantized_model(self.tinyllama_w8a8)
+        expected_out = "<s> Paris is the capital of which country?\n\n**A) Paris**\n\n**Q** ** Paris is the capital of which country?\n\n**A) Paris**\n\n**Q** ** Paris is the capital of which country"
+        self._test_quantized_model(self.tinyllama_w8a8, expected_out)
+
+    def test_tinyllama_w4a16(self):
+        expected_out = "<s> Paris is the capital of which country?\nAnswer: Paris is the capital of France.\nQuestion: Which country is the capital of which city?\nAnswer: The capital of the city of New York is New York.\nQuestion: Which"
+        self._test_quantized_model(self.tinyllama_w4a16, expected_out)
+
+    def test_tinyllama_w8a16(self):
+        expected_out = "<s> Paris is the capital of which country?\nA. France\nB. Germany\nC. Spain\nD. Italy\nE. Switzerland\nQ10. Which of the following is not a country in the European Union?\nA."
+        self._test_quantized_model(self.tinyllama_w8a16, expected_out)
 
     def test_llama_8b_fp8(self):
-        self._test_quantized_model(self.llama3_8b_fp8)
+        expected_out = "<|begin_of_text|>Paris is the capital of which country? France\nWhat is the name of the famous art museum in Paris? The Louvre\nWhat is the name of the famous opera house in Paris? Palais Garnier\nWhat is the name of the"
+        self._test_quantized_model(self.llama3_8b_fp8, expected_out)
 
-    def _test_quantized_model(self, model_name: str):
+    def _test_quantized_model(self, model_name: str, expected_output: str):
         """Carry out generation"""
         quantized_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
         tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -68,7 +80,8 @@ def _test_quantized_model(self, model_name: str):
             "quantized model should load a non-trivial scale into the state dict",
         )
         inputs = tokenizer(self.prompt, return_tensors="pt").to(device)
-        generated_ids = quantized_model.generate(**inputs, max_length=50)
+        generated_ids = quantized_model.generate(**inputs, max_length=50, do_sample=False)
         outputs = tokenizer.batch_decode(generated_ids)
 
         self.assertIsNotNone(outputs)
+        self.assertEqual(outputs[0], expected_output)