Fixes

jainapurva · jainapurva · commit 7e997804babf · 2024-10-04T10:41:23.000-07:00
diff --git a/torchao/_models/llama/eval.py b/torchao/_models/llama/eval.py
@@ -24,7 +24,9 @@
     float8_dynamic_activation_float8_weight,
     float8_static_activation_float8_weight,
 )
+from torchao.quantization.observer import PerRow, PerTensor
 from torchao._models._eval import TransformerEvalWrapper, InputRecorder
+from torchao._models.llama.model import prepare_inputs_for_model
 
 from tokenizer import get_tokenizer
 import time
@@ -56,33 +58,17 @@ def run_evaluation(
     tokenizer_path = checkpoint_path.parent / "tokenizer.model"
     assert tokenizer_path.is_file(), str(tokenizer_path)
     # Load Model and Tokenizer
-
     print("Loading model ...")
     t0 = time.time()
     model = _load_model(checkpoint_path, "cpu", precision)
 
     if max_length is None:
         max_length = model.config.block_size
-    print('Load model successfully')
     device_sync(device=device) # MKG
     print(f"Time to load model: {time.time() - t0:.02f} seconds")
     tokenizer = get_tokenizer(tokenizer_path, checkpoint_path)
-    print('Run completed until tokenizer')
 
     if quantization:
-        from torchao.quantization.quant_api import (
-            quantize_,
-            int4_weight_only,
-            int8_weight_only,
-            int8_dynamic_activation_int8_weight,
-            fpx_weight_only,
-            uintx_weight_only,
-            unwrap_tensor_subclass,
-            float8_weight_only,
-            float8_dynamic_activation_float8_weight,
-        )
-        from torchao.quantization.observer import PerRow, PerTensor
-        print('Quantization imports completed')
         if "int8wo" in quantization:
             quantize_(model, int8_weight_only())
         if "int8dq" in quantization:
@@ -117,7 +103,6 @@ def run_evaluation(
             # avoid circular imports
             from torchao._models._eval import InputRecorder
             from torchao.quantization.GPTQ import Int4WeightOnlyGPTQQuantizer
-            from torchao._models.llama.model import prepare_inputs_for_model
             groupsize=int(quantization.split("-")[-2])
             assert groupsize in [32,64,128,256], f"int4wo groupsize needs to be one of [32,64,128,256] but got {groupsize}"
             assert precision==torch.bfloat16, f"{quantization} requires precision or bfloat16 but got {precision}"
diff --git a/torchao/_models/llama/evals.sh b/torchao/_models/llama/evals.sh
@@ -11,8 +11,12 @@ export CHECKPOINT_PATH=../../../checkpoints # path to checkpoints folder
 export MODEL_REPO=meta-llama/Meta-Llama-3.1-8B
 # python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization autoround-cpu    # auto-round w/o quant_lm_head
 # python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization autoround-cuda-1 # auto-round w/ quant_lm_head
-python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization float8wo
-python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization float8dq-tensor
-python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization float8dq-row
+python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth
+python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization int8wo
+python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization int8dq
+python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization int4wo-64
+# python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization float8wo #7.60
+# python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization float8dq-tensor #7.62
+# python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization float8dq-row #7.62
 # --tasks 'mmlu' 'truthfulqa_mc2'
 # python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization autoquant --tasks 'winogrande' 'arc_challenge'
diff --git a/torchao/_models/llama/generate.py b/torchao/_models/llama/generate.py
@@ -210,8 +210,11 @@ def main(
             fpx_weight_only,
             uintx_weight_only,
             autoquant,
-            unwrap_tensor_subclass
+            unwrap_tensor_subclass,
+            float8_weight_only,
+            float8_dynamic_activation_float8_weight,
         )
+        from torchao.quantization.observer import PerTensor, PerRow
         if "int8wo" in quantization:
             quantize_(model, int8_weight_only())
         if "int8dq" in quantization:
diff --git a/torchao/quantization/README.md b/torchao/quantization/README.md
@@ -20,16 +20,17 @@ Benchmarks and evaluation are run on a machine with a single NVIDIA-A100-80GB GP
 |             | int4wo-64               |  8.316              |  180.80       |  763.33                 |  6.88            |  4.22           |
 |             | int4wo-64-GPTQ          |  7.921              |  180.80       |  763.33                 |  6.88            |  4.22           |
 |             | autoquant-int4hqq       |  8.110              |  188.41       |  800.58                 |  7.14            |  4.25           |
-| Llama-3.1-8B  | Base (bfloat16)         |  7.441              |   95.64       | 1435.54                 | 16.43            | 15.01           |
-|             | int8dq                  |  7.581              |    8.61       |   64.75                 |  9.24            |  7.52           |
-|             | int8wo                  |  7.447              |  153.03       | 1150.80                 | 10.42            |  7.52           |
-|             | fp6                     |  7.661              |  161.58       |  910.02                 |  7.72            |  5.63           |
-|             | int4wo-64               |  8.316              |  180.80       |  763.33                 |  6.88            |  4.22           |
-|             | int4wo-64-GPTQ          |  7.921              |  180.80       |  763.33                 |  6.88            |  4.22           |
-|             | autoquant-int4hqq       |  8.110              |  188.41       |  800.58                 |  7.14            |  4.25           |
-|             | float8wo               |  8.316              |  180.80       |  763.33                 |  6.88            |  4.22           |
-|             | float8dq (PerTensor)          |  7.921              |  180.80       |  763.33                 |  6.88            |  4.22           |
-|             | float8dq (Per Row)       |  8.110              |  188.41       |  800.58                 |  7.14            |  4.25           |
+
+Benchmarks and evaluation for model meta-llama/Meta-Llama-3.1-8B are run on a machine with a single NVIDIA-H100 GPU using the scripts for [generation](../_models/llama/generate.py) and [eval](../_models/llama/eval.py). Evaluation was done using the lm_eval library for tasks/data.
+
+| Model         | Technique               | wikitext-perplexity | Tokens/Second | Memory Bandwidth (GB/s) | Peak Memory (GB) | Model Size (GB) |
+| -----------   | ----------------------- | ------------------- | ------------- | ----------------------- | ---------------- | --------------- |
+| Llama-3.1-8B  | Base (bfloat16)         |  7.54               |  126.90       | 1904.75                 | 16.75            | 15.01           |
+|               | int8wo                  |  7.56               |  198.85       | 1495.41                 | 11.05            |  7.52           |
+|               | int4wo-64               |  8.44               |  241.39       | 1019.14                 |  7.08            |  4.22           |
+|               | float8wo                |  7.60               |  178.46       | 1339.93                 | 12.09            |  7.51           |
+|               | float8dq (PerTensor)    |  7.62               |  116.40       |  873.58                 | 11.14            |  7.51           |
+|               | float8dq (Per Row)      |  7.62               |  154.63       | 1161.47                 | 11.14            |  7.51           |
 
 note: Int8 dynamic quantization works best on compute bound models like [SAM](https://github.com/pytorch-labs/segment-anything-fast) whereas Llama with batchsize=1 tends to be memory bound, thus the rather low performance.
 
@@ -136,7 +137,8 @@ change_linear_weights_to_int8_dqtensors(model)
 ```python
 # for torch 2.4+
 from torchao.quantization import quantize_, float8_dynamic_activation_float8_weight
-quantize_(model, float8_dynamic_activation_float8_weight())
+from torchao.quantization.observer import PerTensor
+quantize_(model, float8_dynamic_activation_float8_weight(granularity=PerTensor()))
 
 ```