Fixes

jainapurva · jainapurva · commit b16772d622b0 · 2024-10-04T11:17:13.000-07:00
diff --git a/torchao/_models/llama/eval.py b/torchao/_models/llama/eval.py
@@ -128,9 +128,17 @@ def run_evaluation(
         if "float8wo" in quantization:
             quantize_(model, float8_weight_only())
         if "float8dq" in quantization:
-            quantize_(model, float8_dynamic_activation_float8_weight())
-        if "float8saq" in quantization:
-            quantize_(model, float8_static_activation_float8_weight())
+            granularity = str(quantization.split("-")[-1])
+            if granularity=="tensor":
+                granularity = PerTensor()
+            elif granularity=="row":
+                granularity = PerRow()
+            else:
+                if granularity=="float8dq":
+                    granularity = PerTensor()
+                else:
+                    raise ValueError(f"Unknown granularity {granularity}")
+            quantize_(model, float8_dynamic_activation_float8_weight(granularity=granularity))
         if "autoround" in quantization:
             from torchao.prototype.autoround.autoround_llm import quantize_model_with_autoround_
             from transformers import AutoTokenizer
diff --git a/torchao/quantization/README.md b/torchao/quantization/README.md
@@ -30,7 +30,7 @@ Benchmarks and evaluation for model meta-llama/Meta-Llama-3.1-8B are run on a ma
 |               | int4wo-64               |  8.44               |  241.39       | 1019.14                 |  7.08            |  4.22           |
 |               | float8wo                |  7.60               |  178.46       | 1339.93                 | 12.09            |  7.51           |
 |               | float8dq (PerTensor)    |  7.62               |  116.40       |  873.58                 | 11.14            |  7.51           |
-|               | float8dq (Per Row)      |  7.62               |  154.63       | 1161.47                 | 11.14            |  7.51           |
+|               | float8dq (Per Row)      |  7.61               |  154.63       | 1161.47                 | 11.14            |  7.51           |
 
 note: Int8 dynamic quantization works best on compute bound models like [SAM](https://github.com/pytorch-labs/segment-anything-fast) whereas Llama with batchsize=1 tends to be memory bound, thus the rather low performance.
 
@@ -139,6 +139,8 @@ change_linear_weights_to_int8_dqtensors(model)
 from torchao.quantization import quantize_, float8_dynamic_activation_float8_weight
 from torchao.quantization.observer import PerTensor
 quantize_(model, float8_dynamic_activation_float8_weight(granularity=PerTensor()))
+from torchao.quantization.observer import PerTensor
+quantize_(model, float8_dynamic_activation_float8_weight(granularity=PerTensor()))
 
 ```