File tree 3 files changed +2
-7
lines changed 3 files changed +2
-7
lines changed Original file line number Diff line number Diff line change @@ -21,6 +21,7 @@ export MODEL_REPO=meta-llama/Meta-Llama-3.1-8B
21
21
python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --compile --write_result benchmark_results.txt
22
22
python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --compile --quantization int8wo --write_result benchmark_results.txt
23
23
python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --compile --quantization int4wo-64 --write_result benchmark_results.txt
24
+ # Runs on H100, float8 is not supported on CUDA arch < 8.9
24
25
python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --compile --quantization float8wo --write_result benchmark_results.txt
25
26
python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --compile --quantization float8dq-tensor --write_result benchmark_results.txt
26
27
python generate.py --checkpoint_path $CHECKPOINT_PATH /$MODEL_REPO /model.pth --compile --quantization float8dq-wo --write_result benchmark_results.txt
Original file line number Diff line number Diff line change @@ -255,10 +255,7 @@ def main(
255
255
elif granularity == "row" :
256
256
granularity = PerRow ()
257
257
else :
258
- if granularity == "float8dq" :
259
- granularity = PerTensor ()
260
- else :
261
- raise ValueError (f"Unknown granularity { granularity } " )
258
+ granularity = PerTensor ()
262
259
quantize_ (model , float8_dynamic_activation_float8_weight (granularity = granularity ))
263
260
if "autoquant" in quantization :
264
261
if "autoquant-int4" == quantization :
Original file line number Diff line number Diff line change @@ -139,9 +139,6 @@ change_linear_weights_to_int8_dqtensors(model)
139
139
from torchao.quantization import quantize_, float8_dynamic_activation_float8_weight
140
140
from torchao.quantization.observer import PerTensor
141
141
quantize_(model, float8_dynamic_activation_float8_weight(granularity = PerTensor()))
142
- from torchao.quantization.observer import PerTensor
143
- quantize_(model, float8_dynamic_activation_float8_weight(granularity = PerTensor()))
144
-
145
142
```
146
143
147
144
#### A16W6 Floating Point WeightOnly Quantization
You can’t perform that action at this time.
0 commit comments