@@ -45,7 +45,7 @@ def run_evaluation(repo_id, tasks, limit, device, precision, quantization, compi
45
45
model = AutoModelForCausalLM .from_pretrained (repo_id ).to (device = "cpu" , dtype = precision )
46
46
47
47
if compile :
48
- model = torch .compile (model , mode = "max-autotune" , fullgraph = True )
48
+ model = torch .compile (model , fullgraph = True )
49
49
50
50
if quantization == "int8dq" :
51
51
change_linear_weights_to_int8_dqtensors (model )
@@ -57,16 +57,10 @@ def run_evaluation(repo_id, tasks, limit, device, precision, quantization, compi
57
57
elif quantization == "autoquant" :
58
58
model = autoquant (model .to (device = device ))
59
59
elif quantization == "fp8" :
60
- from float8_experimental .float8_linear_utils import swap_linear_with_float8_linear
61
- from float8_experimental .float8_dynamic_linear import Float8DynamicLinear
60
+ from float8_experimental .inference import quantize_to_float8 , ActivationCasting , QuantConfig , ScalingGranularity
62
61
model .to (device )
63
- swap_linear_with_float8_linear (
64
- model ,
65
- Float8DynamicLinear ,
66
- from_float_kwargs = {
67
- "pre_quantize_weight" : True ,
68
- },
69
- )
62
+ quantize_to_float8 (model , QuantConfig (ActivationCasting .DYNAMIC ), scaling_granularity = ScalingGranularity .TensorWise )
63
+
70
64
pass # no quantization applied, model is already on device and precision dtype.
71
65
72
66
with torch .no_grad ():
0 commit comments