1- #
2- # -*- coding: utf-8 -*-
3- #
41# Copyright (c) 2023 Intel Corporation
52#
63# Licensed under the Apache License, Version 2.0 (the "License");
1815
1916import copy
2017import os
18+ from pathlib import Path
19+ from typing import List , Union
2120
2221import numpy as np
2322import onnx
2625
2726from neural_compressor .common import Logger
2827from neural_compressor .onnxrt .algorithms .smoother .calibrator import Calibrator
28+ from neural_compressor .onnxrt .quantization .calibrate import CalibrationDataReader
2929from neural_compressor .onnxrt .utils .onnx_model import ONNXModel
3030from neural_compressor .onnxrt .utils .utility import (
3131 get_qrange_for_qType ,
3636
3737logger = Logger ().get_logger ()
3838
39- dtype_map = {
39+ __all__ = ["Smoother" ]
40+
41+ _dtype_map = {
4042 np .dtype ("float32" ): 1 ,
4143 np .dtype ("uint8" ): 2 ,
4244 np .dtype ("int8" ): 3 ,
4749}
4850
4951
50- def get_quant_dequant_output (model , input_data , output_data , providers ):
52+ def _get_quant_dequant_output (model , input_data , output_data , providers ):
5153 """Get loss between fp32 output and QDQ output.
5254
5355 Args:
@@ -58,14 +60,14 @@ def get_quant_dequant_output(model, input_data, output_data, providers):
5860 """
5961 import onnxruntime as ort
6062
61- input_data = quant_dequant_data (input_data , 2 , "asym" )
63+ input_data = _quant_dequant_data (input_data , 2 , "asym" )
6264 sess = ort .InferenceSession (model .SerializeToString (), providers = providers )
6365 preds = sess .run (None , {model .graph .input [0 ].name : input_data })
6466 loss = np .sum (np .abs (output_data - preds ) ** 2 )
6567 return loss
6668
6769
68- def make_sub_graph (node , inits , input_data , output_data , opset , ir_version ):
70+ def _make_sub_graph (node , inits , input_data , output_data , opset , ir_version ):
6971 """Build a model with the specific node.
7072
7173 Args:
@@ -78,15 +80,15 @@ def make_sub_graph(node, inits, input_data, output_data, opset, ir_version):
7880 """
7981 from onnx import helper
8082
81- input = helper .make_tensor_value_info (node .input [0 ], dtype_map [input_data .dtype ], input_data .shape )
82- output = helper .make_tensor_value_info (node .output [0 ], dtype_map [output_data .dtype ], output_data .shape )
83+ input = helper .make_tensor_value_info (node .input [0 ], _dtype_map [input_data .dtype ], input_data .shape )
84+ output = helper .make_tensor_value_info (node .output [0 ], _dtype_map [output_data .dtype ], output_data .shape )
8385 graph = helper .make_graph ([node ], "sub_graph" , [input ], [output ], inits )
8486 model = helper .make_model (graph , opset_imports = opset )
8587 model .ir_version = ir_version
8688 return model
8789
8890
89- def quant_dequant_data (data , qType = 3 , scheme = "sym" ):
91+ def _quant_dequant_data (data , qType = 3 , scheme = "sym" ):
9092 """Quantize and then dequantize data.
9193
9294 Args:
@@ -113,9 +115,9 @@ class Smoother:
113115
114116 def __init__ (
115117 self ,
116- model ,
117- dataloader ,
118- providers = ["CPUExecutionProvider" ],
118+ model : Union [ onnx . ModelProto , ONNXModel , Path , str ] ,
119+ dataloader : CalibrationDataReader ,
120+ providers : List [ str ] = ["CPUExecutionProvider" ],
119121 ):
120122 """Initialize the attributes of class."""
121123 self .model = model if isinstance (model , ONNXModel ) else ONNXModel (model , load_external_data = True )
@@ -138,30 +140,37 @@ def __init__(
138140
139141 def transform (
140142 self ,
141- alpha = 0.5 ,
142- folding = True ,
143- percentile = 99.999 ,
144- op_types = ["Gemm" , "Conv" , "MatMul" , "FusedConv" ],
145- scales_per_op = True ,
146- calib_iter = 100 ,
147- auto_alpha_args = {"alpha_min" : 0.3 , "alpha_max" : 0.7 , "alpha_step" : 0.05 , "attn_method" : "min" },
143+ alpha : Union [ float , str ] = 0.5 ,
144+ folding : bool = True ,
145+ percentile : float = 99.999 ,
146+ op_types : List [ str ] = ["Gemm" , "Conv" , "MatMul" , "FusedConv" ],
147+ scales_per_op : bool = True ,
148+ calib_iter : int = 100 ,
149+ auto_alpha_args : dict = {"alpha_min" : 0.3 , "alpha_max" : 0.7 , "alpha_step" : 0.05 , "attn_method" : "min" },
148150 * args ,
149151 ** kwargs
150152 ):
151153 """The main entry of smooth quant.
152154
153155 Args:
154- alpha (float or str): alpha value to balance the quantization difficulty of activation and weight.
155- folding (bool): whether fold those foldable Mul which are inserted for smooth quant
156- percentile (float): percentile of calibration to remove outliers
157- op_types (list): the op type to be smooth quantized
158- scales_per_op (bool): True, each op will have an individual scale, mainlyfor accuracy
159- False, ops with the same input will share a scale, mainly for performance
160- calib_iter (int): iteration num for calibration
156+ alpha (float, optional): alpha value to balance the quantization difficulty of activation and weight.
157+ Defaults to 0.5.
158+ folding (bool, optional): whether fold those foldable Mul which are inserted for smooth quant.
159+ Defaults to True.
160+ percentile (float, optional): percentile of calibration to remove outliers.
161+ Defaults to 99.999.
162+ op_types (list, optional): the op type to be smooth quantized.
163+ Defaults to ["Gemm", "Conv", "MatMul", "FusedConv"].
164+ scales_per_op (bool, optional): True, each op will have an individual scale, mainlyfor accuracy
165+ False, ops with the same input will share a scale, mainly for performance.
166+ Defaults to True.
167+ calib_iter (int, optional): iteration num for calibration. Defaults to 100.
168+ auto_alpha_args (_type_, optional): alpha args for auto smooth.
169+ Defaults to {"alpha_min": 0.3, "alpha_max": 0.7, "alpha_step": 0.05, "attn_method": "min"}.
161170
162171 Returns:
163- A FP32 model with the same architecture as the orig model but with different weight which will be
164- benefit to quantization
172+ onnx.ModelProto: A FP32 model with the same architecture as the orig model
173+ but with different weight which will be benefit to quantization
165174 """
166175 self .scales_per_op = scales_per_op
167176 self .clean ()
@@ -207,7 +216,6 @@ def _dump_op_info(self, percentile, op_types, iterations):
207216 calibrator = Calibrator (
208217 self .model ,
209218 self .dataloader ,
210- [],
211219 iterations = list (range (0 , iterations )),
212220 backend = self .providers ,
213221 )
@@ -388,7 +396,7 @@ def _get_output_loss(self, node_name, scale, calib_iter):
388396 )
389397 base_dir = "" if not self .model .is_large_model else os .path .dirname (self .model .model_path )
390398 weight = onnx .numpy_helper .to_array (self .model .get_initializer (node .input [1 ]), base_dir )
391- weight_q = quant_dequant_data (weight )
399+ weight_q = _quant_dequant_data (weight )
392400
393401 self .model .set_initializer (node .input [1 ], weight_q )
394402 inits = [self .model .get_initializer (i ) for i in node .input if self .model .get_initializer (i ) is not None ]
@@ -404,15 +412,15 @@ def _get_output_loss(self, node_name, scale, calib_iter):
404412
405413 outputs = session .run (added_tensors , inputs )
406414 if model is None :
407- model = make_sub_graph (
415+ model = _make_sub_graph (
408416 node ,
409417 inits ,
410418 outputs [0 ],
411419 outputs [1 ],
412420 self .model .model .opset_import ,
413421 self .model .model .ir_version ,
414422 )
415- loss += get_quant_dequant_output (model , outputs [0 ] * scale , outputs [1 ], self .providers )
423+ loss += _get_quant_dequant_output (model , outputs [0 ] * scale , outputs [1 ], self .providers )
416424
417425 self .model .remove_tensors_from_outputs ([i for i in added_tensors if i not in orig_outputs ])
418426 self .model .set_initializer (node .input [1 ], weight )
@@ -431,7 +439,14 @@ def _reshape_scale_for_input(self, tensor, key):
431439 scale = np .reshape (self .tensor_scales_info [key ], (1 , self .tensor_scales_info [key ].shape [0 ]))
432440 return scale
433441
434- def _auto_tune_alpha (self , calib_iter , alpha_min = 0.3 , alpha_max = 0.7 , alpha_step = 0.05 , attn_method = "min" ):
442+ def _auto_tune_alpha (
443+ self ,
444+ calib_iter ,
445+ alpha_min : float = 0.3 ,
446+ alpha_max : float = 0.7 ,
447+ alpha_step : float = 0.05 ,
448+ attn_method : str = "min" ,
449+ ):
435450 """Perform alpha-tuning to obtain layer-wise optimal alpha values and adjust parameters accordingly.
436451
437452 Args:
0 commit comments