Skip to content

Commit 81da403

Browse files
xin3heyiliu30Spycsh
authored
Enhance SmoothQuant tuning structure. (#1109)
* enhance sq tuning Signed-off-by: Xin He <xin3.he@intel.com> * Support the tuning of smooth quant' alpha in strategy layer (#1112) Signed-off-by: yiliu30 <yi4.liu@intel.com> * added more UTs Signed-off-by: yiliu30 <yi4.liu@intel.com> * fixed ut Signed-off-by: yiliu30 <yi4.liu@intel.com> * fixed ut Signed-off-by: yiliu30 <yi4.liu@intel.com> * enable sq tuning for both quant_level is auto or 1 Signed-off-by: yiliu30 <yi4.liu@intel.com> * fix accuracy issue Signed-off-by: Xin He <xin3.he@intel.com> * fix UT Signed-off-by: Xin He <xin3.he@intel.com> * fix alpha=auto Signed-off-by: Xin He <xin3.he@intel.com> * support sq tuning for both auto and O1 Signed-off-by: yiliu30 <yi4.liu@intel.com> * fixed the typo Signed-off-by: yiliu30 <yi4.liu@intel.com> * rename func name in ut Signed-off-by: Xin He <xin3.he@intel.com> * remove duplicate Linear if Linear is wrapped by Linear Signed-off-by: Xin He <xin3.he@intel.com> * refactor tensorflow interface * adjust the pre-optimization and sq order for ort * updated ort ut Signed-off-by: yiliu30 <yi4.liu@intel.com> * fix pylint and docstyle Signed-off-by: Xin He <xin3.he@intel.com> * add sketch for ort tune sq alpha Signed-off-by: yiliu30 <yi4.liu@intel.com> * correct the calib_iter Signed-off-by: yiliu30 <yi4.liu@intel.com> * fix tensorflow UT and int8 acc issue * fix ut Signed-off-by: Xin He <xin3.he@intel.com> --------- Signed-off-by: Xin He <xin3.he@intel.com> Signed-off-by: yiliu30 <yi4.liu@intel.com> Co-authored-by: Yi30 <106061964+yiliu30@users.noreply.github.com> Co-authored-by: yiliu30 <yi4.liu@intel.com> Co-authored-by: spycsh <sihan.chen@intel.com>
1 parent 9ff7f01 commit 81da403

File tree

19 files changed

+861
-577
lines changed

19 files changed

+861
-577
lines changed

neural_compressor/adaptor/onnxrt.py

Lines changed: 43 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
import math
3838
import sys
3939
import re
40+
from typing import Dict
4041

4142
onnx = LazyImport("onnx")
4243
ort = LazyImport("onnxruntime")
@@ -157,21 +158,26 @@ def __init__(self, framework_specific_info):
157158

158159
self.optype_statistics = None
159160

160-
def smooth_quant(self, model, dataloader, iterations, tune_cfg, alpha=0.5, folding=True,
161-
percentile=99.999, op_types=['MatMul', 'Gemm', 'Conv', 'FusedConv'], scales_per_op=True):
161+
# sq algo and args
162+
self.sq = None
163+
self.cur_sq_args = None
164+
165+
def smooth_quant(self, model, dataloader, iterations, alpha=0.5, folding=True,
166+
percentile=99.999, op_types=['MatMul', 'Gemm', 'Conv', 'FusedConv'],
167+
scales_per_op=True, record_max_info=False):
162168
"""Get augmented model with smooth quant.
163169
164170
Args:
165171
model_wrapper (object): origin_model
166172
dataloader (object): dataloader
167173
iterations (int): iterations
168-
tune_cfg (dict): quantization config
169174
alpha (float or str): smooth alpha in SmoothQuant, 1.0 will fallback to SPIQ
170175
folding (bool): whether fold those foldable Mul which are inserted for SmoothQuant
171176
percentile (float): percentile of calibration to remove outliers
172177
op_types (list): The op types whose input tensor will be dumped
173178
scales_per_op (bool): True, each op will have an individual scale, mainly for accuracy
174179
False, ops with the same input will share a scale, mainly for performance
180+
record_max_info (bool): False, whether record the scale information
175181
176182
Returns:
177183
model: A modified onnx model
@@ -180,11 +186,29 @@ def smooth_quant(self, model, dataloader, iterations, tune_cfg, alpha=0.5, foldi
180186
return self.smooth_quant_model
181187

182188
from .ox_utils.smooth_quant import ORTSmoothQuant
183-
quantize_config = self._cfg_to_quantize_config(tune_cfg) if tune_cfg is not None else None
184-
sq = ORTSmoothQuant(self.pre_optimized_model, dataloader, self.reduce_range, self.backend)
185-
self.smooth_quant_model = sq.transform(
189+
# TODO remove quantize_config as it no consumer
190+
quantize_config = None
191+
# pre-optimization -> sq
192+
self._pre_optimize(model)
193+
# assign the algo to the adaptor, so adaptor can call it later when needed
194+
self.sq = ORTSmoothQuant(self.pre_optimized_model, dataloader, self.reduce_range, self.backend)
195+
self.smooth_quant_model = self.sq.transform(
186196
alpha, folding, percentile, op_types, scales_per_op, iterations, quantize_config)
197+
logger.info("Updated the pre-optimized model with smooth quant model.")
198+
# TODO double-check the smooth_quant_model and pre_optimized_model to make sure there no two fp32 model replicas
199+
self.pre_optimized_model = self.smooth_quant_model
187200
return self.smooth_quant_model
201+
202+
def _need_smooth_quant(self, tune_cfg) -> bool:
203+
# compare the alpha from tune_cfg and current alpha to decide whether re-smooth model or not
204+
# TODO
205+
return False
206+
207+
def _parse_sq_args(self, tune_cfg, cur_sq_args) -> Dict:
208+
# parse the sq args according to the tune cfg and current sq args
209+
# TODO
210+
return {}
211+
188212

189213
@dump_elapsed_time("Pass quantize model")
190214
def quantize(self, tune_cfg, model, data_loader, q_func=None):
@@ -201,6 +225,14 @@ def quantize(self, tune_cfg, model, data_loader, q_func=None):
201225
Returns:
202226
(dict): quantized model
203227
"""
228+
# two steps to re-smooth the model if needed
229+
if self._need_smooth_quant(tune_cfg):
230+
# step1. recover the sq to original fp32 model
231+
self.sq.recover()
232+
new_sq_args = self._parse_sq_args(tune_cfg, self.cur_sq_args)
233+
# step2. re-smooth the model with new alpha
234+
model = self.smooth_quant(model=model, dataloader=data_loader, iterations=new_sq_args['iterations'],\
235+
alpha=new_sq_args['alpha'], folding=new_sq_args['folding'], scales_per_op=new_sq_args['scales_per_op'])
204236
assert q_func is None, "quantization aware training has not been supported on ONNXRUNTIME"
205237
if self.smooth_quant_model is not None:
206238
model = self.smooth_quant_model
@@ -630,6 +662,11 @@ def _detect_domain(self, model):
630662
return is_nlp
631663

632664
def _pre_optimize(self, model, level=1):
665+
# the pre-optimization may already done at the smoothing process
666+
# pre_optimize -> sq -> update the pre_optimized_model
667+
if self.pre_optimized_model:
668+
logger.info("Pre-optimization already done, return it directly.")
669+
return self.pre_optimized_model
633670
from neural_compressor import options
634671
from neural_compressor.adaptor.ox_utils.util import \
635672
remove_init_from_model_input, split_shared_bias

0 commit comments

Comments
 (0)