intel
diff --git a/‎neural_compressor/adaptor/onnxrt.py‎
Lines changed: 43 additions & 6 deletions b/‎neural_compressor/adaptor/onnxrt.py‎
Lines changed: 43 additions & 6 deletions
@@ -37,6 +37,7 @@
 import math
 import sys
 import re
+from typing import Dict
 
 onnx = LazyImport("onnx")
 ort = LazyImport("onnxruntime")
@@ -157,21 +158,26 @@ def __init__(self, framework_specific_info):
 
         self.optype_statistics = None
 
-    def smooth_quant(self, model, dataloader, iterations, tune_cfg, alpha=0.5, folding=True,
-            percentile=99.999, op_types=['MatMul', 'Gemm', 'Conv', 'FusedConv'], scales_per_op=True):
+        # sq algo and args
+        self.sq = None
+        self.cur_sq_args = None
+
+    def smooth_quant(self, model, dataloader, iterations, alpha=0.5, folding=True,
+            percentile=99.999, op_types=['MatMul', 'Gemm', 'Conv', 'FusedConv'],
+            scales_per_op=True, record_max_info=False):
         """Get augmented model with smooth quant.
 
         Args:
             model_wrapper (object): origin_model
             dataloader (object): dataloader
             iterations (int): iterations
-            tune_cfg (dict): quantization config
             alpha (float or str): smooth alpha in SmoothQuant, 1.0 will fallback to SPIQ
             folding (bool): whether fold those foldable Mul which are inserted for SmoothQuant
             percentile (float): percentile of calibration to remove outliers
             op_types (list): The op types whose input tensor will be dumped
             scales_per_op (bool): True, each op will have an individual scale, mainly for accuracy
                                   False, ops with the same input will share a scale, mainly for performance
+            record_max_info (bool): False, whether record the scale information
 
         Returns:
             model: A modified onnx model
@@ -180,11 +186,29 @@ def smooth_quant(self, model, dataloader, iterations, tune_cfg, alpha=0.5, foldi
             return self.smooth_quant_model
 
         from .ox_utils.smooth_quant import ORTSmoothQuant
-        quantize_config = self._cfg_to_quantize_config(tune_cfg) if tune_cfg is not None else None
-        sq = ORTSmoothQuant(self.pre_optimized_model, dataloader, self.reduce_range, self.backend)
-        self.smooth_quant_model = sq.transform(
+        # TODO remove quantize_config as it no consumer
+        quantize_config = None
+        # pre-optimization -> sq
+        self._pre_optimize(model)
+        # assign the algo to the adaptor, so adaptor can call it later when needed
+        self.sq = ORTSmoothQuant(self.pre_optimized_model, dataloader, self.reduce_range, self.backend)
+        self.smooth_quant_model = self.sq.transform(
             alpha, folding, percentile, op_types, scales_per_op, iterations, quantize_config)
+        logger.info("Updated the pre-optimized model with smooth quant model.")
+        # TODO double-check the smooth_quant_model and pre_optimized_model to make sure there no two fp32 model replicas
+        self.pre_optimized_model = self.smooth_quant_model
         return self.smooth_quant_model
+    
+    def _need_smooth_quant(self, tune_cfg) -> bool:
+        # compare the alpha from tune_cfg and current alpha to decide whether re-smooth model or not
+        # TODO
+        return False
+    
+    def _parse_sq_args(self, tune_cfg, cur_sq_args) -> Dict:
+        # parse the sq args according to the tune cfg and current sq args
+        # TODO
+        return {}
+        
 
     @dump_elapsed_time("Pass quantize model")
     def quantize(self, tune_cfg, model, data_loader, q_func=None):
@@ -201,6 +225,14 @@ def quantize(self, tune_cfg, model, data_loader, q_func=None):
         Returns:
             (dict): quantized model
         """
+        # two steps to re-smooth the model if needed
+        if self._need_smooth_quant(tune_cfg):
+            # step1. recover the sq to original fp32 model
+            self.sq.recover()
+            new_sq_args = self._parse_sq_args(tune_cfg, self.cur_sq_args)
+            # step2. re-smooth the model with new alpha
+            model = self.smooth_quant(model=model, dataloader=data_loader, iterations=new_sq_args['iterations'],\
+                alpha=new_sq_args['alpha'], folding=new_sq_args['folding'], scales_per_op=new_sq_args['scales_per_op'])
         assert q_func is None, "quantization aware training has not been supported on ONNXRUNTIME"
         if self.smooth_quant_model is not None:
             model = self.smooth_quant_model
@@ -630,6 +662,11 @@ def _detect_domain(self, model):
         return is_nlp
 
     def _pre_optimize(self, model, level=1):
+        # the pre-optimization may already done at the smoothing process
+        # pre_optimize -> sq -> update the pre_optimized_model
+        if self.pre_optimized_model:
+            logger.info("Pre-optimization already done, return it directly.")
+            return self.pre_optimized_model
         from neural_compressor import options
         from neural_compressor.adaptor.ox_utils.util import \
             remove_init_from_model_input, split_shared_bias