speedup quant and evaluation, fix recompile issue

xin3he · xin3he · commit d9907aa86a93 · 2025-10-14T05:05:37.000-04:00
Signed-off-by: He, Xin3 &lt;xin3.he@intel.com&gt;
diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
@@ -361,6 +361,7 @@ def __init__(
         self.infer_bs_coeff = 1
         self.enable_torch_compile = enable_torch_compile
         self._adjust_torch_compile(enable_torch_compile)
+        os.environ["AR_TORCH_COMPILE"] = "1" if self.enable_torch_compile else "0"
         self._check_configs()
         torch.set_printoptions(precision=3, sci_mode=True)
 
@@ -1911,10 +1912,7 @@ def _quantize_layers(self, layer_names: list, layer_inputs: dict) -> None:
 
         self.model = mv_module_from_gpu(self.model, self.low_cpu_mem_usage)
         clear_memory()
-        if self.enable_torch_compile:
-            quant_layer = compile_func(self._quantize_layer, self.device)
-        else:
-            quant_layer = self._quantize_layer
+        quant_layer = self._quantize_layer
         for layer_name in layer_names:
             layer_input = layer_inputs[layer_name]
             layer_input = to_device(layer_input, self.cache_device)
@@ -3008,14 +3006,8 @@ def _quantize_blocks(
                     logger.info("using algorithm extension for quantization.")
             except (ImportError, ModuleNotFoundError):
                 quantize_block = self._quantize_block
-                if self.enable_torch_compile:
-                    quantize_block = compile_func(quantize_block, device)
-                else:
-                    quantize_block = quantize_block
         else:
             quantize_block = self._quantize_block
-            if self.enable_torch_compile:
-                quantize_block = compile_func(quantize_block, device)
 
         if pbar is None:
             pbar = tqdm(range(0, len(block_names), nblocks))
diff --git a/auto_round/envs.py b/auto_round/envs.py
@@ -22,6 +22,7 @@
 environment_variables: dict[str, Callable[[], Any]] = {
     # this is used for configuring the default logging level
     "AR_LOG_LEVEL": lambda: os.getenv("AR_LOG_LEVEL", "INFO").upper(),
+    "AR_TORCH_COMPILE": lambda: os.getenv("AR_TORCH_COMPILE", "0"),
 }
 
 
diff --git a/auto_round/utils.py b/auto_round/utils.py
@@ -171,6 +171,12 @@ def is_hpex_available():
     return _hpex_available
 
 
+@torch._dynamo.disable()
+@lru_cache(None)
+def is_torch_compile_enabled():
+    return os.getenv("AR_TORCH_COMPILE", "0") in ("1", "true", "True")
+
+
 def get_module(module, key):
     """Get module from model by key name.
 
@@ -506,6 +512,9 @@ def block_forward(
         output = output[output_return_id]
     return output
 
+if is_torch_compile_enabled():
+    block_forward = torch.compile(block_forward)
+
 
 def check_to_quantized(config):
     """Checks if the configuration is valid for quantization.
diff --git a/auto_round/wrapper.py b/auto_round/wrapper.py
@@ -27,6 +27,7 @@
     is_mx_fp,
     is_nv_fp,
     set_module,
+    is_torch_compile_enabled,
 )
 
 if deepspeed_exists:
@@ -143,11 +144,15 @@ def _init_tuning_params_and_quant_func(self):
         self._init_params("max_scale", p_dtype, shape, 1.0, (self.enable_minmax_tuning and self.orig_layer.bits < 16))
 
         self.weight_quant_func, self.data_type = get_quant_func(orig_layer.data_type, orig_layer.bits, orig_layer.sym)
+        if is_torch_compile_enabled():
+            self.weight_quant_func = torch.compile(self.weight_quant_func)
 
         if self.enable_act_quant:
             self.act_quant_func, self.act_data_type = get_quant_func(
                 orig_layer.act_data_type, orig_layer.act_bits, orig_layer.act_sym
             )
+            if is_torch_compile_enabled():
+                self.act_quant_func = torch.compile(self.act_quant_func)
             self._init_params("act_max_scale", p_dtype, (1), 1.0, not orig_layer.act_dynamic)
 
         ## bias tuning

Original file line number	Diff line number	Diff line change
`@@ -22,6 +22,7 @@`
`22`	`22`	`environment_variables: dict[str, Callable[[], Any]] = {`
`23`	`23`	`# this is used for configuring the default logging level`
`24`	`24`	`"AR_LOG_LEVEL": lambda: os.getenv("AR_LOG_LEVEL", "INFO").upper(),`
	`25`	`+ "AR_TORCH_COMPILE": lambda: os.getenv("AR_TORCH_COMPILE", "0"),`
`25`	`26`	`}`
`26`	`27`
`27`	`28`