use low_gpu_mem_usage to cache best params

xin3he · xin3he · commit 29df3579a1ac · 2025-11-03T01:15:32.000-05:00
Signed-off-by: He, Xin3 &lt;xin3.he@intel.com&gt;
diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
@@ -2321,10 +2321,10 @@ def _quantize_layer(
             if total_loss < best_loss:
                 best_loss = total_loss
                 if not self.not_use_best_mse:
-                    best_params = collect_best_params(wrapper_linear)
+                    best_params = collect_best_params(wrapper_linear, self.low_gpu_mem_usage)
                     last_best_iter = i
             if self.not_use_best_mse and i == self.iters - 1:
-                best_params = collect_best_params(wrapper_linear)
+                best_params = collect_best_params(wrapper_linear, self.low_gpu_mem_usage)
 
             if not self.not_use_best_mse:
                 if 0 < self.dynamic_max_gap <= i - last_best_iter:
@@ -2603,21 +2603,23 @@ def _quantize_block(
                     )
 
                 total_loss += loss.item() / num_elm
+                # Sometimes the cached memory is not released during training and cause OOM
+                if self.low_gpu_mem_usage:
+                    clear_memory_if_reached_threshold(threshold=0.85)
                 self._scale_loss_and_backward(scaler, loss)
-                clear_memory_if_reached_threshold(threshold=0.85)
 
             if i == 0:
                 init_loss = total_loss
 
             if total_loss < best_loss:
                 best_loss = total_loss
                 if not self.not_use_best_mse:
-                    best_params = collect_best_params(block)
+                    best_params = collect_best_params(block, self.low_gpu_mem_usage)
                     # print(f"get better result at iter {i}, the loss is {total_loss}", flush=True)
 
                     last_best_iter = i
             if self.not_use_best_mse and i == self.iters - 1:
-                best_params = collect_best_params(block)
+                best_params = collect_best_params(block, self.low_gpu_mem_usage)
 
             if not self.not_use_best_mse:
                 if 0 < self.dynamic_max_gap <= i - last_best_iter:
@@ -2634,6 +2636,8 @@ def _quantize_block(
             f"layers in the block, loss iter 0: {init_loss:.6f} -> iter {best_iter}: {last_loss:.6f}"
         )
         logger.info(dump_info)
+        if self.low_gpu_mem_usage:
+            clear_memory()  # clear cached memory during training
         if len(unquantized_layer_names) != 0:
             logger.info(f"{unquantized_layer_names} have not been quantized")
         with torch.no_grad():
@@ -2644,7 +2648,6 @@ def _quantize_block(
             set_amax_for_all_moe_layers(block, attr_name="orig_layer.act_max")
 
         if self.enable_quanted_input:
-            clear_memory()
             q_outputs = self._get_block_outputs(
                 block,
                 input_ids,
diff --git a/auto_round/compressors/utils.py b/auto_round/compressors/utils.py
@@ -199,13 +199,15 @@ def check_awq_gemm_compatibility(model, bits, group_size, sym, layer_configs=Non
     return True, ""
 
 
-def collect_best_params(block):
+def collect_best_params(block, low_gpu_mem_usage: bool = False):
     params = {}
     for n, m in block.named_modules():
         if hasattr(m, "orig_layer"):
             params[n] = {}
             for key in m.params.keys():
                 params[n][key] = copy.deepcopy(m.params[key].data)
+                if low_gpu_mem_usage:
+                    params[n][key] = params[n][key].cpu()
     return params
 
 
diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
@@ -431,7 +431,7 @@ def clear_memory_if_reached_threshold(threshold=0.85):
     elif hasattr(torch, "xpu") and torch.xpu.is_available():
         name, device_api = "XPU", torch.xpu
     else:
-        return
+        return False
 
     num_devices = device_api.device_count()
     for i in range(num_devices):
@@ -452,6 +452,7 @@ def clear_memory_if_reached_threshold(threshold=0.85):
                 return True
         except Exception as e:
             logger.warning_once(f"Failed to check memory for {name} device {i}: {e}")
+    return False
 
 
 def check_memory_availability(device, inputs, weight, org_seqlen, org_bs):
diff --git a/auto_round/wrapper.py b/auto_round/wrapper.py
@@ -550,7 +550,7 @@ def __init__(self, orig_layer, bit=4, group_size=-1, device="cpu"):
     def unwrapper(self, best_params):
         if best_params is None:
             return self.orig_layer
-        v = best_params["v"]
+        v = best_params["v"].to(self.device)
         weight_q, _, _ = self.quant_func(
             self.orig_layer.weight, self.bits, self.group_size, v, q_scale_thresh=self.q_scale_thresh
         )
@@ -601,7 +601,7 @@ def __init__(self, orig_layer, bit=4, group_size=-1, device="cpu"):
     def unwrapper(self, best_params):
         if best_params is None:
             return self.orig_layer
-        v = best_params["v"]
+        v = best_params["v"].to(self.device)
         weight_q, _, _ = self.quant_func(
             self.orig_layer.weight, self.bits, self.group_size, v, q_scale_thresh=self.q_scale_thresh
         )