intel · n1ck-guo · Sep 19, 2025 · Sep 11, 2025 · Sep 11, 2025 · Sep 11, 2025
diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
@@ -865,9 +865,9 @@ def remove_duplicates(lst):
                 elif is_nv_fp(self.data_type) or is_mx_fp(self.data_type):
                     format = f"auto_round:{self.data_type}"
                 elif is_static_wfp8afp8(self):  # staic wfp8afp8
-                    format = f"auto_round:{AutoRoundFormat.TORCH_FP8_STATIC.value}"
+                    format = f"auto_round:{AutoRoundFormat.FP8_STATIC.value}"
                 elif self.data_type == "fp" and self.bits == 8 and self.act_bits >= 16:  # woq fp8
-                    format = "auto_round:fp8"
+                    format = f"auto_round:{AutoRoundFormat.FP8.value}"
                 elif self.act_bits < 16:
                     raise ValueError(
                         "AutoRound format does not support exporting "
@@ -882,6 +882,20 @@ def remove_duplicates(lst):
                     check_compressed_tensors_supported()
                     format = format.replace("llm_compressor", f"llm_compressor:{self.data_type}")
                     formats[index] = format
+                if is_static_wfp8afp8(self):
+                    format = f"llm_compressor:{AutoRoundFormat.FP8_STATIC.value}"
+                    formats[index] = format
+                    if self.act_group_size != 0:
+                        logger.warning(
+                            f"scheme FP8_STATIC export to llm_compressor format only support for act_group_size 0,"
+                            f" ,but got act_group_size={self.act_group_size}, reset = 0"
+                        )
+                        self.act_group_size = 0
+                    if self.group_size > 0:
+                        logger.warning(
+                            f"please note that group_size={self.group_size}"
+                            " may not be supported for llm_compressor format, and cannot be loaded in llm_compressor"
+                        )
                 elif not is_wfp8afp8(self):
                     logger.error(
                         "Currently, the llm_compressor format only supports MXFP/NVFP/FP8. "
@@ -971,13 +985,25 @@ def _check_supported_format(self, format: str) -> bool:
                     )
                     format = "fake"
             else:
-                if not (format == "auto_round" or format == f"auto_round:{AutoRoundFormat.TORCH_FP8_STATIC.value}"):
+                if format not in [
+                    "auto_round",
+                    f"auto_round:{AutoRoundFormat.FP8_STATIC.value}",
+                    f"llm_compressor:{AutoRoundFormat.FP8_STATIC.value}",
+                    "auto_round:llm_compressor",
+                ]:
                     logger.warning(
                         f"Currently only support to export auto_round or fake format for static W{self.bits}AFP8 model,"
                         f" change format {format} to auto_round"
                     )
-                    format = "auto_round"
-            if self.act_group_size != 0 and not self.act_dynamic and format == "auto_round:fp8":
+                    if is_static_wfp8afp8(self):
+                        format = f"auto_round:{AutoRoundFormat.FP8_STATIC.value}"
+                    else:
+                        format = f"auto_round:{AutoRoundFormat.FP8.value}"
+            if (
+                self.act_group_size != 0
+                and not self.act_dynamic
+                and format == f"auto_round:{AutoRoundFormat.FP8.value}"
+            ):
                 logger.warning(
                     f"Please note that quantize activation with act_group_size={self.act_group_size}"
                     " may result in failure to export or import normally."
@@ -1198,7 +1224,7 @@ def register_act_hook(model):
             def get_imatrix_hook(module, input, output):
                 input = input[0] if isinstance(input, (tuple, list)) else input
                 flattened = input.reshape(-1, input.shape[-1]).to(torch.float32)
-                squared = torch.sum(flattened**2, dim=0).to(torch.float32)
+                squared = torch.sum(torch.pow(flattened, 2), dim=0).to(torch.float32)
 
                 if not hasattr(module, "imatrix"):
                     module.imatrix = squared
@@ -3094,6 +3120,8 @@ def save_quantized(
             )
         if format == "llm_compressor" and (is_nv_fp(self.data_type) or is_mx_fp(self.data_type)):
             format = format.replace("llm_compressor", f"llm_compressor:{self.data_type}")
+        if format == "llm_compressor" and is_static_wfp8afp8(self):
+            format = format.replace("llm_compressor", "llm_compressor:{AutoRoundFormat.FP8_STATIC.value}")
 
         from auto_round.export import EXPORT_FORMAT
 

diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py
@@ -337,7 +337,7 @@ def quant_tensor_gguf_asym_dq(
         if bits == 2:
             quant_weights = torch.abs(tensor)
         elif bits == 4 or bits == 5:
-            sigma2 = torch.sum(tensor**2, dim=-1, keepdim=True) / 32  ##Note 32 is different from QK_K
+            sigma2 = torch.sum(torch.pow(tensor, 2), dim=-1, keepdim=True) / 32  ##Note 32 is different from QK_K
             av_x = torch.sqrt(sigma2)
             quant_weights = torch.abs(tensor) + av_x
         params = search_kwargs[bits]
@@ -384,7 +384,9 @@ def quant_tensor_gguf_asym_dq(
                 if bits == 2:
                     tmp_quant_weights = torch.abs(tensor)
                 elif bits == 4 or bits == 5:
-                    sigma2 = torch.sum(tensor**2, dim=-1, keepdim=True) / 32  ## Note 32 is different from QK_K
+                    sigma2 = (
+                        torch.sum(torch.pow(tensor, 2), dim=-1, keepdim=True) / 32
+                    )  ## Note 32 is different from QK_K
                     av_x = torch.sqrt(sigma2)
                     tmp_quant_weights = torch.abs(tensor) + av_x
                 quant_weights[replace_index, :] = tmp_quant_weights[replace_index, :]
@@ -395,7 +397,7 @@ def quant_tensor_gguf_asym_dq(
                 tmp_quant_weights = tmp_quant_weights.view(-1, 1).expand(-1, quant_weights.shape[1])
                 quant_weights[mean_replace_index, :] = tmp_quant_weights[mean_replace_index, :]
 
-        # sigma2 = torch.sum(tensor ** 2, dim=-1, keepdim=True) / QK_K
+        # sigma2 = torch.sum(torch.pow(tensor, 2), dim=-1, keepdim=True) / QK_K
         # if imatrix is None:
         #     av_x = torch.sqrt(sigma2)
         #     quant_weights = torch.abs(av_x + tensor * tensor)
@@ -470,7 +472,7 @@ def iterative_wls_quant_search(data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, u
     quant_data = torch.clamp(torch.round(iscale * (data - rmin)), minq, maxq)
     diff = scale * quant_data + rmin - data
 
-    best_mad = torch.sum((weights * torch.abs(diff)) if use_mad else weights * diff**2, dim=1, keepdim=True)
+    best_mad = torch.sum((weights * torch.abs(diff)) if use_mad else weights * torch.pow(diff, 2), dim=1, keepdim=True)
 
     for is_ in range(nstep):
         factor = rrmin + rdelta * is_ + maxq - minq
@@ -484,7 +486,7 @@ def iterative_wls_quant_search(data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, u
         sum_l2 = torch.sum(mul_weights_quant_data * quant_data_new, dim=-1, keepdim=True)
         sum_xl = torch.sum(mul_weights_quant_data * data, dim=-1, keepdim=True)
 
-        D = sum_w * sum_l2 - sum_l**2
+        D = sum_w * sum_l2 - torch.pow(sum_l, 2)
         this_scale = (sum_w * sum_xl - sum_x * sum_l) / D
         this_min = (sum_l2 * sum_x - sum_l * sum_xl) / D
         this_min[this_min > 0] = 0
@@ -494,7 +496,7 @@ def iterative_wls_quant_search(data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, u
         quant_data = torch.clamp(torch.round(reverse_this_scale * (data - this_min)), minq, maxq)
         diff = this_scale * quant_data + this_min - data
         # diff = this_scale * quant_data_new + this_min - data
-        mad = torch.sum((weights * torch.abs(diff)) if use_mad else weights * diff**2, dim=-1, keepdim=True)
+        mad = torch.sum((weights * torch.abs(diff)) if use_mad else weights * torch.pow(diff, 2), dim=-1, keepdim=True)
 
         idx_to_replace = torch.where((mad < best_mad) & (D > 0))[0]
         best_mad[idx_to_replace] = mad[idx_to_replace]
@@ -566,7 +568,7 @@ def quant_tensor_gguf_sym_dq(
         imatrix = imatrix.to(tensor.device)
 
         # if bits == 3:
-        #     # sigma2 = 2 * torch.sum(tensor ** 2, dim=-1, keepdim=True) / QK_K
+        #     # sigma2 = 2 * torch.sum(torch.pow(tensor, 2), dim=-1, keepdim=True) / QK_K
         #     # imatrix = imatrix.reshape(1, -1).expand(tensor.numel() // imatrix.numel(), -1).reshape(tensor.shape)
         #     # quant_weights = imatrix * torch.sqrt(sigma2 + tensor * tensor)
         #     # scale, int_w = make_qx_quants(tensor, bits=bits, rmse_type=1, qw=quant_weights)
@@ -588,7 +590,7 @@ def quant_tensor_gguf_sym_dq(
                 if bits == 6:
                     quant_weights[replace_index] = tensor[replace_index] * tensor[replace_index]
                 else:
-                    sigma2 = 2 * torch.sum(tensor**2, dim=-1, keepdim=True) / QK_K
+                    sigma2 = 2 * torch.sum(torch.pow(tensor, 2), dim=-1, keepdim=True) / QK_K
                     tmp_quant_weights = torch.sqrt(sigma2 + tensor * tensor)
                     quant_weights[replace_index] = tmp_quant_weights[replace_index]
             mean_replace_index = (zero_cnt > 0) & (zero_cnt <= group_size // 2)

diff --git a/auto_round/export/export_to_autogptq/export.py b/auto_round/export/export_to_autogptq/export.py
@@ -47,6 +47,7 @@
 from tqdm import tqdm
 
 import auto_round.export.export_to_autogptq.qlinear_triton
+from auto_round.export.utils import save_model
 from auto_round.logger import logger
 from auto_round.utils import (
     SUPPORTED_LAYER_TYPES,
@@ -214,54 +215,7 @@ def wrapper(name):
         model.config.quantization_config = quantization_config
 
     dtype = torch.float16  ##force dtype to fp16
-    save(model, output_dir, safe_serialization=safe_serialization, dtype=dtype)
+    save_model(
+        model, output_dir, safe_serialization=safe_serialization, dtype=dtype, config_file="quantize_config.json"
+    )
     return model
-
-
-def save(
-    model: torch.nn.Module, save_dir: str, max_shard_size: str = "5GB", safe_serialization: bool = True, dtype=None
-):
-    """Save model state dict and configs.
-
-    Args:
-        model (`nn.Module`):
-            Model to be saved. The model can be wrapped or unwrapped.
-        save_dir (`str`):
-            Directory to which to save. Will be created if it doesn't exist.
-        max_shard_size (`str`, defaults to `"10GB"`):
-            The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size
-            lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5MB"`).
-            <Tip warning={true}>
-
-            If a single weight of the model is bigger than `max_shard_size`, it will be in its own checkpoint shard
-            which will be bigger than `max_shard_size`.
-
-            </Tip>
-        safe_serialization (`bool`, defaults to `True`):
-            Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
-    """
-    ##max_shard_size = "10000GB"  ## API of auto-gptq with marlin does not support shard size
-    os.makedirs(save_dir, exist_ok=True)
-    try:
-        model.save_pretrained(save_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization)
-    except ValueError as e:
-        if hasattr(model, "generation_config"):
-            setattr(model.generation_config, "do_sample", True)
-        model.save_pretrained(save_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization)
-    config_path = os.path.join(save_dir, "config.json")
-    if dtype is not None and dtype != model.dtype and os.path.exists(os.path.join(save_dir, "config.json")):
-        with open(config_path, "r") as file:
-            data = json.load(file)
-        data["torch_dtype"] = str(dtype).split(".")[-1]
-        with open(config_path, "w") as file:
-            json.dump(data, file, indent=2)
-
-    config_file = "quantize_config.json"
-    if hasattr(model, "config") and hasattr(model.config, "quantization_config"):
-        with open(os.path.join(save_dir, config_file), "w", encoding="utf-8") as f:
-            json.dump(model.config.quantization_config, f, indent=2)
-
-    try:
-        copy_python_files_from_model_cache(model, save_dir)
-    except Exception as e:
-        logger.warning("Skipping source model Python file copy due to error: %s", e)
diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
@@ -27,6 +27,7 @@
 from tqdm import tqdm
 
 from auto_round.export.export_to_autoround.utils import REQUIRED_CONFIG_KEYS, check_neq_config
+from auto_round.export.utils import save_model
 from auto_round.logger import logger
 from auto_round.utils import (
     SUPPORTED_FORMATS,
@@ -47,7 +48,8 @@
 class AutoRoundFormat(str, Enum):
     # Weight: FP8, per-channel, may be extended to per-tensor in future
     # Activation: FP8, per-tensor
-    TORCH_FP8_STATIC = "fp8_static"
+    FP8_STATIC = "fp8_static"
+    FP8 = "fp8"
 
 
 def dynamic_import_quant_linear_for_packing(backend, bits, group_size, sym, act_bits=16):
@@ -159,11 +161,19 @@ def pack_layer(layer_name, model, backend, device=None):
 
         return pack_layer(layer_name, model, backend, device)
 
-    if backend == "auto_round:fp8" or backend == f"auto_round:{AutoRoundFormat.TORCH_FP8_STATIC.value}":
+    if (
+        backend == f"auto_round:{AutoRoundFormat.FP8.value}"
+        or backend == f"auto_round:{AutoRoundFormat.FP8_STATIC.value}"
+    ):
         from auto_round.export.export_to_autoround.export_to_fp8 import pack_layer
 
         return pack_layer(layer_name, model, backend, device)
 
+    if backend == "auto_round:llm_compressor":
+        from auto_round.export.export_to_llmcompressor.export_to_static_fp import pack_layer
+
+        return pack_layer(layer_name, model, backend, device)
+
     layer = get_module(model, layer_name)
     if hasattr(layer, "orig_layer"):
         layer = layer.orig_layer
@@ -271,6 +281,11 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
 
         return save_quantized_as_fp(output_dir, inplace=inplace, backend="auto_round:llm_compressor", **kwargs)
 
+    if backend == "auto_round:llm_compressor":
+        from auto_round.export.export_to_llmcompressor.export_to_static_fp import save_quantized_as_static_fp
+
+        return save_quantized_as_static_fp(output_dir, inplace=inplace, backend="auto_round:llm_compressor", **kwargs)
+
     if kwargs.get("data_type", "int") == "fp" and kwargs.get("bits", 16) == 8 and kwargs.get("act_bits", 16) >= 16:
         from auto_round.export.export_to_autoround.export_to_fp8 import save_quantized_as_autoround
 
@@ -280,7 +295,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
     if (
         (kwargs.get("sym") is None or kwargs.get("sym"))
         and ("gptq" not in backend and "awq" not in backend)
-        and (AutoRoundFormat.TORCH_FP8_STATIC.value not in backend)
+        and (AutoRoundFormat.FP8_STATIC.value not in backend)
     ):
         backend = backend.replace("auto_round", "auto_round:auto_gptq")
 
@@ -367,52 +382,6 @@ def wrapper(name):
         dtype = torch.float16  ## awq kernel only supports float16 on cuda
     else:
         dtype = None
-    save(model, output_dir, safe_serialization=safe_serialization, dtype=dtype)
+    save_model(model, output_dir, safe_serialization=safe_serialization, dtype=dtype)
 
     return model
-
-
-def save(model: nn.Module, save_dir: str, max_shard_size: str = "5GB", safe_serialization: bool = True, dtype=None):
-    """Save model state dict and configs.
-
-    Args:
-        model (`nn.Module`):
-            Model to be saved. The model can be wrapped or unwrapped.
-        save_dir (`str`):
-            Directory to which to save. Will be created if it doesn't exist.
-        max_shard_size (`str`, defaults to `"10GB"`):
-            The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size
-            lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5MB"`).
-            <Tip warning={true}>
-
-            If a single weight of the model is bigger than `max_shard_size`, it will be in its own checkpoint shard
-            which will be bigger than `max_shard_size`.
-
-            </Tip>
-        safe_serialization (`bool`, defaults to `True`):
-            Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
-    """
-    os.makedirs(save_dir, exist_ok=True)
-    try:
-        model.save_pretrained(save_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization)
-    except ValueError as e:
-        if hasattr(model, "generation_config"):
-            setattr(model.generation_config, "do_sample", True)
-        model.save_pretrained(save_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization)
-
-    config_path = os.path.join(save_dir, "config.json")
-    if dtype is not None and dtype != model.dtype and os.path.exists(os.path.join(save_dir, "config.json")):
-        with open(config_path, "r") as file:
-            data = json.load(file)
-        data["torch_dtype"] = str(dtype).split(".")[-1]
-        with open(config_path, "w") as file:
-            json.dump(data, file, indent=2)
-    config_file = "quantization_config.json"
-    if hasattr(model, "config") and hasattr(model.config, "quantization_config"):
-        with open(os.path.join(save_dir, config_file), "w", encoding="utf-8") as f:
-            json.dump(model.config.quantization_config, f, indent=2)
-
-    try:
-        copy_python_files_from_model_cache(model, save_dir)
-    except Exception as e:
-        logger.warning("Skipping source model Python file copy due to error: %s", e)