Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 34 additions & 6 deletions auto_round/compressors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -865,9 +865,9 @@ def remove_duplicates(lst):
elif is_nv_fp(self.data_type) or is_mx_fp(self.data_type):
format = f"auto_round:{self.data_type}"
elif is_static_wfp8afp8(self): # staic wfp8afp8
format = f"auto_round:{AutoRoundFormat.TORCH_FP8_STATIC.value}"
format = f"auto_round:{AutoRoundFormat.FP8_STATIC.value}"
elif self.data_type == "fp" and self.bits == 8 and self.act_bits >= 16: # woq fp8
format = "auto_round:fp8"
format = f"auto_round:{AutoRoundFormat.FP8.value}"
elif self.act_bits < 16:
raise ValueError(
"AutoRound format does not support exporting "
Expand All @@ -882,6 +882,20 @@ def remove_duplicates(lst):
check_compressed_tensors_supported()
format = format.replace("llm_compressor", f"llm_compressor:{self.data_type}")
formats[index] = format
if is_static_wfp8afp8(self):
format = f"llm_compressor:{AutoRoundFormat.FP8_STATIC.value}"
formats[index] = format
if self.act_group_size != 0:
logger.warning(
f"scheme FP8_STATIC export to llm_compressor format only support for act_group_size 0,"
f" ,but got act_group_size={self.act_group_size}, reset = 0"
)
self.act_group_size = 0
if self.group_size > 0:
logger.warning(
f"please note that group_size={self.group_size}"
" may not be supported for llm_compressor format, and cannot be loaded in llm_compressor"
)
elif not is_wfp8afp8(self):
logger.error(
"Currently, the llm_compressor format only supports MXFP/NVFP/FP8. "
Expand Down Expand Up @@ -971,13 +985,25 @@ def _check_supported_format(self, format: str) -> bool:
)
format = "fake"
else:
if not (format == "auto_round" or format == f"auto_round:{AutoRoundFormat.TORCH_FP8_STATIC.value}"):
if format not in [
"auto_round",
f"auto_round:{AutoRoundFormat.FP8_STATIC.value}",
f"llm_compressor:{AutoRoundFormat.FP8_STATIC.value}",
"auto_round:llm_compressor",
]:
logger.warning(
f"Currently only support to export auto_round or fake format for static W{self.bits}AFP8 model,"
f" change format {format} to auto_round"
)
format = "auto_round"
if self.act_group_size != 0 and not self.act_dynamic and format == "auto_round:fp8":
if is_static_wfp8afp8(self):
format = f"auto_round:{AutoRoundFormat.FP8_STATIC.value}"
else:
format = f"auto_round:{AutoRoundFormat.FP8.value}"
if (
self.act_group_size != 0
and not self.act_dynamic
and format == f"auto_round:{AutoRoundFormat.FP8.value}"
):
logger.warning(
f"Please note that quantize activation with act_group_size={self.act_group_size}"
" may result in failure to export or import normally."
Expand Down Expand Up @@ -1198,7 +1224,7 @@ def register_act_hook(model):
def get_imatrix_hook(module, input, output):
input = input[0] if isinstance(input, (tuple, list)) else input
flattened = input.reshape(-1, input.shape[-1]).to(torch.float32)
squared = torch.sum(flattened**2, dim=0).to(torch.float32)
squared = torch.sum(torch.pow(flattened, 2), dim=0).to(torch.float32)

if not hasattr(module, "imatrix"):
module.imatrix = squared
Expand Down Expand Up @@ -3094,6 +3120,8 @@ def save_quantized(
)
if format == "llm_compressor" and (is_nv_fp(self.data_type) or is_mx_fp(self.data_type)):
format = format.replace("llm_compressor", f"llm_compressor:{self.data_type}")
if format == "llm_compressor" and is_static_wfp8afp8(self):
format = format.replace("llm_compressor", "llm_compressor:{AutoRoundFormat.FP8_STATIC.value}")

from auto_round.export import EXPORT_FORMAT

Expand Down
18 changes: 10 additions & 8 deletions auto_round/data_type/gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@ def quant_tensor_gguf_asym_dq(
if bits == 2:
quant_weights = torch.abs(tensor)
elif bits == 4 or bits == 5:
sigma2 = torch.sum(tensor**2, dim=-1, keepdim=True) / 32 ##Note 32 is different from QK_K
sigma2 = torch.sum(torch.pow(tensor, 2), dim=-1, keepdim=True) / 32 ##Note 32 is different from QK_K
av_x = torch.sqrt(sigma2)
quant_weights = torch.abs(tensor) + av_x
params = search_kwargs[bits]
Expand Down Expand Up @@ -384,7 +384,9 @@ def quant_tensor_gguf_asym_dq(
if bits == 2:
tmp_quant_weights = torch.abs(tensor)
elif bits == 4 or bits == 5:
sigma2 = torch.sum(tensor**2, dim=-1, keepdim=True) / 32 ## Note 32 is different from QK_K
sigma2 = (
torch.sum(torch.pow(tensor, 2), dim=-1, keepdim=True) / 32
) ## Note 32 is different from QK_K
av_x = torch.sqrt(sigma2)
tmp_quant_weights = torch.abs(tensor) + av_x
quant_weights[replace_index, :] = tmp_quant_weights[replace_index, :]
Expand All @@ -395,7 +397,7 @@ def quant_tensor_gguf_asym_dq(
tmp_quant_weights = tmp_quant_weights.view(-1, 1).expand(-1, quant_weights.shape[1])
quant_weights[mean_replace_index, :] = tmp_quant_weights[mean_replace_index, :]

# sigma2 = torch.sum(tensor ** 2, dim=-1, keepdim=True) / QK_K
# sigma2 = torch.sum(torch.pow(tensor, 2), dim=-1, keepdim=True) / QK_K
# if imatrix is None:
# av_x = torch.sqrt(sigma2)
# quant_weights = torch.abs(av_x + tensor * tensor)
Expand Down Expand Up @@ -470,7 +472,7 @@ def iterative_wls_quant_search(data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, u
quant_data = torch.clamp(torch.round(iscale * (data - rmin)), minq, maxq)
diff = scale * quant_data + rmin - data

best_mad = torch.sum((weights * torch.abs(diff)) if use_mad else weights * diff**2, dim=1, keepdim=True)
best_mad = torch.sum((weights * torch.abs(diff)) if use_mad else weights * torch.pow(diff, 2), dim=1, keepdim=True)

for is_ in range(nstep):
factor = rrmin + rdelta * is_ + maxq - minq
Expand All @@ -484,7 +486,7 @@ def iterative_wls_quant_search(data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, u
sum_l2 = torch.sum(mul_weights_quant_data * quant_data_new, dim=-1, keepdim=True)
sum_xl = torch.sum(mul_weights_quant_data * data, dim=-1, keepdim=True)

D = sum_w * sum_l2 - sum_l**2
D = sum_w * sum_l2 - torch.pow(sum_l, 2)
this_scale = (sum_w * sum_xl - sum_x * sum_l) / D
this_min = (sum_l2 * sum_x - sum_l * sum_xl) / D
this_min[this_min > 0] = 0
Expand All @@ -494,7 +496,7 @@ def iterative_wls_quant_search(data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, u
quant_data = torch.clamp(torch.round(reverse_this_scale * (data - this_min)), minq, maxq)
diff = this_scale * quant_data + this_min - data
# diff = this_scale * quant_data_new + this_min - data
mad = torch.sum((weights * torch.abs(diff)) if use_mad else weights * diff**2, dim=-1, keepdim=True)
mad = torch.sum((weights * torch.abs(diff)) if use_mad else weights * torch.pow(diff, 2), dim=-1, keepdim=True)

idx_to_replace = torch.where((mad < best_mad) & (D > 0))[0]
best_mad[idx_to_replace] = mad[idx_to_replace]
Expand Down Expand Up @@ -566,7 +568,7 @@ def quant_tensor_gguf_sym_dq(
imatrix = imatrix.to(tensor.device)

# if bits == 3:
# # sigma2 = 2 * torch.sum(tensor ** 2, dim=-1, keepdim=True) / QK_K
# # sigma2 = 2 * torch.sum(torch.pow(tensor, 2), dim=-1, keepdim=True) / QK_K
# # imatrix = imatrix.reshape(1, -1).expand(tensor.numel() // imatrix.numel(), -1).reshape(tensor.shape)
# # quant_weights = imatrix * torch.sqrt(sigma2 + tensor * tensor)
# # scale, int_w = make_qx_quants(tensor, bits=bits, rmse_type=1, qw=quant_weights)
Expand All @@ -588,7 +590,7 @@ def quant_tensor_gguf_sym_dq(
if bits == 6:
quant_weights[replace_index] = tensor[replace_index] * tensor[replace_index]
else:
sigma2 = 2 * torch.sum(tensor**2, dim=-1, keepdim=True) / QK_K
sigma2 = 2 * torch.sum(torch.pow(tensor, 2), dim=-1, keepdim=True) / QK_K
tmp_quant_weights = torch.sqrt(sigma2 + tensor * tensor)
quant_weights[replace_index] = tmp_quant_weights[replace_index]
mean_replace_index = (zero_cnt > 0) & (zero_cnt <= group_size // 2)
Expand Down
54 changes: 4 additions & 50 deletions auto_round/export/export_to_autogptq/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
from tqdm import tqdm

import auto_round.export.export_to_autogptq.qlinear_triton
from auto_round.export.utils import save_model
from auto_round.logger import logger
from auto_round.utils import (
SUPPORTED_LAYER_TYPES,
Expand Down Expand Up @@ -214,54 +215,7 @@ def wrapper(name):
model.config.quantization_config = quantization_config

dtype = torch.float16 ##force dtype to fp16
save(model, output_dir, safe_serialization=safe_serialization, dtype=dtype)
save_model(
model, output_dir, safe_serialization=safe_serialization, dtype=dtype, config_file="quantize_config.json"
)
return model


def save(
model: torch.nn.Module, save_dir: str, max_shard_size: str = "5GB", safe_serialization: bool = True, dtype=None
):
"""Save model state dict and configs.

Args:
model (`nn.Module`):
Model to be saved. The model can be wrapped or unwrapped.
save_dir (`str`):
Directory to which to save. Will be created if it doesn't exist.
max_shard_size (`str`, defaults to `"10GB"`):
The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size
lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5MB"`).
<Tip warning={true}>

If a single weight of the model is bigger than `max_shard_size`, it will be in its own checkpoint shard
which will be bigger than `max_shard_size`.

</Tip>
safe_serialization (`bool`, defaults to `True`):
Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
"""
##max_shard_size = "10000GB" ## API of auto-gptq with marlin does not support shard size
os.makedirs(save_dir, exist_ok=True)
try:
model.save_pretrained(save_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization)
except ValueError as e:
if hasattr(model, "generation_config"):
setattr(model.generation_config, "do_sample", True)
model.save_pretrained(save_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization)
config_path = os.path.join(save_dir, "config.json")
if dtype is not None and dtype != model.dtype and os.path.exists(os.path.join(save_dir, "config.json")):
with open(config_path, "r") as file:
data = json.load(file)
data["torch_dtype"] = str(dtype).split(".")[-1]
with open(config_path, "w") as file:
json.dump(data, file, indent=2)

config_file = "quantize_config.json"
if hasattr(model, "config") and hasattr(model.config, "quantization_config"):
with open(os.path.join(save_dir, config_file), "w", encoding="utf-8") as f:
json.dump(model.config.quantization_config, f, indent=2)

try:
copy_python_files_from_model_cache(model, save_dir)
except Exception as e:
logger.warning("Skipping source model Python file copy due to error: %s", e)
69 changes: 19 additions & 50 deletions auto_round/export/export_to_autoround/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from tqdm import tqdm

from auto_round.export.export_to_autoround.utils import REQUIRED_CONFIG_KEYS, check_neq_config
from auto_round.export.utils import save_model
from auto_round.logger import logger
from auto_round.utils import (
SUPPORTED_FORMATS,
Expand All @@ -47,7 +48,8 @@
class AutoRoundFormat(str, Enum):
# Weight: FP8, per-channel, may be extended to per-tensor in future
# Activation: FP8, per-tensor
TORCH_FP8_STATIC = "fp8_static"
FP8_STATIC = "fp8_static"
FP8 = "fp8"


def dynamic_import_quant_linear_for_packing(backend, bits, group_size, sym, act_bits=16):
Expand Down Expand Up @@ -159,11 +161,19 @@ def pack_layer(layer_name, model, backend, device=None):

return pack_layer(layer_name, model, backend, device)

if backend == "auto_round:fp8" or backend == f"auto_round:{AutoRoundFormat.TORCH_FP8_STATIC.value}":
if (
backend == f"auto_round:{AutoRoundFormat.FP8.value}"
or backend == f"auto_round:{AutoRoundFormat.FP8_STATIC.value}"
):
from auto_round.export.export_to_autoround.export_to_fp8 import pack_layer

return pack_layer(layer_name, model, backend, device)

if backend == "auto_round:llm_compressor":
from auto_round.export.export_to_llmcompressor.export_to_static_fp import pack_layer

return pack_layer(layer_name, model, backend, device)

layer = get_module(model, layer_name)
if hasattr(layer, "orig_layer"):
layer = layer.orig_layer
Expand Down Expand Up @@ -271,6 +281,11 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex

return save_quantized_as_fp(output_dir, inplace=inplace, backend="auto_round:llm_compressor", **kwargs)

if backend == "auto_round:llm_compressor":
from auto_round.export.export_to_llmcompressor.export_to_static_fp import save_quantized_as_static_fp

return save_quantized_as_static_fp(output_dir, inplace=inplace, backend="auto_round:llm_compressor", **kwargs)

if kwargs.get("data_type", "int") == "fp" and kwargs.get("bits", 16) == 8 and kwargs.get("act_bits", 16) >= 16:
from auto_round.export.export_to_autoround.export_to_fp8 import save_quantized_as_autoround

Expand All @@ -280,7 +295,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
if (
(kwargs.get("sym") is None or kwargs.get("sym"))
and ("gptq" not in backend and "awq" not in backend)
and (AutoRoundFormat.TORCH_FP8_STATIC.value not in backend)
and (AutoRoundFormat.FP8_STATIC.value not in backend)
):
backend = backend.replace("auto_round", "auto_round:auto_gptq")

Expand Down Expand Up @@ -367,52 +382,6 @@ def wrapper(name):
dtype = torch.float16 ## awq kernel only supports float16 on cuda
else:
dtype = None
save(model, output_dir, safe_serialization=safe_serialization, dtype=dtype)
save_model(model, output_dir, safe_serialization=safe_serialization, dtype=dtype)

return model


def save(model: nn.Module, save_dir: str, max_shard_size: str = "5GB", safe_serialization: bool = True, dtype=None):
"""Save model state dict and configs.

Args:
model (`nn.Module`):
Model to be saved. The model can be wrapped or unwrapped.
save_dir (`str`):
Directory to which to save. Will be created if it doesn't exist.
max_shard_size (`str`, defaults to `"10GB"`):
The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size
lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5MB"`).
<Tip warning={true}>

If a single weight of the model is bigger than `max_shard_size`, it will be in its own checkpoint shard
which will be bigger than `max_shard_size`.

</Tip>
safe_serialization (`bool`, defaults to `True`):
Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
"""
os.makedirs(save_dir, exist_ok=True)
try:
model.save_pretrained(save_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization)
except ValueError as e:
if hasattr(model, "generation_config"):
setattr(model.generation_config, "do_sample", True)
model.save_pretrained(save_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization)

config_path = os.path.join(save_dir, "config.json")
if dtype is not None and dtype != model.dtype and os.path.exists(os.path.join(save_dir, "config.json")):
with open(config_path, "r") as file:
data = json.load(file)
data["torch_dtype"] = str(dtype).split(".")[-1]
with open(config_path, "w") as file:
json.dump(data, file, indent=2)
config_file = "quantization_config.json"
if hasattr(model, "config") and hasattr(model.config, "quantization_config"):
with open(os.path.join(save_dir, config_file), "w", encoding="utf-8") as f:
json.dump(model.config.quantization_config, f, indent=2)

try:
copy_python_files_from_model_cache(model, save_dir)
except Exception as e:
logger.warning("Skipping source model Python file copy due to error: %s", e)
Loading