Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 15 additions & 7 deletions auto_round/export/export_to_autoround/qlinear_fp.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
from auto_round.data_type.mxfp import FP32_EXPONENT_BIAS, FP32_MIN_NORMAL
from auto_round.data_type.nvfp import cast_to_fp4, get_reciprocal
from auto_round.data_type.utils import reshape_pad_tensor_by_group_size, revert_tensor_by_pad
from auto_round.utils import _get_packing_device, is_mx_fp, is_nv_fp
from auto_round.utils import BackendDataType, _get_packing_device, is_mx_fp, is_nv_fp

# from auto_round.utils import get_weight_compress_dtype
logger = getLogger(__name__)
Expand Down Expand Up @@ -72,14 +72,22 @@ def __init__(
super().__init__()
if bits not in [4, 8]:
raise NotImplementedError("Only 4,8 bits are supported.")
if infeatures % 32 != 0 or outfeatures % 32 != 0:
raise NotImplementedError("in_feature and out_feature must be divisible by 32.")
self.is_mx = is_mx_fp(data_type)
self.is_nv = is_nv_fp(data_type)
if self.is_mx and group_size != 32:
raise NotImplementedError("Only group_size 32 are supported for mxfp.")
if self.is_nv and group_size not in [16, 32]:
raise NotImplementedError("Only group_size 16 are supported for nvfp.")
if self.is_mx:
if group_size != 32:
raise NotImplementedError(f"Only group_size 32 are supported for {BackendDataType.MX_FP} data type.")
if infeatures % group_size != 0:
raise NotImplementedError(
f"in_feature must be divisible by {group_size} for {BackendDataType.MX_FP} data type."
)
if self.is_nv:
if group_size % 16 != 0:
raise NotImplementedError(f"Only group_size 16 are supported for {BackendDataType.NV_FP} data type.")
if infeatures % group_size != 0:
raise NotImplementedError(
f"in_feature must be divisible by {group_size} for {BackendDataType.NV_FP} data type."
)
self.infeatures = infeatures
self.outfeatures = outfeatures
self.bits = bits
Expand Down
17 changes: 13 additions & 4 deletions auto_round/inference/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,12 +127,19 @@ def feature_multiply_checker_group_size(
)


def in_feature_checker_group_size(in_feature, out_feature, config):
group_size = config["group_size"]
return in_feature % group_size == 0


feature_multiply_checker_32 = functools.partial(feature_multiply_checker, in_feature_multiplier=32)
feature_multiply_checker_16 = functools.partial(feature_multiply_checker, in_feature_multiplier=16)
in_output_feature_multiply_checker_32 = functools.partial(
feature_multiply_checker, in_feature_multiplier=32, out_feature_multiplier=32
)

in_feature_multiply_checker_32 = functools.partial(
feature_multiply_checker, in_feature_multiplier=32, out_feature_multiplier=None
)
exllamav2_feature_checker = functools.partial(
feature_multiply_checker_group_size, in_feature_multiplier=32, out_feature_multiplier=32
)
Expand All @@ -141,6 +148,8 @@ def feature_multiply_checker_group_size(
feature_multiply_checker_group_size, in_feature_multiplier=1, out_feature_multiplier=64
)

mxfp_nvfp_feature_checker = functools.partial(in_feature_checker_group_size)


def fp8_static_scheme_checker(
in_feature: int,
Expand Down Expand Up @@ -239,7 +248,7 @@ def fp8_static_scheme_checker(
act_data_type=["mx_fp_rceil"],
act_dynamic=[True],
priority=0,
checkers=[feature_multiply_checker_32],
checkers=[mxfp_nvfp_feature_checker],
alias=["auto_round", "torch"],
requirements=["auto-round>0.7.0"],
)
Expand All @@ -259,7 +268,7 @@ def fp8_static_scheme_checker(
act_data_type=["mx_fp_rceil"],
act_dynamic=[True],
priority=0,
checkers=[feature_multiply_checker_32],
checkers=[mxfp_nvfp_feature_checker],
alias=["auto_round", "torch"],
requirements=["auto-round>0.7.0"],
)
Expand All @@ -280,7 +289,7 @@ def fp8_static_scheme_checker(
act_data_type=["nv_fp4_with_static_gs"],
act_dynamic=[True],
priority=0,
checkers=[feature_multiply_checker_16],
checkers=[mxfp_nvfp_feature_checker],
alias=["auto_round", "torch"],
requirements=["auto-round>0.7.0"],
)
Expand Down
12 changes: 12 additions & 0 deletions auto_round/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2963,6 +2963,18 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str
layer_config.setdefault(n, copy.deepcopy(default_dict))
layer_config[n].update({"bits": 16, "data_type": "fp", "fixed_by_user": True})
logger.warning_once(f"{n} skipped quantization (shape not divisible by 32).")
# enforce shape divisibility for mxfp/nvfp
if (is_nv_fp(default_dict["data_type"]) or is_mx_fp(default_dict["data_type"])) and not gguf_name:
for n, m in model.named_modules():
if type(m) in supported_types or m.__class__.__name__ in inner_supported_types:
if m.weight.shape[1] % default_dict["group_size"]:
layer_config.setdefault(n, copy.deepcopy(default_dict))
layer_config[n].update(
{"bits": 16, "data_type": "fp", "act_bits": 16, "act_data_type": "fp", "fixed_by_user": True}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could you fix this issue later? when setting the bits and bits to 16, no other attribute need to be set anymore

)
logger.warning_once(
f"{n} skipped quantization (shape not divisible by {default_dict['group_size']})."
)

# 9. block layers: mark as in_blocks=True
for name in get_layer_names_in_block(model, supported_types, quant_block_list, inner_supported_types):
Expand Down
269 changes: 0 additions & 269 deletions test/test_cpu/test_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,275 +302,6 @@ def test_static_afp8_export(self, static_kv_dtype):
self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype, torch.float8_e4m3fn)
shutil.rmtree(quantized_model_path, ignore_errors=True)

def test_mxfp4_llmcompressor_format(self):
model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
from transformers import AutoConfig

scheme = "MXFP4"
layer_config = {}
fp_layers_str = "k_proj"
from auto_round.utils import get_fp_layer_names

not_quantize_layer_names = get_fp_layer_names(model, fp_layers_str)
for name in not_quantize_layer_names:
layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float"}
autoround = AutoRound(
model,
self.tokenizer,
scheme=scheme,
iters=2,
seqlen=2,
layer_config=layer_config,
dataset=self.llm_dataloader,
)
quantized_model_path = self.save_dir
autoround.quantize()
compressed_model = autoround.save_quantized(
output_dir=quantized_model_path, inplace=True, format="llm_compressor"
)
tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
skip_layer = compressed_model.model.decoder.layers[3].self_attn.k_proj
assert (
hasattr(tmp_layer, "weight_scale")
and hasattr(tmp_layer, "weight_packed")
and tmp_layer.weight_scale.dtype is torch.uint8
and tmp_layer.weight_scale.shape[0] == 768
), "Illegal MXFP4 packing name or data_type or shape"
assert not hasattr(skip_layer, "weight_scale") and not hasattr( ## check skipped layers
skip_layer, "weight_packed"
), "Illegal MXFP4 quantization for fp_layers"
quantization_config = AutoConfig.from_pretrained(
quantized_model_path, trust_remote_code=True
).quantization_config
assert (
quantization_config["format"] == "float-quantized"
and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True
and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 4
), f"Invalid MXFP4 quantization configuration: {quantization_config}"

shutil.rmtree("./saved", ignore_errors=True)

def test_rtn_mxfp4_llmcompressor_format(self):
model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
from transformers import AutoConfig

scheme = "MXFP4"
layer_config = {}
fp_layers_str = "k_proj"
from auto_round.utils import get_fp_layer_names

not_quantize_layer_names = get_fp_layer_names(model, fp_layers_str)
for name in not_quantize_layer_names:
layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float"}
autoround = AutoRound(
model,
self.tokenizer,
scheme=scheme,
iters=0,
seqlen=2,
layer_config=layer_config,
dataset=self.llm_dataloader,
)
quantized_model_path = self.save_dir
autoround.quantize()
compressed_model = autoround.save_quantized(
output_dir=quantized_model_path, inplace=True, format="llm_compressor"
)
tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
skip_layer = compressed_model.model.decoder.layers[3].self_attn.k_proj
assert (
hasattr(tmp_layer, "weight_scale")
and hasattr(tmp_layer, "weight_packed")
and tmp_layer.weight_scale.dtype is torch.uint8
and tmp_layer.weight_scale.shape[0] == 768
), "Illegal MXFP4 packing name or data_type or shape"
assert not hasattr(skip_layer, "weight_scale") and not hasattr( ## check skipped layers
skip_layer, "weight_packed"
), "Illegal MXFP4 quantization for fp_layers"
quantization_config = AutoConfig.from_pretrained(
quantized_model_path, trust_remote_code=True
).quantization_config
assert (
quantization_config["format"] == "float-quantized"
and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True
and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 4
), f"Invalid MXFP4 quantization configuration: {quantization_config}"
shutil.rmtree("./saved", ignore_errors=True)

def test_mxfp8_llmcompressor_format(self):
model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
from transformers import AutoConfig

scheme = "MXFP8"
autoround = AutoRound(
model,
self.tokenizer,
scheme=scheme,
iters=2,
seqlen=2,
dataset=self.llm_dataloader,
)
quantized_model_path = self.save_dir
compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor")
tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
assert (
hasattr(tmp_layer, "weight_scale")
and hasattr(tmp_layer, "weight")
and tmp_layer.weight.dtype is torch.float8_e4m3fn
and tmp_layer.weight_scale.dtype is torch.uint8
and tmp_layer.weight_scale.shape[0] == 768
), "Illegal MXFP8 packing name or data_type or shape"
quantization_config = AutoConfig.from_pretrained(
quantized_model_path, trust_remote_code=True
).quantization_config
assert (
quantization_config["format"] == "float-quantized"
and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True
and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 8
), f"Invalid MXFP8 quantization configuration: {quantization_config}"
folder_size_gb = _get_folder_size(quantized_model_path)
# Original opt-125m is < 0.5GB -> quantized mxfp8 model should be smaller but not empty
assert (
0.15 < folder_size_gb < 0.2
), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.2 GB)"
shutil.rmtree("./saved", ignore_errors=True)

def test_nvfp4_llmcompressor_format(self):
model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
from transformers import AutoConfig

scheme = "NVFP4"
autoround = AutoRound(
model,
self.tokenizer,
scheme=scheme,
iters=2,
seqlen=2,
dataset=self.llm_dataloader,
)
quantized_model_path = self.save_dir
compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor")
tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
assert (
hasattr(tmp_layer, "weight_scale")
and hasattr(tmp_layer, "weight_global_scale")
and hasattr(tmp_layer, "input_global_scale")
and tmp_layer.weight_packed.dtype is torch.uint8
and tmp_layer.weight_scale.dtype is torch.float8_e4m3fn
and tmp_layer.weight_scale.shape[0] == 768
), "Illegal NVFP4 packing name or data_type or shape"
quantization_config = AutoConfig.from_pretrained(
quantized_model_path, trust_remote_code=True
).quantization_config
assert (
quantization_config["format"] == "nvfp4-pack-quantized"
and quantization_config["config_groups"]["group_0"]["input_activations"]["num_bits"] == 4
), f"Invalid NVFP4 quantization configuration: {quantization_config}"
folder_size_gb = _get_folder_size(quantized_model_path)
# Original opt-125m is < 0.5GB -> quantized nvfp4 model should be smaller but not empty
assert (
0.1 < folder_size_gb < 0.15
), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.15 GB)"
shutil.rmtree("./saved", ignore_errors=True)

def test_nvfp4_autoround_format(self):
model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
from transformers import AutoConfig

scheme = "NVFP4"
autoround = AutoRound(
model,
self.tokenizer,
scheme="NVFP4",
iters=2,
seqlen=2,
dataset=self.llm_dataloader,
)
quantized_model_path = self.save_dir
compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
assert (
hasattr(tmp_layer, "weight_scale")
and hasattr(tmp_layer, "weight_global_scale")
and hasattr(tmp_layer, "input_global_scale")
and tmp_layer.weight_packed.dtype is torch.uint8
and tmp_layer.weight_scale.dtype is torch.float8_e4m3fn
and tmp_layer.weight_scale.shape[0] == 768
), "Illegal NVFP4 packing name or data_type or shape"
shutil.rmtree("./saved", ignore_errors=True)

def test_nvfp4_autoround_save_quantized(self):
model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
from transformers import AutoConfig

scheme = "NVFP4"
autoround = AutoRound(
model,
self.tokenizer,
scheme="NVFP4",
iters=2,
seqlen=2,
dataset=self.llm_dataloader,
)
quantized_model_path = self.save_dir
autoround.quantize()
compressed_model = autoround.save_quantized(output_dir=quantized_model_path, format="auto_round")
tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
assert (
hasattr(tmp_layer, "weight_scale")
and hasattr(tmp_layer, "weight_global_scale")
and hasattr(tmp_layer, "input_global_scale")
and tmp_layer.weight_packed.dtype is torch.uint8
and tmp_layer.weight_scale.dtype is torch.float8_e4m3fn
and tmp_layer.weight_scale.shape[0] == 768
), "Illegal NVFP4 packing name or data_type or shape"
shutil.rmtree("./saved", ignore_errors=True)

def test_nvfp4_moe_actmax_rtn(self):
model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite"
layer_config = {
"self_attn": {"bits": 16, "act_bits": 16},
"mlp.shared_experts": {"bits": 16, "act_bits": 16},
}
scheme = "nvfp4"
autoround = AutoRound(
model_name,
scheme=scheme,
iters=0,
seqlen=2,
nsamples=2,
dataset=self.llm_dataloader,
layer_config=layer_config,
)
compressed_model, _ = autoround.quantize()
assert hasattr(compressed_model.model.layers[1].mlp.experts[0].gate_proj.orig_layer, "act_max")

def test_nvfp4_moe_actmax_ar(self):
model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite"
layer_config = {
"q_proj": {"bits": 16, "act_bits": 16},
"mlp.shared_experts": {"bits": 16, "act_bits": 16},
"experts.*2": {"bits": 16, "act_bits": 16},
"experts.*5": {"bits": 16, "act_bits": 16},
}
scheme = "nvfp4"
autoround = AutoRound(
model_name,
scheme=scheme,
iters=1,
seqlen=2,
nsamples=2,
dataset=self.llm_dataloader,
layer_config=layer_config,
)
autoround.quantize_and_save(output_dir=self.save_dir, inplace=True, format="auto_round")


if __name__ == "__main__":
unittest.main()
Loading