Skip to content

Commit fbc41c8

Browse files
fix nvfp act quantization bug (#891)
* fix nvfp act quantization bug Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com> * add cuda ut for moe nvfp quantize Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com> * add cpu UT, refine cuda UT Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix ut typo Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix cpu ut Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com> * enhance experts amax match, refine UT Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 9d62109 commit fbc41c8

File tree

4 files changed

+73
-10
lines changed

4 files changed

+73
-10
lines changed

auto_round/compressors/base.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1651,9 +1651,7 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str])
16511651
if self.device_map is not None:
16521652
accelerate.hooks.remove_hook_from_submodules(block)
16531653

1654-
if (
1655-
is_nv_fp(self.act_data_type) and any("nv_fp" in format_ for format_ in self.formats)
1656-
) or is_static_wfp8afp8(self):
1654+
if is_nv_fp(self.act_data_type) or is_static_wfp8afp8(self):
16571655
# enable moe experts act_max automatic generation for Linear
16581656
set_amax_for_all_moe_layers(block, attr_name="act_max")
16591657
# Normalize imatrix and quantize layers
@@ -2911,11 +2909,7 @@ def _quantize_block(
29112909
with torch.no_grad():
29122910
unwrapper_block(block, best_params)
29132911

2914-
if (
2915-
is_nv_fp(self.act_data_type)
2916-
and hasattr(self, "formats")
2917-
and any("nv_fp" in format_ for format_ in self.formats)
2918-
):
2912+
if is_nv_fp(self.act_data_type):
29192913
# enable moe experts act_max automatic generation for WrapperWALayer
29202914
set_amax_for_all_moe_layers(block, attr_name="orig_layer.act_max")
29212915

auto_round/utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2481,7 +2481,7 @@ def set_nested_attr(module, attr_name: str, value):
24812481
attrs = attr_name.split(".")
24822482
for attr in attrs[:-1]:
24832483
if not hasattr(module, attr):
2484-
raise AttributeError(f"{module} has no attribute '{attr}'")
2484+
return None # No need to set act_max for fp layers
24852485
module = getattr(module, attr)
24862486
setattr(module, attrs[-1], value)
24872487

@@ -2546,7 +2546,7 @@ def set_amax_for_all_moe_layers(model: torch.nn.Module, layer_name=None, attr_na
25462546
# For other MoE models (like Mixtral) with iterable experts
25472547
try:
25482548
set_amax_for_uncalibrated_experts(
2549-
[getattr(expert, linear_name) for expert in sub_module.experts], attr_name=attr_name
2549+
[getattr(expert, linear_name, None) for expert in sub_module.experts], attr_name=attr_name
25502550
)
25512551
except AttributeError as e:
25522552
# Provide more helpful debugging information

test/test_cpu/test_export.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -532,6 +532,45 @@ def test_nvfp4_autoround_save_quantized(self):
532532
), "Illegal NVFP4 packing name or data_type or shape"
533533
shutil.rmtree("./saved", ignore_errors=True)
534534

535+
def test_nvfp4_moe_actmax_rtn(self):
536+
model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite"
537+
layer_config = {
538+
"self_attn": {"bits": 16, "act_bits": 16},
539+
"mlp.shared_experts": {"bits": 16, "act_bits": 16},
540+
}
541+
scheme = "nvfp4"
542+
autoround = AutoRound(
543+
model_name,
544+
scheme=scheme,
545+
iters=0,
546+
seqlen=2,
547+
nsamples=2,
548+
dataset=self.llm_dataloader,
549+
layer_config=layer_config,
550+
)
551+
compressed_model, _ = autoround.quantize()
552+
assert hasattr(compressed_model.model.layers[1].mlp.experts[0].gate_proj.orig_layer, "act_max")
553+
554+
def test_nvfp4_moe_actmax_ar(self):
555+
model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite"
556+
layer_config = {
557+
"q_proj": {"bits": 16, "act_bits": 16},
558+
"mlp.shared_experts": {"bits": 16, "act_bits": 16},
559+
"experts.*2": {"bits": 16, "act_bits": 16},
560+
"experts.*5": {"bits": 16, "act_bits": 16},
561+
}
562+
scheme = "nvfp4"
563+
autoround = AutoRound(
564+
model_name,
565+
scheme=scheme,
566+
iters=1,
567+
seqlen=2,
568+
nsamples=2,
569+
dataset=self.llm_dataloader,
570+
layer_config=layer_config,
571+
)
572+
autoround.quantize_and_save(output_dir=self.save_dir, inplace=True, format="auto_round")
573+
535574

536575
if __name__ == "__main__":
537576
unittest.main()

test/test_cuda/test_export.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -402,6 +402,36 @@ def test_nvfp4_llmcompressor_format(self):
402402
# if "France" in prompt:
403403
# assert "Paris" in generated_text
404404

405+
def test_nvfp4_moe_actmax_rtn(self):
406+
model_name = "/data0/deepseek-ai/DeepSeek-V2-Lite"
407+
scheme = "nvfp4"
408+
autoround = AutoRound(
409+
model_name,
410+
scheme=scheme,
411+
iters=0,
412+
seqlen=2,
413+
nsamples=2,
414+
dataset=self.llm_dataloader,
415+
)
416+
autoround.quantize()
417+
quantized_model_path = self.save_dir
418+
autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round")
419+
420+
def test_nvfp4_moe_actmax_ar(self):
421+
model_name = "/data0/deepseek-ai/DeepSeek-V2-Lite"
422+
scheme = "nvfp4"
423+
autoround = AutoRound(
424+
model_name,
425+
scheme=scheme,
426+
iters=1,
427+
seqlen=2,
428+
nsamples=2,
429+
dataset=self.llm_dataloader,
430+
)
431+
autoround.quantize()
432+
quantized_model_path = self.save_dir
433+
autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round")
434+
405435

406436
if __name__ == "__main__":
407437
unittest.main()

0 commit comments

Comments
 (0)