intel
diff --git a/‎auto_round/export/export_to_autoround/qlinear_fp.py‎
Lines changed: 15 additions & 7 deletions b/‎auto_round/export/export_to_autoround/qlinear_fp.py‎
Lines changed: 15 additions & 7 deletions
diff --git a/‎auto_round/inference/backend.py‎
Lines changed: 13 additions & 4 deletions b/‎auto_round/inference/backend.py‎
Lines changed: 13 additions & 4 deletions
diff --git a/‎auto_round/utils.py‎
Lines changed: 12 additions & 0 deletions b/‎auto_round/utils.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎test/test_cpu/test_export.py‎
Lines changed: 0 additions & 269 deletions b/‎test/test_cpu/test_export.py‎
Lines changed: 0 additions & 269 deletions
@@ -38,7 +38,7 @@
 from auto_round.data_type.mxfp import FP32_EXPONENT_BIAS, FP32_MIN_NORMAL
 from auto_round.data_type.nvfp import cast_to_fp4, get_reciprocal
 from auto_round.data_type.utils import reshape_pad_tensor_by_group_size, revert_tensor_by_pad
-from auto_round.utils import _get_packing_device, is_mx_fp, is_nv_fp
+from auto_round.utils import BackendDataType, _get_packing_device, is_mx_fp, is_nv_fp
 
 # from auto_round.utils import get_weight_compress_dtype
 logger = getLogger(__name__)
@@ -72,14 +72,22 @@ def __init__(
         super().__init__()
         if bits not in [4, 8]:
             raise NotImplementedError("Only 4,8 bits are supported.")
-        if infeatures % 32 != 0 or outfeatures % 32 != 0:
-            raise NotImplementedError("in_feature and out_feature must be divisible by 32.")
         self.is_mx = is_mx_fp(data_type)
         self.is_nv = is_nv_fp(data_type)
-        if self.is_mx and group_size != 32:
-            raise NotImplementedError("Only group_size 32 are supported for mxfp.")
-        if self.is_nv and group_size not in [16, 32]:
-            raise NotImplementedError("Only group_size 16 are supported for nvfp.")
+        if self.is_mx:
+            if group_size != 32:
+                raise NotImplementedError(f"Only group_size 32 are supported for {BackendDataType.MX_FP} data type.")
+            if infeatures % group_size != 0:
+                raise NotImplementedError(
+                    f"in_feature must be divisible by {group_size} for {BackendDataType.MX_FP} data type."
+                )
+        if self.is_nv:
+            if group_size % 16 != 0:
+                raise NotImplementedError(f"Only group_size 16 are supported for {BackendDataType.NV_FP} data type.")
+            if infeatures % group_size != 0:
+                raise NotImplementedError(
+                    f"in_feature must be divisible by {group_size} for {BackendDataType.NV_FP} data type."
+                )
         self.infeatures = infeatures
         self.outfeatures = outfeatures
         self.bits = bits
 
@@ -127,12 +127,19 @@ def feature_multiply_checker_group_size(
     )
 
 
+def in_feature_checker_group_size(in_feature, out_feature, config):
+    group_size = config["group_size"]
+    return in_feature % group_size == 0
+
+
 feature_multiply_checker_32 = functools.partial(feature_multiply_checker, in_feature_multiplier=32)
 feature_multiply_checker_16 = functools.partial(feature_multiply_checker, in_feature_multiplier=16)
 in_output_feature_multiply_checker_32 = functools.partial(
     feature_multiply_checker, in_feature_multiplier=32, out_feature_multiplier=32
 )
-
+in_feature_multiply_checker_32 = functools.partial(
+    feature_multiply_checker, in_feature_multiplier=32, out_feature_multiplier=None
+)
 exllamav2_feature_checker = functools.partial(
     feature_multiply_checker_group_size, in_feature_multiplier=32, out_feature_multiplier=32
 )
@@ -141,6 +148,8 @@ def feature_multiply_checker_group_size(
     feature_multiply_checker_group_size, in_feature_multiplier=1, out_feature_multiplier=64
 )
 
+mxfp_nvfp_feature_checker = functools.partial(in_feature_checker_group_size)
+
 
 def fp8_static_scheme_checker(
     in_feature: int,
@@ -239,7 +248,7 @@ def fp8_static_scheme_checker(
     act_data_type=["mx_fp_rceil"],
     act_dynamic=[True],
     priority=0,
-    checkers=[feature_multiply_checker_32],
+    checkers=[mxfp_nvfp_feature_checker],
     alias=["auto_round", "torch"],
     requirements=["auto-round>0.7.0"],
 )
@@ -259,7 +268,7 @@ def fp8_static_scheme_checker(
     act_data_type=["mx_fp_rceil"],
     act_dynamic=[True],
     priority=0,
-    checkers=[feature_multiply_checker_32],
+    checkers=[mxfp_nvfp_feature_checker],
     alias=["auto_round", "torch"],
     requirements=["auto-round>0.7.0"],
 )
@@ -280,7 +289,7 @@ def fp8_static_scheme_checker(
     act_data_type=["nv_fp4_with_static_gs"],
     act_dynamic=[True],
     priority=0,
-    checkers=[feature_multiply_checker_16],
+    checkers=[mxfp_nvfp_feature_checker],
     alias=["auto_round", "torch"],
     requirements=["auto-round>0.7.0"],
 )
 
@@ -2963,6 +2963,18 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str
                     layer_config.setdefault(n, copy.deepcopy(default_dict))
                     layer_config[n].update({"bits": 16, "data_type": "fp", "fixed_by_user": True})
                     logger.warning_once(f"{n} skipped quantization (shape not divisible by 32).")
+    # enforce shape divisibility for mxfp/nvfp
+    if (is_nv_fp(default_dict["data_type"]) or is_mx_fp(default_dict["data_type"])) and not gguf_name:
+        for n, m in model.named_modules():
+            if type(m) in supported_types or m.__class__.__name__ in inner_supported_types:
+                if m.weight.shape[1] % default_dict["group_size"]:
+                    layer_config.setdefault(n, copy.deepcopy(default_dict))
+                    layer_config[n].update(
+                        {"bits": 16, "data_type": "fp", "act_bits": 16, "act_data_type": "fp", "fixed_by_user": True}
+                    )
+                    logger.warning_once(
+                        f"{n} skipped quantization (shape not divisible by {default_dict['group_size']})."
+                    )
 
     # 9. block layers: mark as in_blocks=True
     for name in get_layer_names_in_block(model, supported_types, quant_block_list, inner_supported_types):
 
@@ -302,275 +302,6 @@ def test_static_afp8_export(self, static_kv_dtype):
         self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype, torch.float8_e4m3fn)
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_mxfp4_llmcompressor_format(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        from transformers import AutoConfig
-
-        scheme = "MXFP4"
-        layer_config = {}
-        fp_layers_str = "k_proj"
-        from auto_round.utils import get_fp_layer_names
-
-        not_quantize_layer_names = get_fp_layer_names(model, fp_layers_str)
-        for name in not_quantize_layer_names:
-            layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float"}
-        autoround = AutoRound(
-            model,
-            self.tokenizer,
-            scheme=scheme,
-            iters=2,
-            seqlen=2,
-            layer_config=layer_config,
-            dataset=self.llm_dataloader,
-        )
-        quantized_model_path = self.save_dir
-        autoround.quantize()
-        compressed_model = autoround.save_quantized(
-            output_dir=quantized_model_path, inplace=True, format="llm_compressor"
-        )
-        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
-        skip_layer = compressed_model.model.decoder.layers[3].self_attn.k_proj
-        assert (
-            hasattr(tmp_layer, "weight_scale")
-            and hasattr(tmp_layer, "weight_packed")
-            and tmp_layer.weight_scale.dtype is torch.uint8
-            and tmp_layer.weight_scale.shape[0] == 768
-        ), "Illegal MXFP4 packing name or data_type or shape"
-        assert not hasattr(skip_layer, "weight_scale") and not hasattr(  ## check skipped layers
-            skip_layer, "weight_packed"
-        ), "Illegal MXFP4 quantization for fp_layers"
-        quantization_config = AutoConfig.from_pretrained(
-            quantized_model_path, trust_remote_code=True
-        ).quantization_config
-        assert (
-            quantization_config["format"] == "float-quantized"
-            and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True
-            and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 4
-        ), f"Invalid MXFP4 quantization configuration: {quantization_config}"
-
-        shutil.rmtree("./saved", ignore_errors=True)
-
-    def test_rtn_mxfp4_llmcompressor_format(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        from transformers import AutoConfig
-
-        scheme = "MXFP4"
-        layer_config = {}
-        fp_layers_str = "k_proj"
-        from auto_round.utils import get_fp_layer_names
-
-        not_quantize_layer_names = get_fp_layer_names(model, fp_layers_str)
-        for name in not_quantize_layer_names:
-            layer_config[name] = {"bits": 16, "act_bits": 16, "data_type": "float"}
-        autoround = AutoRound(
-            model,
-            self.tokenizer,
-            scheme=scheme,
-            iters=0,
-            seqlen=2,
-            layer_config=layer_config,
-            dataset=self.llm_dataloader,
-        )
-        quantized_model_path = self.save_dir
-        autoround.quantize()
-        compressed_model = autoround.save_quantized(
-            output_dir=quantized_model_path, inplace=True, format="llm_compressor"
-        )
-        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
-        skip_layer = compressed_model.model.decoder.layers[3].self_attn.k_proj
-        assert (
-            hasattr(tmp_layer, "weight_scale")
-            and hasattr(tmp_layer, "weight_packed")
-            and tmp_layer.weight_scale.dtype is torch.uint8
-            and tmp_layer.weight_scale.shape[0] == 768
-        ), "Illegal MXFP4 packing name or data_type or shape"
-        assert not hasattr(skip_layer, "weight_scale") and not hasattr(  ## check skipped layers
-            skip_layer, "weight_packed"
-        ), "Illegal MXFP4 quantization for fp_layers"
-        quantization_config = AutoConfig.from_pretrained(
-            quantized_model_path, trust_remote_code=True
-        ).quantization_config
-        assert (
-            quantization_config["format"] == "float-quantized"
-            and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True
-            and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 4
-        ), f"Invalid MXFP4 quantization configuration: {quantization_config}"
-        shutil.rmtree("./saved", ignore_errors=True)
-
-    def test_mxfp8_llmcompressor_format(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        from transformers import AutoConfig
-
-        scheme = "MXFP8"
-        autoround = AutoRound(
-            model,
-            self.tokenizer,
-            scheme=scheme,
-            iters=2,
-            seqlen=2,
-            dataset=self.llm_dataloader,
-        )
-        quantized_model_path = self.save_dir
-        compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor")
-        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
-        assert (
-            hasattr(tmp_layer, "weight_scale")
-            and hasattr(tmp_layer, "weight")
-            and tmp_layer.weight.dtype is torch.float8_e4m3fn
-            and tmp_layer.weight_scale.dtype is torch.uint8
-            and tmp_layer.weight_scale.shape[0] == 768
-        ), "Illegal MXFP8 packing name or data_type or shape"
-        quantization_config = AutoConfig.from_pretrained(
-            quantized_model_path, trust_remote_code=True
-        ).quantization_config
-        assert (
-            quantization_config["format"] == "float-quantized"
-            and quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] is True
-            and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 8
-        ), f"Invalid MXFP8 quantization configuration: {quantization_config}"
-        folder_size_gb = _get_folder_size(quantized_model_path)
-        # Original opt-125m is < 0.5GB -> quantized mxfp8 model should be smaller but not empty
-        assert (
-            0.15 < folder_size_gb < 0.2
-        ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.2 GB)"
-        shutil.rmtree("./saved", ignore_errors=True)
-
-    def test_nvfp4_llmcompressor_format(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        from transformers import AutoConfig
-
-        scheme = "NVFP4"
-        autoround = AutoRound(
-            model,
-            self.tokenizer,
-            scheme=scheme,
-            iters=2,
-            seqlen=2,
-            dataset=self.llm_dataloader,
-        )
-        quantized_model_path = self.save_dir
-        compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor")
-        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
-        assert (
-            hasattr(tmp_layer, "weight_scale")
-            and hasattr(tmp_layer, "weight_global_scale")
-            and hasattr(tmp_layer, "input_global_scale")
-            and tmp_layer.weight_packed.dtype is torch.uint8
-            and tmp_layer.weight_scale.dtype is torch.float8_e4m3fn
-            and tmp_layer.weight_scale.shape[0] == 768
-        ), "Illegal NVFP4 packing name or data_type or shape"
-        quantization_config = AutoConfig.from_pretrained(
-            quantized_model_path, trust_remote_code=True
-        ).quantization_config
-        assert (
-            quantization_config["format"] == "nvfp4-pack-quantized"
-            and quantization_config["config_groups"]["group_0"]["input_activations"]["num_bits"] == 4
-        ), f"Invalid NVFP4 quantization configuration: {quantization_config}"
-        folder_size_gb = _get_folder_size(quantized_model_path)
-        # Original opt-125m is < 0.5GB -> quantized nvfp4 model should be smaller but not empty
-        assert (
-            0.1 < folder_size_gb < 0.15
-        ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.15 GB)"
-        shutil.rmtree("./saved", ignore_errors=True)
-
-    def test_nvfp4_autoround_format(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        from transformers import AutoConfig
-
-        scheme = "NVFP4"
-        autoround = AutoRound(
-            model,
-            self.tokenizer,
-            scheme="NVFP4",
-            iters=2,
-            seqlen=2,
-            dataset=self.llm_dataloader,
-        )
-        quantized_model_path = self.save_dir
-        compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
-        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
-        assert (
-            hasattr(tmp_layer, "weight_scale")
-            and hasattr(tmp_layer, "weight_global_scale")
-            and hasattr(tmp_layer, "input_global_scale")
-            and tmp_layer.weight_packed.dtype is torch.uint8
-            and tmp_layer.weight_scale.dtype is torch.float8_e4m3fn
-            and tmp_layer.weight_scale.shape[0] == 768
-        ), "Illegal NVFP4 packing name or data_type or shape"
-        shutil.rmtree("./saved", ignore_errors=True)
-
-    def test_nvfp4_autoround_save_quantized(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        from transformers import AutoConfig
-
-        scheme = "NVFP4"
-        autoround = AutoRound(
-            model,
-            self.tokenizer,
-            scheme="NVFP4",
-            iters=2,
-            seqlen=2,
-            dataset=self.llm_dataloader,
-        )
-        quantized_model_path = self.save_dir
-        autoround.quantize()
-        compressed_model = autoround.save_quantized(output_dir=quantized_model_path, format="auto_round")
-        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
-        assert (
-            hasattr(tmp_layer, "weight_scale")
-            and hasattr(tmp_layer, "weight_global_scale")
-            and hasattr(tmp_layer, "input_global_scale")
-            and tmp_layer.weight_packed.dtype is torch.uint8
-            and tmp_layer.weight_scale.dtype is torch.float8_e4m3fn
-            and tmp_layer.weight_scale.shape[0] == 768
-        ), "Illegal NVFP4 packing name or data_type or shape"
-        shutil.rmtree("./saved", ignore_errors=True)
-
-    def test_nvfp4_moe_actmax_rtn(self):
-        model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite"
-        layer_config = {
-            "self_attn": {"bits": 16, "act_bits": 16},
-            "mlp.shared_experts": {"bits": 16, "act_bits": 16},
-        }
-        scheme = "nvfp4"
-        autoround = AutoRound(
-            model_name,
-            scheme=scheme,
-            iters=0,
-            seqlen=2,
-            nsamples=2,
-            dataset=self.llm_dataloader,
-            layer_config=layer_config,
-        )
-        compressed_model, _ = autoround.quantize()
-        assert hasattr(compressed_model.model.layers[1].mlp.experts[0].gate_proj.orig_layer, "act_max")
-
-    def test_nvfp4_moe_actmax_ar(self):
-        model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite"
-        layer_config = {
-            "q_proj": {"bits": 16, "act_bits": 16},
-            "mlp.shared_experts": {"bits": 16, "act_bits": 16},
-            "experts.*2": {"bits": 16, "act_bits": 16},
-            "experts.*5": {"bits": 16, "act_bits": 16},
-        }
-        scheme = "nvfp4"
-        autoround = AutoRound(
-            model_name,
-            scheme=scheme,
-            iters=1,
-            seqlen=2,
-            nsamples=2,
-            dataset=self.llm_dataloader,
-            layer_config=layer_config,
-        )
-        autoround.quantize_and_save(output_dir=self.save_dir, inplace=True, format="auto_round")
-
 
 if __name__ == "__main__":
     unittest.main()