fix cuda CI backend issue, fixtypo (#974)

WeiweiZhang1 · web-flow · commit 635eded93395 · 2025-10-31T20:18:32.000+08:00
diff --git a/auto_round/export/export_to_autogptq/export.py b/auto_round/export/export_to_autogptq/export.py
@@ -258,17 +258,17 @@ def save_quantized_as_autogptq(output_dir, inplace=True, backend="auto_gptq:exll
 
     all_to_quantized = True
     modules_in_block_to_quantize = []
-    if not dynamic:  # Only uniform precision
-        for block_names in all_blocks:
-            first_block = get_module(model, block_names[0])
-            for n, m in first_block.named_modules():
-                if m.tmp_name not in layer_config:
-                    continue
-                if not check_to_quantized(layer_config[m.tmp_name]):
-                    all_to_quantized = False
-                else:
-                    modules_in_block_to_quantize.append(n)
-        modules_in_block_to_quantize = [modules_in_block_to_quantize]
+    # for backward compatibility
+    for block_names in all_blocks:
+        first_block = get_module(model, block_names[0])
+        for n, m in first_block.named_modules():
+            if m.tmp_name not in layer_config:
+                continue
+            if not check_to_quantized(layer_config[m.tmp_name]):
+                all_to_quantized = False
+            else:
+                modules_in_block_to_quantize.append(n)
+    modules_in_block_to_quantize = [modules_in_block_to_quantize]
 
     if all_to_quantized:
         modules_in_block_to_quantize = None
diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py
@@ -132,6 +132,20 @@ def feature_multiply_checker_group_size(
     )
 
 
+def feature_compatible_multiply_checker(
+    in_feature, out_feature, config, in_feature_multiplier, out_feature_multiplier=None
+):
+    group_size = config["group_size"]
+    if out_feature_multiplier is None:
+        out_feature_multiplier = in_feature_multiplier
+    compatible_flag = in_feature < group_size and (in_feature * out_feature) % group_size == 0
+    return (
+        in_feature % in_feature_multiplier == 0
+        and out_feature % out_feature_multiplier == 0
+        and (in_feature % group_size == 0 or compatible_flag)
+    )
+
+
 def in_feature_checker_group_size(in_feature, out_feature, config):
     group_size = config["group_size"]
     return in_feature % group_size == 0
@@ -148,6 +162,9 @@ def in_feature_checker_group_size(in_feature, out_feature, config):
 exllamav2_feature_checker = functools.partial(
     feature_multiply_checker_group_size, in_feature_multiplier=32, out_feature_multiplier=32
 )
+compatible_exllamav2_feature_checker = functools.partial(
+    feature_compatible_multiply_checker, in_feature_multiplier=32, out_feature_multiplier=32
+)
 
 gptqmodel_marlin_feature_checker = functools.partial(
     feature_multiply_checker_group_size, in_feature_multiplier=1, out_feature_multiplier=64
@@ -185,9 +202,9 @@ def fp8_static_scheme_checker(
     act_bits=WOQ_DEFAULT_ACT_BITS,
     # 16, 384,768 accuracy issue
     group_size=[-1, 32, 64, 128, 256, 512, 1024, 2048],
-    checkers=[exllamav2_feature_checker],
+    checkers=[compatible_exllamav2_feature_checker],
     alias=["gptq", "auto_gptq", "exllamav2", "gptq:exllamav2", "auto_gptq:exllamav2"],
-    requirements=["auto-gptq>=0.7.1"],
+    requirements=["torch<2.6.0", "auto-gptq>=0.7.1"],
 )
 
 BackendInfos["auto_gptq:tritonv2"] = BackendInfo(
diff --git a/auto_round/special_model_handler.py b/auto_round/special_model_handler.py
@@ -85,7 +85,9 @@ def _handle_moe_model(model, formats=None):
                 parent = model.get_submodule(parent)
                 setattr(parent, child, new_module)
 
-        logger.warning("Llama4 experts are converted, the quantized model can not run on transformers.")
+        logger.warning(
+            f"{model.config.model_type} experts are converted, the quantized model can not run on transformers."
+        )
     return model