Skip to content

Commit d00cc85

Browse files
authored
fix multi cuda ut bug (#1014)
* fix multi cuda ut bug Signed-off-by: n1ck-guo <heng.guo@intel.com>
1 parent 5c2f1bd commit d00cc85

File tree

10 files changed

+117
-50
lines changed

10 files changed

+117
-50
lines changed

auto_round/compressors/utils.py

Lines changed: 50 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -323,9 +323,20 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str
323323
cfg.setdefault(key, copy.deepcopy(default_dict.get(key)))
324324

325325
# 5. collect supported modules
326+
embedding_types = (torch.nn.Embedding,)
326327
gguf_name = get_gguf_scheme(default_scheme)
327-
if gguf_name and torch.nn.Embedding not in supported_types:
328-
supported_types = (*supported_types, torch.nn.Embedding)
328+
if gguf_name:
329+
if torch.nn.Embedding not in supported_types:
330+
supported_types = (*supported_types, torch.nn.Embedding)
331+
332+
# for some Embedding which type() is not torch.nn.Embedding
333+
# for example: transformers.models.gemma3.modeling_gemma3.Gemma3TextScaledWordEmbedding
334+
model_module_name = model.__class__.__module__
335+
module_cls = sys.modules[model_module_name]
336+
for name in module_cls.__dict__:
337+
if name.endswith("Embedding") and not name.endswith("RotaryEmbedding"):
338+
embedding_types = (*embedding_types, getattr(module_cls, name))
339+
supported_types = (*supported_types, *embedding_types)
329340

330341
all_supported_layer_names, embedding_layer_names = [], []
331342
all_module_names = []
@@ -338,7 +349,7 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str
338349
if type(m) not in supported_types and m.__class__.__name__ not in inner_supported_types:
339350
continue
340351
all_supported_layer_names.append(n)
341-
if isinstance(m, torch.nn.Embedding):
352+
if isinstance(m, embedding_types) or m.__class__.__name__.endswith("Embedding"):
342353
embedding_layer_names.append(n)
343354

344355
# 6. expand regex configs
@@ -650,7 +661,7 @@ def get_layer_config_by_gguf_format(layer_config, target_gguf_format: str, model
650661

651662
import gguf # pylint: disable=E0401
652663

653-
from auto_round.utils.common import LazyImport
664+
from auto_round.utils.common import MM_KEYS, LazyImport
654665
from auto_round.utils.model import get_lm_head_name, get_module
655666

656667
# from auto_round.export.export_to_gguf.convert import ModelBase, get_model_architecture
@@ -660,24 +671,41 @@ def get_layer_config_by_gguf_format(layer_config, target_gguf_format: str, model
660671
hparams=model.config.to_dict(), model_type=model_type
661672
)
662673
try:
663-
model_class = convert_hf_to_gguf.ModelBase.from_model_architecture(model_architecture, model_type=model_type)
674+
if model_type != ModelType.TEXT:
675+
model_class_vision = convert_hf_to_gguf.ModelBase.from_model_architecture(
676+
model_architecture, model_type=model_type
677+
)
678+
model_class = convert_hf_to_gguf.ModelBase.from_model_architecture(
679+
model_architecture, model_type=ModelType.TEXT
680+
)
681+
664682
except NotImplementedError:
665683
return layer_config, {}
666684

667685
n_layer = None
668-
for name in ["n_layers", "num_hidden_layers", "n_layer", "num_layers"]:
669-
sub_attr = "text_config" if model_type == ModelType.TEXT else "vision_config"
686+
if model_type != ModelType.TEXT:
687+
n_layer_vision = None
688+
for name in ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"]:
670689
if hasattr(model.config, name):
671690
n_layer = getattr(model.config, name)
672-
break
673-
if hasattr(model.config, sub_attr):
674-
if hasattr(getattr(model.config, sub_attr), name):
675-
n_layer = getattr(getattr(model.config, sub_attr), name)
691+
if model_type != ModelType.TEXT:
692+
if n_layer is not None and hasattr(model.config, "text_config"):
693+
if hasattr(getattr(model.config, "text_config"), name):
694+
n_layer = getattr(getattr(model.config, "text_config"), name)
695+
for config_name in ["vision_config", "vision_encoder"]:
696+
if hasattr(model.config, config_name):
697+
if hasattr(getattr(model.config, config_name), name):
698+
n_layer_vision = getattr(getattr(model.config, config_name), name)
699+
break
700+
if n_layer and n_layer_vision:
676701
break
702+
677703
if n_layer is None:
678704
return layer_config, {}
679705

680706
tensor_map = gguf.get_tensor_name_map(model_class.model_arch, n_layer)
707+
if model_type != ModelType.TEXT:
708+
tensor_map_vision = gguf.get_tensor_name_map(model_class_vision.model_arch, n_layer_vision)
681709

682710
def _set_config(config, target_config):
683711
for k, v in target_config.items():
@@ -733,7 +761,17 @@ def _set_config(config, target_config):
733761
re.search("gguf:q([0-9]{1,})_[01k]", GGUF_CONFIG[target_gguf_format]["embedding"]).group(1)
734762
)
735763

736-
gguf_name = tensor_map.get_name(layer_name)
764+
if model_type != ModelType.TEXT and any([key in layer_name for key in MM_KEYS]):
765+
gguf_name = tensor_map_vision.get_name(layer_name)
766+
if gguf_name is None:
767+
for key in MM_KEYS:
768+
gguf_name = tensor_map_vision.get_name(layer_name.replace(f".{key}", ""))
769+
if gguf_name is not None:
770+
break
771+
else:
772+
gguf_name = tensor_map.get_name(layer_name)
773+
if gguf_name is None:
774+
gguf_name = tensor_map.get_name(layer_name.replace(".language_model", ""))
737775
bits_index = 6
738776
if config.get("fixed_by_user", False):
739777
if "bits" not in config:

auto_round/eval/eval_cli.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,6 @@ def eval_task_by_task(
229229
import traceback
230230

231231
from lm_eval import simple_evaluate as lm_simple_evaluate # pylint: disable=E0611
232-
from lm_eval.models.hf_vlms import HFMultimodalLM
233232
from lm_eval.models.huggingface import HFLM
234233
from transformers import AutoModelForCausalLM, AutoTokenizer
235234

@@ -269,6 +268,8 @@ def eval_task_by_task(
269268
if batch_size is None or batch_size == "auto":
270269
logger.warning("hf-multimodal models does not support auto currently, reset eval_bs to 16")
271270
batch_size = 16
271+
from lm_eval.models.hf_vlms import HFMultimodalLM
272+
272273
hflm = HFMultimodalLM(
273274
pretrained=model,
274275
tokenizer=tokenizer,
@@ -333,7 +334,10 @@ def eval_task_by_task(
333334
res_all = res
334335
else:
335336
for key in res_keys:
336-
res_all[key].update(res[key])
337+
if key not in res_all:
338+
continue
339+
else:
340+
res_all[key].update(res[key])
337341
print(make_table(res_all))
338342

339343
print("total eval time:", time.time() - st)

auto_round/eval/evaluation.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121

2222
os.environ["TOKENIZERS_PARALLELISM"] = "false"
2323

24-
from lm_eval.models.hf_vlms import HFMultimodalLM
2524
from lm_eval.models.huggingface import HFLM
2625

2726

@@ -37,6 +36,8 @@ def simple_evaluate_user_model(
3736
**kwargs
3837
):
3938
if mllm:
39+
from lm_eval.models.hf_vlms import HFMultimodalLM
40+
4041
if batch_size is None or batch_size == "auto":
4142
logger.warning("hf-multimodal models does not support auto currently, reset eval_bs to 16")
4243
batch_size = 16

auto_round/export/export_to_gguf/export.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -133,8 +133,8 @@ def pack_gguf_layer(
133133
):
134134
"""Export the model to gguf format."""
135135
global gguf_model_instance_global
136-
if output_dir is not None and os.path.exists(output_dir):
137-
logger.warning_once(f"{output_dir} already exists, this may cause model conflict")
136+
# if output_dir is not None and os.path.exists(output_dir):
137+
# logger.warning_once(f"{output_dir} already exists, this may cause model conflict")
138138
if "gguf_model_instance_global" not in globals():
139139
config = model.config
140140

auto_round/utils/common.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,24 @@ def __getitem__(self, key):
124124

125125
SUPPORTED_LAYER_TYPES = SUPPORTED_LAYER_TYPES + (LinearLayer, LinearAllreduce)
126126

127+
MM_KEYS = [
128+
"multi_modal_projector",
129+
"vision_tower",
130+
"multimodal_projector",
131+
"thinker",
132+
"visual",
133+
"audio",
134+
"talker",
135+
"token2wav",
136+
"vision_model",
137+
"audio_tower",
138+
"vision_encoder",
139+
"vision_language_adapter",
140+
"patch_merger",
141+
"pre_mm_projector_norm",
142+
"vision",
143+
]
144+
127145

128146
def is_debug_mode():
129147
"""Checks if the Python interpreter is running in debug mode.

auto_round/utils/device.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,8 @@ def detect_device_count():
193193
"""
194194
if torch.cuda.is_available():
195195
return torch.cuda.device_count()
196+
elif hasattr(torch, "xpu") and torch.xpu.is_available():
197+
return torch.xpu.device_count()
196198
else:
197199
try:
198200
import habana_frameworks.torch.hpu as hthpu # pylint: disable=E0401
@@ -1144,11 +1146,13 @@ def set_avg_auto_device_map(model: torch.nn.Module, device_map):
11441146
device_list = parse_available_devices(device_map)
11451147
gpu_devices = []
11461148
for device in device_list:
1149+
if device.startswith("hpu") and len(device_list) > 1:
1150+
logger.warning_once("Auto-scheme does not support multiple HPUs.")
11471151
if device.startswith("cpu") or device.startswith("hpu"):
11481152
continue
11491153
gpu_devices.append(device)
11501154
num_devices = len(gpu_devices)
1151-
if num_devices < 1:
1155+
if num_devices <= 1:
11521156
return
11531157

11541158
for block_names in block_name_list:
@@ -1272,7 +1276,16 @@ def parse_available_devices(device_map: Union[str, torch.device, int, dict, None
12721276
device_map = device_map.strip()
12731277
if device_map.lower() == "cpu":
12741278
return ["cpu"]
1275-
1279+
if device_map.lower() == "auto":
1280+
device_count = detect_device_count()
1281+
if "cuda" in device_types:
1282+
return [f"cuda:{i}" for i in range(device_count)]
1283+
elif "xpu" in device_types:
1284+
return [f"xpu:{i}" for i in range(device_count)]
1285+
elif "hpu" in device_types:
1286+
return [f"hpu:{i}" for i in range(device_count)]
1287+
else:
1288+
return ["cpu"]
12761289
# Split by commas
12771290
parts = [x.strip() for x in device_map.split(",") if x.strip()]
12781291
parsed = []
@@ -1283,7 +1296,7 @@ def parse_available_devices(device_map: Union[str, torch.device, int, dict, None
12831296
parsed.append(f"{device_type}:{p}" if device_type != "cpu" else "cpu")
12841297
else:
12851298
parsed.append(p)
1286-
return parsed
1299+
return list(set(parsed))
12871300

12881301
if isinstance(device_map, dict):
12891302
# Extract all devices recursively from dict values

auto_round/utils/model.py

Lines changed: 2 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -497,23 +497,8 @@ def is_pure_text_model(model):
497497

498498

499499
def is_mllm_model(model_or_path: Union[str, torch.nn.Module], platform: str = None):
500-
MM_KEYS = [
501-
"multi_modal_projector",
502-
"vision_tower",
503-
"multimodal_projector",
504-
"thinker",
505-
"visual",
506-
"audio",
507-
"talker",
508-
"token2wav",
509-
"vision_model",
510-
"audio_tower",
511-
"vision_encoder",
512-
"vision_language_adapter",
513-
"patch_merger",
514-
"pre_mm_projector_norm",
515-
"vision",
516-
]
500+
from auto_round.utils.common import MM_KEYS
501+
517502
model_path = model_or_path if isinstance(model_or_path, str) else model_or_path.name_or_path
518503
if not os.path.isdir(model_path):
519504
model_path = download_or_get_path(model_path, platform=platform)

test/test_cpu/test_scheme.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,17 @@ def test_scheme_in_layer_config(self):
118118
if n == "model.decoder.layers.4.self_attn.k_proj":
119119
self.assertEqual(m.group_size, 64)
120120

121+
def test_parse_available_devices(self):
122+
from auto_round.utils.device import parse_available_devices
123+
124+
device_list = parse_available_devices("auto")
125+
self.assertTrue(len(device_list) == 1 and "cpu" in device_list)
126+
device_list = parse_available_devices("a:cuda:0,b:cuda:1,c:cpu")
127+
self.assertTrue(len(device_list) == 3)
128+
self.assertEqual(device_list, ["cuda:0", "cuda:1", "cpu"])
129+
device_list = parse_available_devices("0,1")
130+
self.assertTrue(len(device_list) == 1 and "cpu" in device_list)
131+
121132

122133
if __name__ == "__main__":
123134
unittest.main()

test/test_cuda/test_gguf.py

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def test_q2_k_export(self):
8888
quantized_model_path = "./saved"
8989

9090
autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="gguf:q2_k_s")
91-
gguf_file = "Qwen2.5-1.5B-Instruct-1.5B-Q2_K_S.gguf"
91+
gguf_file = os.listdir(quantized_model_path)[0]
9292
model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto")
9393
text = "There is a girl who likes adventure,"
9494
inputs = autoround.tokenizer(text, return_tensors="pt").to(model.device)
@@ -123,7 +123,7 @@ def test_q4_0(self):
123123
quantized_model_path = "./saved"
124124

125125
autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="gguf:q4_0")
126-
gguf_file = "Qwen2.5-0.5B-Instruct-494M-Q4_0.gguf"
126+
gguf_file = os.listdir(quantized_model_path)[0]
127127
model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto")
128128
text = "There is a girl who likes adventure,"
129129
inputs = autoround.tokenizer(text, return_tensors="pt").to(model.device)
@@ -144,7 +144,7 @@ def test_q4_1(self):
144144
quantized_model_path = "./saved"
145145

146146
autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="gguf:q4_1")
147-
gguf_file = "Qwen2.5-0.5B-Instruct-494M-Q4_1.gguf"
147+
gguf_file = os.listdir(quantized_model_path)[0]
148148
model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto")
149149
text = "There is a girl who likes adventure,"
150150
inputs = autoround.tokenizer(text, return_tensors="pt").to(model.device)
@@ -198,15 +198,13 @@ def test_vlm_gguf(self):
198198
quantized_model_path = "./saved"
199199
autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0")
200200
self.assertTrue("mmproj-model.gguf" in os.listdir("./saved"))
201-
file_size = os.path.getsize("./saved/Qwen2.5-VL-7B-Instruct-7.6B-Q4_0.gguf") / 1024**2
202-
self.assertAlmostEqual(file_size, 4226, delta=1.0)
201+
file_size = os.path.getsize("./saved/Qwen2.5-VL-7B-Instruct-Q4_0.gguf") / 1024**2
202+
self.assertAlmostEqual(file_size, 4226, delta=5.0)
203203
file_size = os.path.getsize("./saved/mmproj-model.gguf") / 1024**2
204-
self.assertAlmostEqual(file_size, 2578, delta=1.0)
204+
self.assertAlmostEqual(file_size, 2580, delta=5.0)
205205
shutil.rmtree("./saved", ignore_errors=True)
206206

207207
model_name = "/models/gemma-3-12b-it"
208-
from auto_round import AutoRoundMLLM
209-
from auto_round.utils import mllm_load_model
210208

211209
model, processor, tokenizer, image_processor = mllm_load_model(model_name)
212210
autoround = AutoRoundMLLM(
@@ -216,15 +214,15 @@ def test_vlm_gguf(self):
216214
image_processor=image_processor,
217215
device="auto",
218216
nsamples=32,
219-
iters=1,
217+
iters=0,
220218
)
221219
quantized_model_path = "./saved"
222220
autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m")
223221
self.assertTrue("mmproj-model.gguf" in os.listdir("./saved"))
224-
file_size = os.path.getsize("./saved/gemma-3-12b-it-12B-Q4_K_M.gguf") / 1024**2
225-
self.assertAlmostEqual(file_size, 6568, delta=1.0)
222+
file_size = os.path.getsize("./saved/gemma-3-12B-it-Q4_K_M.gguf") / 1024**2
223+
self.assertAlmostEqual(file_size, 6962, delta=5.0)
226224
file_size = os.path.getsize("./saved/mmproj-model.gguf") / 1024**2
227-
self.assertAlmostEqual(file_size, 1599, delta=1.0)
225+
self.assertAlmostEqual(file_size, 1599, delta=5.0)
228226
shutil.rmtree(quantized_model_path, ignore_errors=True)
229227

230228
# @require_gguf

test/test_cuda/test_vlms.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,6 @@ def test_mllm_detect(self):
140140
"/models/Phi-3.5-vision-instruct",
141141
"/models/Qwen2-VL-2B-Instruct",
142142
"/models/SmolVLM-256M-Instruct",
143-
"/models/Llama-4-Maverick-17B-128E-Instruct",
144143
"/models/Mistral-Small-3.1-24B-Instruct-2503",
145144
"/models/InternVL3-1B",
146145
"/models/pixtral-12b",

0 commit comments

Comments
 (0)