gguf support for inclusionAI/Ling-flash-2.0 (#940)

n1ck-guo · web-flow · commit 90a3e8619180 · 2025-10-23T20:04:01.000+08:00
diff --git a/auto_round/export/export_to_gguf/export.py b/auto_round/export/export_to_gguf/export.py
@@ -28,6 +28,7 @@
     LazyImport,
     check_to_quantized,
     clear_memory,
+    download_hf_model,
     flatten_list,
     get_block_names,
     get_gguf_architecture,
@@ -73,7 +74,10 @@ def create_model_class(
     low_cpu_mem_usage=False,
     model_type=convert_hf_to_gguf.ModelType.TEXT,
 ):
-    tmp_work_dir = Path(os.path.join(output_dir, TMP_DIR_NAME))
+    tmp_work_dir = model.name_or_path
+    os.makedirs(output_dir, exist_ok=True)
+    if not os.path.isdir(tmp_work_dir):
+        tmp_work_dir = download_hf_model(tmp_work_dir)
     with torch.inference_mode():
         model_architecture = get_gguf_architecture(tmp_work_dir, model_type=model_type)
         try:
@@ -95,7 +99,7 @@ def create_model_class(
         output_type = FTYPE_MAP.get(output_type.lower())
 
         model_instance = model_class(
-            dir_model=tmp_work_dir,
+            dir_model=Path(tmp_work_dir),
             ftype=output_type,
             fname_out=Path(output_dir),
             is_big_endian=False,
@@ -126,19 +130,10 @@ def pack_gguf_layer(
 ):
     """Export the model to gguf format."""
     global gguf_model_instance_global
-    tmp_work_dir = Path(os.path.join(output_dir, TMP_DIR_NAME))
-    if output_dir is not None and os.path.exists(output_dir) and not os.path.exists(tmp_work_dir):
+    if output_dir is not None and os.path.exists(output_dir):
         logger.warning_once(f"{output_dir} already exists, this may cause model conflict")
-    tmp_work_dir = Path(os.path.join(output_dir, TMP_DIR_NAME))
     if "gguf_model_instance_global" not in globals():
         config = model.config
-        config.save_pretrained(tmp_work_dir)
-        if tokenizer is not None and hasattr(tokenizer, "save_pretrained"):
-            tokenizer.save_pretrained(tmp_work_dir)
-        if processor is not None:
-            processor.save_pretrained(tmp_work_dir)
-        if image_processor is not None:
-            image_processor.save_pretrained(tmp_work_dir)
 
         gguf_model_instance_global = [
             create_model_class(
@@ -201,27 +196,11 @@ def pack_gguf_layer(
 @torch.inference_mode()
 def save_quantized_as_gguf(output_dir, backend="gguf:q4_0", layer_config=None, vlm=False, **kwargs):
     """Export the model to gguf format."""
-    tmp_work_dir = Path(os.path.join(output_dir, TMP_DIR_NAME))
-    if output_dir is not None and os.path.exists(output_dir) and not os.path.exists(tmp_work_dir):
-        logger.warning(f"{output_dir} already exists, this may cause model conflict")
-
     st = time.time()
     global gguf_model_instance_global
 
     model = kwargs["model"]
     if "gguf_model_instance_global" not in globals():
-        config = model.config
-        config.save_pretrained(tmp_work_dir)
-        tokenizer = kwargs.get("tokenizer", None)
-        if tokenizer is not None:
-            tokenizer.save_pretrained(tmp_work_dir)
-        processor = kwargs.get("processor", None)
-        if processor is not None:
-            processor.save_pretrained(tmp_work_dir)
-        image_processor = kwargs.get("image_processor", None)
-        if image_processor is not None:
-            image_processor.save_pretrained(tmp_work_dir)
-
         gguf_model_instance_global = [
             create_model_class(output_dir, model, layer_config, backend, model_type=convert_hf_to_gguf.ModelType.TEXT)
         ]
@@ -237,6 +216,5 @@ def save_quantized_as_gguf(output_dir, backend="gguf:q4_0", layer_config=None, v
         rt = time.time() - st
         logger.info(f"Model successfully exported to {gguf_model.fname_out}, running time={rt}")
     del gguf_model_instance_global
-    shutil.rmtree(tmp_work_dir, ignore_errors=True)
 
     return model