support for model scope (#957)

n1ck-guo · web-flow · commit 2448dcf2b864 · 2025-10-31T16:26:13.000+08:00
* support for model scope

Signed-off-by: n1ck-guo &lt;heng.guo@intel.com&gt;
diff --git a/auto_round/__main__.py b/auto_round/__main__.py
@@ -45,6 +45,12 @@ def __init__(self, *args, **kwargs):
             help="Path to the pre-trained model or model identifier from huggingface.co/models. "
             "Examples: 'facebook/opt-125m', 'bert-base-uncased', or local path like '/path/to/model'",
         )
+        basic.add_argument(
+            "--platform",
+            default="hf",
+            help="Platform to load the pre-trained model. Options: [hf, model_scope]."
+            " hf stands for huggingface and model_scope stands for model scope.",
+        )
         basic.add_argument(
             "--scheme",
             default="W4A16",
@@ -566,6 +572,7 @@ def tune(args):
 
     autoround: BaseCompressor = AutoRound(
         model=model_name,
+        platform=args.platform,
         scheme=scheme,
         dataset=args.dataset,
         iters=args.iters,
diff --git a/auto_round/autoround.py b/auto_round/autoround.py
@@ -43,6 +43,7 @@ class AutoRound:
     Attributes:
         model (torch.nn.Module): The loaded PyTorch model in eval mode.
         tokenizer: Tokenizer used to prepare input text for calibration/tuning.
+        platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
         bits (int): Weight quantization bits.
         group_size (int): Per-group size for weight quantization.
         sym (bool): Whether to use symmetric weight quantization.
@@ -67,6 +68,7 @@ def __new__(
         cls,
         model: Union[torch.nn.Module, str],
         tokenizer=None,
+        platform: str = "hf",
         scheme: Union[str, dict, QuantizationScheme, AutoScheme] = "W4A16",
         layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None,
         dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k",
@@ -146,7 +148,7 @@ def __new__(
         """
         model_cls = []
 
-        if (extra_config and not extra_config.mllm_config.is_default()) or is_mllm_model(model):
+        if (extra_config and not extra_config.mllm_config.is_default()) or is_mllm_model(model, platform=platform):
             logger.info("using MLLM mode for multimodal model.")
             model_cls.append(MLLMCompressor)
             if extra_config:
@@ -170,6 +172,7 @@ def __new__(
         ar = dynamic_compressor(
             model=model,
             tokenizer=tokenizer,
+            platform=platform,
             scheme=scheme,
             layer_config=layer_config,
             dataset=dataset,
@@ -314,6 +317,7 @@ def __init__(
         self,
         model: Union[torch.nn.Module, str],
         tokenizer=None,
+        platform: str = "hf",
         scheme: Union[str, dict, QuantizationScheme] = "W4A16",
         layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None,
         dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k",
@@ -331,6 +335,7 @@ def __init__(
         super().__init__(
             model=model,
             tokenizer=tokenizer,
+            platform=platform,
             scheme=scheme,
             layer_config=layer_config,
             dataset=dataset,
@@ -354,6 +359,7 @@ class AutoRoundAdam(AdamCompressor):
     Args:
         model: The PyTorch model to be quantized.
         tokenizer: An optional tokenizer for processing input data.
+        platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
         scheme (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations
         bits (int): Number of bits for quantization (default is 4).
         group_size (int): Size of the quantization group (default is 128).
@@ -413,6 +419,7 @@ def __init__(
         self,
         model: Union[torch.nn.Module, str],
         tokenizer=None,
+        platform: str = "hf",
         scheme: Union[str, dict, QuantizationScheme] = "W4A16",
         layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None,
         dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k",
@@ -431,6 +438,7 @@ def __init__(
         super().__init__(
             model=model,
             tokenizer=tokenizer,
+            platform=platform,
             scheme=scheme,
             layer_config=layer_config,
             batch_size=batch_size,
@@ -455,6 +463,7 @@ class AutoRoundMLLM(MLLMCompressor):
     Args:
         model: The PyTorch model to be quantized.
         tokenizer: An optional tokenizer for processing input data.
+        platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
         processor: Any multi-modal model will require an object to encode or
                    decode the data that groups several modalities (among text, vision and audio).
         image_processor: Image processor for special model like llava.
@@ -513,6 +522,7 @@ def __init__(
         self,
         model: Union[torch.nn.Module, str],
         tokenizer=None,
+        platform: str = "hf",
         processor=None,
         image_processor=None,
         scheme: Union[str, dict, QuantizationScheme] = "W4A16",
@@ -533,6 +543,7 @@ def __init__(
         super().__init__(
             model=model,
             tokenizer=tokenizer,
+            platform=platform,
             processor=processor,
             image_processor=image_processor,
             scheme=scheme,
@@ -559,6 +570,7 @@ class AutoRoundDiffusion(DiffusionCompressor):
     Args:
         model: The PyTorch model to be quantized.
         tokenizer: An optional tokenizer for processing input data, is not used for diffusion models.
+        platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
         guidance_scale (float): Control how much the image generation process follows the text prompt.
                                 The more it is, the more closely it follows the prompt (default is 7.5).
         num_inference_steps (int): The reference number of denoising steps (default is 50).
diff --git a/auto_round/compressors/adam.py b/auto_round/compressors/adam.py
@@ -27,6 +27,7 @@ class AdamCompressor(BaseCompressor):
     Args:
         model: The PyTorch model to be quantized.
         tokenizer: An optional tokenizer for processing input data.
+        platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
         scheme (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations
         bits (int): Number of bits for quantization (default is 4).
         group_size (int): Size of the quantization group (default is 128).
@@ -86,6 +87,7 @@ def __init__(
         self,
         model: Union[torch.nn.Module, str],
         tokenizer=None,
+        platform="hf",
         scheme: Union[str, dict, QuantizationScheme] = "W4A16",
         layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None,
         dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k",
@@ -104,6 +106,7 @@ def __init__(
         super(AdamCompressor, self).__init__(
             model=model,
             tokenizer=tokenizer,
+            platform=platform,
             scheme=scheme,
             layer_config=layer_config,
             batch_size=batch_size,
diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
@@ -30,6 +30,7 @@
 from tqdm import tqdm
 from transformers import set_seed
 
+from auto_round import envs
 from auto_round.auto_scheme.gen_auto_scheme import AutoScheme
 from auto_round.compressors.utils import (
     block_forward,
@@ -105,6 +106,7 @@ class BaseCompressor(object):
     Attributes:
         model (torch.nn.Module): The loaded PyTorch model in eval mode.
         tokenizer: Tokenizer used to prepare input text for calibration/tuning.
+        platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
         bits (int): Weight quantization bits.
         group_size (int): Per-group size for weight quantization.
         sym (bool): Whether to use symmetric weight quantization.
@@ -129,6 +131,7 @@ def __init__(
         self,
         model: Union[torch.nn.Module, str],
         tokenizer=None,
+        platform="hf",
         scheme: Union[str, dict, QuantizationScheme, AutoScheme] = "W4A16",
         layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None,
         dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k",
@@ -228,6 +231,10 @@ def __init__(
         device = kwargs.pop("device", None)
         # Scale factor for RAM usage per parameter.
         mem_per_param_scale = kwargs.pop("mem_per_param_scale", None)
+
+        if envs.AR_USE_MODELSCOPE:
+            platform = "model_scope"
+        self.platform = platform
         self.quant_lm_head = kwargs.pop("quant_lm_head", False)
         self.mllm = kwargs.pop("mllm") if "mllm" in kwargs else False
         self.diffusion = kwargs.pop("diffusion") if "diffusion" in kwargs else False
@@ -259,6 +266,7 @@ def __init__(
         if isinstance(model, str):
             model, tokenizer = llm_load_model(
                 model,
+                platform=platform,
                 device="cpu",  # always load cpu first
             )
         elif tokenizer is None and not self.diffusion and iters > 0:
diff --git a/auto_round/compressors/diffusion/compressor.py b/auto_round/compressors/diffusion/compressor.py
@@ -47,6 +47,7 @@ class DiffusionCompressor(BaseCompressor):
     Args:
         model: The PyTorch model to be quantized.
         tokenizer: An optional tokenizer for processing input data, is not used for diffusion models.
+        platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
         guidance_scale (float): Control how much the image generation process follows the text prompt.
                                 The more it is, the more closely it follows the prompt (default is 7.5).
         num_inference_steps (int): The reference number of denoising steps (default is 50).
@@ -81,6 +82,7 @@ def __init__(
         self,
         model: Union[object, str],
         tokenizer=None,
+        platform: str = "hf",
         guidance_scale: float = 7.5,
         num_inference_steps: int = 50,
         generator_seed: int = None,
@@ -110,7 +112,7 @@ def __init__(
         self._set_device(device_map)
 
         if isinstance(model, str):
-            pipe, model = diffusion_load_model(model, device=self.device)
+            pipe, model = diffusion_load_model(model, platform=platform, device=self.device)
         elif isinstance(model, pipeline_utils.DiffusionPipeline):
             pipe = model
             model = pipe.transformer
@@ -145,6 +147,7 @@ def __init__(
         super(DiffusionCompressor, self).__init__(
             model=model,
             tokenizer=None,
+            platform=platform,
             scheme=scheme,
             layer_config=layer_config,
             dataset=dataset,
diff --git a/auto_round/compressors/mllm/compressor.py b/auto_round/compressors/mllm/compressor.py
@@ -87,6 +87,7 @@ class MLLMCompressor(BaseCompressor):
     Args:
         model: The PyTorch model to be quantized.
         tokenizer: An optional tokenizer for processing input data.
+        platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
         processor: Any multi-modal model will require an object to encode or
                    decode the data that groups several modalities (among text, vision and audio).
         image_processor: Image processor for special model like llava.
@@ -145,6 +146,7 @@ def __init__(
         self,
         model: Union[torch.nn.Module, str],
         tokenizer=None,
+        platform: str = "hf",
         processor=None,
         image_processor=None,
         scheme: Union[str, dict, QuantizationScheme] = "W4A16",
@@ -171,7 +173,7 @@ def __init__(
         self._set_device(device_map)
 
         if isinstance(model, str):
-            model, processor, tokenizer, image_processor = mllm_load_model(model, device=self.device)
+            model, processor, tokenizer, image_processor = mllm_load_model(model, platform=platform, device=self.device)
 
         self.model = model
         quant_nontext_module = self._check_quant_nontext(layer_config, quant_nontext_module)
@@ -258,6 +260,7 @@ def __init__(
         super(MLLMCompressor, self).__init__(
             model=model,
             tokenizer=tokenizer,
+            platform=platform,
             scheme=scheme,
             layer_config=layer_config,
             dataset=dataset,
@@ -374,6 +377,7 @@ def calib(self, nsamples, bs):
                     continue
                 try:
                     if isinstance(data_new, torch.Tensor):
+                        data_new = data_new.to(self.model.device)
                         self.model(data_new)
                     elif isinstance(data_new, tuple) or isinstance(data_new, list):
                         self.model(*data_new)
diff --git a/auto_round/compressors/utils.py b/auto_round/compressors/utils.py
@@ -480,7 +480,7 @@ def gguf_args_check(args_or_ar, formats: list[str] = None, model_type=ModelType.
 
     from auto_round.export.export_to_gguf.convert import download_convert_file
     from auto_round.logger import logger
-    from auto_round.utils.model import download_hf_model, get_gguf_architecture
+    from auto_round.utils.model import download_or_get_path, get_gguf_architecture
 
     formats = sorted(formats, key=lambda x: len(x))
     export_gguf = False
@@ -505,7 +505,7 @@ def gguf_args_check(args_or_ar, formats: list[str] = None, model_type=ModelType.
             else:
                 model_path = args_or_ar.model.name_or_path
             if not os.path.isdir(model_path):
-                model_path = download_hf_model(model_path)
+                model_path = download_or_get_path(model_path, args_or_ar.platform)
             model_architecture = get_gguf_architecture(model_path, model_type=ModelType.TEXT)
             if model_architecture not in ModelBase._model_classes[ModelType.TEXT]:
                 logger.warning(
@@ -539,7 +539,7 @@ def gguf_args_check(args_or_ar, formats: list[str] = None, model_type=ModelType.
         else:
             model_path = args_or_ar.model.name_or_path
         if not os.path.isdir(model_path):
-            model_path = download_hf_model(model_path)
+            model_path = download_or_get_path(model_path, args_or_ar.platform)
         model_architecture = get_gguf_architecture(model_path, model_type=ModelType.TEXT)
         if model_architecture not in ModelBase._model_classes[ModelType.TEXT]:
             logger.error(f"Model {model_architecture} is not supported to export gguf format.")
diff --git a/auto_round/envs.py b/auto_round/envs.py
@@ -18,10 +18,12 @@
 
 if TYPE_CHECKING:
     AR_LOG_LEVEL: str = "INFO"
+    AR_USE_MODELSCOPE: bool = "False"
 
 environment_variables: dict[str, Callable[[], Any]] = {
     # this is used for configuring the default logging level
     "AR_LOG_LEVEL": lambda: os.getenv("AR_LOG_LEVEL", "INFO").upper(),
+    "AR_USE_MODELSCOPE": lambda: os.getenv("AR_USE_MODELSCOPE", "False").lower() in ["1", "true"],
 }
 
 
@@ -41,3 +43,30 @@ def is_set(name: str):
     if name in environment_variables:
         return name in os.environ
     raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+
+
+def set_config(**kwargs):
+    """
+    Set configuration values for environment variables.
+
+    Args:
+        **kwargs: Keyword arguments where keys are environment variable names
+                 and values are the desired values to set.
+
+    Example:
+        set_config(AR_LOG_LEVEL="DEBUG", AR_USE_MODELSCOPE=True)
+    """
+    for key, value in kwargs.items():
+        if key in environment_variables:
+            # Convert value to appropriate string format
+            if key == "AR_USE_MODELSCOPE":
+                # Handle boolean values for AR_USE_MODELSCOPE
+                str_value = "true" if value in [True, "True", "true", "1", 1] else "false"
+            else:
+                # For other variables, convert to string
+                str_value = str(value)
+
+            # Set the environment variable
+            os.environ[key] = str_value
+        else:
+            raise AttributeError(f"module {__name__!r} has no attribute {key!r}")
diff --git a/auto_round/export/export_to_gguf/convert.py b/auto_round/export/export_to_gguf/convert.py
@@ -167,11 +167,11 @@ def is_extra_tensor(tensor_name):
         from safetensors import safe_open
 
         from auto_round.export.export_to_gguf.special_handle import get_tensor_from_file
-        from auto_round.utils import download_hf_model
+        from auto_round.utils import download_or_get_path
 
         dir_path = cls.model.name_or_path
         if not os.path.isdir(dir_path):
-            dir_path = download_hf_model(dir_path)
+            dir_path = download_or_get_path(dir_path)
         INDEX_FILE = "model.safetensors.index.json"
         if INDEX_FILE in os.listdir(dir_path):
             with open(os.path.join(dir_path, INDEX_FILE)) as f:
diff --git a/auto_round/export/export_to_gguf/export.py b/auto_round/export/export_to_gguf/export.py
@@ -28,7 +28,7 @@
     LazyImport,
     check_to_quantized,
     clear_memory,
-    download_hf_model,
+    download_or_get_path,
     flatten_list,
     get_block_names,
     get_gguf_architecture,
@@ -77,7 +77,7 @@ def create_model_class(
     tmp_work_dir = model.name_or_path
     os.makedirs(output_dir, exist_ok=True)
     if not os.path.isdir(tmp_work_dir):
-        tmp_work_dir = download_hf_model(tmp_work_dir)
+        tmp_work_dir = download_or_get_path(tmp_work_dir)
     with torch.inference_mode():
         model_architecture = get_gguf_architecture(tmp_work_dir, model_type=model_type)
         try:
diff --git a/auto_round/export/export_to_gguf/special_handle.py b/auto_round/export/export_to_gguf/special_handle.py
@@ -20,7 +20,7 @@
 from safetensors import safe_open
 from torch import Tensor
 
-from auto_round.utils import download_hf_model
+from auto_round.utils import download_or_get_path
 
 
 def handle_special_model(cls, model_architecture):
@@ -32,7 +32,7 @@ def handle_special_model(cls, model_architecture):
 
 def get_tensor_from_file(dir_path, tensor_name):
     if not os.path.isdir(dir_path):
-        dir_path = download_hf_model(dir_path)
+        dir_path = download_or_get_path(dir_path)
     INDEX_FILE = "model.safetensors.index.json"
     # get filename
     if INDEX_FILE in os.listdir(dir_path):
diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py
diff --git a/test/test_cpu/test_model_scope.py b/test/test_cpu/test_model_scope.py