Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions auto_round/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,12 @@ def __init__(self, *args, **kwargs):
help="Path to the pre-trained model or model identifier from huggingface.co/models. "
"Examples: 'facebook/opt-125m', 'bert-base-uncased', or local path like '/path/to/model'",
)
basic.add_argument(
"--platform",
default="hf",
help="Platform to load the pre-trained model. Options: [hf, model_scope]."
" hf stands for huggingface and model_scope stands for model scope.",
)
basic.add_argument(
"--scheme",
default="W4A16",
Expand Down Expand Up @@ -566,6 +572,7 @@ def tune(args):

autoround: BaseCompressor = AutoRound(
model=model_name,
platform=args.platform,
scheme=scheme,
dataset=args.dataset,
iters=args.iters,
Expand Down
14 changes: 13 additions & 1 deletion auto_round/autoround.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ class AutoRound:
Attributes:
model (torch.nn.Module): The loaded PyTorch model in eval mode.
tokenizer: Tokenizer used to prepare input text for calibration/tuning.
platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
bits (int): Weight quantization bits.
group_size (int): Per-group size for weight quantization.
sym (bool): Whether to use symmetric weight quantization.
Expand All @@ -67,6 +68,7 @@ def __new__(
cls,
model: Union[torch.nn.Module, str],
tokenizer=None,
platform: str = "hf",
scheme: Union[str, dict, QuantizationScheme, AutoScheme] = "W4A16",
layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None,
dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k",
Expand Down Expand Up @@ -146,7 +148,7 @@ def __new__(
"""
model_cls = []

if (extra_config and not extra_config.mllm_config.is_default()) or is_mllm_model(model):
if (extra_config and not extra_config.mllm_config.is_default()) or is_mllm_model(model, platform=platform):
logger.info("using MLLM mode for multimodal model.")
model_cls.append(MLLMCompressor)
if extra_config:
Expand All @@ -170,6 +172,7 @@ def __new__(
ar = dynamic_compressor(
model=model,
tokenizer=tokenizer,
platform=platform,
scheme=scheme,
layer_config=layer_config,
dataset=dataset,
Expand Down Expand Up @@ -314,6 +317,7 @@ def __init__(
self,
model: Union[torch.nn.Module, str],
tokenizer=None,
platform: str = "hf",
scheme: Union[str, dict, QuantizationScheme] = "W4A16",
layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None,
dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k",
Expand All @@ -331,6 +335,7 @@ def __init__(
super().__init__(
model=model,
tokenizer=tokenizer,
platform=platform,
scheme=scheme,
layer_config=layer_config,
dataset=dataset,
Expand All @@ -354,6 +359,7 @@ class AutoRoundAdam(AdamCompressor):
Args:
model: The PyTorch model to be quantized.
tokenizer: An optional tokenizer for processing input data.
platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
scheme (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations
bits (int): Number of bits for quantization (default is 4).
group_size (int): Size of the quantization group (default is 128).
Expand Down Expand Up @@ -413,6 +419,7 @@ def __init__(
self,
model: Union[torch.nn.Module, str],
tokenizer=None,
platform: str = "hf",
scheme: Union[str, dict, QuantizationScheme] = "W4A16",
layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None,
dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k",
Expand All @@ -431,6 +438,7 @@ def __init__(
super().__init__(
model=model,
tokenizer=tokenizer,
platform=platform,
scheme=scheme,
layer_config=layer_config,
batch_size=batch_size,
Expand All @@ -455,6 +463,7 @@ class AutoRoundMLLM(MLLMCompressor):
Args:
model: The PyTorch model to be quantized.
tokenizer: An optional tokenizer for processing input data.
platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
processor: Any multi-modal model will require an object to encode or
decode the data that groups several modalities (among text, vision and audio).
image_processor: Image processor for special model like llava.
Expand Down Expand Up @@ -513,6 +522,7 @@ def __init__(
self,
model: Union[torch.nn.Module, str],
tokenizer=None,
platform: str = "hf",
processor=None,
image_processor=None,
scheme: Union[str, dict, QuantizationScheme] = "W4A16",
Expand All @@ -533,6 +543,7 @@ def __init__(
super().__init__(
model=model,
tokenizer=tokenizer,
platform=platform,
processor=processor,
image_processor=image_processor,
scheme=scheme,
Expand All @@ -559,6 +570,7 @@ class AutoRoundDiffusion(DiffusionCompressor):
Args:
model: The PyTorch model to be quantized.
tokenizer: An optional tokenizer for processing input data, is not used for diffusion models.
platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
guidance_scale (float): Control how much the image generation process follows the text prompt.
The more it is, the more closely it follows the prompt (default is 7.5).
num_inference_steps (int): The reference number of denoising steps (default is 50).
Expand Down
3 changes: 3 additions & 0 deletions auto_round/compressors/adam.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class AdamCompressor(BaseCompressor):
Args:
model: The PyTorch model to be quantized.
tokenizer: An optional tokenizer for processing input data.
platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
scheme (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations
bits (int): Number of bits for quantization (default is 4).
group_size (int): Size of the quantization group (default is 128).
Expand Down Expand Up @@ -86,6 +87,7 @@ def __init__(
self,
model: Union[torch.nn.Module, str],
tokenizer=None,
platform="hf",
scheme: Union[str, dict, QuantizationScheme] = "W4A16",
layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None,
dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k",
Expand All @@ -104,6 +106,7 @@ def __init__(
super(AdamCompressor, self).__init__(
model=model,
tokenizer=tokenizer,
platform=platform,
scheme=scheme,
layer_config=layer_config,
batch_size=batch_size,
Expand Down
8 changes: 8 additions & 0 deletions auto_round/compressors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from tqdm import tqdm
from transformers import set_seed

from auto_round import envs
from auto_round.auto_scheme.gen_auto_scheme import AutoScheme
from auto_round.compressors.utils import (
block_forward,
Expand Down Expand Up @@ -105,6 +106,7 @@ class BaseCompressor(object):
Attributes:
model (torch.nn.Module): The loaded PyTorch model in eval mode.
tokenizer: Tokenizer used to prepare input text for calibration/tuning.
platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
bits (int): Weight quantization bits.
group_size (int): Per-group size for weight quantization.
sym (bool): Whether to use symmetric weight quantization.
Expand All @@ -129,6 +131,7 @@ def __init__(
self,
model: Union[torch.nn.Module, str],
tokenizer=None,
platform="hf",
scheme: Union[str, dict, QuantizationScheme, AutoScheme] = "W4A16",
layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None,
dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k",
Expand Down Expand Up @@ -228,6 +231,10 @@ def __init__(
device = kwargs.pop("device", None)
# Scale factor for RAM usage per parameter.
mem_per_param_scale = kwargs.pop("mem_per_param_scale", None)

if envs.AR_USE_MODELSCOPE:
platform = "model_scope"
self.platform = platform
self.quant_lm_head = kwargs.pop("quant_lm_head", False)
self.mllm = kwargs.pop("mllm") if "mllm" in kwargs else False
self.diffusion = kwargs.pop("diffusion") if "diffusion" in kwargs else False
Expand Down Expand Up @@ -259,6 +266,7 @@ def __init__(
if isinstance(model, str):
model, tokenizer = llm_load_model(
model,
platform=platform,
device="cpu", # always load cpu first
)
elif tokenizer is None and not self.diffusion and iters > 0:
Expand Down
5 changes: 4 additions & 1 deletion auto_round/compressors/diffusion/compressor.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ class DiffusionCompressor(BaseCompressor):
Args:
model: The PyTorch model to be quantized.
tokenizer: An optional tokenizer for processing input data, is not used for diffusion models.
platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
guidance_scale (float): Control how much the image generation process follows the text prompt.
The more it is, the more closely it follows the prompt (default is 7.5).
num_inference_steps (int): The reference number of denoising steps (default is 50).
Expand Down Expand Up @@ -81,6 +82,7 @@ def __init__(
self,
model: Union[object, str],
tokenizer=None,
platform: str = "hf",
guidance_scale: float = 7.5,
num_inference_steps: int = 50,
generator_seed: int = None,
Expand Down Expand Up @@ -110,7 +112,7 @@ def __init__(
self._set_device(device_map)

if isinstance(model, str):
pipe, model = diffusion_load_model(model, device=self.device)
pipe, model = diffusion_load_model(model, platform=platform, device=self.device)
elif isinstance(model, pipeline_utils.DiffusionPipeline):
pipe = model
model = pipe.transformer
Expand Down Expand Up @@ -145,6 +147,7 @@ def __init__(
super(DiffusionCompressor, self).__init__(
model=model,
tokenizer=None,
platform=platform,
scheme=scheme,
layer_config=layer_config,
dataset=dataset,
Expand Down
6 changes: 5 additions & 1 deletion auto_round/compressors/mllm/compressor.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ class MLLMCompressor(BaseCompressor):
Args:
model: The PyTorch model to be quantized.
tokenizer: An optional tokenizer for processing input data.
platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
processor: Any multi-modal model will require an object to encode or
decode the data that groups several modalities (among text, vision and audio).
image_processor: Image processor for special model like llava.
Expand Down Expand Up @@ -145,6 +146,7 @@ def __init__(
self,
model: Union[torch.nn.Module, str],
tokenizer=None,
platform: str = "hf",
processor=None,
image_processor=None,
scheme: Union[str, dict, QuantizationScheme] = "W4A16",
Expand All @@ -171,7 +173,7 @@ def __init__(
self._set_device(device_map)

if isinstance(model, str):
model, processor, tokenizer, image_processor = mllm_load_model(model, device=self.device)
model, processor, tokenizer, image_processor = mllm_load_model(model, platform=platform, device=self.device)

self.model = model
quant_nontext_module = self._check_quant_nontext(layer_config, quant_nontext_module)
Expand Down Expand Up @@ -258,6 +260,7 @@ def __init__(
super(MLLMCompressor, self).__init__(
model=model,
tokenizer=tokenizer,
platform=platform,
scheme=scheme,
layer_config=layer_config,
dataset=dataset,
Expand Down Expand Up @@ -374,6 +377,7 @@ def calib(self, nsamples, bs):
continue
try:
if isinstance(data_new, torch.Tensor):
data_new = data_new.to(self.model.device)
self.model(data_new)
elif isinstance(data_new, tuple) or isinstance(data_new, list):
self.model(*data_new)
Expand Down
6 changes: 3 additions & 3 deletions auto_round/compressors/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,7 +480,7 @@ def gguf_args_check(args_or_ar, formats: list[str] = None, model_type=ModelType.

from auto_round.export.export_to_gguf.convert import download_convert_file
from auto_round.logger import logger
from auto_round.utils.model import download_hf_model, get_gguf_architecture
from auto_round.utils.model import download_or_get_path, get_gguf_architecture

formats = sorted(formats, key=lambda x: len(x))
export_gguf = False
Expand All @@ -505,7 +505,7 @@ def gguf_args_check(args_or_ar, formats: list[str] = None, model_type=ModelType.
else:
model_path = args_or_ar.model.name_or_path
if not os.path.isdir(model_path):
model_path = download_hf_model(model_path)
model_path = download_or_get_path(model_path, args_or_ar.platform)
model_architecture = get_gguf_architecture(model_path, model_type=ModelType.TEXT)
if model_architecture not in ModelBase._model_classes[ModelType.TEXT]:
logger.warning(
Expand Down Expand Up @@ -539,7 +539,7 @@ def gguf_args_check(args_or_ar, formats: list[str] = None, model_type=ModelType.
else:
model_path = args_or_ar.model.name_or_path
if not os.path.isdir(model_path):
model_path = download_hf_model(model_path)
model_path = download_or_get_path(model_path, args_or_ar.platform)
model_architecture = get_gguf_architecture(model_path, model_type=ModelType.TEXT)
if model_architecture not in ModelBase._model_classes[ModelType.TEXT]:
logger.error(f"Model {model_architecture} is not supported to export gguf format.")
Expand Down
29 changes: 29 additions & 0 deletions auto_round/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,12 @@

if TYPE_CHECKING:
AR_LOG_LEVEL: str = "INFO"
AR_USE_MODELSCOPE: bool = "False"

environment_variables: dict[str, Callable[[], Any]] = {
# this is used for configuring the default logging level
"AR_LOG_LEVEL": lambda: os.getenv("AR_LOG_LEVEL", "INFO").upper(),
"AR_USE_MODELSCOPE": lambda: os.getenv("AR_USE_MODELSCOPE", "False").lower() in ["1", "true"],
}


Expand All @@ -41,3 +43,30 @@ def is_set(name: str):
if name in environment_variables:
return name in os.environ
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")


def set_config(**kwargs):
"""
Set configuration values for environment variables.

Args:
**kwargs: Keyword arguments where keys are environment variable names
and values are the desired values to set.

Example:
set_config(AR_LOG_LEVEL="DEBUG", AR_USE_MODELSCOPE=True)
"""
for key, value in kwargs.items():
if key in environment_variables:
# Convert value to appropriate string format
if key == "AR_USE_MODELSCOPE":
# Handle boolean values for AR_USE_MODELSCOPE
str_value = "true" if value in [True, "True", "true", "1", 1] else "false"
else:
# For other variables, convert to string
str_value = str(value)

# Set the environment variable
os.environ[key] = str_value
else:
raise AttributeError(f"module {__name__!r} has no attribute {key!r}")
4 changes: 2 additions & 2 deletions auto_round/export/export_to_gguf/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,11 +167,11 @@ def is_extra_tensor(tensor_name):
from safetensors import safe_open

from auto_round.export.export_to_gguf.special_handle import get_tensor_from_file
from auto_round.utils import download_hf_model
from auto_round.utils import download_or_get_path

dir_path = cls.model.name_or_path
if not os.path.isdir(dir_path):
dir_path = download_hf_model(dir_path)
dir_path = download_or_get_path(dir_path)
INDEX_FILE = "model.safetensors.index.json"
if INDEX_FILE in os.listdir(dir_path):
with open(os.path.join(dir_path, INDEX_FILE)) as f:
Expand Down
4 changes: 2 additions & 2 deletions auto_round/export/export_to_gguf/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
LazyImport,
check_to_quantized,
clear_memory,
download_hf_model,
download_or_get_path,
flatten_list,
get_block_names,
get_gguf_architecture,
Expand Down Expand Up @@ -77,7 +77,7 @@ def create_model_class(
tmp_work_dir = model.name_or_path
os.makedirs(output_dir, exist_ok=True)
if not os.path.isdir(tmp_work_dir):
tmp_work_dir = download_hf_model(tmp_work_dir)
tmp_work_dir = download_or_get_path(tmp_work_dir)
with torch.inference_mode():
model_architecture = get_gguf_architecture(tmp_work_dir, model_type=model_type)
try:
Expand Down
4 changes: 2 additions & 2 deletions auto_round/export/export_to_gguf/special_handle.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from safetensors import safe_open
from torch import Tensor

from auto_round.utils import download_hf_model
from auto_round.utils import download_or_get_path


def handle_special_model(cls, model_architecture):
Expand All @@ -32,7 +32,7 @@ def handle_special_model(cls, model_architecture):

def get_tensor_from_file(dir_path, tensor_name):
if not os.path.isdir(dir_path):
dir_path = download_hf_model(dir_path)
dir_path = download_or_get_path(dir_path)
INDEX_FILE = "model.safetensors.index.json"
# get filename
if INDEX_FILE in os.listdir(dir_path):
Expand Down
Loading