Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified auto_round/alg_ext.abi3.so
Binary file not shown.
5 changes: 3 additions & 2 deletions auto_round/auto_scheme/gen_auto_scheme.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@
from auto_round import AutoScheme
from auto_round.auto_scheme import AUTO_SCHEME_METHODS
from auto_round.auto_scheme.utils import compute_avg_bits_for_scheme
from auto_round.compressors.utils import gguf_type_fallback
from auto_round.export.export_to_gguf.config import GGUF_INNER_CONFIG
from auto_round.logger import logger
from auto_round.utils import _gguf_type_fallback, get_layer_features, get_module
from auto_round.utils import get_layer_features, get_module


class GenScheme:
Expand Down Expand Up @@ -128,7 +129,7 @@ def fallback_gguf_layer_config(self, layer_config: dict[str, dict]) -> dict[str,
new_type = f"gguf:q{bits}_" + f"{1 - prefix_idx}"
if new_type not in GGUF_INNER_CONFIG:
current_type = f"gguf:q{bits}_k"
new_type = _gguf_type_fallback(current_type)
new_type = gguf_type_fallback(current_type)

# Apply fallback configuration
target_config = GGUF_INNER_CONFIG[new_type]
Expand Down
65 changes: 34 additions & 31 deletions auto_round/compressors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,25 @@
from tqdm import tqdm
from transformers import set_seed

from auto_round.compressors.utils import (
block_forward,
check_need_act_calibration,
check_skippable_keywords,
collect_best_params,
get_fp_layer_names,
get_layer_config_by_gguf_format,
get_shared_keys,
gguf_args_check,
infer_bits_by_data_type,
init_cache,
is_mx_fp,
is_nv_fp,
is_standard_fp,
is_static_wfp8afp8,
is_wfp8afp8,
reset_params,
set_layer_config,
)
from auto_round.data_type import QUANT_FUNC_WITH_DTYPE
from auto_round.data_type.utils import reshape_pad_tensor_by_group_size
from auto_round.export.export_to_autoround import AutoRoundFormat
Expand All @@ -46,18 +65,11 @@
SUPPORTED_LAYER_TYPES,
TORCH_VERSION_AT_LEAST_2_6,
CpuInfo,
_gguf_args_check,
_is_fp8_linear,
_is_fp8_model,
block_forward,
check_and_mark_fp8_model,
check_is_cpu,
check_need_act_calibration,
check_seqlen_compatible,
check_skippable_keywords,
check_to_quantized,
clear_memory,
collect_best_params,
compile_func,
convert_dtype_str2torch,
convert_fp8_layer_to_linear,
Expand All @@ -69,29 +81,19 @@
flatten_list,
get_block_names,
get_device_memory,
get_fp_layer_names,
get_layer_config_by_gguf_format,
get_layer_features,
get_layer_names_in_block,
get_lm_head_name,
get_max_vram,
get_module,
get_shared_keys,
htcore,
infer_bits_by_data_type,
init_cache,
is_debug_mode,
is_fp8_linear,
is_fp8_model,
is_hpex_available,
is_mx_fp,
is_nv_fp,
is_standard_fp,
is_static_wfp8afp8,
is_wfp8afp8,
llm_load_model,
mv_module_from_gpu,
reset_params,
set_amax_for_all_moe_layers,
set_layer_config,
set_module,
to_device,
to_dtype,
Expand Down Expand Up @@ -869,9 +871,9 @@ def remove_duplicates(lst):
)
formats[i] = gguf_format_name.lower()

_gguf_args_check(self, formats, model_type=ModelType.TEXT)
gguf_args_check(self, formats, model_type=ModelType.TEXT)
if self.mllm:
_gguf_args_check(self, formats, model_type=ModelType.MMPROJ)
gguf_args_check(self, formats, model_type=ModelType.MMPROJ)

for f in formats:
if f.startswith("gguf"):
Expand Down Expand Up @@ -946,7 +948,7 @@ def remove_duplicates(lst):
"Please change format to fake or auto_round etc."
)
elif "auto_awq" in format:
from auto_round.utils import check_awq_gemm_compatibility
from auto_round.compressors.utils import check_awq_gemm_compatibility

awq_supported, info = check_awq_gemm_compatibility(
self.model, self.bits, self.group_size, self.sym, self.layer_config
Expand Down Expand Up @@ -1330,7 +1332,7 @@ def _quantize_layer_via_rtn(self, name: str) -> None:
"""
m = get_module(self.model, name)

if _is_fp8_linear(m):
if is_fp8_linear(m):
m = convert_fp8_layer_to_linear(m, self.amp_dtype)
set_module(self.model, name, m)
#
Expand Down Expand Up @@ -1490,7 +1492,7 @@ def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]:
cnt = 1
cnt += 1
# Convert remaining fp8
if _is_fp8_model(self.model):
if is_fp8_model(self.model):
convert_fp8_model_to_16b_model(self.model, self.amp_dtype)
self.quantized = True
return self.model, self.layer_config
Expand Down Expand Up @@ -1558,7 +1560,7 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str])
pbar.set_description(f"Quantizing {block_name}")
block = get_module(self.model, block_name)
block = block.to(self.device)
if _is_fp8_model(self.model):
if is_fp8_model(self.model):
convert_fp8_model_to_16b_model(block, dtype=self.amp_dtype)

if self.device_map == "auto":
Expand Down Expand Up @@ -1755,9 +1757,9 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:

self._quantize_layers(layer_names, all_inputs) ##TODO pack layer immediately

if _is_fp8_model(self.model):
if is_fp8_model(self.model):
for n, m in self.model.named_modules():
if _is_fp8_linear(m):
if is_fp8_linear(m):
new_layer = convert_fp8_layer_to_linear(m, self.amp_dtype).to("cpu")
set_module(self.model, n, new_layer)

Expand Down Expand Up @@ -1806,7 +1808,7 @@ def _quantize_layers(self, layer_names: list, layer_inputs: dict) -> None:

layer = get_module(self.model, layer_name)
layer = layer.to(self.device)
if _is_fp8_model(self.model):
if is_fp8_model(self.model):
new_layer = convert_fp8_layer_to_linear(layer, self.amp_dtype).to(self.device)
set_module(self.model, layer_name, new_layer)
layer = new_layer
Expand Down Expand Up @@ -2050,7 +2052,7 @@ def try_cache_inter_data_gpucpu(self, block_names, nsamples, layer_names=None, l
Raises:
Exception: If caching on GPU fails, switches to CPU and caches there.
"""
if _is_fp8_model(self.model):
if is_fp8_model(self.model):
layer_names = []
if layer_names is None:
layer_names = []
Expand Down Expand Up @@ -2471,6 +2473,7 @@ def _quantize_layer(
logger.info(dump_info)

def _register_act_max_hook(self, model):

def get_act_max_hook(module, input, output):
if isinstance(input, (tuple, list)):
input = input[0]
Expand Down Expand Up @@ -2569,9 +2572,9 @@ def _quantize_block(
Returns:
Tuple: (q_outputs, output) if self.enable_quanted_input is True, else (None, output)
"""
if _is_fp8_model(self.model):
if is_fp8_model(self.model):
for n, m in block.named_modules():
if _is_fp8_linear(m):
if is_fp8_linear(m):
new_layer = convert_fp8_layer_to_linear(m, self.amp_dtype).to(device)
set_module(block, n, new_layer)

Expand Down
2 changes: 1 addition & 1 deletion auto_round/compressors/diffusion/compressor.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@

from auto_round.compressors.base import BaseCompressor
from auto_round.compressors.diffusion.dataset import get_diffusion_dataloader
from auto_round.compressors.utils import block_forward
from auto_round.logger import logger
from auto_round.schemes import QuantizationScheme
from auto_round.utils import (
LazyImport,
block_forward,
clear_memory,
diffusion_load_model,
extract_block_names_to_str,
Expand Down
2 changes: 1 addition & 1 deletion auto_round/compressors/mllm/compressor.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,14 @@
_handle_special_model,
)
from auto_round.utils import (
_is_fp8_model,
check_to_quantized,
clear_memory,
detect_device,
extract_block_names_to_str,
find_matching_blocks,
get_block_names,
get_max_vram,
is_fp8_model,
mllm_load_model,
mv_module_from_gpu,
to_device,
Expand Down
Loading