Skip to content

Commit f1b5c72

Browse files
authored
refactor utils file (#943)
* refact utils Signed-off-by: n1ck-guo <heng.guo@intel.com>
1 parent 959ac67 commit f1b5c72

File tree

31 files changed

+3372
-3235
lines changed

31 files changed

+3372
-3235
lines changed

auto_round/alg_ext.abi3.so

0 Bytes
Binary file not shown.

auto_round/auto_scheme/gen_auto_scheme.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,10 @@
2020
from auto_round import AutoScheme
2121
from auto_round.auto_scheme import AUTO_SCHEME_METHODS
2222
from auto_round.auto_scheme.utils import compute_avg_bits_for_scheme
23+
from auto_round.compressors.utils import gguf_type_fallback
2324
from auto_round.export.export_to_gguf.config import GGUF_INNER_CONFIG
2425
from auto_round.logger import logger
25-
from auto_round.utils import _gguf_type_fallback, get_layer_features, get_module
26+
from auto_round.utils import get_layer_features, get_module
2627

2728

2829
class GenScheme:
@@ -128,7 +129,7 @@ def fallback_gguf_layer_config(self, layer_config: dict[str, dict]) -> dict[str,
128129
new_type = f"gguf:q{bits}_" + f"{1 - prefix_idx}"
129130
if new_type not in GGUF_INNER_CONFIG:
130131
current_type = f"gguf:q{bits}_k"
131-
new_type = _gguf_type_fallback(current_type)
132+
new_type = gguf_type_fallback(current_type)
132133

133134
# Apply fallback configuration
134135
target_config = GGUF_INNER_CONFIG[new_type]

auto_round/compressors/base.py

Lines changed: 34 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,25 @@
3131
from tqdm import tqdm
3232
from transformers import set_seed
3333

34+
from auto_round.compressors.utils import (
35+
block_forward,
36+
check_need_act_calibration,
37+
check_skippable_keywords,
38+
collect_best_params,
39+
get_fp_layer_names,
40+
get_layer_config_by_gguf_format,
41+
get_shared_keys,
42+
gguf_args_check,
43+
infer_bits_by_data_type,
44+
init_cache,
45+
is_mx_fp,
46+
is_nv_fp,
47+
is_standard_fp,
48+
is_static_wfp8afp8,
49+
is_wfp8afp8,
50+
reset_params,
51+
set_layer_config,
52+
)
3453
from auto_round.data_type import QUANT_FUNC_WITH_DTYPE
3554
from auto_round.data_type.utils import reshape_pad_tensor_by_group_size
3655
from auto_round.export.export_to_autoround import AutoRoundFormat
@@ -46,18 +65,11 @@
4665
SUPPORTED_LAYER_TYPES,
4766
TORCH_VERSION_AT_LEAST_2_6,
4867
CpuInfo,
49-
_gguf_args_check,
50-
_is_fp8_linear,
51-
_is_fp8_model,
52-
block_forward,
5368
check_and_mark_fp8_model,
5469
check_is_cpu,
55-
check_need_act_calibration,
5670
check_seqlen_compatible,
57-
check_skippable_keywords,
5871
check_to_quantized,
5972
clear_memory,
60-
collect_best_params,
6173
compile_func,
6274
convert_dtype_str2torch,
6375
convert_fp8_layer_to_linear,
@@ -69,29 +81,19 @@
6981
flatten_list,
7082
get_block_names,
7183
get_device_memory,
72-
get_fp_layer_names,
73-
get_layer_config_by_gguf_format,
7484
get_layer_features,
7585
get_layer_names_in_block,
7686
get_lm_head_name,
7787
get_max_vram,
7888
get_module,
79-
get_shared_keys,
8089
htcore,
81-
infer_bits_by_data_type,
82-
init_cache,
8390
is_debug_mode,
91+
is_fp8_linear,
92+
is_fp8_model,
8493
is_hpex_available,
85-
is_mx_fp,
86-
is_nv_fp,
87-
is_standard_fp,
88-
is_static_wfp8afp8,
89-
is_wfp8afp8,
9094
llm_load_model,
9195
mv_module_from_gpu,
92-
reset_params,
9396
set_amax_for_all_moe_layers,
94-
set_layer_config,
9597
set_module,
9698
to_device,
9799
to_dtype,
@@ -869,9 +871,9 @@ def remove_duplicates(lst):
869871
)
870872
formats[i] = gguf_format_name.lower()
871873

872-
_gguf_args_check(self, formats, model_type=ModelType.TEXT)
874+
gguf_args_check(self, formats, model_type=ModelType.TEXT)
873875
if self.mllm:
874-
_gguf_args_check(self, formats, model_type=ModelType.MMPROJ)
876+
gguf_args_check(self, formats, model_type=ModelType.MMPROJ)
875877

876878
for f in formats:
877879
if f.startswith("gguf"):
@@ -946,7 +948,7 @@ def remove_duplicates(lst):
946948
"Please change format to fake or auto_round etc."
947949
)
948950
elif "auto_awq" in format:
949-
from auto_round.utils import check_awq_gemm_compatibility
951+
from auto_round.compressors.utils import check_awq_gemm_compatibility
950952

951953
awq_supported, info = check_awq_gemm_compatibility(
952954
self.model, self.bits, self.group_size, self.sym, self.layer_config
@@ -1330,7 +1332,7 @@ def _quantize_layer_via_rtn(self, name: str) -> None:
13301332
"""
13311333
m = get_module(self.model, name)
13321334

1333-
if _is_fp8_linear(m):
1335+
if is_fp8_linear(m):
13341336
m = convert_fp8_layer_to_linear(m, self.amp_dtype)
13351337
set_module(self.model, name, m)
13361338
#
@@ -1490,7 +1492,7 @@ def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]:
14901492
cnt = 1
14911493
cnt += 1
14921494
# Convert remaining fp8
1493-
if _is_fp8_model(self.model):
1495+
if is_fp8_model(self.model):
14941496
convert_fp8_model_to_16b_model(self.model, self.amp_dtype)
14951497
self.quantized = True
14961498
return self.model, self.layer_config
@@ -1558,7 +1560,7 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str])
15581560
pbar.set_description(f"Quantizing {block_name}")
15591561
block = get_module(self.model, block_name)
15601562
block = block.to(self.device)
1561-
if _is_fp8_model(self.model):
1563+
if is_fp8_model(self.model):
15621564
convert_fp8_model_to_16b_model(block, dtype=self.amp_dtype)
15631565

15641566
if self.device_map == "auto":
@@ -1755,9 +1757,9 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:
17551757

17561758
self._quantize_layers(layer_names, all_inputs) ##TODO pack layer immediately
17571759

1758-
if _is_fp8_model(self.model):
1760+
if is_fp8_model(self.model):
17591761
for n, m in self.model.named_modules():
1760-
if _is_fp8_linear(m):
1762+
if is_fp8_linear(m):
17611763
new_layer = convert_fp8_layer_to_linear(m, self.amp_dtype).to("cpu")
17621764
set_module(self.model, n, new_layer)
17631765

@@ -1806,7 +1808,7 @@ def _quantize_layers(self, layer_names: list, layer_inputs: dict) -> None:
18061808

18071809
layer = get_module(self.model, layer_name)
18081810
layer = layer.to(self.device)
1809-
if _is_fp8_model(self.model):
1811+
if is_fp8_model(self.model):
18101812
new_layer = convert_fp8_layer_to_linear(layer, self.amp_dtype).to(self.device)
18111813
set_module(self.model, layer_name, new_layer)
18121814
layer = new_layer
@@ -2050,7 +2052,7 @@ def try_cache_inter_data_gpucpu(self, block_names, nsamples, layer_names=None, l
20502052
Raises:
20512053
Exception: If caching on GPU fails, switches to CPU and caches there.
20522054
"""
2053-
if _is_fp8_model(self.model):
2055+
if is_fp8_model(self.model):
20542056
layer_names = []
20552057
if layer_names is None:
20562058
layer_names = []
@@ -2471,6 +2473,7 @@ def _quantize_layer(
24712473
logger.info(dump_info)
24722474

24732475
def _register_act_max_hook(self, model):
2476+
24742477
def get_act_max_hook(module, input, output):
24752478
if isinstance(input, (tuple, list)):
24762479
input = input[0]
@@ -2569,9 +2572,9 @@ def _quantize_block(
25692572
Returns:
25702573
Tuple: (q_outputs, output) if self.enable_quanted_input is True, else (None, output)
25712574
"""
2572-
if _is_fp8_model(self.model):
2575+
if is_fp8_model(self.model):
25732576
for n, m in block.named_modules():
2574-
if _is_fp8_linear(m):
2577+
if is_fp8_linear(m):
25752578
new_layer = convert_fp8_layer_to_linear(m, self.amp_dtype).to(device)
25762579
set_module(block, n, new_layer)
25772580

auto_round/compressors/diffusion/compressor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,11 @@
2121

2222
from auto_round.compressors.base import BaseCompressor
2323
from auto_round.compressors.diffusion.dataset import get_diffusion_dataloader
24+
from auto_round.compressors.utils import block_forward
2425
from auto_round.logger import logger
2526
from auto_round.schemes import QuantizationScheme
2627
from auto_round.utils import (
2728
LazyImport,
28-
block_forward,
2929
clear_memory,
3030
diffusion_load_model,
3131
extract_block_names_to_str,

auto_round/compressors/mllm/compressor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,14 +32,14 @@
3232
_handle_special_model,
3333
)
3434
from auto_round.utils import (
35-
_is_fp8_model,
3635
check_to_quantized,
3736
clear_memory,
3837
detect_device,
3938
extract_block_names_to_str,
4039
find_matching_blocks,
4140
get_block_names,
4241
get_max_vram,
42+
is_fp8_model,
4343
mllm_load_model,
4444
mv_module_from_gpu,
4545
to_device,

0 commit comments

Comments
 (0)