|
31 | 31 | from tqdm import tqdm |
32 | 32 | from transformers import set_seed |
33 | 33 |
|
| 34 | +from auto_round.compressors.utils import ( |
| 35 | + block_forward, |
| 36 | + check_need_act_calibration, |
| 37 | + check_skippable_keywords, |
| 38 | + collect_best_params, |
| 39 | + get_fp_layer_names, |
| 40 | + get_layer_config_by_gguf_format, |
| 41 | + get_shared_keys, |
| 42 | + gguf_args_check, |
| 43 | + infer_bits_by_data_type, |
| 44 | + init_cache, |
| 45 | + is_mx_fp, |
| 46 | + is_nv_fp, |
| 47 | + is_standard_fp, |
| 48 | + is_static_wfp8afp8, |
| 49 | + is_wfp8afp8, |
| 50 | + reset_params, |
| 51 | + set_layer_config, |
| 52 | +) |
34 | 53 | from auto_round.data_type import QUANT_FUNC_WITH_DTYPE |
35 | 54 | from auto_round.data_type.utils import reshape_pad_tensor_by_group_size |
36 | 55 | from auto_round.export.export_to_autoround import AutoRoundFormat |
|
46 | 65 | SUPPORTED_LAYER_TYPES, |
47 | 66 | TORCH_VERSION_AT_LEAST_2_6, |
48 | 67 | CpuInfo, |
49 | | - _gguf_args_check, |
50 | | - _is_fp8_linear, |
51 | | - _is_fp8_model, |
52 | | - block_forward, |
53 | 68 | check_and_mark_fp8_model, |
54 | 69 | check_is_cpu, |
55 | | - check_need_act_calibration, |
56 | 70 | check_seqlen_compatible, |
57 | | - check_skippable_keywords, |
58 | 71 | check_to_quantized, |
59 | 72 | clear_memory, |
60 | | - collect_best_params, |
61 | 73 | compile_func, |
62 | 74 | convert_dtype_str2torch, |
63 | 75 | convert_fp8_layer_to_linear, |
|
69 | 81 | flatten_list, |
70 | 82 | get_block_names, |
71 | 83 | get_device_memory, |
72 | | - get_fp_layer_names, |
73 | | - get_layer_config_by_gguf_format, |
74 | 84 | get_layer_features, |
75 | 85 | get_layer_names_in_block, |
76 | 86 | get_lm_head_name, |
77 | 87 | get_max_vram, |
78 | 88 | get_module, |
79 | | - get_shared_keys, |
80 | 89 | htcore, |
81 | | - infer_bits_by_data_type, |
82 | | - init_cache, |
83 | 90 | is_debug_mode, |
| 91 | + is_fp8_linear, |
| 92 | + is_fp8_model, |
84 | 93 | is_hpex_available, |
85 | | - is_mx_fp, |
86 | | - is_nv_fp, |
87 | | - is_standard_fp, |
88 | | - is_static_wfp8afp8, |
89 | | - is_wfp8afp8, |
90 | 94 | llm_load_model, |
91 | 95 | mv_module_from_gpu, |
92 | | - reset_params, |
93 | 96 | set_amax_for_all_moe_layers, |
94 | | - set_layer_config, |
95 | 97 | set_module, |
96 | 98 | to_device, |
97 | 99 | to_dtype, |
@@ -869,9 +871,9 @@ def remove_duplicates(lst): |
869 | 871 | ) |
870 | 872 | formats[i] = gguf_format_name.lower() |
871 | 873 |
|
872 | | - _gguf_args_check(self, formats, model_type=ModelType.TEXT) |
| 874 | + gguf_args_check(self, formats, model_type=ModelType.TEXT) |
873 | 875 | if self.mllm: |
874 | | - _gguf_args_check(self, formats, model_type=ModelType.MMPROJ) |
| 876 | + gguf_args_check(self, formats, model_type=ModelType.MMPROJ) |
875 | 877 |
|
876 | 878 | for f in formats: |
877 | 879 | if f.startswith("gguf"): |
@@ -946,7 +948,7 @@ def remove_duplicates(lst): |
946 | 948 | "Please change format to fake or auto_round etc." |
947 | 949 | ) |
948 | 950 | elif "auto_awq" in format: |
949 | | - from auto_round.utils import check_awq_gemm_compatibility |
| 951 | + from auto_round.compressors.utils import check_awq_gemm_compatibility |
950 | 952 |
|
951 | 953 | awq_supported, info = check_awq_gemm_compatibility( |
952 | 954 | self.model, self.bits, self.group_size, self.sym, self.layer_config |
@@ -1330,7 +1332,7 @@ def _quantize_layer_via_rtn(self, name: str) -> None: |
1330 | 1332 | """ |
1331 | 1333 | m = get_module(self.model, name) |
1332 | 1334 |
|
1333 | | - if _is_fp8_linear(m): |
| 1335 | + if is_fp8_linear(m): |
1334 | 1336 | m = convert_fp8_layer_to_linear(m, self.amp_dtype) |
1335 | 1337 | set_module(self.model, name, m) |
1336 | 1338 | # |
@@ -1490,7 +1492,7 @@ def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]: |
1490 | 1492 | cnt = 1 |
1491 | 1493 | cnt += 1 |
1492 | 1494 | # Convert remaining fp8 |
1493 | | - if _is_fp8_model(self.model): |
| 1495 | + if is_fp8_model(self.model): |
1494 | 1496 | convert_fp8_model_to_16b_model(self.model, self.amp_dtype) |
1495 | 1497 | self.quantized = True |
1496 | 1498 | return self.model, self.layer_config |
@@ -1558,7 +1560,7 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str]) |
1558 | 1560 | pbar.set_description(f"Quantizing {block_name}") |
1559 | 1561 | block = get_module(self.model, block_name) |
1560 | 1562 | block = block.to(self.device) |
1561 | | - if _is_fp8_model(self.model): |
| 1563 | + if is_fp8_model(self.model): |
1562 | 1564 | convert_fp8_model_to_16b_model(block, dtype=self.amp_dtype) |
1563 | 1565 |
|
1564 | 1566 | if self.device_map == "auto": |
@@ -1755,9 +1757,9 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: |
1755 | 1757 |
|
1756 | 1758 | self._quantize_layers(layer_names, all_inputs) ##TODO pack layer immediately |
1757 | 1759 |
|
1758 | | - if _is_fp8_model(self.model): |
| 1760 | + if is_fp8_model(self.model): |
1759 | 1761 | for n, m in self.model.named_modules(): |
1760 | | - if _is_fp8_linear(m): |
| 1762 | + if is_fp8_linear(m): |
1761 | 1763 | new_layer = convert_fp8_layer_to_linear(m, self.amp_dtype).to("cpu") |
1762 | 1764 | set_module(self.model, n, new_layer) |
1763 | 1765 |
|
@@ -1806,7 +1808,7 @@ def _quantize_layers(self, layer_names: list, layer_inputs: dict) -> None: |
1806 | 1808 |
|
1807 | 1809 | layer = get_module(self.model, layer_name) |
1808 | 1810 | layer = layer.to(self.device) |
1809 | | - if _is_fp8_model(self.model): |
| 1811 | + if is_fp8_model(self.model): |
1810 | 1812 | new_layer = convert_fp8_layer_to_linear(layer, self.amp_dtype).to(self.device) |
1811 | 1813 | set_module(self.model, layer_name, new_layer) |
1812 | 1814 | layer = new_layer |
@@ -2050,7 +2052,7 @@ def try_cache_inter_data_gpucpu(self, block_names, nsamples, layer_names=None, l |
2050 | 2052 | Raises: |
2051 | 2053 | Exception: If caching on GPU fails, switches to CPU and caches there. |
2052 | 2054 | """ |
2053 | | - if _is_fp8_model(self.model): |
| 2055 | + if is_fp8_model(self.model): |
2054 | 2056 | layer_names = [] |
2055 | 2057 | if layer_names is None: |
2056 | 2058 | layer_names = [] |
@@ -2471,6 +2473,7 @@ def _quantize_layer( |
2471 | 2473 | logger.info(dump_info) |
2472 | 2474 |
|
2473 | 2475 | def _register_act_max_hook(self, model): |
| 2476 | + |
2474 | 2477 | def get_act_max_hook(module, input, output): |
2475 | 2478 | if isinstance(input, (tuple, list)): |
2476 | 2479 | input = input[0] |
@@ -2569,9 +2572,9 @@ def _quantize_block( |
2569 | 2572 | Returns: |
2570 | 2573 | Tuple: (q_outputs, output) if self.enable_quanted_input is True, else (None, output) |
2571 | 2574 | """ |
2572 | | - if _is_fp8_model(self.model): |
| 2575 | + if is_fp8_model(self.model): |
2573 | 2576 | for n, m in block.named_modules(): |
2574 | | - if _is_fp8_linear(m): |
| 2577 | + if is_fp8_linear(m): |
2575 | 2578 | new_layer = convert_fp8_layer_to_linear(m, self.amp_dtype).to(device) |
2576 | 2579 | set_module(block, n, new_layer) |
2577 | 2580 |
|
|
0 commit comments