20
20
21
21
import math
22
22
from copy import deepcopy
23
- from typing import OrderedDict
23
+ from typing import Optional , OrderedDict , Union
24
24
25
25
from ...utils import logger
26
26
from ...utils .utility import LazyImport
@@ -679,7 +679,7 @@ def quant_weight_w_scale(weight, scale, zp, group_size=-1):
679
679
680
680
def autoround_quantize (
681
681
model ,
682
- tokenizer ,
682
+ tokenizer = None ,
683
683
bits : int = 4 ,
684
684
group_size : int = 128 ,
685
685
sym : bool = False ,
@@ -689,10 +689,8 @@ def autoround_quantize(
689
689
amp : bool = True ,
690
690
device = None ,
691
691
lr_scheduler = None ,
692
- dataloader = None , ## to support later
693
- dataset_name : str = "NeelNanda/pile-10k" ,
694
- dataset_split : str = "train" ,
695
- use_quant_input : bool = True ,
692
+ dataset : Union [str , list , tuple , torch .utils .data .DataLoader ] = "NeelNanda/pile-10k" ,
693
+ enable_quanted_input : bool = True ,
696
694
enable_minmax_tuning : bool = True ,
697
695
lr : float = None ,
698
696
minmax_lr : float = None ,
@@ -706,52 +704,52 @@ def autoround_quantize(
706
704
gradient_accumulate_steps : int = 1 ,
707
705
not_use_best_mse : bool = False ,
708
706
dynamic_max_gap : int = - 1 ,
709
- data_type : str = "int" , ##only support data_type
710
- scale_dtype = "fp16" ,
707
+ data_type : str = "int" , ##only support int for now
708
+ scale_dtype : str = "fp16" ,
711
709
** kwargs ,
712
710
):
713
711
"""Run autoround weight-only quantization.
714
712
Args:
715
- model: The PyTorch model to be quantized.
716
- tokenizer: Tokenizer for processing input data. Temporarily set as a mandatory parameter .
717
- bits (int): Number of bits for quantization (default is 4).
718
- group_size (int): Size of the quantization group (default is 128).
719
- sym (bool): Whether the symmetric quantization is to be used.
720
- weight_config (dict): Configuration for weight quantization (default is an empty dictionary).
721
- weight_config={
722
- 'layer1':##layer_name
723
- {
724
- 'data_type': 'int',
725
- 'bits': 4,
726
- 'group_size': 32,
727
- 'scheme': "asym", ## or sym
728
- }
729
- ...
730
- }
731
- enable_full_range (bool): Whether to enable full range quantization (default is False).
732
- bs (int): Batch size for training (default is 8).
733
- amp (bool): Whether to use automatic mixed precision (default is True). Automatically detect and set .
734
- device: The device to be used for tuning (default is None). Automatically detect and set .
735
- lr_scheduler: The learning rate scheduler to be used.
736
- dataloader: The dataloader for input data (to be supported in future ).
737
- dataset_name (str ): The default dataset name (default is "NeelNanda/pile-10k").
738
- dataset_split (str): The split of the dataset to be used (default is "train" ).
739
- use_quant_input (bool): Whether to use quantized input data (default is True).
740
- enable_minmax_tuning (bool ): Whether to enable min-max tuning (default is True ).
741
- lr (float): The learning rate (default is 0.005 ).
742
- minmax_lr (float ): The learning rate for min-max tuning (default is None ).
743
- low_gpu_mem_usage (bool): Whether to use low GPU memory (default is True ).
744
- iters (int): Number of iterations (default is 200 ).
745
- seqlen (int): Length of the sequence .
746
- n_samples (int ): Number of samples (default is 512 ).
747
- sampler (str ): The sampling method (default is "rand" ).
748
- seed (int): The random seed (default is 42 ).
749
- n_blocks (int): Number of blocks (default is 1).
750
- gradient_accumulate_steps (int ): Number of gradient accumulation steps (default is 1 ).
751
- not_use_best_mse (bool ): Whether to use mean squared error (default is False ).
752
- dynamic_max_gap (int ): The dynamic maximum gap (default is -1 ).
753
- data_type (str): The data type to be used (default is "int").
754
- **kwargs: Additional keyword arguments .
713
+ model: The PyTorch model to be quantized.
714
+ tokenizer: An optional tokenizer for processing input data. If none is provided, a dataloader must be supplied .
715
+ bits (int): Number of bits for quantization (default is 4).
716
+ group_size (int): Size of the quantization group (default is 128).
717
+ sym (bool): Whether symmetric quantization is to be used (default is False) .
718
+ weight_config (dict): Configuration for weight quantization (default is an empty dictionary).
719
+ weight_config={
720
+ 'layer1':##layer_name
721
+ {
722
+ 'data_type': 'int',
723
+ 'bits': 4,
724
+ 'group_size': 32,
725
+ ' sym': False
726
+ }
727
+ ...
728
+ }
729
+ enable_full_range (bool): Whether to enable full range quantization (default is False).
730
+ batch_size (int): Batch size for training (default is 8).
731
+ amp (bool): Whether to use automatic mixed precision (default is True).
732
+ device: The device to be used for tuning (default is "auto") .
733
+ lr_scheduler: The learning rate scheduler to be used.
734
+ dataset (str): The default dataset name (default is "NeelNanda/pile-10k" ).
735
+ enable_quanted_input (bool ): Whether to use the output of the previous quantized block as
736
+ the input for the current block (default is True ).
737
+ enable_minmax_tuning (bool): Whether to enable weight min-max tuning (default is True).
738
+ lr (float ): The learning rate (default is None, will be set to 1.0/iters ).
739
+ minmax_lr (float): The learning rate for min-max tuning (default is None, it will be set to lr automatically ).
740
+ low_gpu_mem_usage (bool ): Whether to use low GPU memory (default is True ).
741
+ iters (int): Number of iterations (default is 200 ).
742
+ seqlen (int): Data length of the sequence for tuning (default is 2048 ).
743
+ n_samples (int): Number of samples (default is 512) .
744
+ sampler (str ): The sampling method (default is "rand" ).
745
+ seed (int ): The random seed (default is 42 ).
746
+ n_blocks (int): Number of blocks (default is 1 ).
747
+ gradient_accumulate_steps (int): Number of gradient accumulation steps (default is 1).
748
+ not_use_best_mse (bool ): Whether to use mean squared error (default is False ).
749
+ dynamic_max_gap (int ): The dynamic maximum gap (default is -1 ).
750
+ data_type (str ): The data type to be used (default is "int" ).
751
+ scale_dtype (str): The data type of quantization scale to be used (default is "float32"), different kernels
752
+ have different choices .
755
753
756
754
Returns:
757
755
The quantized model.
@@ -770,10 +768,8 @@ def autoround_quantize(
770
768
amp = amp ,
771
769
device = device ,
772
770
lr_scheduler = lr_scheduler ,
773
- dataloader = dataloader , ## to support later
774
- dataset_name = dataset_name ,
775
- dataset_split = dataset_split ,
776
- use_quant_input = use_quant_input ,
771
+ dataset = dataset ,
772
+ enable_quanted_input = enable_quanted_input ,
777
773
enable_minmax_tuning = enable_minmax_tuning ,
778
774
lr = lr ,
779
775
minmax_lr = minmax_lr ,
0 commit comments