From af54864a8a91d44ddf766821cdc25668430803c7 Mon Sep 17 00:00:00 2001 From: Chang Xu Date: Wed, 2 Aug 2023 17:32:12 +0800 Subject: [PATCH] Add WeightOnlyPTQ and GPTQ (#6572) * Add WeightOnlyPTQ and GPTQ * update * update --- llm/causallm/argument.py | 2 ++ llm/causallm/finetune_generation.py | 13 +++++++++++-- llm/causallm/quant.py | 30 +++++++++++++++++++++++++---- 3 files changed, 39 insertions(+), 6 deletions(-) diff --git a/llm/causallm/argument.py b/llm/causallm/argument.py index 06fef64c3de2..7bc1a4e6f133 100644 --- a/llm/causallm/argument.py +++ b/llm/causallm/argument.py @@ -52,6 +52,8 @@ class QuantArgument: # PTQ related parameters do_ptq: bool = field(default=False, metadata={"help": "Whether to use PTQ"}) ptq_step: int = field(default=8, metadata={"help": "Step for PTQ"}) + ptq_weight_only: bool = field(default=False, metadata={"help": "Whether to use PTQ weight only"}) + quant_bits: int = field(default=8, metadata={"help": "Quantization bit size"}) fused_qkv: bool = field(default=False, metadata={"help": "Whether to use Fused Quantized QKV"}) parallel_ffn: bool = field(default=False, metadata={"help": "Whether to use Parallel FFN"}) diff --git a/llm/causallm/finetune_generation.py b/llm/causallm/finetune_generation.py index 8c6429564f19..1abb79cce6e6 100644 --- a/llm/causallm/finetune_generation.py +++ b/llm/causallm/finetune_generation.py @@ -48,9 +48,9 @@ def main(): training_args.print_config(quant_args, "Quant") training_args.print_config(gen_args, "Generation") - if sum([quant_args.do_ptq, quant_args.do_qat, training_args.do_train]) > 1: + if sum([quant_args.do_ptq, quant_args.do_qat, quant_args.do_gptq, training_args.do_train]) > 1: raise ValueError( - "--do_train, --do_ptq and --do_qat cannot work at the same time. Please choose only one at a time" + "--do_train, --do_ptq, --do_gptq and --do_qat cannot work at the same time. Please choose only one at a time" ) # Setup GPU & distributed training @@ -257,6 +257,15 @@ def compute_metrics_do_generation(eval_preds): apply_ptq(quant_args, trainer, ptq_dataloader) + if quant_args.do_gptq: + if isinstance(model, LoRAModel): + raise NotImplementedError( + "PTQ strategy not supported for LoRA model. Please merge lora parameters to pretrain model first." + ) + from quant import apply_gptq + + apply_gptq(quant_args, trainer, ptq_dataloader) + # Evaluation dev set if training_args.do_eval: eval_result = trainer.evaluate(dev_ds) diff --git a/llm/causallm/quant.py b/llm/causallm/quant.py index 9b0d55b1a2ed..dde88a22c803 100644 --- a/llm/causallm/quant.py +++ b/llm/causallm/quant.py @@ -20,17 +20,19 @@ from paddle.quantization import PTQ, QAT, QuantConfig from paddle.quantization.quanters.abs_max import FakeQuanterWithAbsMaxObserverLayer from paddleslim.quant.advanced import ( + GPTQ, EMASampler, MultiStepSampler, PieceWiseSearch, Shift, Smooth, ) +from paddleslim.quant.advanced.utils import find_parent_layer_and_sub_name from paddleslim.quant.layers import ( QuantizedColumnParallelLinear, QuantizedRowParallelLinear, ) -from paddleslim.quant.observers import AbsMaxChannelWiseWeightObserver, AbsmaxObserver +from paddleslim.quant.observers import AbsMaxChannelWiseWeightObserver, AVGObserver from paddleslim.quant.quanters import PACTQuanter from paddlenlp.peft import PrefixModelForCausalLM @@ -93,7 +95,7 @@ def apply_smooth(quant_args, trainer, ptq_dataloader, ptq_model_config): search_scale_min=1.0, search_scale_max=5.0, weight_quant_method="abs_max_channel_wise", - act_quant_method="abs_max", + act_quant_method="avg", ) else: search_func = None @@ -117,8 +119,8 @@ def apply_smooth(quant_args, trainer, ptq_dataloader, ptq_model_config): def apply_ptq(quant_args, trainer, ptq_dataloader): q_config = QuantConfig(activation=None, weight=None) - act_quanter = AbsmaxObserver() - weight_quanter = AbsMaxChannelWiseWeightObserver() + act_quanter = AVGObserver() if not quant_args.ptq_weight_only else None + weight_quanter = AbsMaxChannelWiseWeightObserver(quant_bits=quant_args.quant_bits) q_config.add_qat_layer_mapping(ColumnParallelLinear, QuantizedColumnParallelLinear) q_config.add_qat_layer_mapping(RowParallelLinear, QuantizedRowParallelLinear) q_config.add_type_config( @@ -137,6 +139,26 @@ def apply_ptq(quant_args, trainer, ptq_dataloader): trainer.model = ptq.convert(trainer.model, inplace=True) +def apply_gptq(quant_args, trainer, ptq_dataloader): + num_layer = 0 + model = trainer.model + for cur_name, cur_layer in model.named_sublayers(): + if type(cur_layer) in [paddle.nn.Linear, ColumnParallelLinear, RowParallelLinear]: + num_layer += 1 + print("GPTQ layer", num_layer, cur_name) + parent_layer, sub_name = find_parent_layer_and_sub_name(model, cur_name) + cur_quant_layer = GPTQ(cur_layer) + setattr(parent_layer, sub_name, cur_quant_layer) + trainer.ptq_loop( + ptq_dataloader, + description="PTQ", + max_eval_iters=quant_args.ptq_step, + ) + cur_quant_layer.fasterquant(percdamp=0.1, groupsize=-1, actorder=True) + del cur_quant_layer + setattr(parent_layer, sub_name, cur_layer) + + def get_ptq_model_config(model): if isinstance(model, PrefixModelForCausalLM): base_model_prefix = model.model.base_model_prefix