Skip to content

Commit

Permalink
Add WeightOnlyPTQ and GPTQ (#6572)
Browse files Browse the repository at this point in the history
* Add WeightOnlyPTQ and GPTQ

* update

* update
  • Loading branch information
RachelXu7 authored Aug 2, 2023
1 parent f91716f commit af54864
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 6 deletions.
2 changes: 2 additions & 0 deletions llm/causallm/argument.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ class QuantArgument:
# PTQ related parameters
do_ptq: bool = field(default=False, metadata={"help": "Whether to use PTQ"})
ptq_step: int = field(default=8, metadata={"help": "Step for PTQ"})
ptq_weight_only: bool = field(default=False, metadata={"help": "Whether to use PTQ weight only"})
quant_bits: int = field(default=8, metadata={"help": "Quantization bit size"})

fused_qkv: bool = field(default=False, metadata={"help": "Whether to use Fused Quantized QKV"})
parallel_ffn: bool = field(default=False, metadata={"help": "Whether to use Parallel FFN"})
Expand Down
13 changes: 11 additions & 2 deletions llm/causallm/finetune_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ def main():
training_args.print_config(quant_args, "Quant")
training_args.print_config(gen_args, "Generation")

if sum([quant_args.do_ptq, quant_args.do_qat, training_args.do_train]) > 1:
if sum([quant_args.do_ptq, quant_args.do_qat, quant_args.do_gptq, training_args.do_train]) > 1:
raise ValueError(
"--do_train, --do_ptq and --do_qat cannot work at the same time. Please choose only one at a time"
"--do_train, --do_ptq, --do_gptq and --do_qat cannot work at the same time. Please choose only one at a time"
)

# Setup GPU & distributed training
Expand Down Expand Up @@ -257,6 +257,15 @@ def compute_metrics_do_generation(eval_preds):

apply_ptq(quant_args, trainer, ptq_dataloader)

if quant_args.do_gptq:
if isinstance(model, LoRAModel):
raise NotImplementedError(
"PTQ strategy not supported for LoRA model. Please merge lora parameters to pretrain model first."
)
from quant import apply_gptq

apply_gptq(quant_args, trainer, ptq_dataloader)

# Evaluation dev set
if training_args.do_eval:
eval_result = trainer.evaluate(dev_ds)
Expand Down
30 changes: 26 additions & 4 deletions llm/causallm/quant.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,17 +20,19 @@
from paddle.quantization import PTQ, QAT, QuantConfig
from paddle.quantization.quanters.abs_max import FakeQuanterWithAbsMaxObserverLayer
from paddleslim.quant.advanced import (
GPTQ,
EMASampler,
MultiStepSampler,
PieceWiseSearch,
Shift,
Smooth,
)
from paddleslim.quant.advanced.utils import find_parent_layer_and_sub_name
from paddleslim.quant.layers import (
QuantizedColumnParallelLinear,
QuantizedRowParallelLinear,
)
from paddleslim.quant.observers import AbsMaxChannelWiseWeightObserver, AbsmaxObserver
from paddleslim.quant.observers import AbsMaxChannelWiseWeightObserver, AVGObserver
from paddleslim.quant.quanters import PACTQuanter

from paddlenlp.peft import PrefixModelForCausalLM
Expand Down Expand Up @@ -93,7 +95,7 @@ def apply_smooth(quant_args, trainer, ptq_dataloader, ptq_model_config):
search_scale_min=1.0,
search_scale_max=5.0,
weight_quant_method="abs_max_channel_wise",
act_quant_method="abs_max",
act_quant_method="avg",
)
else:
search_func = None
Expand All @@ -117,8 +119,8 @@ def apply_smooth(quant_args, trainer, ptq_dataloader, ptq_model_config):

def apply_ptq(quant_args, trainer, ptq_dataloader):
q_config = QuantConfig(activation=None, weight=None)
act_quanter = AbsmaxObserver()
weight_quanter = AbsMaxChannelWiseWeightObserver()
act_quanter = AVGObserver() if not quant_args.ptq_weight_only else None
weight_quanter = AbsMaxChannelWiseWeightObserver(quant_bits=quant_args.quant_bits)
q_config.add_qat_layer_mapping(ColumnParallelLinear, QuantizedColumnParallelLinear)
q_config.add_qat_layer_mapping(RowParallelLinear, QuantizedRowParallelLinear)
q_config.add_type_config(
Expand All @@ -137,6 +139,26 @@ def apply_ptq(quant_args, trainer, ptq_dataloader):
trainer.model = ptq.convert(trainer.model, inplace=True)


def apply_gptq(quant_args, trainer, ptq_dataloader):
num_layer = 0
model = trainer.model
for cur_name, cur_layer in model.named_sublayers():
if type(cur_layer) in [paddle.nn.Linear, ColumnParallelLinear, RowParallelLinear]:
num_layer += 1
print("GPTQ layer", num_layer, cur_name)
parent_layer, sub_name = find_parent_layer_and_sub_name(model, cur_name)
cur_quant_layer = GPTQ(cur_layer)
setattr(parent_layer, sub_name, cur_quant_layer)
trainer.ptq_loop(
ptq_dataloader,
description="PTQ",
max_eval_iters=quant_args.ptq_step,
)
cur_quant_layer.fasterquant(percdamp=0.1, groupsize=-1, actorder=True)
del cur_quant_layer
setattr(parent_layer, sub_name, cur_layer)


def get_ptq_model_config(model):
if isinstance(model, PrefixModelForCausalLM):
base_model_prefix = model.model.base_model_prefix
Expand Down

0 comments on commit af54864

Please sign in to comment.