Skip to content

Commit

Permalink
[Inference] Add a8w8(fp8) a8w8c8(int8) quant_type support (#9032)
Browse files Browse the repository at this point in the history
* 1. add a8w8(fp8) a8w8c8(int8) quant_type support
2. add llama3.1 and qwen2 ptq config
3. update quantization.md

* fix load_quant_model bug

* fix load quant bug

* update ll/README.md
  • Loading branch information
lixcli authored Aug 28, 2024
1 parent 3e7c5ca commit 19927ba
Show file tree
Hide file tree
Showing 38 changed files with 2,559 additions and 79 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -124,3 +124,6 @@ FETCH_HEAD
# vscode
.vscode
./ppdiffusers/ppdiffusers/version.py

dataset/
output/
16 changes: 15 additions & 1 deletion llm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,15 @@ python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" ./alignment/dpo
</div>
<div align="center">
<font size ="1">
飞桨量化算法效果展示
飞桨 W4和 W8A8量化算法效果展示
</font>
</div>
<div align="center">
<img width="300" alt="llm" src="https://github.com/user-attachments/assets/ab8d04ba-d589-4f54-acf1-b00c0fd9159e">
</div>
<div align="center">
<font size ="1">
飞桨 W8A8C8和 FP8量化效果展示
</font>
</div>

Expand All @@ -220,6 +228,12 @@ python run_finetune.py ./config/llama/ptq_argument.json

# GPTQ 量化启动命令参考
python run_finetune.py ./config/llama/ptq_argument.json

# W8A8C8(INT)量化启动命令参考
python run_finetune.py ./config/llama/ptq_c8_argument.json

# W8A8(FP8)量化启动命令参考
python run_finetune.py ./config/llama/fp8_ptq_argument.json
```

更多技术细节和模型量化使用详见[量化文档](./docs/quantization.md)
Expand Down
26 changes: 26 additions & 0 deletions llm/config/llama/AdvertiseGen/w8a8_ptq_argument.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"model_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"quant_type": "a8w8",
"per_device_train_batch_size": 8,
"per_device_eval_batch_size": 8,
"eval_accumulation_steps":16,
"src_length": 1024,
"max_length": 2048,
"fp16": true,
"fp16_opt_level": "O2",
"dataset_name_or_path": "../dataset/AdvertiseGen",
"output_dir": "../output/llama3.1/w8a8_ptq_ckpts_AdvertiseGen",
"do_eval": true,
"eval_with_do_generation": false,
"do_ptq": true,
"ptq_step": 16,
"unified_checkpoint": false,
"smooth": true,
"smooth_step": 16,
"smooth_all_linears": true,
"smooth_piecewise_search": true,
"smooth_k_piece": 3,
"smooth_search_piece": true,
"act_quant_method": "avg",
"cachekv_quant_method": "avg_headwise"
}
26 changes: 26 additions & 0 deletions llm/config/llama/AdvertiseGen/w8a8c8_ptq_argument.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"model_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"quant_type": "a8w8c8",
"per_device_train_batch_size": 8,
"per_device_eval_batch_size": 8,
"eval_accumulation_steps":16,
"src_length": 1024,
"max_length": 2048,
"fp16": true,
"fp16_opt_level": "O2",
"dataset_name_or_path": "../dataset/AdvertiseGen",
"output_dir": "../output/llama3.1/w8a8_ptq_ckpts_AdvertiseGen",
"do_eval": true,
"eval_with_do_generation": false,
"do_ptq": true,
"ptq_step": 16,
"unified_checkpoint": false,
"smooth": true,
"smooth_step": 16,
"smooth_all_linears": true,
"smooth_piecewise_search": true,
"smooth_k_piece": 3,
"smooth_search_piece": true,
"act_quant_method": "avg",
"cachekv_quant_method": "avg_headwise"
}
24 changes: 24 additions & 0 deletions llm/config/llama/AdvertiseGen/wfp8afp8_ptq_argument.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
"model_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"quant_type": "a8w8",
"use_fp8": "WA",
"fp8_type": ["e4m3", "e4m3"],
"per_device_train_batch_size": 8,
"per_device_eval_batch_size": 8,
"eval_accumulation_steps":16,
"src_length": 1024,
"max_length": 2048,
"fp16": true,
"fp16_opt_level": "O2",
"dataset_name_or_path": "../dataset/AdvertiseGen",
"output_dir": "../output/llama3.1/w8a8_ptq_ckpts_AdvertiseGen",
"do_eval": true,
"eval_with_do_generation": false,
"do_ptq": true,
"ptq_step": 16,
"unified_checkpoint": false,
"smooth": false,
"weight_quant_method": "abs_max",
"act_quant_method": "abs_max",
"cachekv_quant_method": "abs_max"
}
37 changes: 37 additions & 0 deletions llm/config/llama/ceval/ceval_w8a8_ptq_argument.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
{
"model_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"quant_type": "a8w8",
"per_device_train_batch_size": 8,
"per_device_eval_batch_size": 8,
"eval_accumulation_steps":16,
"src_length": 1024,
"max_length": 2048,
"fp16": true,
"fp16_opt_level": "O2",
"output_dir": "../output/llama3.1/w8a8_ptq_ckpts_ceval",
"do_eval": true,
"eval_with_do_generation": false,
"do_ptq": false,
"ptq_step": 1,
"unified_checkpoint": false,
"smooth": true,
"smooth_step": 8,
"smooth_all_linears": true,
"smooth_piecewise_search": true,
"smooth_k_piece": 1,
"smooth_search_piece": true,
"act_quant_method": "avg",
"cachekv_quant_method": "avg_headwise",
"load_quant_model": true,
"do_ceval": true,
"cot": false,
"few_shot": true,
"ntrain": 5,
"with_prompt": false,
"constrained_decoding": true,
"temperature": 0.2,
"n_times": 1,
"do_save_csv": false,
"do_test": false,
"ceval_data_path": "../dataset/ceval"
}
25 changes: 25 additions & 0 deletions llm/config/llama/ceval/cevel_wfp8afp8_ptq_argument.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"model_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"quant_type": "a8w8",
"use_fp8": "WA",
"per_device_train_batch_size": 8,
"per_device_eval_batch_size": 8,
"eval_accumulation_steps":16,
"src_length": 1024,
"max_length": 2048,
"fp16": true,
"fp16_opt_level": "O2",
"dataset_name_or_path": "../dataset/ceval_ptq",
"output_dir": "../output/llama3.1/wfp8afp8_ptq_ckpts_ceval",
"do_eval": true,
"eval_with_do_generation": false,
"do_ptq": false,
"ptq_step": 1,
"unified_checkpoint": false,
"smooth": false,
"weight_quant_method": "abs_max",
"act_quant_method": "abs_max",
"cachekv_quant_method": "abs_max",
"load_quant_model": true,
"do_ceval": true
}
26 changes: 26 additions & 0 deletions llm/config/llama/ceval/w8a8_ptq_argument.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"model_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"quant_type": "a8w8",
"per_device_train_batch_size": 8,
"per_device_eval_batch_size": 8,
"eval_accumulation_steps":16,
"src_length": 1024,
"max_length": 2048,
"fp16": true,
"fp16_opt_level": "O2",
"dataset_name_or_path": "../dataset/ceval_ptq",
"output_dir": "../output/llama3.1/w8a8_ptq_ckpts_ceval",
"do_eval": true,
"eval_with_do_generation": false,
"do_ptq": true,
"ptq_step": 16,
"unified_checkpoint": false,
"smooth": true,
"smooth_step": 16,
"smooth_all_linears": true,
"smooth_piecewise_search": true,
"smooth_k_piece": 3,
"smooth_search_piece": true,
"act_quant_method": "avg",
"cachekv_quant_method": "avg_headwise"
}
26 changes: 26 additions & 0 deletions llm/config/llama/ceval/w8a8c8_ptq_argument.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"model_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"quant_type": "a8w8c8",
"per_device_train_batch_size": 8,
"per_device_eval_batch_size": 8,
"eval_accumulation_steps":16,
"src_length": 1024,
"max_length": 2048,
"fp16": true,
"fp16_opt_level": "O2",
"dataset_name_or_path": "../dataset/ceval_ptq",
"output_dir": "../output/llama3.1/w8a8c8_ptq_ckpts_ceval",
"do_eval": true,
"eval_with_do_generation": false,
"do_ptq": true,
"ptq_step": 16,
"unified_checkpoint": false,
"smooth": true,
"smooth_step": 16,
"smooth_all_linears": true,
"smooth_piecewise_search": true,
"smooth_k_piece": 3,
"smooth_search_piece": true,
"act_quant_method": "abs_max",
"cachekv_quant_method": "abs_max_headwise"
}
23 changes: 23 additions & 0 deletions llm/config/llama/ceval/wfp8afp8_ptq_argument.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{
"model_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"quant_type": "a8w8",
"use_fp8": "WA",
"per_device_train_batch_size": 8,
"per_device_eval_batch_size": 8,
"eval_accumulation_steps":16,
"src_length": 1024,
"max_length": 2048,
"fp16": true,
"fp16_opt_level": "O2",
"dataset_name_or_path": "../dataset/ceval_ptq",
"output_dir": "../output/llama3.1/wfp8afp8_ptq_ckpts_ceval",
"do_eval": true,
"eval_with_do_generation": false,
"do_ptq": true,
"ptq_step": 16,
"unified_checkpoint": false,
"smooth": false,
"weight_quant_method": "abs_max",
"act_quant_method": "abs_max",
"cachekv_quant_method": "abs_max"
}
26 changes: 26 additions & 0 deletions llm/config/llama/ceval_ptq_argument.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"model_name_or_path": "meta-llama/Meta-Llama-3-8B",
"per_device_train_batch_size": 8,
"per_device_eval_batch_size": 8,
"eval_accumulation_steps":16,
"src_length": 1024,
"max_length": 2048,
"bf16": true,
"fp16_opt_level": "O2",
"dataset_name_or_path": "./data",
"output_dir": "./checkpoints/llama_ptq_ckpts",
"do_eval": true,
"eval_with_do_generation": false,
"do_ptq": false,
"ptq_step": 1,
"unified_checkpoint": false,
"smooth": true,
"smooth_step": 8,
"smooth_all_linears": true,
"smooth_piecewise_search": true,
"smooth_k_piece": 1,
"smooth_search_piece": true,
"load_quant_model": true,
"do_ceval": true,
"ceval_data_path": "../dataset/ceval"
}
24 changes: 24 additions & 0 deletions llm/config/llama/fp8_ptq_argument.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
"model_name_or_path": "meta-llama/Meta-Llama-3-8B",
"quant_type": "W8A8",
"use_fp8": "WA",
"fp8_type": ["e4m3", "e4m3"],
"per_device_train_batch_size": 8,
"per_device_eval_batch_size": 8,
"eval_accumulation_steps":16,
"src_length": 1024,
"max_length": 2048,
"fp16": true,
"fp16_opt_level": "O2",
"dataset_name_or_path": "./data",
"output_dir": "./checkpoints/llama_ptq_ckpts",
"do_eval": true,
"eval_with_do_generation": false,
"do_ptq": true,
"ptq_step": 16,
"unified_checkpoint": false,
"smooth": false,
"weight_quant_method": "abs_max",
"act_quant_method": "abs_max",
"cachekv_quant_method": "abs_max"
}
26 changes: 26 additions & 0 deletions llm/config/llama/ptq_c8_argument.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"model_name_or_path": "meta-llama/Meta-Llama-3-8B",
"quant_type": "a8w8c8",
"per_device_train_batch_size": 8,
"per_device_eval_batch_size": 8,
"eval_accumulation_steps":16,
"src_length": 1024,
"max_length": 2048,
"fp16": true,
"fp16_opt_level": "O2",
"dataset_name_or_path": "./data",
"output_dir": "./checkpoints/llama_ptq_c8_ckpts",
"do_eval": true,
"eval_with_do_generation": false,
"do_ptq": true,
"ptq_step": 16,
"unified_checkpoint": false,
"smooth": true,
"smooth_step": 16,
"smooth_all_linears": true,
"smooth_piecewise_search": true,
"smooth_k_piece": 3,
"smooth_search_piece": true,
"act_quant_method": "avg",
"cachekv_quant_method": "avg_headwise"
}
26 changes: 26 additions & 0 deletions llm/config/qwen/AdvertiseGen/w8a8_ptq_argument.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"model_name_or_path": "Qwen/Qwen2-7B-Instruct",
"quant_type": "a8w8",
"per_device_train_batch_size": 8,
"per_device_eval_batch_size": 8,
"eval_accumulation_steps":16,
"src_length": 1024,
"max_length": 2048,
"fp16": true,
"fp16_opt_level": "O2",
"dataset_name_or_path": "../dataset/AdvertiseGen",
"output_dir": "../output/qwen2/w8a8_ptq_ckpts_AdvertiseGen",
"do_eval": true,
"eval_with_do_generation": false,
"do_ptq": true,
"ptq_step": 16,
"unified_checkpoint": false,
"smooth": true,
"smooth_step": 16,
"smooth_all_linears": true,
"smooth_piecewise_search": true,
"smooth_k_piece": 3,
"smooth_search_piece": true,
"act_quant_method": "abs_max",
"cachekv_quant_method": "abs_max_headwise"
}
Loading

0 comments on commit 19927ba

Please sign in to comment.