[Inference] Add a8w8(fp8) a8w8c8(int8) quant_type support (#9032)

* 1. add a8w8(fp8) a8w8c8(int8) quant_type support 2. add llama3.1 and qwen2 ptq config 3. update quantization.md * fix load_quant_model bug * fix load quant bug * update ll/README.md
PaddlePaddle · Aug 28, 2024 · 19927ba · 19927ba
1 parent 3e7c5ca
commit 19927ba
Show file tree

Hide file tree

Showing 38 changed files with 2,559 additions and 79 deletions.
diff --git a/.gitignore b/.gitignore
@@ -124,3 +124,6 @@ FETCH_HEAD
 # vscode
 .vscode
 ./ppdiffusers/ppdiffusers/version.py
+
+dataset/
+output/
diff --git a/llm/README.md b/llm/README.md
@@ -210,7 +210,15 @@ python -u  -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" ./alignment/dpo
 </div>
 <div align="center">
     <font size ="1">
-    飞桨量化算法效果展示
+    飞桨 W4和 W8A8量化算法效果展示
+     </font>
+</div>
+<div align="center">
+    <img width="300" alt="llm" src="https://github.com/user-attachments/assets/ab8d04ba-d589-4f54-acf1-b00c0fd9159e">
+</div>
+<div align="center">
+    <font size ="1">
+    飞桨 W8A8C8和 FP8量化效果展示
      </font>
 </div>
 
@@ -220,6 +228,12 @@ python  run_finetune.py ./config/llama/ptq_argument.json
 
 # GPTQ 量化启动命令参考
 python  run_finetune.py ./config/llama/ptq_argument.json
+
+# W8A8C8(INT)量化启动命令参考
+python  run_finetune.py ./config/llama/ptq_c8_argument.json
+
+# W8A8(FP8)量化启动命令参考
+python run_finetune.py ./config/llama/fp8_ptq_argument.json
 ```
 
 更多技术细节和模型量化使用详见[量化文档](./docs/quantization.md)。

diff --git a/llm/config/llama/AdvertiseGen/w8a8_ptq_argument.json b/llm/config/llama/AdvertiseGen/w8a8_ptq_argument.json
@@ -0,0 +1,26 @@
+{
+  "model_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+  "quant_type": "a8w8",
+  "per_device_train_batch_size": 8,
+  "per_device_eval_batch_size": 8,
+  "eval_accumulation_steps":16,
+  "src_length": 1024,
+  "max_length": 2048,
+  "fp16": true,
+  "fp16_opt_level": "O2",
+  "dataset_name_or_path": "../dataset/AdvertiseGen",
+  "output_dir": "../output/llama3.1/w8a8_ptq_ckpts_AdvertiseGen",
+  "do_eval": true,
+  "eval_with_do_generation": false,
+  "do_ptq": true,
+  "ptq_step": 16,
+  "unified_checkpoint": false,
+  "smooth": true,
+  "smooth_step": 16,
+  "smooth_all_linears": true,
+  "smooth_piecewise_search": true,
+  "smooth_k_piece": 3,
+  "smooth_search_piece": true,
+  "act_quant_method": "avg",
+  "cachekv_quant_method": "avg_headwise"
+}
diff --git a/llm/config/llama/AdvertiseGen/w8a8c8_ptq_argument.json b/llm/config/llama/AdvertiseGen/w8a8c8_ptq_argument.json
@@ -0,0 +1,26 @@
+{
+  "model_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+  "quant_type": "a8w8c8",
+  "per_device_train_batch_size": 8,
+  "per_device_eval_batch_size": 8,
+  "eval_accumulation_steps":16,
+  "src_length": 1024,
+  "max_length": 2048,
+  "fp16": true,
+  "fp16_opt_level": "O2",
+  "dataset_name_or_path": "../dataset/AdvertiseGen",
+  "output_dir": "../output/llama3.1/w8a8_ptq_ckpts_AdvertiseGen",
+  "do_eval": true,
+  "eval_with_do_generation": false,
+  "do_ptq": true,
+  "ptq_step": 16,
+  "unified_checkpoint": false,
+  "smooth": true,
+  "smooth_step": 16,
+  "smooth_all_linears": true,
+  "smooth_piecewise_search": true,
+  "smooth_k_piece": 3,
+  "smooth_search_piece": true,
+  "act_quant_method": "avg",
+  "cachekv_quant_method": "avg_headwise"
+}
diff --git a/llm/config/llama/AdvertiseGen/wfp8afp8_ptq_argument.json b/llm/config/llama/AdvertiseGen/wfp8afp8_ptq_argument.json
@@ -0,0 +1,24 @@
+{
+  "model_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+  "quant_type": "a8w8",
+  "use_fp8": "WA",
+  "fp8_type": ["e4m3", "e4m3"],
+  "per_device_train_batch_size": 8,
+  "per_device_eval_batch_size": 8,
+  "eval_accumulation_steps":16,
+  "src_length": 1024,
+  "max_length": 2048,
+  "fp16": true,
+  "fp16_opt_level": "O2",
+  "dataset_name_or_path": "../dataset/AdvertiseGen",
+  "output_dir": "../output/llama3.1/w8a8_ptq_ckpts_AdvertiseGen",
+  "do_eval": true,
+  "eval_with_do_generation": false,
+  "do_ptq": true,
+  "ptq_step": 16,
+  "unified_checkpoint": false,
+  "smooth": false,
+  "weight_quant_method": "abs_max",
+  "act_quant_method": "abs_max",
+  "cachekv_quant_method": "abs_max"
+  }
diff --git a/llm/config/llama/ceval/ceval_w8a8_ptq_argument.json b/llm/config/llama/ceval/ceval_w8a8_ptq_argument.json
@@ -0,0 +1,37 @@
+{
+  "model_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+  "quant_type": "a8w8",
+  "per_device_train_batch_size": 8,
+  "per_device_eval_batch_size": 8,
+  "eval_accumulation_steps":16,
+  "src_length": 1024,
+  "max_length": 2048,
+  "fp16": true,
+  "fp16_opt_level": "O2",
+  "output_dir": "../output/llama3.1/w8a8_ptq_ckpts_ceval",
+  "do_eval": true,
+  "eval_with_do_generation": false,
+  "do_ptq": false,
+  "ptq_step": 1,
+  "unified_checkpoint": false,
+  "smooth": true,
+  "smooth_step": 8,
+  "smooth_all_linears": true,
+  "smooth_piecewise_search": true,
+  "smooth_k_piece": 1,
+  "smooth_search_piece": true,
+  "act_quant_method": "avg",
+  "cachekv_quant_method": "avg_headwise",
+  "load_quant_model": true,
+  "do_ceval": true,
+  "cot": false,
+  "few_shot": true,
+  "ntrain": 5,
+  "with_prompt": false,
+  "constrained_decoding": true,
+  "temperature": 0.2,
+  "n_times": 1,
+  "do_save_csv": false,
+  "do_test": false,
+  "ceval_data_path": "../dataset/ceval"
+}
diff --git a/llm/config/llama/ceval/cevel_wfp8afp8_ptq_argument.json b/llm/config/llama/ceval/cevel_wfp8afp8_ptq_argument.json
@@ -0,0 +1,25 @@
+{
+  "model_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+  "quant_type": "a8w8",
+  "use_fp8": "WA",
+  "per_device_train_batch_size": 8,
+  "per_device_eval_batch_size": 8,
+  "eval_accumulation_steps":16,
+  "src_length": 1024,
+  "max_length": 2048,
+  "fp16": true,
+  "fp16_opt_level": "O2",
+  "dataset_name_or_path": "../dataset/ceval_ptq",
+  "output_dir": "../output/llama3.1/wfp8afp8_ptq_ckpts_ceval",
+  "do_eval": true,
+  "eval_with_do_generation": false,
+  "do_ptq": false,
+  "ptq_step": 1,
+  "unified_checkpoint": false,
+  "smooth": false,
+  "weight_quant_method": "abs_max",
+  "act_quant_method": "abs_max",
+  "cachekv_quant_method": "abs_max",
+  "load_quant_model": true,
+  "do_ceval": true
+  }
diff --git a/llm/config/llama/ceval/w8a8_ptq_argument.json b/llm/config/llama/ceval/w8a8_ptq_argument.json
@@ -0,0 +1,26 @@
+{
+  "model_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+  "quant_type": "a8w8",
+  "per_device_train_batch_size": 8,
+  "per_device_eval_batch_size": 8,
+  "eval_accumulation_steps":16,
+  "src_length": 1024,
+  "max_length": 2048,
+  "fp16": true,
+  "fp16_opt_level": "O2",
+  "dataset_name_or_path": "../dataset/ceval_ptq",
+  "output_dir": "../output/llama3.1/w8a8_ptq_ckpts_ceval",
+  "do_eval": true,
+  "eval_with_do_generation": false,
+  "do_ptq": true,
+  "ptq_step": 16,
+  "unified_checkpoint": false,
+  "smooth": true,
+  "smooth_step": 16,
+  "smooth_all_linears": true,
+  "smooth_piecewise_search": true,
+  "smooth_k_piece": 3,
+  "smooth_search_piece": true,
+  "act_quant_method": "avg",
+  "cachekv_quant_method": "avg_headwise"
+}
diff --git a/llm/config/llama/ceval/w8a8c8_ptq_argument.json b/llm/config/llama/ceval/w8a8c8_ptq_argument.json
@@ -0,0 +1,26 @@
+{
+  "model_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+  "quant_type": "a8w8c8",
+  "per_device_train_batch_size": 8,
+  "per_device_eval_batch_size": 8,
+  "eval_accumulation_steps":16,
+  "src_length": 1024,
+  "max_length": 2048,
+  "fp16": true,
+  "fp16_opt_level": "O2",
+  "dataset_name_or_path": "../dataset/ceval_ptq",
+  "output_dir": "../output/llama3.1/w8a8c8_ptq_ckpts_ceval",
+  "do_eval": true,
+  "eval_with_do_generation": false,
+  "do_ptq": true,
+  "ptq_step": 16,
+  "unified_checkpoint": false,
+  "smooth": true,
+  "smooth_step": 16,
+  "smooth_all_linears": true,
+  "smooth_piecewise_search": true,
+  "smooth_k_piece": 3,
+  "smooth_search_piece": true,
+  "act_quant_method": "abs_max",
+  "cachekv_quant_method": "abs_max_headwise"
+}
diff --git a/llm/config/llama/ceval/wfp8afp8_ptq_argument.json b/llm/config/llama/ceval/wfp8afp8_ptq_argument.json
@@ -0,0 +1,23 @@
+{
+  "model_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+  "quant_type": "a8w8",
+  "use_fp8": "WA",
+  "per_device_train_batch_size": 8,
+  "per_device_eval_batch_size": 8,
+  "eval_accumulation_steps":16,
+  "src_length": 1024,
+  "max_length": 2048,
+  "fp16": true,
+  "fp16_opt_level": "O2",
+  "dataset_name_or_path": "../dataset/ceval_ptq",
+  "output_dir": "../output/llama3.1/wfp8afp8_ptq_ckpts_ceval",
+  "do_eval": true,
+  "eval_with_do_generation": false,
+  "do_ptq": true,
+  "ptq_step": 16,
+  "unified_checkpoint": false,
+  "smooth": false,
+  "weight_quant_method": "abs_max",
+  "act_quant_method": "abs_max",
+  "cachekv_quant_method": "abs_max"
+  }
diff --git a/llm/config/llama/ceval_ptq_argument.json b/llm/config/llama/ceval_ptq_argument.json
@@ -0,0 +1,26 @@
+{
+  "model_name_or_path": "meta-llama/Meta-Llama-3-8B",
+  "per_device_train_batch_size": 8,
+  "per_device_eval_batch_size": 8,
+  "eval_accumulation_steps":16,
+  "src_length": 1024,
+  "max_length": 2048,
+  "bf16": true,
+  "fp16_opt_level": "O2",
+  "dataset_name_or_path": "./data",
+  "output_dir": "./checkpoints/llama_ptq_ckpts",
+  "do_eval": true,
+  "eval_with_do_generation": false,
+  "do_ptq": false,
+  "ptq_step": 1,
+  "unified_checkpoint": false,
+  "smooth": true,
+  "smooth_step": 8,
+  "smooth_all_linears": true,
+  "smooth_piecewise_search": true,
+  "smooth_k_piece": 1,
+  "smooth_search_piece": true,
+  "load_quant_model": true,
+  "do_ceval": true,
+  "ceval_data_path": "../dataset/ceval"
+}
diff --git a/llm/config/llama/fp8_ptq_argument.json b/llm/config/llama/fp8_ptq_argument.json
@@ -0,0 +1,24 @@
+{
+  "model_name_or_path": "meta-llama/Meta-Llama-3-8B",
+  "quant_type": "W8A8",
+  "use_fp8": "WA",
+  "fp8_type": ["e4m3", "e4m3"],
+  "per_device_train_batch_size": 8,
+  "per_device_eval_batch_size": 8,
+  "eval_accumulation_steps":16,
+  "src_length": 1024,
+  "max_length": 2048,
+  "fp16": true,
+  "fp16_opt_level": "O2",
+  "dataset_name_or_path": "./data",
+  "output_dir": "./checkpoints/llama_ptq_ckpts",
+  "do_eval": true,
+  "eval_with_do_generation": false,
+  "do_ptq": true,
+  "ptq_step": 16,
+  "unified_checkpoint": false,
+  "smooth": false,
+  "weight_quant_method": "abs_max",
+  "act_quant_method": "abs_max",
+  "cachekv_quant_method": "abs_max"
+  }
diff --git a/llm/config/llama/ptq_c8_argument.json b/llm/config/llama/ptq_c8_argument.json
@@ -0,0 +1,26 @@
+{
+  "model_name_or_path": "meta-llama/Meta-Llama-3-8B",
+  "quant_type": "a8w8c8",
+  "per_device_train_batch_size": 8,
+  "per_device_eval_batch_size": 8,
+  "eval_accumulation_steps":16,
+  "src_length": 1024,
+  "max_length": 2048,
+  "fp16": true,
+  "fp16_opt_level": "O2",
+  "dataset_name_or_path": "./data",
+  "output_dir": "./checkpoints/llama_ptq_c8_ckpts",
+  "do_eval": true,
+  "eval_with_do_generation": false,
+  "do_ptq": true,
+  "ptq_step": 16,
+  "unified_checkpoint": false,
+  "smooth": true,
+  "smooth_step": 16,
+  "smooth_all_linears": true,
+  "smooth_piecewise_search": true,
+  "smooth_k_piece": 3,
+  "smooth_search_piece": true,
+  "act_quant_method": "avg",
+  "cachekv_quant_method": "avg_headwise"
+}
diff --git a/llm/config/qwen/AdvertiseGen/w8a8_ptq_argument.json b/llm/config/qwen/AdvertiseGen/w8a8_ptq_argument.json
@@ -0,0 +1,26 @@
+{
+  "model_name_or_path": "Qwen/Qwen2-7B-Instruct",
+  "quant_type": "a8w8",
+  "per_device_train_batch_size": 8,
+  "per_device_eval_batch_size": 8,
+  "eval_accumulation_steps":16,
+  "src_length": 1024,
+  "max_length": 2048,
+  "fp16": true,
+  "fp16_opt_level": "O2",
+  "dataset_name_or_path": "../dataset/AdvertiseGen",
+  "output_dir": "../output/qwen2/w8a8_ptq_ckpts_AdvertiseGen",
+  "do_eval": true,
+  "eval_with_do_generation": false,
+  "do_ptq": true,
+  "ptq_step": 16,
+  "unified_checkpoint": false,
+  "smooth": true,
+  "smooth_step": 16,
+  "smooth_all_linears": true,
+  "smooth_piecewise_search": true,
+  "smooth_k_piece": 3,
+  "smooth_search_piece": true,
+  "act_quant_method": "abs_max",
+  "cachekv_quant_method": "abs_max_headwise"
+}