From ed30fe0c4db6b85f017c78727fcbef87e338b86e Mon Sep 17 00:00:00 2001
From: shibing624 <shibing624@126.com>
Date: Mon, 23 Oct 2023 15:40:32 +0800
Subject: [PATCH] update readme.

---
 README.md                |  8 ++++++--
 inference.py             |  4 ++--
 supervised_finetuning.py | 21 +++++++++++++++++++++
 3 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index a9aac92..f8caf22 100644
--- a/README.md
+++ b/README.md
@@ -30,6 +30,8 @@ Supervised Finetuning, RLHF(Reward Modeling and Reinforcement Learning) and DPO(
 - DPO方法来自论文[Direct Preference Optimization:Your Language Model is Secretly a Reward Model](https://arxiv.org/pdf/2305.18290.pdf)
 
 ## 🔥 News
+[2023/10/23] v1.6版本：新增了RoPE插值来扩展GPT模型的上下文长度；针对LLaMA模型支持了[FlashAttention-2](https://github.com/Dao-AILab/flash-attention)和[LongLoRA](https://github.com/dvlab-research/LongLoRA) 提出的 **$S^2$-Attn**；支持了[NEFTune](https://github.com/neelsjain/NEFTune)给embedding加噪训练。详见[Release-v1.6](https://github.com/shibing624/MedicalGPT/releases/tag/1.6.0)
+
 [2023/08/28] v1.5版本: 新增[DPO(直接偏好优化)](https://arxiv.org/pdf/2305.18290.pdf)方法，DPO通过直接优化语言模型来实现对其行为的精确控制，可以有效学习到人类偏好。详见[Release-v1.5](https://github.com/shibing624/MedicalGPT/releases/tag/1.5.0)
 
 [2023/08/08] v1.4版本: 发布基于ShareGPT4数据集微调的中英文Vicuna-13B模型[shibing624/vicuna-baichuan-13b-chat](https://huggingface.co/shibing624/vicuna-baichuan-13b-chat)，和对应的LoRA模型[shibing624/vicuna-baichuan-13b-chat-lora](https://huggingface.co/shibing624/vicuna-baichuan-13b-chat-lora)，详见[Release-v1.4](https://github.com/shibing624/MedicalGPT/releases/tag/1.4.0)
@@ -196,8 +198,8 @@ CUDA_VISIBLE_DEVICES=0 python inference.py \
 - `--tokenizer_path {tokenizer_path}`：存放对应tokenizer的目录。若不提供此参数，则其默认值与--base_model相同
 - `--template_name`：模板名称，如`vicuna`、`alpaca`等。若不提供此参数，则其默认值是vicuna
 - `--interactive`：以交互方式启动多轮问答，使用流式推理
-- `--data_file {file_name}`：非交互方式启动下，按行读取file_name中的的内容进行预测
-- `--predictions_file {file_name}`：非交互式方式下，将预测的结果以json格式写入file_name
+- `--data_file {file_name}`：非交互方式启动下，读取file_name中的的内容进行batch预测
+- `--output_file {file_name}`：非交互式方式下，将预测的结果以jsonl格式写入file_name
 - `--resize_emb`：是否调整embedding大小，若不调整，则使用预训练模型的embedding大小，默认不调整
 - `--only_cpu`：仅使用CPU进行推理
 - `--gpus {gpu_ids}`：指定使用的GPU设备编号，默认为0。如使用多张GPU，以逗号分隔，如0,1,2
@@ -330,5 +332,7 @@ CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc_per_node 2 inference_multigpu_demo.py
 - [Direct Preference Optimization:Your Language Model is Secretly a Reward Model](https://arxiv.org/pdf/2305.18290.pdf)
 - [tloen/alpaca-lora](https://github.com/tloen/alpaca-lora/blob/main/finetune.py)
 - [ymcui/Chinese-LLaMA-Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
+- [hiyouga/LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory)
+- [dvlab-research/LongLoRA](https://github.com/dvlab-research/LongLoRA)
 
 Thanks for their great work!
diff --git a/inference.py b/inference.py
index ff97379..4dd6cc1 100644
--- a/inference.py
+++ b/inference.py
@@ -229,7 +229,7 @@ def main():
                 history[-1][-1] = response.strip()
     else:
         print("Start inference.")
-        counts = []
+        counts = 0
         if os.path.exists(args.output_file):
             os.remove(args.output_file)
         eval_batch_size = args.eval_batch_size
@@ -251,7 +251,7 @@ def main():
                 repetition_penalty=args.repetition_penalty,
             )
             results = []
-            for example, response in enumerate(batch, responses):
+            for example, response in zip(batch, responses):
                 print(f"===")
                 print(f"Input: {example}")
                 print(f"Output: {response}\n")
diff --git a/supervised_finetuning.py b/supervised_finetuning.py
index cfff745..247903e 100644
--- a/supervised_finetuning.py
+++ b/supervised_finetuning.py
@@ -21,6 +21,7 @@
 import os
 from dataclasses import dataclass, field
 from glob import glob
+from types import MethodType
 from typing import Literal, Optional, Tuple, List, Dict, Sequence
 
 import torch
@@ -133,6 +134,10 @@ class ModelArguments:
         default=False,
         metadata={"help": "Enable shift short attention (S^2-Attn) proposed by LongLoRA."}
     )
+    neft_alpha: Optional[float] = field(
+        default=0,
+        metadata={"help": "The alpha parameter to control the noise magnitude in NEFTune. value can be 5."}
+    )
 
     def __post_init__(self):
         if self.model_type is None:
@@ -1167,6 +1172,22 @@ def filter_empty_labels(example):
                 bnb_4bit_compute_dtype=torch_dtype,
             ) if training_args.qlora else None,
         )
+
+        # Set NEFTune trick for fine-tuning
+        if model_args.neft_alpha > 0:
+            input_embed = model.get_input_embeddings()
+            if isinstance(input_embed, torch.nn.Embedding):
+                def noisy_forward(self: torch.nn.Embedding, x: torch.Tensor) -> torch.Tensor:
+                    embeddings = input_embed.__class__.forward(self, x)
+                    dims = self.num_embeddings * self.embedding_dim
+                    mag_norm = model_args.neft_alpha / (dims ** 0.5)
+                    embeddings += torch.zeros_like(embeddings).uniform_(-mag_norm, mag_norm)
+                    return embeddings
+
+                input_embed.forward = MethodType(noisy_forward, input_embed)
+                logger.info("Using noisy embedding with alpha={:.2f}".format(model_args.neft_alpha))
+            else:
+                logger.warning("Input embeddings are not normal nn.Embedding, cannot transform into noisy embedding.")
     else:
         raise ValueError(f"Error, model_name_or_path is None, SFT must be loaded from a pre-trained model")