From caec4eaf55dced581a5605525f67e7d41c68b726 Mon Sep 17 00:00:00 2001
From: yaoguany <89233842+yaoguany@users.noreply.github.com>
Date: Fri, 4 Aug 2023 20:22:42 +0800
Subject: [PATCH 01/29] fix tokenizer load bug in latest transformers

---
 src/lmflow/models/hf_decoder_model.py | 45 ++++++++++++++++++++-------
 1 file changed, 34 insertions(+), 11 deletions(-)
diff --git a/src/lmflow/models/hf_decoder_model.py b/src/lmflow/models/hf_decoder_model.py
index 76cd353b9..aed7f751d 100644
--- a/src/lmflow/models/hf_decoder_model.py
+++ b/src/lmflow/models/hf_decoder_model.py
@@ -140,18 +140,41 @@ def __init__(
             "revision": model_args.model_revision,
             "use_auth_token": True if model_args.use_auth_token else None,
         }
-        if model_args.tokenizer_name:
-            tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
-        elif model_args.model_name_or_path:
-            tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs)
-        else:
-            raise ValueError(
-                "You are instantiating a new tokenizer from scratch. This is"
-                " not supported by this script. You can do it from another"
-                " script, save it, and load it from here, using"
-                " --tokenizer_name."
-            )
+        
+        try:
+            if model_args.tokenizer_name:
+                tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
+            elif model_args.model_name_or_path:
+                tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs)
+            else:
+                raise ValueError(
+                    "You are instantiating a new tokenizer from scratch. This is"
+                    " not supported by this script. You can do it from another"
+                    " script, save it, and load it from here, using"
+                    " --tokenizer_name."
+                )
 
+        except RecursionError:
+            if model_args.tokenizer_name:
+                tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, unk_token="<unk>",
+                                                    bos_token="<s>",
+                                                    eos_token="</s>",
+                                                    **tokenizer_kwargs)
+            elif model_args.model_name_or_path:
+                tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, unk_token="<unk>",
+                                                    bos_token="<s>",
+                                                    eos_token="</s>",
+                                                    **tokenizer_kwargs)
+            else:
+                raise ValueError(
+                    "You are instantiating a new tokenizer from scratch. This is"
+                    " not supported by this script. You can do it from another"
+                    " script, save it, and load it from here, using"
+                    " --tokenizer_name."
+                )
+
+            logger.warning("The tokenizer_config.json file doesn't set the special tokens. Using default values: <unk>, <s>, </s>")
+            
         self.tokenizer = tokenizer  
 
         torch_dtype = (

From eab5c21497505765003c126a3a92bb87bfa98f9f Mon Sep 17 00:00:00 2001
From: yaoguany <89233842+yaoguany@users.noreply.github.com>
Date: Sat, 5 Aug 2023 23:59:16 +0800
Subject: [PATCH 02/29] add position interpolation readme

---
 Position_Interpolation_README.md | 40 ++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 Position_Interpolation_README.md

diff --git a/Position_Interpolation_README.md b/Position_Interpolation_README.md
new file mode 100644
index 000000000..df0d469f0
--- /dev/null
+++ b/Position_Interpolation_README.md
@@ -0,0 +1,40 @@
+# Position Interpolation 
+Now LMFlow supports the latest Linear & NTK (Neural Kernel theory) scaling techniques for LLaMA models. \
+For more details of these techniques, you can checkout the links below:
+* Linear scaling: \
+https://arxiv.org/abs/2306.15595
+* NTK scaling: \
+https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
+# Usage
+To use the Position Interpolation Techniques, you need to set the following options:
+```
+--truncate_to_model_max_length False
+--do_rope_scaling True
+```
+For linear scaling, set the extending ratio by:
+```
+--rope_pi_ratio 4
+```
+For NTK scaling, set the extending ratio by:
+```
+--rope_ntk_ratio 4
+```
+Here is an example of evaluation bash code:
+```
+#!/bin/bash
+
+CUDA_VISIBLE_DEVICES=0 \
+    deepspeed examples/evaluation.py \
+    --answer_type text \
+    --model_name_or_path pinkmanlove/llama-7b-hf \
+    --dataset_path data/wiki_en_eval \
+    --deepspeed examples/ds_config.json \
+    --inference_batch_size_per_device 1 \
+    --truncate_to_model_max_length False \
+    --block_size 4096 \
+    --use_flash_attention True \
+    --do_rope_scaling True \
+    --rope_pi_ratio 2 \
+    --rope_ntk_ratio 4 \
+    --metric ppl
+```
\ No newline at end of file

From 162e0cfec4a4b2fa25f4e80281a46868ace6137e Mon Sep 17 00:00:00 2001
From: yaoguany <89233842+yaoguany@users.noreply.github.com>
Date: Sun, 6 Aug 2023 00:13:01 +0800
Subject: [PATCH 03/29] add vocab_extension readme

---
 vocab_extension.md | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)
 create mode 100644 vocab_extension.md

diff --git a/vocab_extension.md b/vocab_extension.md
new file mode 100644
index 000000000..bb5217d1c
--- /dev/null
+++ b/vocab_extension.md
@@ -0,0 +1,22 @@
+# Train & Merge Tokenizer
+To automatically convert data, train a SentencePiece tokenizer, and merge the tokenizer, you can run the following script:
+```
+bash scripts/vocab_extension/train_merge_tokenizer.sh
+``` 
+Alternatively, you can run each of the three steps separately:
+
+# Convert JSON Data to TXT
+To convert JSON data to TXT for sentencepiece tokenizer training, run:
+```
+bash scripts/vocab_extension/convert_json_to_txt.sh
+```
+# Train SentencePiece Tokenizer
+To train a SentencePiece tokenizer, run:
+```
+bash scripts/vocab_extension/train_tokenizer.sh
+```
+# Merge New Tokenizer with the Origin One
+To merge a new tokenizer with the original one, run:
+```
+bash scripts/vocab_extension/merge_tokenizer.sh
+```
\ No newline at end of file

From 3467db70ae493c5a1906eaed1fe010264ec482f6 Mon Sep 17 00:00:00 2001
From: shizhediao <654745845@qq.com>
Date: Sun, 6 Aug 2023 00:23:50 +0800
Subject: [PATCH 04/29] update wechat qrcode

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index eb14f529b..9cc09360f 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,7 @@
 [![Doc](https://img.shields.io/badge/Website-Doc-ff69b4.svg)](https://optimalscale.github.io/LMFlow/)
 [![Embark](https://img.shields.io/badge/Discord-LMFlow-%237289da.svg?logo=discord)](https://discord.gg/u9VJNpzhvA)
 [![slack badge](https://img.shields.io/badge/Slack-Join-blueviolet?logo=slack&amp)](https://join.slack.com/t/lmflow/shared_invite/zt-1wju9nicy-woXbNtS~5MavHSAtiMxmxQ)
-[![WeChat badge](https://img.shields.io/badge/WeChat-Join-brightgreen?logo=wechat&amp)](https://i.imgloc.com/2023/07/13/VgJyaZ.jpeg)
+[![WeChat badge](https://img.shields.io/badge/WeChat-Join-brightgreen?logo=wechat&amp)](https://s1.ax1x.com/2023/08/06/pPAQTPI.jpg)
 
 An extensible, convenient, and efficient toolbox for finetuning large machine learning models, designed to be user-friendly, speedy and reliable, and accessible to the entire community.
 
@@ -395,7 +395,7 @@ Whether you are a beginner or an expert, we believe that you can benefit from th
 
 [![Embark](https://img.shields.io/badge/discord-LMFlow-%237289da.svg?logo=discord)](https://discord.gg/u9VJNpzhvA)
 [![slack badge](https://img.shields.io/badge/Slack-join-blueviolet?logo=slack&amp)](https://join.slack.com/t/lmflow/shared_invite/zt-1wju9nicy-woXbNtS~5MavHSAtiMxmxQ)
-[![WeChat badge](https://img.shields.io/badge/WeChat-Join-brightgreen?logo=wechat&amp)](https://i.imgloc.com/2023/07/13/VgJyaZ.jpeg)
+[![WeChat badge](https://img.shields.io/badge/WeChat-Join-brightgreen?logo=wechat&amp)](https://s1.ax1x.com/2023/08/06/pPAQTPI.jpg)
 
 
 

From 1d9bb83f973bdc088bbb7111680432413a0c463a Mon Sep 17 00:00:00 2001
From: yaoguany <89233842+yaoguany@users.noreply.github.com>
Date: Sun, 6 Aug 2023 00:26:22 +0800
Subject: [PATCH 05/29] changed readme position

---
 README.md                                               | 4 ++++
 vocab_extension.md => scripts/vocab_extension/README.md | 0
 2 files changed, 4 insertions(+)
 rename vocab_extension.md => scripts/vocab_extension/README.md (100%)

diff --git a/README.md b/README.md
index eb14f529b..5aa45c0bd 100644
--- a/README.md
+++ b/README.md
@@ -336,6 +336,10 @@ You can config the deepspeed under configs. Details can be referred at [DeepSpee
 
 Thanks to the great efforts of [llama.cpp](https://github.com/ggerganov/llama.cpp). It is possible for everyone to run their LLaMA models on CPU by 4-bit quantization. We provide a script to convert LLaMA LoRA weights to `.pt` files. You only need to use `convert-pth-to-ggml.py` in llama.cpp to perform quantization.
 
+### 4.4 Vocabulary List Extension
+
+Now you can train your own sentencepiece tokenizer and merge it with model's origin hf tokenizer. Check [vocab_extension](https://github.com/OptimalScale/LMFlow/scripts/vocab_extension) for more details.
+
 
 ## 5. Model Release
 
diff --git a/vocab_extension.md b/scripts/vocab_extension/README.md
similarity index 100%
rename from vocab_extension.md
rename to scripts/vocab_extension/README.md

From 90c7cf86e1a6ee480ec96a54bad0701b5bbcd142 Mon Sep 17 00:00:00 2001
From: yaoguany <89233842+yaoguany@users.noreply.github.com>
Date: Sun, 6 Aug 2023 00:28:09 +0800
Subject: [PATCH 06/29] fix link bug

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 5aa45c0bd..ae393a437 100644
--- a/README.md
+++ b/README.md
@@ -338,7 +338,7 @@ Thanks to the great efforts of [llama.cpp](https://github.com/ggerganov/llama.cp
 
 ### 4.4 Vocabulary List Extension
 
-Now you can train your own sentencepiece tokenizer and merge it with model's origin hf tokenizer. Check [vocab_extension](https://github.com/OptimalScale/LMFlow/scripts/vocab_extension) for more details.
+Now you can train your own sentencepiece tokenizer and merge it with model's origin hf tokenizer. Check [vocab_extension](https://github.com/OptimalScale/LMFlow/tree/main/scripts/vocab_extension) for more details.
 
 
 ## 5. Model Release

From 083a69d7b363780f1a73a963b156022710313168 Mon Sep 17 00:00:00 2001
From: yaoguany <89233842+yaoguany@users.noreply.github.com>
Date: Sun, 6 Aug 2023 00:32:54 +0800
Subject: [PATCH 07/29] change readme position

---
 README.md                                                     | 4 +++-
 .../Position_Interpolation.md                                 | 0
 2 files changed, 3 insertions(+), 1 deletion(-)
 rename Position_Interpolation_README.md => readme/Position_Interpolation.md (100%)

diff --git a/README.md b/README.md
index eb14f529b..26762357a 100644
--- a/README.md
+++ b/README.md
@@ -336,7 +336,9 @@ You can config the deepspeed under configs. Details can be referred at [DeepSpee
 
 Thanks to the great efforts of [llama.cpp](https://github.com/ggerganov/llama.cpp). It is possible for everyone to run their LLaMA models on CPU by 4-bit quantization. We provide a script to convert LLaMA LoRA weights to `.pt` files. You only need to use `convert-pth-to-ggml.py` in llama.cpp to perform quantization.
 
-
+### 4.5 Position Interpolation for LLaMA Models
+Now LMFlow supports the latest Linear & NTK (Neural Kernel theory) scaling techniques for LLaMA models. Check [postion_interpolation](
+https://github.com/OptimalScale/LMFlow/blob/main/readme/Position_Interpolation.md) for more details
 ## 5. Model Release
 
 ### 5.1 Medical Model Checkpoints
diff --git a/Position_Interpolation_README.md b/readme/Position_Interpolation.md
similarity index 100%
rename from Position_Interpolation_README.md
rename to readme/Position_Interpolation.md

From c42568f0954ad5f6ce74333b4156697135adaf17 Mon Sep 17 00:00:00 2001
From: yaoguany <89233842+yaoguany@users.noreply.github.com>
Date: Sun, 6 Aug 2023 00:35:11 +0800
Subject: [PATCH 08/29] fix link

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ae393a437..b3b388dfa 100644
--- a/README.md
+++ b/README.md
@@ -338,7 +338,7 @@ Thanks to the great efforts of [llama.cpp](https://github.com/ggerganov/llama.cp
 
 ### 4.4 Vocabulary List Extension
 
-Now you can train your own sentencepiece tokenizer and merge it with model's origin hf tokenizer. Check [vocab_extension](https://github.com/OptimalScale/LMFlow/tree/main/scripts/vocab_extension) for more details.
+Now you can train your own sentencepiece tokenizer and merge it with model's origin hf tokenizer. Check [vocab_extension](https://github.com/OptimalScale/LMFlow/blob/main/scripts/vocab_extension) for more details.
 
 
 ## 5. Model Release

From 23e7a67dfe27b07fb3a17e79bfd02b102d91a568 Mon Sep 17 00:00:00 2001
From: yaoguany <89233842+yaoguany@users.noreply.github.com>
Date: Sun, 6 Aug 2023 10:51:30 +0800
Subject: [PATCH 09/29] change font size of new readmes

---
 readme/Position_Interpolation.md  | 2 +-
 scripts/vocab_extension/README.md | 9 +++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/readme/Position_Interpolation.md b/readme/Position_Interpolation.md
index df0d469f0..e89e961a8 100644
--- a/readme/Position_Interpolation.md
+++ b/readme/Position_Interpolation.md
@@ -5,7 +5,7 @@ For more details of these techniques, you can checkout the links below:
 https://arxiv.org/abs/2306.15595
 * NTK scaling: \
 https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
-# Usage
+## Usage
 To use the Position Interpolation Techniques, you need to set the following options:
 ```
 --truncate_to_model_max_length False
diff --git a/scripts/vocab_extension/README.md b/scripts/vocab_extension/README.md
index bb5217d1c..5582d7d16 100644
--- a/scripts/vocab_extension/README.md
+++ b/scripts/vocab_extension/README.md
@@ -1,21 +1,22 @@
-# Train & Merge Tokenizer
+# Vocab Extension
+## Train & Merge Tokenizer
 To automatically convert data, train a SentencePiece tokenizer, and merge the tokenizer, you can run the following script:
 ```
 bash scripts/vocab_extension/train_merge_tokenizer.sh
 ``` 
 Alternatively, you can run each of the three steps separately:
 
-# Convert JSON Data to TXT
+## Convert JSON Data to TXT
 To convert JSON data to TXT for sentencepiece tokenizer training, run:
 ```
 bash scripts/vocab_extension/convert_json_to_txt.sh
 ```
-# Train SentencePiece Tokenizer
+## Train SentencePiece Tokenizer
 To train a SentencePiece tokenizer, run:
 ```
 bash scripts/vocab_extension/train_tokenizer.sh
 ```
-# Merge New Tokenizer with the Origin One
+## Merge New Tokenizer with the Origin One
 To merge a new tokenizer with the original one, run:
 ```
 bash scripts/vocab_extension/merge_tokenizer.sh

From ee4205e517594ee0c69be2303bc6f6658443a85a Mon Sep 17 00:00:00 2001
From: yaoguany <89233842+yaoguany@users.noreply.github.com>
Date: Sun, 6 Aug 2023 10:57:56 +0800
Subject: [PATCH 10/29] fix details in hf_decoder_model load tokenizer

---
 src/lmflow/models/hf_decoder_model.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/lmflow/models/hf_decoder_model.py b/src/lmflow/models/hf_decoder_model.py
index aed7f751d..571bbc447 100644
--- a/src/lmflow/models/hf_decoder_model.py
+++ b/src/lmflow/models/hf_decoder_model.py
@@ -155,6 +155,7 @@ def __init__(
                 )
 
         except RecursionError:
+            logger.warning("The tokenizer_config.json file doesn't set the special tokens. Using default values: <unk>, <s>, </s> for unknown token, bos token and eos token respectively.")
             if model_args.tokenizer_name:
                 tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, unk_token="<unk>",
                                                     bos_token="<s>",
@@ -172,8 +173,6 @@ def __init__(
                     " script, save it, and load it from here, using"
                     " --tokenizer_name."
                 )
-
-            logger.warning("The tokenizer_config.json file doesn't set the special tokens. Using default values: <unk>, <s>, </s>")
             
         self.tokenizer = tokenizer  
 

From d5dc68232bed954d1112532791d56a627430ebce Mon Sep 17 00:00:00 2001
From: yaoguany <89233842+yaoguany@users.noreply.github.com>
Date: Mon, 7 Aug 2023 11:13:34 +0800
Subject: [PATCH 11/29] fix vocab extension bugs when use new transformers

---
 scripts/vocab_extension/merge_tokenizer.sh    |  2 +-
 .../vocab_extension/train_merge_tokenizer.sh  |  2 +-
 utils/merge_tokenizer.py                      | 22 ++++++++++++++-----
 3 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/scripts/vocab_extension/merge_tokenizer.sh b/scripts/vocab_extension/merge_tokenizer.sh
index 4dbb98896..0cd1722c0 100644
--- a/scripts/vocab_extension/merge_tokenizer.sh
+++ b/scripts/vocab_extension/merge_tokenizer.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
 mkdir -p ./output_models/new_tokenizer
-python utils/merge_tokenizer.py --tokenizer_dir pinkmanlove/llama-7b-hf \
+python utils/merge_tokenizer.py --tokenizer_dir openlm-research/open_llama_3b \
         --chinese_sp_model_file ./output_models/new_tokenizer/example.model \
         --output_dir ./output_models/merged_tokenizer \
\ No newline at end of file
diff --git a/scripts/vocab_extension/train_merge_tokenizer.sh b/scripts/vocab_extension/train_merge_tokenizer.sh
index 943c81ebe..43398613a 100644
--- a/scripts/vocab_extension/train_merge_tokenizer.sh
+++ b/scripts/vocab_extension/train_merge_tokenizer.sh
@@ -19,5 +19,5 @@ python utils/train_tokenizer.py --dataset_path ./data/wiki_zh_eval/converted_dat
 # merge the new tokenizer with the old one
 mkdir -p ./output_models/merged_tokenizer
 python utils/merge_tokenizer.py --chinese_sp_model_file ./output_models/new_tokenizer/example.model \
-        --tokenizer_dir pinkmanlove/llama-7b-hf \
+        --tokenizer_dir openlm-research/open_llama_3b \
         --output_dir ./output_models/merged_tokenizer
\ No newline at end of file
diff --git a/utils/merge_tokenizer.py b/utils/merge_tokenizer.py
index 17488f5c7..81cd0109e 100644
--- a/utils/merge_tokenizer.py
+++ b/utils/merge_tokenizer.py
@@ -20,7 +20,7 @@
     os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"]="python"
 
     parser = argparse.ArgumentParser()
-    parser.add_argument('--tokenizer_dir', default='pinkmanlove/llama-7b-hf', type=str, required=False)
+    parser.add_argument('--tokenizer_dir', default='openlm-research/open_llama_3b', type=str, required=False)
     parser.add_argument('--chinese_sp_model_file', default='./output_models/new_tokenizer/example.model', type=str)
     parser.add_argument('--output_dir', default='./output_models/merged_tokenizer', type=str, required=False)
     args = parser.parse_args()
@@ -30,7 +30,12 @@
     output_dir = args.output_dir
     
     # load
-    old_tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
+    try:
+        old_tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, use_fast=False)
+    except RecursionError:
+        old_tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, unk_token="<unk>",
+                                                    bos_token="<s>",
+                                                    eos_token="</s>", use_fast=False)
     chinese_sp_model = spm.SentencePieceProcessor()
     chinese_sp_model.Load(chinese_sp_model_file)
 
@@ -56,15 +61,20 @@
     with open(output_sp_dir+'/merged_tokenizer.model', 'wb') as f:
         f.write(old_spm.SerializeToString())
     
-    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=tokenizer_dir,vocab_file=output_sp_dir+'/merged_tokenizer.model')
-
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=tokenizer_dir,vocab_file=output_sp_dir+'/merged_tokenizer.model', use_fast=False)
+    except RecursionError:
+        tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=tokenizer_dir,unk_token="<unk>",
+                                                    bos_token="<s>",
+                                                    eos_token="</s>",
+                                                    vocab_file=output_sp_dir+'/merged_tokenizer.model',
+                                                    use_fast=False)
     tokenizer.save_pretrained(output_hf_dir)
     logging.info(f"Merged tokenizer has been saved to %s",output_dir)
 
 
     # Test
-    old_tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
-    new_tokenizer = AutoTokenizer.from_pretrained(output_hf_dir)
+    new_tokenizer = tokenizer
     logging.info(f"Old tokenizer vocab size: %d",len(old_tokenizer))
     logging.info(f"New tokenizer vocab size: %d",len(new_tokenizer))
     

From 137c65c56013dfbfb90821580f0dc50d4e5af118 Mon Sep 17 00:00:00 2001
From: yaoguany <89233842+yaoguany@users.noreply.github.com>
Date: Mon, 7 Aug 2023 12:18:03 +0800
Subject: [PATCH 12/29] improve performance of training tokenizer

---
 scripts/vocab_extension/train_merge_tokenizer.sh | 3 ++-
 scripts/vocab_extension/train_tokenizer.sh       | 3 ++-
 utils/merge_tokenizer.py                         | 4 ++--
 utils/train_tokenizer.py                         | 6 ++++--
 4 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/scripts/vocab_extension/train_merge_tokenizer.sh b/scripts/vocab_extension/train_merge_tokenizer.sh
index 43398613a..2e63e84b6 100644
--- a/scripts/vocab_extension/train_merge_tokenizer.sh
+++ b/scripts/vocab_extension/train_merge_tokenizer.sh
@@ -14,7 +14,8 @@ python utils/train_tokenizer.py --dataset_path ./data/wiki_zh_eval/converted_dat
         --model_type bpe \
         --output_dir ./output_models/new_tokenizer \
         --user_defined_symbols 0,1,2,3,4,5,6,7,8,9,% \
-        --vocab_size 20000
+        --vocab_size 20000 \
+        --max_sentencepiece_length 4
 
 # merge the new tokenizer with the old one
 mkdir -p ./output_models/merged_tokenizer
diff --git a/scripts/vocab_extension/train_tokenizer.sh b/scripts/vocab_extension/train_tokenizer.sh
index f58347b6b..d61275499 100644
--- a/scripts/vocab_extension/train_tokenizer.sh
+++ b/scripts/vocab_extension/train_tokenizer.sh
@@ -4,4 +4,5 @@ python utils/train_tokenizer.py --dataset_path ./data/wiki_zh_eval/converted_dat
         --model_type bpe \
         --output_dir ./output_models/new_tokenizer \
         --user_defined_symbols 0,1,2,3,4,5,6,7,8,9,% \
-        --vocab_size 20000
\ No newline at end of file
+        --vocab_size 20000 \
+        --max_sentencepiece_length 4
\ No newline at end of file
diff --git a/utils/merge_tokenizer.py b/utils/merge_tokenizer.py
index 81cd0109e..e0fdd87e7 100644
--- a/utils/merge_tokenizer.py
+++ b/utils/merge_tokenizer.py
@@ -81,5 +81,5 @@
     text='''白日依山尽，黄河入海流。欲穷千里目，更上一层楼。
     The primary use of LLaMA is research on large language models, including'''
     logging.info(f"Test text:\n %s",text)
-    logging.info(f"Tokenized by LLaMA tokenizer:%s",old_tokenizer.tokenize(text))
-    logging.info(f"Tokenized by Chinese-LLaMA tokenizer:%s",new_tokenizer.tokenize(text))
\ No newline at end of file
+    logging.info(f"Tokenized by original tokenizer:%s",old_tokenizer.tokenize(text))
+    logging.info(f"Tokenized by merged tokenizer:%s",new_tokenizer.tokenize(text))
\ No newline at end of file
diff --git a/utils/train_tokenizer.py b/utils/train_tokenizer.py
index 31b1f79b5..48ce3c2b2 100644
--- a/utils/train_tokenizer.py
+++ b/utils/train_tokenizer.py
@@ -13,6 +13,7 @@
     parser.add_argument('--vocab_size', default=20000, type=int, required=False)
     parser.add_argument('--model_type', default='bpe', type=str, required=False)
     parser.add_argument('--user_defined_symbols', default='0,1,2,3,4,5,6,7,8,9,%', type=str, required=False)
+    parser.add_argument('--max_sentencepiece_length', default=4, type=int, required=False)
     args = parser.parse_args()    
 
     dataset_path = args.dataset_path
@@ -20,10 +21,11 @@
     vocab_size = args.vocab_size
     model_type = args.model_type
     user_defined_symbols = args.user_defined_symbols
-
+    max_sentencepiece_length=args.max_sentencepiece_length
+    
     def mkdir(path):
         if not os.path.exists(path):
             os.makedirs(path)
     mkdir(output_dir)
 
-    spm.SentencePieceTrainer.train('--input={} --model_prefix={} --model_type={} --vocab_size={} --user_defined_symbols={} --minloglevel=1'.format(dataset_path,output_dir+'/example',model_type,vocab_size,user_defined_symbols))
\ No newline at end of file
+    spm.SentencePieceTrainer.train('--input={} --model_prefix={} --model_type={} --vocab_size={} --user_defined_symbols={} --max_sentencepiece_length={} --minloglevel=1'.format(dataset_path,output_dir+'/example',model_type,vocab_size,user_defined_symbols,max_sentencepiece_length))
\ No newline at end of file

From 596790c28b311a0666ef3455b040f64feb11b7f0 Mon Sep 17 00:00:00 2001
From: rpan <research4pan@gmail.com>
Date: Mon, 7 Aug 2023 14:51:43 +0800
Subject: [PATCH 13/29] Use fp16 instead of bf16 in default scripts

Since only Ampere architecture GPU support bf16. fp16 is supported
by more types of GPUs.
---
 scripts/run_finetune.sh                                   | 2 +-
 scripts/run_finetune_with_lora.sh                         | 2 +-
 scripts/run_finetune_with_lora_save_aggregated_weights.sh | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/run_finetune.sh b/scripts/run_finetune.sh
index e6287bcd5..a9e766aa6 100755
--- a/scripts/run_finetune.sh
+++ b/scripts/run_finetune.sh
@@ -27,7 +27,7 @@ deepspeed ${deepspeed_args} \
     --block_size 512 \
     --per_device_train_batch_size 1 \
     --deepspeed configs/ds_config_zero3.json \
-    --bf16 \
+    --fp16 \
     --run_name finetune \
     --validation_split_percentage 0 \
     --logging_steps 20 \
diff --git a/scripts/run_finetune_with_lora.sh b/scripts/run_finetune_with_lora.sh
index 57190d600..b57bc58b9 100755
--- a/scripts/run_finetune_with_lora.sh
+++ b/scripts/run_finetune_with_lora.sh
@@ -28,7 +28,7 @@ deepspeed ${deepspeed_args} \
     --lora_r 8 \
     --save_aggregated_lora 0\
     --deepspeed configs/ds_config_zero2.json \
-    --bf16 \
+    --fp16 \
     --run_name finetune_with_lora \
     --validation_split_percentage 0 \
     --logging_steps 20 \
diff --git a/scripts/run_finetune_with_lora_save_aggregated_weights.sh b/scripts/run_finetune_with_lora_save_aggregated_weights.sh
index 46249c40e..70e907d95 100755
--- a/scripts/run_finetune_with_lora_save_aggregated_weights.sh
+++ b/scripts/run_finetune_with_lora_save_aggregated_weights.sh
@@ -29,7 +29,7 @@ deepspeed ${deepspeed_args} \
     --lora_r 8 \
     --save_aggregated_lora 1\
     --deepspeed configs/ds_config_zero2.json \
-    --bf16 \
+    --fp16 \
     --run_name finetune_with_lora \
     --validation_split_percentage 0 \
     --logging_steps 20 \

From db915332dacd61f9f0e302f60afc61b9bcebe6f2 Mon Sep 17 00:00:00 2001
From: rpan <research4pan@gmail.com>
Date: Mon, 7 Aug 2023 17:24:04 +0800
Subject: [PATCH 14/29] Add data download for finetune scripts

---
 scripts/run_finetune.sh                                   | 3 +++
 scripts/run_finetune_with_lora.sh                         | 3 +++
 scripts/run_finetune_with_lora_save_aggregated_weights.sh | 3 +++
 3 files changed, 9 insertions(+)

diff --git a/scripts/run_finetune.sh b/scripts/run_finetune.sh
index a9e766aa6..4cc56aa5c 100755
--- a/scripts/run_finetune.sh
+++ b/scripts/run_finetune.sh
@@ -14,6 +14,9 @@ output_dir=${project_dir}/output_models/${exp_id}
 log_dir=${project_dir}/log/${exp_id}
 
 dataset_path=${project_dir}/data/alpaca/train
+if [ ! -d ${dataset_path} ]; then
+  cd data && ./download.sh alpaca && cd -
+fi
 
 mkdir -p ${output_dir} ${log_dir}
 
diff --git a/scripts/run_finetune_with_lora.sh b/scripts/run_finetune_with_lora.sh
index b57bc58b9..696e3ca5b 100755
--- a/scripts/run_finetune_with_lora.sh
+++ b/scripts/run_finetune_with_lora.sh
@@ -12,6 +12,9 @@ output_dir=${project_dir}/output_models/${exp_id}
 log_dir=${project_dir}/log/${exp_id}
 
 dataset_path=${project_dir}/data/alpaca/train
+if [ ! -d ${dataset_path} ]; then
+  cd data && ./download.sh alpaca && cd -
+fi
 
 mkdir -p ${output_dir} ${log_dir}
 
diff --git a/scripts/run_finetune_with_lora_save_aggregated_weights.sh b/scripts/run_finetune_with_lora_save_aggregated_weights.sh
index 70e907d95..087db8af1 100755
--- a/scripts/run_finetune_with_lora_save_aggregated_weights.sh
+++ b/scripts/run_finetune_with_lora_save_aggregated_weights.sh
@@ -13,6 +13,9 @@ log_dir=${project_dir}/log/${exp_id}
 
 dataset_path=${project_dir}/data/alpaca/train
 eval_dataset_path=${project_dir}/data/alpaca/test
+if [ ! -d ${dataset_path} ]; then
+  cd data && ./download.sh alpaca && cd -
+fi
 
 mkdir -p ${output_dir} ${log_dir}
 

From 74183fe4a94e504e8010a681bab8d9f2183d757a Mon Sep 17 00:00:00 2001
From: yaoguany <89233842+yaoguany@users.noreply.github.com>
Date: Mon, 7 Aug 2023 17:25:57 +0800
Subject: [PATCH 15/29] fix import bugs when use flash_attn1

---
 src/lmflow/models/hf_decoder_model.py                    | 7 +------
 src/lmflow/utils/flash_attention/gpt2_flash_attention.py | 7 +++----
 .../utils/flash_attention/gpt_neo_flash_attention.py     | 7 +++----
 .../utils/flash_attention/llama_flash_attention.py       | 9 ++++-----
 4 files changed, 11 insertions(+), 19 deletions(-)

diff --git a/src/lmflow/models/hf_decoder_model.py b/src/lmflow/models/hf_decoder_model.py
index 571bbc447..4e2f594af 100644
--- a/src/lmflow/models/hf_decoder_model.py
+++ b/src/lmflow/models/hf_decoder_model.py
@@ -75,12 +75,7 @@
             "A100": ["LlamaForCausalLM", "GPTNeoForCausalLM", "GPT2ForCausalLM", "BloomForCausalLM"],
             "A40": ["LlamaForCausalLM","GPTNeoForCausalLM", "GPT2ForCausalLM", "BloomForCausalLM"]
         }
-    if int(flash_attn.__version__.split(".")[0]) == 1:
-        GPU_SUPPORT_FLASH_ATTENTION = {
-            "A100": ["LlamaForCausalLM", "GPTNeoForCausalLM", "GPT2ForCausalLM", "BloomForCausalLM"],
-            "A40": ["GPTNeoForCausalLM", "GPT2ForCausalLM", "BloomForCausalLM"]
-        }
-except ImportError:
+except:
     pass
 
 class HFDecoderModel(DecoderModel, Tunable):
diff --git a/src/lmflow/utils/flash_attention/gpt2_flash_attention.py b/src/lmflow/utils/flash_attention/gpt2_flash_attention.py
index f9d46ff8a..25ad44390 100644
--- a/src/lmflow/utils/flash_attention/gpt2_flash_attention.py
+++ b/src/lmflow/utils/flash_attention/gpt2_flash_attention.py
@@ -8,11 +8,10 @@
 
 from einops import rearrange
 
-import flash_attn
-if int(flash_attn.__version__.split(".")[0]) == 1:
-    from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func
-if int(flash_attn.__version__.split(".")[0]) == 2:
+try:
     from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
+except:
+    from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func
 
 from flash_attn.bert_padding import unpad_input, pad_input
 
diff --git a/src/lmflow/utils/flash_attention/gpt_neo_flash_attention.py b/src/lmflow/utils/flash_attention/gpt_neo_flash_attention.py
index 0af820299..20c9e4783 100644
--- a/src/lmflow/utils/flash_attention/gpt_neo_flash_attention.py
+++ b/src/lmflow/utils/flash_attention/gpt_neo_flash_attention.py
@@ -4,11 +4,10 @@
 import transformers
 from einops import rearrange
 
-import flash_attn
-if int(flash_attn.__version__.split(".")[0]) == 1:
-    from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func
-if int(flash_attn.__version__.split(".")[0]) == 2:
+try:
     from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
+except:
+    from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func
 
 from flash_attn.bert_padding import unpad_input, pad_input
 
diff --git a/src/lmflow/utils/flash_attention/llama_flash_attention.py b/src/lmflow/utils/flash_attention/llama_flash_attention.py
index 91bdc828a..e55eab949 100644
--- a/src/lmflow/utils/flash_attention/llama_flash_attention.py
+++ b/src/lmflow/utils/flash_attention/llama_flash_attention.py
@@ -8,12 +8,11 @@
 
 from einops import rearrange
 
-import flash_attn
-if int(flash_attn.__version__.split(".")[0]) == 1:
-    from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func
-if int(flash_attn.__version__.split(".")[0]) == 2:
+try:
     from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
-    
+except:
+    from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func
+
 from flash_attn.bert_padding import unpad_input, pad_input
 
 

From 1854d129c5b7e821e8a2f7f08adb339398b8716a Mon Sep 17 00:00:00 2001
From: yaoguany <89233842+yaoguany@users.noreply.github.com>
Date: Mon, 7 Aug 2023 17:28:59 +0800
Subject: [PATCH 16/29] code comments for flash_attn2

---
 src/lmflow/utils/flash_attention/gpt2_flash_attention.py    | 1 +
 src/lmflow/utils/flash_attention/gpt_neo_flash_attention.py | 1 +
 src/lmflow/utils/flash_attention/llama_flash_attention.py   | 1 +
 3 files changed, 3 insertions(+)

diff --git a/src/lmflow/utils/flash_attention/gpt2_flash_attention.py b/src/lmflow/utils/flash_attention/gpt2_flash_attention.py
index 25ad44390..bac90b447 100644
--- a/src/lmflow/utils/flash_attention/gpt2_flash_attention.py
+++ b/src/lmflow/utils/flash_attention/gpt2_flash_attention.py
@@ -8,6 +8,7 @@
 
 from einops import rearrange
 
+#try to import flash_attn 2.x.x, if not, import flash_attn 1.x.x
 try:
     from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
 except:
diff --git a/src/lmflow/utils/flash_attention/gpt_neo_flash_attention.py b/src/lmflow/utils/flash_attention/gpt_neo_flash_attention.py
index 20c9e4783..49c3d50a0 100644
--- a/src/lmflow/utils/flash_attention/gpt_neo_flash_attention.py
+++ b/src/lmflow/utils/flash_attention/gpt_neo_flash_attention.py
@@ -4,6 +4,7 @@
 import transformers
 from einops import rearrange
 
+#try to import flash_attn 2.x.x, if not, import flash_attn 1.x.x
 try:
     from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
 except:
diff --git a/src/lmflow/utils/flash_attention/llama_flash_attention.py b/src/lmflow/utils/flash_attention/llama_flash_attention.py
index e55eab949..4159629c6 100644
--- a/src/lmflow/utils/flash_attention/llama_flash_attention.py
+++ b/src/lmflow/utils/flash_attention/llama_flash_attention.py
@@ -8,6 +8,7 @@
 
 from einops import rearrange
 
+#try to import flash_attn 2.x.x, if not, import flash_attn 1.x.x
 try:
     from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
 except:

From dff1f92eebc13443e80ef9171617412a5862a001 Mon Sep 17 00:00:00 2001
From: rpan <research4pan@gmail.com>
Date: Mon, 7 Aug 2023 19:32:33 +0800
Subject: [PATCH 17/29] Add auto data download for all scripts

---
 scripts/run_evaluation.sh             | 4 ++++
 scripts/run_evaluation_accelerator.sh | 4 ++++
 scripts/run_evaluation_with_lora.sh   | 5 +++++
 scripts/run_multistage_finetune.sh    | 3 +++
 scripts/run_raft_align.sh             | 4 ++++
 scripts/run_reward_modeling.sh        | 3 +++
 6 files changed, 23 insertions(+)

diff --git a/scripts/run_evaluation.sh b/scripts/run_evaluation.sh
index d94346af0..a1e786ec4 100755
--- a/scripts/run_evaluation.sh
+++ b/scripts/run_evaluation.sh
@@ -1,5 +1,9 @@
 #!/bin/bash
 
+if [ ! -d data/MedQA-USMLE ]; then
+  cd data && ./download.sh MedQA-USMLE && cd -
+fi
+
 CUDA_VISIBLE_DEVICES=0 \
     deepspeed examples/evaluation.py \
     --answer_type medmcqa \
diff --git a/scripts/run_evaluation_accelerator.sh b/scripts/run_evaluation_accelerator.sh
index 1820c0239..8959f6f4b 100644
--- a/scripts/run_evaluation_accelerator.sh
+++ b/scripts/run_evaluation_accelerator.sh
@@ -1,5 +1,9 @@
 #!/bin/bash
 
+if [ ! -d data/MedQA-USMLE ]; then
+  cd data && ./download.sh MedQA-USMLE && cd -
+fi
+
 CUDA_VISIBLE_DEVICES=0 accelerate launch --config_file configs/accelerator_singlegpu_config.yaml examples/evaluation.py \
     --answer_type usmle \
     --model_name_or_path gpt2-large \
diff --git a/scripts/run_evaluation_with_lora.sh b/scripts/run_evaluation_with_lora.sh
index d8a02162a..b83ad074c 100755
--- a/scripts/run_evaluation_with_lora.sh
+++ b/scripts/run_evaluation_with_lora.sh
@@ -3,6 +3,11 @@
 # --model_name_or_path specifies the original huggingface model
 # --lora_model_path specifies the model difference introduced by finetuning,
 #   i.e. the one saved by ./scripts/run_finetune_with_lora.sh
+
+if [ ! -d data/alpaca ]; then
+  cd data && ./download.sh alpaca && cd -
+fi
+
 CUDA_VISIBLE_DEVICES=0 \
     deepspeed examples/evaluation.py \
     --answer_type text \
diff --git a/scripts/run_multistage_finetune.sh b/scripts/run_multistage_finetune.sh
index 9d30746c6..701540e8f 100755
--- a/scripts/run_multistage_finetune.sh
+++ b/scripts/run_multistage_finetune.sh
@@ -11,6 +11,9 @@ project_dir=$(cd "$(dirname $0)"/..; pwd)
 output_dir=${project_dir}/output_models/${exp_id}
 log_dir=${project_dir}/log/${exp_id}
 dataset_path="${project_dir}/data/example_dataset/train"
+if [ ! -d ${dataset_path} ]; then
+  cd data && ./download.sh example_dataset && cd -
+fi
 
 mkdir -p ${output_dir} ${log_dir}
 
diff --git a/scripts/run_raft_align.sh b/scripts/run_raft_align.sh
index 29d18ff9f..9cb5a9717 100755
--- a/scripts/run_raft_align.sh
+++ b/scripts/run_raft_align.sh
@@ -11,6 +11,10 @@ project_dir=$(cd "$(dirname $0)"/..; pwd)
 output_dir=${project_dir}/output_models/${exp_id}
 log_dir=${project_dir}/log/${exp_id}
 
+if [ ! -d data/hh_rlhf ]; then
+  cd data && ./download.sh hh_rlhf && cd -
+fi
+
 mkdir -p ${output_dir} ${log_dir}
 
 export PYTHONPATH=.
diff --git a/scripts/run_reward_modeling.sh b/scripts/run_reward_modeling.sh
index 476661cde..7d9347851 100644
--- a/scripts/run_reward_modeling.sh
+++ b/scripts/run_reward_modeling.sh
@@ -14,6 +14,9 @@ output_dir=${project_dir}/output_models/${exp_id}
 log_dir=${project_dir}/log/${exp_id}
 
 dataset_path=${project_dir}/data/hh_rlhf/rm/hh_rlhf_rm_training.json
+if [ ! -d data/hh_rlhf ]; then
+  cd data && ./download.sh hh_rlhf && cd -
+fi
 
 mkdir -p ${output_dir} ${log_dir}
 

From 21c2923114339b81f662b7cd93f18b633ee4f4b6 Mon Sep 17 00:00:00 2001
From: yaoguany <89233842+yaoguany@users.noreply.github.com>
Date: Mon, 7 Aug 2023 20:30:10 +0800
Subject: [PATCH 18/29] fix style issues

---
 utils/merge_tokenizer.py | 13 +++++++++++--
 utils/train_tokenizer.py |  9 ++++++++-
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/utils/merge_tokenizer.py b/utils/merge_tokenizer.py
index e0fdd87e7..6354dd1e9 100644
--- a/utils/merge_tokenizer.py
+++ b/utils/merge_tokenizer.py
@@ -11,7 +11,7 @@
 import torch
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
-from transformers import AutoTokenizer
+from transformers import AutoTokenizer,LlamaTokenizer
 
 logging.basicConfig(level=logging.INFO)
 
@@ -36,6 +36,10 @@
         old_tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, unk_token="<unk>",
                                                     bos_token="<s>",
                                                     eos_token="</s>", use_fast=False)
+        
+    if not isinstance(old_tokenizer,LlamaTokenizer):
+        raise ValueError("The tokenizer is not a LlamaTokenizer, we only support LlamaTokenizer for now.")
+
     chinese_sp_model = spm.SentencePieceProcessor()
     chinese_sp_model.Load(chinese_sp_model_file)
 
@@ -62,13 +66,18 @@
         f.write(old_spm.SerializeToString())
     
     try:
-        tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=tokenizer_dir,vocab_file=output_sp_dir+'/merged_tokenizer.model', use_fast=False)
+        tokenizer = AutoTokenizer.from_pretrain(
+            pretrained_model_name_or_path=tokenizer_dir,
+            vocab_file=output_sp_dir+'/merged_tokenizer.model',
+            use_fast=False
+        )
     except RecursionError:
         tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=tokenizer_dir,unk_token="<unk>",
                                                     bos_token="<s>",
                                                     eos_token="</s>",
                                                     vocab_file=output_sp_dir+'/merged_tokenizer.model',
                                                     use_fast=False)
+
     tokenizer.save_pretrained(output_hf_dir)
     logging.info(f"Merged tokenizer has been saved to %s",output_dir)
 
diff --git a/utils/train_tokenizer.py b/utils/train_tokenizer.py
index 48ce3c2b2..98d081123 100644
--- a/utils/train_tokenizer.py
+++ b/utils/train_tokenizer.py
@@ -28,4 +28,11 @@ def mkdir(path):
             os.makedirs(path)
     mkdir(output_dir)
 
-    spm.SentencePieceTrainer.train('--input={} --model_prefix={} --model_type={} --vocab_size={} --user_defined_symbols={} --max_sentencepiece_length={} --minloglevel=1'.format(dataset_path,output_dir+'/example',model_type,vocab_size,user_defined_symbols,max_sentencepiece_length))
\ No newline at end of file
+    spm.SentencePieceTrainer.train(
+    f'--input={dataset_path}'
+    f' --model_prefix={output_dir}/example'
+    f' --model_type={model_type}'
+    f' --vocab_size={vocab_size}'
+    f' --user_defined_symbols={user_defined_symbols}'
+    f' --max_sentencepiece_length={max_sentencepiece_length}'
+    )
\ No newline at end of file

From 5ca18daae906bac755acb2b9034a8ac6f73ed908 Mon Sep 17 00:00:00 2001
From: yaoguany <89233842+yaoguany@users.noreply.github.com>
Date: Mon, 7 Aug 2023 20:35:38 +0800
Subject: [PATCH 19/29] change loglevel to 1

---
 utils/train_tokenizer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/train_tokenizer.py b/utils/train_tokenizer.py
index 98d081123..7b67f6c40 100644
--- a/utils/train_tokenizer.py
+++ b/utils/train_tokenizer.py
@@ -35,4 +35,5 @@ def mkdir(path):
     f' --vocab_size={vocab_size}'
     f' --user_defined_symbols={user_defined_symbols}'
     f' --max_sentencepiece_length={max_sentencepiece_length}'
+    f' --minloglevel=1'
     )
\ No newline at end of file

From 3919c3a89e3ddbdafe1b9c0d1142d76f0bf390b8 Mon Sep 17 00:00:00 2001
From: yaoguany <89233842+yaoguany@users.noreply.github.com>
Date: Mon, 7 Aug 2023 20:43:32 +0800
Subject: [PATCH 20/29] optimize code style in merge_tokenizer.py

---
 utils/merge_tokenizer.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/utils/merge_tokenizer.py b/utils/merge_tokenizer.py
index 6354dd1e9..9931edcf6 100644
--- a/utils/merge_tokenizer.py
+++ b/utils/merge_tokenizer.py
@@ -33,9 +33,11 @@
     try:
         old_tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, use_fast=False)
     except RecursionError:
-        old_tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, unk_token="<unk>",
+        old_tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
+                                                    unk_token="<unk>",
                                                     bos_token="<s>",
-                                                    eos_token="</s>", use_fast=False)
+                                                    eos_token="</s>",
+                                                    use_fast=False)
         
     if not isinstance(old_tokenizer,LlamaTokenizer):
         raise ValueError("The tokenizer is not a LlamaTokenizer, we only support LlamaTokenizer for now.")
@@ -66,17 +68,18 @@
         f.write(old_spm.SerializeToString())
     
     try:
-        tokenizer = AutoTokenizer.from_pretrain(
+        tokenizer = AutoTokenizer.from_pretrained(
             pretrained_model_name_or_path=tokenizer_dir,
             vocab_file=output_sp_dir+'/merged_tokenizer.model',
             use_fast=False
         )
     except RecursionError:
-        tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=tokenizer_dir,unk_token="<unk>",
-                                                    bos_token="<s>",
-                                                    eos_token="</s>",
-                                                    vocab_file=output_sp_dir+'/merged_tokenizer.model',
-                                                    use_fast=False)
+        tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=tokenizer_dir,
+                                                unk_token="<unk>",
+                                                bos_token="<s>",
+                                                eos_token="</s>",
+                                                vocab_file=output_sp_dir+'/merged_tokenizer.model',
+                                                use_fast=False)
 
     tokenizer.save_pretrained(output_hf_dir)
     logging.info(f"Merged tokenizer has been saved to %s",output_dir)

From 89b5e48d2c5047f3cb99eb8d7de42a8894eef351 Mon Sep 17 00:00:00 2001
From: yaoguany <89233842+yaoguany@users.noreply.github.com>
Date: Mon, 7 Aug 2023 21:42:55 +0800
Subject: [PATCH 21/29] update install.sh & README

---
 README.md  | 2 +-
 install.sh | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)
 create mode 100644 install.sh

diff --git a/README.md b/README.md
index 092026356..64ef6b251 100644
--- a/README.md
+++ b/README.md
@@ -213,7 +213,7 @@ cd LMFlow
 conda create -n lmflow python=3.9 -y
 conda activate lmflow
 conda install mpi4py
-pip install -e .
+./install.sh
 ```
 
 ## 2.Prepare Dataset
diff --git a/install.sh b/install.sh
new file mode 100644
index 000000000..6452e5053
--- /dev/null
+++ b/install.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+pip install -e .
+
+gpu_state="$(nvidia-smi --query-gpu=name --format=csv,noheader)"
+if [[ *"A100"* == "${gpu_state}" -o *"A40"* == "${gpu_state}" ]]; then
+  echo "YES!!!!!"
+  pip install flash-attn==2.0.2
+fi

From 31d0e1dd23977245bc70c5b446f91b64dc6f8f1c Mon Sep 17 00:00:00 2001
From: yaoguany <89233842+yaoguany@users.noreply.github.com>
Date: Mon, 7 Aug 2023 21:45:05 +0800
Subject: [PATCH 22/29] adjust install.sh

---
 install.sh | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/install.sh b/install.sh
index 6452e5053..eea46d3e4 100644
--- a/install.sh
+++ b/install.sh
@@ -3,7 +3,6 @@
 pip install -e .
 
 gpu_state="$(nvidia-smi --query-gpu=name --format=csv,noheader)"
-if [[ *"A100"* == "${gpu_state}" -o *"A40"* == "${gpu_state}" ]]; then
-  echo "YES!!!!!"
+if [[ "${gpu_state}" == *"A100"* || "${gpu_state}" == *"A40"* ]]; then
   pip install flash-attn==2.0.2
-fi
+fi
\ No newline at end of file

From f005663e099edc7736fee960a5157d0d37c00246 Mon Sep 17 00:00:00 2001
From: yaoguany <89233842+yaoguany@users.noreply.github.com>
Date: Mon, 7 Aug 2023 22:43:42 +0800
Subject: [PATCH 23/29] add flash_attn2 readme

---
 README.md             | 8 +++++---
 readme/flash_attn2.md | 8 ++++++++
 2 files changed, 13 insertions(+), 3 deletions(-)
 create mode 100644 readme/flash_attn2.md

diff --git a/README.md b/README.md
index 64ef6b251..fe4aa31eb 100644
--- a/README.md
+++ b/README.md
@@ -338,12 +338,14 @@ Thanks to the great efforts of [llama.cpp](https://github.com/ggerganov/llama.cp
 
 ### 4.4 Vocabulary List Extension
 
-Now you can train your own sentencepiece tokenizer and merge it with model's origin hf tokenizer. Check [vocab_extension](https://github.com/OptimalScale/LMFlow/blob/main/scripts/vocab_extension) for more details.
+Now you can train your own sentencepiece tokenizer and merge it with model's origin hf tokenizer. Check out [vocab_extension](https://github.com/OptimalScale/LMFlow/blob/main/scripts/vocab_extension) for more details.
 
 ### 4.5 Position Interpolation for LLaMA Models
-Now LMFlow supports the latest Linear & NTK (Neural Kernel theory) scaling techniques for LLaMA models. Check [postion_interpolation](
-https://github.com/OptimalScale/LMFlow/blob/main/readme/Position_Interpolation.md) for more details
+Now LMFlow supports the latest Linear & NTK (Neural Kernel theory) scaling techniques for LLaMA models. Check out [postion_interpolation](
+https://github.com/OptimalScale/LMFlow/blob/main/readme/Position_Interpolation.md) for more details.
 
+### 4.6 Flash Attention 2.0
+Now LMFlow supports the latest Flash Attention 2.0. Check out [flash_attention](https://github.com/OptimalScale/LMFlow/blob/main/readme/flash_attn2.md) for more details.
 ## 5. Model Release
 
 ### 5.1 Medical Model Checkpoints
diff --git a/readme/flash_attn2.md b/readme/flash_attn2.md
new file mode 100644
index 000000000..5e1edaf90
--- /dev/null
+++ b/readme/flash_attn2.md
@@ -0,0 +1,8 @@
+# Falsh Attention 2.0
+We're thrilled to announce that LMFlow now supports training and inference using flash attn2! This cutting-edge feature will take your language modeling to the next level. To use it, simply add ``` --use_flash_attention True ``` to the corresponding bash script.
+
+But that's not all - we're also excited to share that LMFlow now supports GPU models A40 and A100! This means faster and more efficient training for your models.
+
+And as if that wasn't enough, we've also expanded our supported model architectures to include the powerful ["LlamaForCausalLM", "GPTNeoForCausalLM", "BloomForCausalLM"]. With LMFlow, the possibilities for language modeling are endless.
+
+Upgrade to LMFlow now and experience the future of language modeling!
\ No newline at end of file

From 17cf48ea594cf1b6941f287059b7bb9fd66f83f5 Mon Sep 17 00:00:00 2001
From: yaoguany <89233842+yaoguany@users.noreply.github.com>
Date: Mon, 7 Aug 2023 23:29:01 +0800
Subject: [PATCH 24/29] improve flash_attn2 readme

---
 readme/flash_attn2.md | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/readme/flash_attn2.md b/readme/flash_attn2.md
index 5e1edaf90..bdc94fd45 100644
--- a/readme/flash_attn2.md
+++ b/readme/flash_attn2.md
@@ -1,8 +1,17 @@
-# Falsh Attention 2.0
-We're thrilled to announce that LMFlow now supports training and inference using flash attn2! This cutting-edge feature will take your language modeling to the next level. To use it, simply add ``` --use_flash_attention True ``` to the corresponding bash script.
-
-But that's not all - we're also excited to share that LMFlow now supports GPU models A40 and A100! This means faster and more efficient training for your models.
-
-And as if that wasn't enough, we've also expanded our supported model architectures to include the powerful ["LlamaForCausalLM", "GPTNeoForCausalLM", "BloomForCausalLM"]. With LMFlow, the possibilities for language modeling are endless.
+# Flash Attention 2.0
+We're thrilled to announce that LMFlow now supports training and inference using **FlashAttention-2**! This cutting-edge feature will take your language modeling to the next level. To use it, simply add ``` --use_flash_attention True ``` to the corresponding bash script.
+Here is an example of how to use it:
+```
+#!/bin/bash
 
+deepspeed examples/evaluation.py \
+    --answer_type text \
+    --model_name_or_path pinkmanlove/llama-7b-hf \
+    --dataset_path data/wiki_en_eval \
+    --deepspeed examples/ds_config.json \
+    --inference_batch_size_per_device 1 \
+    --block_size 2048 \
+    --use_flash_attention True \
+    --metric ppl
+```
 Upgrade to LMFlow now and experience the future of language modeling!
\ No newline at end of file

From a9fb5c075c445c7c882734c3d2f6b5b870d84c4b Mon Sep 17 00:00:00 2001
From: yaoguany <89233842+yaoguany@users.noreply.github.com>
Date: Mon, 7 Aug 2023 23:38:40 +0800
Subject: [PATCH 25/29] update readme

---
 readme/flash_attn2.md | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/readme/flash_attn2.md b/readme/flash_attn2.md
index bdc94fd45..0ee3beec7 100644
--- a/readme/flash_attn2.md
+++ b/readme/flash_attn2.md
@@ -3,15 +3,23 @@ We're thrilled to announce that LMFlow now supports training and inference using
 Here is an example of how to use it:
 ```
 #!/bin/bash
+pip install flash_attn==2.0.2
 
-deepspeed examples/evaluation.py \
-    --answer_type text \
-    --model_name_or_path pinkmanlove/llama-7b-hf \
-    --dataset_path data/wiki_en_eval \
-    --deepspeed examples/ds_config.json \
-    --inference_batch_size_per_device 1 \
-    --block_size 2048 \
-    --use_flash_attention True \
-    --metric ppl
+model=pinkmanlove/llama-7b-hf
+lora_args=""
+if [ $# -ge 1 ]; then
+  model=$1
+fi
+if [ $# -ge 2 ]; then
+  lora_args="--lora_model_path $2"
+fi
+
+CUDA_VISIBLE_DEVICES=0 \
+  deepspeed examples/chatbot.py \
+      --deepspeed configs/ds_config_chatbot.json \
+      --model_name_or_path ${model} \
+      --use_flash_attention True \
+      ${lora_args}
 ```
+
 Upgrade to LMFlow now and experience the future of language modeling!
\ No newline at end of file

From 7d53bf94ff7b325382f4f457a8c1a98fd115135c Mon Sep 17 00:00:00 2001
From: yaoguany <89233842+yaoguany@users.noreply.github.com>
Date: Mon, 7 Aug 2023 23:49:12 +0800
Subject: [PATCH 26/29] update readme bash code

---
 readme/flash_attn2.md | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/readme/flash_attn2.md b/readme/flash_attn2.md
index 0ee3beec7..3b9970c11 100644
--- a/readme/flash_attn2.md
+++ b/readme/flash_attn2.md
@@ -5,21 +5,14 @@ Here is an example of how to use it:
 #!/bin/bash
 pip install flash_attn==2.0.2
 
-model=pinkmanlove/llama-7b-hf
-lora_args=""
-if [ $# -ge 1 ]; then
-  model=$1
-fi
-if [ $# -ge 2 ]; then
-  lora_args="--lora_model_path $2"
-fi
-
-CUDA_VISIBLE_DEVICES=0 \
-  deepspeed examples/chatbot.py \
-      --deepspeed configs/ds_config_chatbot.json \
-      --model_name_or_path ${model} \
-      --use_flash_attention True \
-      ${lora_args}
+deepspeed --master_port=11000 \
+   examples/chatbot.py \                           
+      --deepspeed configs/ds_config_chatbot.json \                              
+      --model_name_or_path LMFlow/Full-Robin-7b-v2 \                                                     
+      --max_new_tokens 1024 \
+      --prompt_structure "###Human: {input_text}###Assistant:" \
+      --end_string "#" \
+      --use_flash_attention True
 ```
 
 Upgrade to LMFlow now and experience the future of language modeling!
\ No newline at end of file

From f35fe8de5396192bd253ca530c1bdc5f53f391c6 Mon Sep 17 00:00:00 2001
From: shizhediao <654745845@qq.com>
Date: Tue, 8 Aug 2023 03:04:49 +0800
Subject: [PATCH 27/29] annouce flash attention 2

---
 README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index fe4aa31eb..7449ae700 100644
--- a/README.md
+++ b/README.md
@@ -33,6 +33,7 @@ Large Model for All. See our [vision](https://github.com/OptimalScale/LMFlow#vis
 
 
 ## Latest News
+* [2023-08-07] Support [Flash Attention-2](https://crfm.stanford.edu/2023/07/17/flash2.html). Check out [flash_attention](https://github.com/OptimalScale/LMFlow/blob/main/readme/flash_attn2.md) for more details.
 * [2023-08-02] Support [Llama2](https://ai.meta.com/llama/), [ChatGLM2](https://huggingface.co/THUDM/chatglm2-6b), and [Baichuan](https://huggingface.co/baichuan-inc/Baichuan-7B) models.
 * [2023-07-23] :rocket: [LMFlow multimodal chatbot](https://github.com/OptimalScale/LMFlow/blob/main/scripts/run_vis_chatbot_gradio_minigpt4.sh) is now available! Support multimodal inputs of images and texts. [Online Demo](http://multimodal.lmflow.online) is also provided (We hold the service on a single GPU, hence one may experience "queuing" or "application busy" sometimes when multiple users are accessing at the same time, please wait and attempt again later when such event happens) :rocket: ![image](https://github.com/OptimalScale/LMFlow/blob/rpan-vision-encoder/assets/multimodal-chatbot-demo.gif)
 * [2023-06-22]  [LMFlow paper](https://arxiv.org/abs/2306.12420) is out! Check out our implementation details at https://arxiv.org/abs/2306.12420
@@ -344,8 +345,8 @@ Now you can train your own sentencepiece tokenizer and merge it with model's ori
 Now LMFlow supports the latest Linear & NTK (Neural Kernel theory) scaling techniques for LLaMA models. Check out [postion_interpolation](
 https://github.com/OptimalScale/LMFlow/blob/main/readme/Position_Interpolation.md) for more details.
 
-### 4.6 Flash Attention 2.0
-Now LMFlow supports the latest Flash Attention 2.0. Check out [flash_attention](https://github.com/OptimalScale/LMFlow/blob/main/readme/flash_attn2.md) for more details.
+### 4.6 FlashAttention-2
+Now LMFlow supports the latest FlashAttention-2. Check out [flash_attention](https://github.com/OptimalScale/LMFlow/blob/main/readme/flash_attn2.md) for more details.
 ## 5. Model Release
 
 ### 5.1 Medical Model Checkpoints

From 3fe99153169a66627cc17acc9d52e67322813171 Mon Sep 17 00:00:00 2001
From: shizhediao <654745845@qq.com>
Date: Tue, 8 Aug 2023 03:05:36 +0800
Subject: [PATCH 28/29] reorg

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7449ae700..47eb32199 100644
--- a/README.md
+++ b/README.md
@@ -346,7 +346,8 @@ Now LMFlow supports the latest Linear & NTK (Neural Kernel theory) scaling techn
 https://github.com/OptimalScale/LMFlow/blob/main/readme/Position_Interpolation.md) for more details.
 
 ### 4.6 FlashAttention-2
-Now LMFlow supports the latest FlashAttention-2. Check out [flash_attention](https://github.com/OptimalScale/LMFlow/blob/main/readme/flash_attn2.md) for more details.
+Now LMFlow supports the latest [Flash Attention-2](https://crfm.stanford.edu/2023/07/17/flash2.html). Check out [flash_attention](https://github.com/OptimalScale/LMFlow/blob/main/readme/flash_attn2.md) for more details.
+
 ## 5. Model Release
 
 ### 5.1 Medical Model Checkpoints

From 9337a724649b3dbfc6298805135930ea6255aeda Mon Sep 17 00:00:00 2001
From: shizhediao <654745845@qq.com>
Date: Tue, 8 Aug 2023 03:06:24 +0800
Subject: [PATCH 29/29] remove space

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 47eb32199..d85303ee9 100644
--- a/README.md
+++ b/README.md
@@ -346,7 +346,7 @@ Now LMFlow supports the latest Linear & NTK (Neural Kernel theory) scaling techn
 https://github.com/OptimalScale/LMFlow/blob/main/readme/Position_Interpolation.md) for more details.
 
 ### 4.6 FlashAttention-2
-Now LMFlow supports the latest [Flash Attention-2](https://crfm.stanford.edu/2023/07/17/flash2.html). Check out [flash_attention](https://github.com/OptimalScale/LMFlow/blob/main/readme/flash_attn2.md) for more details.
+Now LMFlow supports the latest [FlashAttention-2](https://crfm.stanford.edu/2023/07/17/flash2.html). Check out [flash_attention](https://github.com/OptimalScale/LMFlow/blob/main/readme/flash_attn2.md) for more details.
 
 ## 5. Model Release