From 46de5832835a75ee6318a19e7c4e596696c32f02 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=90=B4=E9=AB=98=E5=8D=87?= <w5688414@gmail.com>
Date: Wed, 13 Jul 2022 15:48:18 +0800
Subject: [PATCH] =?UTF-8?q?Update=20faq=20question=20for=20inbatch=20model?=
 =?UTF-8?q?=20loading=20simcse=20checkpoint=20and=20o=E2=80=A6=20(#2786)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Update faq question for inbatch model loading simcse checkpoint and optimize the code and readme

* Update Readme of neural search
---
 .../recall/in_batch_negative/README.md        | 26 ++++++++++++++++++-
 .../recall/in_batch_negative/base_model.py    |  6 -----
 .../scripts/train_batch_neg.sh                |  5 ++--
 .../in_batch_negative/train_batch_neg.py      |  4 +--
 .../neural_search/recall/milvus/README.md     |  2 +-
 .../neural_search/recall/simcse/README.md     |  4 +++
 6 files changed, 35 insertions(+), 12 deletions(-)

diff --git a/applications/neural_search/recall/in_batch_negative/README.md b/applications/neural_search/recall/in_batch_negative/README.md
index 20ff72c78714..85592e1705d7 100644
--- a/applications/neural_search/recall/in_batch_negative/README.md
+++ b/applications/neural_search/recall/in_batch_negative/README.md
@@ -198,7 +198,7 @@ python -u -m paddle.distributed.launch --gpus "0,1,2,3" \
     --max_seq_length 64 \
     --margin 0.2 \
     --train_set_file recall/train.csv \
-    --evaluate True \
+    --evaluate \
     --recall_result_dir "recall_result_dir" \
     --recall_result_file "recall_result.txt" \
     --hnsw_m 100 \
@@ -588,6 +588,30 @@ outputs {
 ```
 可以看到服务端返回了向量
 
+## FAQ
+
+#### 如何基于无监督SimCSE训练出的模型参数作为参数初始化继续做有监督 In-Batch Negative 训练？
+
++ 使用 `--init_from_ckpt` 参数加载即可，下面是使用示例：
+
+```
+python -u -m paddle.distributed.launch --gpus "0,1,2,3" \
+    train_batch_neg.py \
+    --device gpu \
+    --save_dir ./checkpoints/simcse_inbatch_negative \
+    --batch_size 64 \
+    --learning_rate 5E-5 \
+    --epochs 3 \
+    --output_emb_size 256 \
+    --save_steps 10 \
+    --max_seq_length 64 \
+    --margin 0.2 \
+    --train_set_file recall/train.csv  \
+    --init_from_ckpt simcse/model_20000/model_state.pdparams
+```
+
+
+
 ## Reference
 
 [1] Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, Wen-tau Yih, Dense Passage Retrieval for Open-Domain Question Answering, Preprint 2020.
diff --git a/applications/neural_search/recall/in_batch_negative/base_model.py b/applications/neural_search/recall/in_batch_negative/base_model.py
index da9dd4827d53..a1068c6c179f 100644
--- a/applications/neural_search/recall/in_batch_negative/base_model.py
+++ b/applications/neural_search/recall/in_batch_negative/base_model.py
@@ -41,10 +41,6 @@ def __init__(self, pretrained_model, dropout=None, output_emb_size=None):
                                                       output_emb_size,
                                                       weight_attr=weight_attr)
 
-    @paddle.jit.to_static(input_spec=[
-        paddle.static.InputSpec(shape=[None, None], dtype='int64'),
-        paddle.static.InputSpec(shape=[None, None], dtype='int64')
-    ])
     def get_pooled_embedding(self,
                              input_ids,
                              token_type_ids=None,
@@ -65,8 +61,6 @@ def get_semantic_embedding(self, data_loader):
         with paddle.no_grad():
             for batch_data in data_loader:
                 input_ids, token_type_ids = batch_data
-                input_ids = paddle.to_tensor(input_ids)
-                token_type_ids = paddle.to_tensor(token_type_ids)
 
                 text_embeddings = self.get_pooled_embedding(
                     input_ids, token_type_ids=token_type_ids)
diff --git a/applications/neural_search/recall/in_batch_negative/scripts/train_batch_neg.sh b/applications/neural_search/recall/in_batch_negative/scripts/train_batch_neg.sh
index fc40e1f1872b..f493b89b5fc3 100644
--- a/applications/neural_search/recall/in_batch_negative/scripts/train_batch_neg.sh
+++ b/applications/neural_search/recall/in_batch_negative/scripts/train_batch_neg.sh
@@ -42,7 +42,8 @@ python -u -m paddle.distributed.launch --gpus "0,1,2,3" \
 #     --save_steps 10 \
 #     --max_seq_length 64 \
 #     --margin 0.2 \
-#     --train_set_file data/${root_path}/train.csv  \
+#     --evaluate \
+#     --train_set_file recall/train.csv  \
 #     --init_from_ckpt simcse/model_20000/model_state.pdparams
 
 # 加载post training的模型，模型放在simcse/post_model_10000
@@ -57,5 +58,5 @@ python -u -m paddle.distributed.launch --gpus "0,1,2,3" \
 #     --save_steps 10 \
 #     --max_seq_length 64 \
 #     --margin 0.2 \
-#     --train_set_file data/${root_path}/train.csv  \
+#     --train_set_file recall/train.csv  \
 #     --init_from_ckpt simcse/post_model_10000/model_state.pdparams
diff --git a/applications/neural_search/recall/in_batch_negative/train_batch_neg.py b/applications/neural_search/recall/in_batch_negative/train_batch_neg.py
index fccf1c5b17f7..222b02d16423 100644
--- a/applications/neural_search/recall/in_batch_negative/train_batch_neg.py
+++ b/applications/neural_search/recall/in_batch_negative/train_batch_neg.py
@@ -86,7 +86,7 @@
                     type=int, help="Recall number for each query from Ann index.")
 parser.add_argument("--evaluate_result", type=str, default='evaluate_result.txt',
                     help="evaluate_result")
-parser.add_argument('--evaluate', default=True, type=eval, choices=[True, False],
+parser.add_argument('--evaluate', action='store_true',
                     help='whether evaluate while training')
 args = parser.parse_args()
 # yapf: enable
@@ -294,7 +294,7 @@ def do_train():
                                                    'model_state.pdparams')
                     paddle.save(model.state_dict(), save_param_path)
                     tokenizer.save_pretrained(save_dir)
-        if args.evaluate:
+        if args.evaluate and rank == 0:
             print("evaluating")
             recall_5 = evaluate(model, corpus_data_loader, query_data_loader,
                                 recall_result_file, text_list, id2corpus)
diff --git a/applications/neural_search/recall/milvus/README.md b/applications/neural_search/recall/milvus/README.md
index 34d4837be048..abc9a0868459 100644
--- a/applications/neural_search/recall/milvus/README.md
+++ b/applications/neural_search/recall/milvus/README.md
@@ -206,6 +206,6 @@ python3 inference.py
 ```
 ## FAQ
 
-### 句子抽取向量后，利用milvus进行检索到了相同的句子，得到的距离不是0
+#### 抽取文本语义向量后，利用 Milvus 进行 ANN 检索查询到了完全相同的文本，但是计算出的距离为什么不是 0？
 
 使用的是近似索引，详情请参考Milvus官方文档，[索引创建机制](https://milvus.io/cn/docs/v2.0.x/index.md)
diff --git a/applications/neural_search/recall/simcse/README.md b/applications/neural_search/recall/simcse/README.md
index 080d2f7edbcf..9090dd6fcc1e 100644
--- a/applications/neural_search/recall/simcse/README.md
+++ b/applications/neural_search/recall/simcse/README.md
@@ -431,7 +431,11 @@ sh deploy.sh
 
 [0.5649663209915161, 0.03284594044089317]
 ```
+## FAQ
 
+#### SimCSE模型怎么部署？
+
++ SimCSE使用的模型跟 In-batch Negatives 训练出来的模型网络结构是一样的，使用 In-batch Negatives 的部署流程即可，参考[In-batch Negatives](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/applications/neural_search/recall/in_batch_negative/deploy/python)
 
 ## Reference
 [1] Gao, Tianyu, Xingcheng Yao, and Danqi Chen. “SimCSE: Simple Contrastive Learning of Sentence Embeddings.” ArXiv:2104.08821 [Cs], April 18, 2021. http://arxiv.org/abs/2104.08821.