From 46de5832835a75ee6318a19e7c4e596696c32f02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=90=B4=E9=AB=98=E5=8D=87?= Date: Wed, 13 Jul 2022 15:48:18 +0800 Subject: [PATCH] =?UTF-8?q?Update=20faq=20question=20for=20inbatch=20model?= =?UTF-8?q?=20loading=20simcse=20checkpoint=20and=20o=E2=80=A6=20(#2786)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Update faq question for inbatch model loading simcse checkpoint and optimize the code and readme * Update Readme of neural search --- .../recall/in_batch_negative/README.md | 26 ++++++++++++++++++- .../recall/in_batch_negative/base_model.py | 6 ----- .../scripts/train_batch_neg.sh | 5 ++-- .../in_batch_negative/train_batch_neg.py | 4 +-- .../neural_search/recall/milvus/README.md | 2 +- .../neural_search/recall/simcse/README.md | 4 +++ 6 files changed, 35 insertions(+), 12 deletions(-) diff --git a/applications/neural_search/recall/in_batch_negative/README.md b/applications/neural_search/recall/in_batch_negative/README.md index 20ff72c78714..85592e1705d7 100644 --- a/applications/neural_search/recall/in_batch_negative/README.md +++ b/applications/neural_search/recall/in_batch_negative/README.md @@ -198,7 +198,7 @@ python -u -m paddle.distributed.launch --gpus "0,1,2,3" \ --max_seq_length 64 \ --margin 0.2 \ --train_set_file recall/train.csv \ - --evaluate True \ + --evaluate \ --recall_result_dir "recall_result_dir" \ --recall_result_file "recall_result.txt" \ --hnsw_m 100 \ @@ -588,6 +588,30 @@ outputs { ``` 可以看到服务端返回了向量 +## FAQ + +#### 如何基于无监督SimCSE训练出的模型参数作为参数初始化继续做有监督 In-Batch Negative 训练? + ++ 使用 `--init_from_ckpt` 参数加载即可,下面是使用示例: + +``` +python -u -m paddle.distributed.launch --gpus "0,1,2,3" \ + train_batch_neg.py \ + --device gpu \ + --save_dir ./checkpoints/simcse_inbatch_negative \ + --batch_size 64 \ + --learning_rate 5E-5 \ + --epochs 3 \ + --output_emb_size 256 \ + --save_steps 10 \ + --max_seq_length 64 \ + --margin 0.2 \ + --train_set_file recall/train.csv \ + --init_from_ckpt simcse/model_20000/model_state.pdparams +``` + + + ## Reference [1] Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, Wen-tau Yih, Dense Passage Retrieval for Open-Domain Question Answering, Preprint 2020. diff --git a/applications/neural_search/recall/in_batch_negative/base_model.py b/applications/neural_search/recall/in_batch_negative/base_model.py index da9dd4827d53..a1068c6c179f 100644 --- a/applications/neural_search/recall/in_batch_negative/base_model.py +++ b/applications/neural_search/recall/in_batch_negative/base_model.py @@ -41,10 +41,6 @@ def __init__(self, pretrained_model, dropout=None, output_emb_size=None): output_emb_size, weight_attr=weight_attr) - @paddle.jit.to_static(input_spec=[ - paddle.static.InputSpec(shape=[None, None], dtype='int64'), - paddle.static.InputSpec(shape=[None, None], dtype='int64') - ]) def get_pooled_embedding(self, input_ids, token_type_ids=None, @@ -65,8 +61,6 @@ def get_semantic_embedding(self, data_loader): with paddle.no_grad(): for batch_data in data_loader: input_ids, token_type_ids = batch_data - input_ids = paddle.to_tensor(input_ids) - token_type_ids = paddle.to_tensor(token_type_ids) text_embeddings = self.get_pooled_embedding( input_ids, token_type_ids=token_type_ids) diff --git a/applications/neural_search/recall/in_batch_negative/scripts/train_batch_neg.sh b/applications/neural_search/recall/in_batch_negative/scripts/train_batch_neg.sh index fc40e1f1872b..f493b89b5fc3 100644 --- a/applications/neural_search/recall/in_batch_negative/scripts/train_batch_neg.sh +++ b/applications/neural_search/recall/in_batch_negative/scripts/train_batch_neg.sh @@ -42,7 +42,8 @@ python -u -m paddle.distributed.launch --gpus "0,1,2,3" \ # --save_steps 10 \ # --max_seq_length 64 \ # --margin 0.2 \ -# --train_set_file data/${root_path}/train.csv \ +# --evaluate \ +# --train_set_file recall/train.csv \ # --init_from_ckpt simcse/model_20000/model_state.pdparams # 加载post training的模型,模型放在simcse/post_model_10000 @@ -57,5 +58,5 @@ python -u -m paddle.distributed.launch --gpus "0,1,2,3" \ # --save_steps 10 \ # --max_seq_length 64 \ # --margin 0.2 \ -# --train_set_file data/${root_path}/train.csv \ +# --train_set_file recall/train.csv \ # --init_from_ckpt simcse/post_model_10000/model_state.pdparams diff --git a/applications/neural_search/recall/in_batch_negative/train_batch_neg.py b/applications/neural_search/recall/in_batch_negative/train_batch_neg.py index fccf1c5b17f7..222b02d16423 100644 --- a/applications/neural_search/recall/in_batch_negative/train_batch_neg.py +++ b/applications/neural_search/recall/in_batch_negative/train_batch_neg.py @@ -86,7 +86,7 @@ type=int, help="Recall number for each query from Ann index.") parser.add_argument("--evaluate_result", type=str, default='evaluate_result.txt', help="evaluate_result") -parser.add_argument('--evaluate', default=True, type=eval, choices=[True, False], +parser.add_argument('--evaluate', action='store_true', help='whether evaluate while training') args = parser.parse_args() # yapf: enable @@ -294,7 +294,7 @@ def do_train(): 'model_state.pdparams') paddle.save(model.state_dict(), save_param_path) tokenizer.save_pretrained(save_dir) - if args.evaluate: + if args.evaluate and rank == 0: print("evaluating") recall_5 = evaluate(model, corpus_data_loader, query_data_loader, recall_result_file, text_list, id2corpus) diff --git a/applications/neural_search/recall/milvus/README.md b/applications/neural_search/recall/milvus/README.md index 34d4837be048..abc9a0868459 100644 --- a/applications/neural_search/recall/milvus/README.md +++ b/applications/neural_search/recall/milvus/README.md @@ -206,6 +206,6 @@ python3 inference.py ``` ## FAQ -### 句子抽取向量后,利用milvus进行检索到了相同的句子,得到的距离不是0 +#### 抽取文本语义向量后,利用 Milvus 进行 ANN 检索查询到了完全相同的文本,但是计算出的距离为什么不是 0? 使用的是近似索引,详情请参考Milvus官方文档,[索引创建机制](https://milvus.io/cn/docs/v2.0.x/index.md) diff --git a/applications/neural_search/recall/simcse/README.md b/applications/neural_search/recall/simcse/README.md index 080d2f7edbcf..9090dd6fcc1e 100644 --- a/applications/neural_search/recall/simcse/README.md +++ b/applications/neural_search/recall/simcse/README.md @@ -431,7 +431,11 @@ sh deploy.sh [0.5649663209915161, 0.03284594044089317] ``` +## FAQ +#### SimCSE模型怎么部署? + ++ SimCSE使用的模型跟 In-batch Negatives 训练出来的模型网络结构是一样的,使用 In-batch Negatives 的部署流程即可,参考[In-batch Negatives](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/applications/neural_search/recall/in_batch_negative/deploy/python) ## Reference [1] Gao, Tianyu, Xingcheng Yao, and Danqi Chen. “SimCSE: Simple Contrastive Learning of Sentence Embeddings.” ArXiv:2104.08821 [Cs], April 18, 2021. http://arxiv.org/abs/2104.08821.