Merge branch 'develop' into move_token_num_fetch_out_of_train_cycle

PaddlePaddle · Sep 6, 2022 · 85e829e · 85e829e
2 parents 215050b + b88fc4e
commit 85e829e
Show file tree

Hide file tree

Showing 24 changed files with 81 additions and 41 deletions.
diff --git a/examples/benchmark/clue/mrc/run_c3.py b/examples/benchmark/clue/mrc/run_c3.py
@@ -258,7 +258,7 @@ def _truncate_seq_tuple(tokens_a, tokens_b, tokens_c, max_length):
 
             new_data = tokenizer(tokens_t_list,
                                  text_pair=tokens_c_list,
-                                 is_split_into_words=True)
+                                 is_split_into_words='token')
 
             # Pad each new example for axis=2 of [batch_size, num_choices, seq_len],
             # because length of each choice could be different.
@@ -305,6 +305,7 @@ def _truncate_seq_tuple(tokens_a, tokens_b, tokens_c, max_length):
                 remove_columns=column_names,
                 load_from_cache_file=not args.overwrite_cache,
                 desc="Running tokenizer on train dataset")
+
         batchify_fn = lambda samples, fn=Dict({
             'input_ids':
             Pad(axis=1, pad_val=tokenizer.pad_token_id),  # input

diff --git a/examples/information_extraction/DuEE/sequence_labeling.py b/examples/information_extraction/DuEE/sequence_labeling.py
@@ -98,7 +98,7 @@ def convert_example_to_feature(example,
     tokens, labels = example
     tokenized_input = tokenizer(tokens,
                                 return_length=True,
-                                is_split_into_words=True,
+                                is_split_into_words='token',
                                 max_seq_len=max_seq_len)
 
     input_ids = tokenized_input['input_ids']

diff --git a/examples/information_extraction/msra_ner/eval.py b/examples/information_extraction/msra_ner/eval.py
@@ -56,7 +56,7 @@ def tokenize_and_align_labels(examples):
             examples['tokens'],
             max_seq_len=args.max_seq_length,
             # We use this argument because the texts in our dataset are lists of words (with a label for each word).
-            is_split_into_words=True,
+            is_split_into_words='token',
             return_length=True)
         labels = []
 

diff --git a/examples/information_extraction/msra_ner/predict.py b/examples/information_extraction/msra_ner/predict.py
@@ -86,7 +86,7 @@ def tokenize_and_align_labels(examples):
             examples['tokens'],
             max_seq_len=args.max_seq_length,
             # We use this argument because the texts in our dataset are lists of words (with a label for each word).
-            is_split_into_words=True,
+            is_split_into_words='token',
             return_length=True)
         labels = []
 

diff --git a/examples/information_extraction/msra_ner/train.py b/examples/information_extraction/msra_ner/train.py
@@ -105,7 +105,7 @@ def tokenize_and_align_labels(examples):
             examples['tokens'],
             max_seq_len=args.max_seq_length,
             # We use this argument because the texts in our dataset are lists of words (with a label for each word).
-            is_split_into_words=True,
+            is_split_into_words='token',
             return_length=True)
         labels = []
 

diff --git a/examples/information_extraction/waybill_ie/deploy/python/predict.py b/examples/information_extraction/waybill_ie/deploy/python/predict.py
@@ -116,7 +116,7 @@ def convert_to_features(example, tokenizer):
     tokens = example[0]
     tokenized_input = tokenizer(tokens,
                                 return_length=True,
-                                is_split_into_words=True)
+                                is_split_into_words='token')
     # Token '[CLS]' and '[SEP]' will get label 'O'
     return tokenized_input['input_ids'], tokenized_input[
         'token_type_ids'], tokenized_input['seq_len']

diff --git a/examples/information_extraction/waybill_ie/run_ernie.py b/examples/information_extraction/waybill_ie/run_ernie.py
@@ -40,7 +40,7 @@ def convert_to_features(example, tokenizer, label_vocab):
     tokens, labels = example
     tokenized_input = tokenizer(tokens,
                                 return_length=True,
-                                is_split_into_words=True)
+                                is_split_into_words='token')
     # Token '[CLS]' and '[SEP]' will get label 'O'
     labels = ['O'] + labels + ['O']
     tokenized_input['labels'] = [label_vocab[x] for x in labels]

diff --git a/examples/information_extraction/waybill_ie/run_ernie_crf.py b/examples/information_extraction/waybill_ie/run_ernie_crf.py
@@ -41,7 +41,7 @@ def convert_to_features(example, tokenizer, label_vocab):
     tokens, labels = example
     tokenized_input = tokenizer(tokens,
                                 return_length=True,
-                                is_split_into_words=True)
+                                is_split_into_words='token')
     # Token '[CLS]' and '[SEP]' will get label 'O'
     labels = ['O'] + labels + ['O']
     tokenized_input['labels'] = [label_vocab[x] for x in labels]

diff --git a/examples/sentiment_analysis/skep/predict_opinion.py b/examples/sentiment_analysis/skep/predict_opinion.py
@@ -67,7 +67,7 @@ def convert_example(example, tokenizer, max_seq_length=512, is_test=False):
     tokens = example["tokens"]
     encoded_inputs = tokenizer(tokens,
                                return_length=True,
-                               is_split_into_words=True,
+                               is_split_into_words='token',
                                max_seq_len=max_seq_length)
     input_ids = np.array(encoded_inputs["input_ids"], dtype="int64")
     token_type_ids = np.array(encoded_inputs["token_type_ids"], dtype="int64")

diff --git a/examples/text_to_knowledge/ernie-ctm/data.py b/examples/text_to_knowledge/ernie-ctm/data.py
@@ -37,7 +37,7 @@ def convert_example(example,
     tokens = example["tokens"]
     tokenized_input = tokenizer(tokens,
                                 return_length=True,
-                                is_split_into_words=True,
+                                is_split_into_words='token',
                                 max_seq_len=max_seq_len)
 
     if is_test:

diff --git a/examples/text_to_knowledge/nptag/data.py b/examples/text_to_knowledge/nptag/data.py
@@ -57,7 +57,7 @@ def convert_example(example,
     tokens = list(example["text"]) + ["是"] + ["[MASK]"] * max_cls_len
     inputs = tokenzier(tokens,
                        return_length=True,
-                       is_split_into_words=True,
+                       is_split_into_words='token',
                        max_length=max_seq_len)
 
     label_indices = list(

diff --git a/model_zoo/ernie-gen/train.py b/model_zoo/ernie-gen/train.py
@@ -134,7 +134,7 @@
     default=-1,
     type=int,
     help=
-    "If > 0: set total number of training steps to perform. Override num_train_epochs."
+    "If > 0: set total number of training steps to perform. Override num_epochs."
 )
 
 args = parser.parse_args()
@@ -270,7 +270,7 @@ def train():
         train_model = paddle.DataParallel(train_model)
 
     num_training_steps = args.max_steps if args.max_steps > 0 else len(
-        train_data_loader) * args.num_train_epochs
+        train_data_loader) * args.num_epochs
 
     lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
                                          args.warmup_proportion)

diff --git a/model_zoo/gpt/run_msra_ner.py b/model_zoo/gpt/run_msra_ner.py
@@ -76,7 +76,7 @@ def tokenize_and_align_labels(example,
     example = example['tokens']
     tokenized_input = tokenizer(example,
                                 return_length=True,
-                                is_split_into_words=True,
+                                is_split_into_words='token',
                                 max_seq_len=max_seq_len,
                                 return_token_type_ids=False)
 

diff --git a/paddlenlp/transformers/tokenizer_utils.py b/paddlenlp/transformers/tokenizer_utils.py
@@ -986,7 +986,7 @@ def get_input_ids(text):
             elif isinstance(text,
                             (list, tuple)) and len(text) > 0 and isinstance(
                                 text[0], str):
-                if is_split_into_words:
+                if is_split_into_words == True:
                     tokens = list(
                         itertools.chain(*(
                             self.tokenize(t, is_split_into_words=True, **kwargs)
@@ -1071,7 +1071,7 @@ def get_input_ids(text):
             elif isinstance(text,
                             (list, tuple)) and len(text) > 0 and isinstance(
                                 text[0], str):
-                if is_split_into_words:
+                if is_split_into_words == True:
                     tokens = list(
                         itertools.chain(*(
                             self.tokenize(t, is_split_into_words=True, **kwargs)

diff --git a/paddlenlp/transformers/tokenizer_utils_base.py b/paddlenlp/transformers/tokenizer_utils_base.py
@@ -2013,7 +2013,7 @@ def __call__(self,
                                            List[List[str]]]] = None,
                  max_length: Optional[int] = None,
                  stride: int = 0,
-                 is_split_into_words: bool = False,
+                 is_split_into_words: Union[bool, str] = False,
                  padding: Union[bool, str, PaddingStrategy] = False,
                  truncation: Union[bool, str, TruncationStrategy] = False,
                  return_position_ids: bool = False,
@@ -2061,6 +2061,10 @@ def __call__(self,
                 a bigger batch than inputs to include all spans. Moreover, 'overflow_to_sample'
                 and 'offset_mapping' preserving the original example and position
                 information will be added to the returned dictionary. Defaults to 0.
+            is_split_into_words (Union[bool, str], optional):
+                when the text is words or tokens, `is_split_into_words` should be True or `token`.
+                `True`: means that the text should be words which should be tokenized.
+                `token`: means that the text should be tokens which already be tokenized, so it should not be tokenized again.
             padding (bool, str or [PaddingStrategy], optional):
                 Activates and controls padding. Accepts the following values:
 
@@ -2201,6 +2205,13 @@ def _is_valid_text_input(t):
                 "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
                 "or `List[List[str]]` (batch of pretokenized examples).")
 
+        # check `split_into_words` value
+        if isinstance(is_split_into_words,
+                      str) and is_split_into_words != 'token':
+            raise ValueError(
+                "the value of `is_split_into_words` should be one of: {True, False, 'token'} but receive: <%s>",
+                is_split_into_words)
+
         if is_split_into_words:
             is_batched = isinstance(text,
                                     (list, tuple)) and text and isinstance(

diff --git a/pipelines/README.md b/pipelines/README.md
@@ -77,21 +77,29 @@ python setup.py install
 from pipelines.document_stores import FAISSDocumentStore
 from pipelines.nodes import DensePassageRetriever, ErnieRanker
 
-# Step1: Initialize a FaissDocumentStore to store texts of documents
+# Step1: Preparing the data
+documents = [
+  {'content': '金钱龟不分品种,只有生长地之分,在我国主要分布于广东、广西、福建、海南、香港、澳门等地,在国外主要分布于越南等亚热带国家和地区。',
+  'meta': {'name': 'test1.txt'}},
+  {'content': '衡量酒水的价格的因素很多的，酒水的血统(也就是那里产的，采用什么工艺等）；存储的时间等等，酒水是一件很难标准化得商品，只要你敢要价，有买的那就值那个钱。',
+  'meta': {'name': 'test2.txt'}}
+]
+
+# Step2: Initialize a FaissDocumentStore to store texts of documents
 document_store = FAISSDocumentStore(embedding_dim=768)
 document_store.write_documents(documents)
 
-# Step2: Initialize a DenseRetriever and build ANN index
-retriever = DensePassageRetriever(document_store=document_store, query_embedding_model="rocketqa-zh-dureader-query-encoder")
+# Step3: Initialize a DenseRetriever and build ANN index
+retriever = DensePassageRetriever(document_store=document_store, query_embedding_model="rocketqa-zh-base-query-encoder",embed_title=False)
 document_store.update_embeddings(retriever)
 
-# Step3: Initialize a Ranker
-ranker = ErnieRanker(model_name_or_path="rocketqa-zh-dureader-cross-encoder")
+# Step4: Initialize a Ranker
+ranker = ErnieRanker(model_name_or_path="rocketqa-base-cross-encoder")
 
-# Step4: Initialize a SemanticSearchPipeline and ask questions
+# Step5: Initialize a SemanticSearchPipeline and ask questions
 from pipelines import SemanticSearchPipeline
 pipeline = SemanticSearchPipeline(retriever, ranker)
-prediction = pipeline.run(query="亚马逊河流的相关介绍")
+prediction = pipeline.run(query="衡量酒水的价格的因素有哪些?")
 ```
 ### 快速部署
 

diff --git a/pipelines/examples/frequently-asked-question/Install_windows.md b/pipelines/examples/frequently-asked-question/Install_windows.md
@@ -8,11 +8,12 @@
 a. 依赖安装：
 我们预置了基于[ 8000 多条保险行业问答数据](https://github.com/SophonPlus/ChineseNlpCorpus/blob/master/datasets/baoxianzhidao/intro.ipynb)搭建保险FAQ智能问答的代码示例，您可以通过如下命令快速体验智能问答的效果
 ```bash
-
 git clone https://github.com/tvst/htbuilder.git
 cd htbuilder/
 python setup install
-# 1) 安装 pipelines package
+# pip 一键安装
+pip install --upgrade paddle-pipelines -i https://pypi.tuna.tsinghua.edu.cn/simple
+# 或者源码进行安装最新版本
 cd ${HOME}/PaddleNLP/pipelines/
 pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
 python setup.py install

diff --git a/pipelines/examples/frequently-asked-question/README.md b/pipelines/examples/frequently-asked-question/README.md
@@ -41,9 +41,11 @@ b. 硬件环境：
 c. 依赖安装：
 首先需要安装PaddlePaddle，PaddlePaddle的安装请参考文档[官方安装文档](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html)，然后安装下面的依赖：
 ```bash
-pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
-# 1) 安装 pipelines package
+# pip 一键安装
+pip install --upgrade paddle-pipelines -i https://pypi.tuna.tsinghua.edu.cn/simple
+# 或者源码进行安装最新版本
 cd ${HOME}/PaddleNLP/pipelines/
+pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
 python setup.py install
 ```
 【注意】以下的所有的流程都只需要在`pipelines`根目录下进行，不需要跳转目录

diff --git a/pipelines/examples/question-answering/Install_windows.md b/pipelines/examples/question-answering/Install_windows.md
@@ -12,7 +12,9 @@ a. 依赖安装：
 git clone https://github.com/tvst/htbuilder.git
 cd htbuilder/
 python setup install
-# 1) 安装 pipelines package
+# pip 一键安装
+pip install --upgrade paddle-pipelines -i https://pypi.tuna.tsinghua.edu.cn/simple
+# 或者源码进行安装最新版本
 cd ${HOME}/PaddleNLP/pipelines/
 pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
 python setup.py install

diff --git a/pipelines/examples/question-answering/README.md b/pipelines/examples/question-answering/README.md
@@ -47,9 +47,11 @@ b. 硬件环境：
 c. 依赖安装：
 首先需要安装PaddlePaddle，PaddlePaddle的安装请参考文档[官方安装文档](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html)，然后安装下面的依赖：
 ```bash
-pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
-# 1) 安装 pipelines package
+# pip 一键安装
+pip install --upgrade paddle-pipelines -i https://pypi.tuna.tsinghua.edu.cn/simple
+# 或者源码进行安装最新版本
 cd ${HOME}/PaddleNLP/pipelines/
+pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
 python setup.py install
 ```
 【注意】以下的所有的流程都只需要在`pipelines`根目录下进行，不需要跳转目录

diff --git a/pipelines/examples/semantic-search/Install_windows.md b/pipelines/examples/semantic-search/Install_windows.md
@@ -11,7 +11,9 @@ a. 依赖安装：
 git clone https://github.com/tvst/htbuilder.git
 cd htbuilder/
 python setup install
-# 1) 安装 pipelines package
+# pip 一键安装
+pip install --upgrade paddle-pipelines -i https://pypi.tuna.tsinghua.edu.cn/simple
+# 或者源码进行安装最新版本
 cd ${HOME}/PaddleNLP/pipelines/
 pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
 python setup.py install

diff --git a/pipelines/examples/semantic-search/Neural_Search.md b/pipelines/examples/semantic-search/Neural_Search.md
@@ -23,9 +23,11 @@ b. 硬件环境：
 c. 依赖安装：
 首先需要安装PaddlePaddle，PaddlePaddle的安装请参考文档[官方安装文档](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html)，然后安装下面的依赖：
 ```bash
-pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
-# 1) 安装 pipelines package
+# pip 一键安装
+pip install --upgrade paddle-pipelines -i https://pypi.tuna.tsinghua.edu.cn/simple
+# 或者源码进行安装最新版本
 cd ${HOME}/PaddleNLP/pipelines/
+pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
 python setup.py install
 ```
 【注意】以下的所有的流程都只需要在`pipelines`根目录下进行，不需要跳转目录

diff --git a/pipelines/examples/semantic-search/README.md b/pipelines/examples/semantic-search/README.md
@@ -52,9 +52,11 @@ b. 硬件环境：
 c. 依赖安装：
 首先需要安装PaddlePaddle，PaddlePaddle的安装请参考文档[官方安装文档](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html)，然后安装下面的依赖：
 ```bash
-pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
-# 1) 安装 pipelines package
+# pip 一键安装
+pip install --upgrade paddle-pipelines -i https://pypi.tuna.tsinghua.edu.cn/simple
+# 或者源码进行安装最新版本
 cd ${HOME}/PaddleNLP/pipelines/
+pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
 python setup.py install
 ```
 【注意】以下的所有的流程都只需要在`pipelines`根目录下进行，不需要跳转目录

diff --git a/pipelines/setup.py b/pipelines/setup.py
@@ -14,21 +14,28 @@
 import os
 import setuptools
 import sys
+import io
 import pipelines
-import platform
 
-long_description = "PIPELINES: An End to End Natural Language Proceessing Development Kit Based on ERNIE"
+description = "Paddle-Pipelines: An End to End Natural Language Proceessing Development Kit Based on PaddleNLP"
 
 with open("requirements.txt") as fin:
     REQUIRED_PACKAGES = fin.read()
 
+
+def read(*names, **kwargs):
+    with io.open(os.path.join(os.path.dirname(__file__), *names),
+                 encoding=kwargs.get("encoding", "utf8")) as fp:
+        return fp.read()
+
+
 setuptools.setup(name="paddle-pipelines",
                  version=pipelines.__version__,
                  author="PaddlePaddle Speech and Language Team",
                  author_email="paddlenlp@baidu.com",
-                 description=long_description,
-                 long_description=long_description,
-                 long_description_content_type="text/plain",
+                 description=description,
+                 long_description=read("README.md"),
+                 long_description_content_type="text/markdown",
                  url="https://github.com/PaddlePaddle/PaddleNLP",
                  packages=setuptools.find_packages(
                      where='.',