Skip to content

Commit

Permalink
Merge branch 'develop' into move_token_num_fetch_out_of_train_cycle
Browse files Browse the repository at this point in the history
  • Loading branch information
FrostML authored Sep 6, 2022
2 parents 215050b + b88fc4e commit 85e829e
Show file tree
Hide file tree
Showing 24 changed files with 81 additions and 41 deletions.
3 changes: 2 additions & 1 deletion examples/benchmark/clue/mrc/run_c3.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ def _truncate_seq_tuple(tokens_a, tokens_b, tokens_c, max_length):

new_data = tokenizer(tokens_t_list,
text_pair=tokens_c_list,
is_split_into_words=True)
is_split_into_words='token')

# Pad each new example for axis=2 of [batch_size, num_choices, seq_len],
# because length of each choice could be different.
Expand Down Expand Up @@ -305,6 +305,7 @@ def _truncate_seq_tuple(tokens_a, tokens_b, tokens_c, max_length):
remove_columns=column_names,
load_from_cache_file=not args.overwrite_cache,
desc="Running tokenizer on train dataset")

batchify_fn = lambda samples, fn=Dict({
'input_ids':
Pad(axis=1, pad_val=tokenizer.pad_token_id), # input
Expand Down
2 changes: 1 addition & 1 deletion examples/information_extraction/DuEE/sequence_labeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def convert_example_to_feature(example,
tokens, labels = example
tokenized_input = tokenizer(tokens,
return_length=True,
is_split_into_words=True,
is_split_into_words='token',
max_seq_len=max_seq_len)

input_ids = tokenized_input['input_ids']
Expand Down
2 changes: 1 addition & 1 deletion examples/information_extraction/msra_ner/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def tokenize_and_align_labels(examples):
examples['tokens'],
max_seq_len=args.max_seq_length,
# We use this argument because the texts in our dataset are lists of words (with a label for each word).
is_split_into_words=True,
is_split_into_words='token',
return_length=True)
labels = []

Expand Down
2 changes: 1 addition & 1 deletion examples/information_extraction/msra_ner/predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def tokenize_and_align_labels(examples):
examples['tokens'],
max_seq_len=args.max_seq_length,
# We use this argument because the texts in our dataset are lists of words (with a label for each word).
is_split_into_words=True,
is_split_into_words='token',
return_length=True)
labels = []

Expand Down
2 changes: 1 addition & 1 deletion examples/information_extraction/msra_ner/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def tokenize_and_align_labels(examples):
examples['tokens'],
max_seq_len=args.max_seq_length,
# We use this argument because the texts in our dataset are lists of words (with a label for each word).
is_split_into_words=True,
is_split_into_words='token',
return_length=True)
labels = []

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def convert_to_features(example, tokenizer):
tokens = example[0]
tokenized_input = tokenizer(tokens,
return_length=True,
is_split_into_words=True)
is_split_into_words='token')
# Token '[CLS]' and '[SEP]' will get label 'O'
return tokenized_input['input_ids'], tokenized_input[
'token_type_ids'], tokenized_input['seq_len']
Expand Down
2 changes: 1 addition & 1 deletion examples/information_extraction/waybill_ie/run_ernie.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def convert_to_features(example, tokenizer, label_vocab):
tokens, labels = example
tokenized_input = tokenizer(tokens,
return_length=True,
is_split_into_words=True)
is_split_into_words='token')
# Token '[CLS]' and '[SEP]' will get label 'O'
labels = ['O'] + labels + ['O']
tokenized_input['labels'] = [label_vocab[x] for x in labels]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def convert_to_features(example, tokenizer, label_vocab):
tokens, labels = example
tokenized_input = tokenizer(tokens,
return_length=True,
is_split_into_words=True)
is_split_into_words='token')
# Token '[CLS]' and '[SEP]' will get label 'O'
labels = ['O'] + labels + ['O']
tokenized_input['labels'] = [label_vocab[x] for x in labels]
Expand Down
2 changes: 1 addition & 1 deletion examples/sentiment_analysis/skep/predict_opinion.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def convert_example(example, tokenizer, max_seq_length=512, is_test=False):
tokens = example["tokens"]
encoded_inputs = tokenizer(tokens,
return_length=True,
is_split_into_words=True,
is_split_into_words='token',
max_seq_len=max_seq_length)
input_ids = np.array(encoded_inputs["input_ids"], dtype="int64")
token_type_ids = np.array(encoded_inputs["token_type_ids"], dtype="int64")
Expand Down
2 changes: 1 addition & 1 deletion examples/text_to_knowledge/ernie-ctm/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def convert_example(example,
tokens = example["tokens"]
tokenized_input = tokenizer(tokens,
return_length=True,
is_split_into_words=True,
is_split_into_words='token',
max_seq_len=max_seq_len)

if is_test:
Expand Down
2 changes: 1 addition & 1 deletion examples/text_to_knowledge/nptag/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def convert_example(example,
tokens = list(example["text"]) + ["是"] + ["[MASK]"] * max_cls_len
inputs = tokenzier(tokens,
return_length=True,
is_split_into_words=True,
is_split_into_words='token',
max_length=max_seq_len)

label_indices = list(
Expand Down
4 changes: 2 additions & 2 deletions model_zoo/ernie-gen/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@
default=-1,
type=int,
help=
"If > 0: set total number of training steps to perform. Override num_train_epochs."
"If > 0: set total number of training steps to perform. Override num_epochs."
)

args = parser.parse_args()
Expand Down Expand Up @@ -270,7 +270,7 @@ def train():
train_model = paddle.DataParallel(train_model)

num_training_steps = args.max_steps if args.max_steps > 0 else len(
train_data_loader) * args.num_train_epochs
train_data_loader) * args.num_epochs

lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
args.warmup_proportion)
Expand Down
2 changes: 1 addition & 1 deletion model_zoo/gpt/run_msra_ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def tokenize_and_align_labels(example,
example = example['tokens']
tokenized_input = tokenizer(example,
return_length=True,
is_split_into_words=True,
is_split_into_words='token',
max_seq_len=max_seq_len,
return_token_type_ids=False)

Expand Down
4 changes: 2 additions & 2 deletions paddlenlp/transformers/tokenizer_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -986,7 +986,7 @@ def get_input_ids(text):
elif isinstance(text,
(list, tuple)) and len(text) > 0 and isinstance(
text[0], str):
if is_split_into_words:
if is_split_into_words == True:
tokens = list(
itertools.chain(*(
self.tokenize(t, is_split_into_words=True, **kwargs)
Expand Down Expand Up @@ -1071,7 +1071,7 @@ def get_input_ids(text):
elif isinstance(text,
(list, tuple)) and len(text) > 0 and isinstance(
text[0], str):
if is_split_into_words:
if is_split_into_words == True:
tokens = list(
itertools.chain(*(
self.tokenize(t, is_split_into_words=True, **kwargs)
Expand Down
13 changes: 12 additions & 1 deletion paddlenlp/transformers/tokenizer_utils_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2013,7 +2013,7 @@ def __call__(self,
List[List[str]]]] = None,
max_length: Optional[int] = None,
stride: int = 0,
is_split_into_words: bool = False,
is_split_into_words: Union[bool, str] = False,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = False,
return_position_ids: bool = False,
Expand Down Expand Up @@ -2061,6 +2061,10 @@ def __call__(self,
a bigger batch than inputs to include all spans. Moreover, 'overflow_to_sample'
and 'offset_mapping' preserving the original example and position
information will be added to the returned dictionary. Defaults to 0.
is_split_into_words (Union[bool, str], optional):
when the text is words or tokens, `is_split_into_words` should be True or `token`.
`True`: means that the text should be words which should be tokenized.
`token`: means that the text should be tokens which already be tokenized, so it should not be tokenized again.
padding (bool, str or [PaddingStrategy], optional):
Activates and controls padding. Accepts the following values:
Expand Down Expand Up @@ -2201,6 +2205,13 @@ def _is_valid_text_input(t):
"text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
"or `List[List[str]]` (batch of pretokenized examples).")

# check `split_into_words` value
if isinstance(is_split_into_words,
str) and is_split_into_words != 'token':
raise ValueError(
"the value of `is_split_into_words` should be one of: {True, False, 'token'} but receive: <%s>",
is_split_into_words)

if is_split_into_words:
is_batched = isinstance(text,
(list, tuple)) and text and isinstance(
Expand Down
22 changes: 15 additions & 7 deletions pipelines/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,21 +77,29 @@ python setup.py install
from pipelines.document_stores import FAISSDocumentStore
from pipelines.nodes import DensePassageRetriever, ErnieRanker

# Step1: Initialize a FaissDocumentStore to store texts of documents
# Step1: Preparing the data
documents = [
{'content': '金钱龟不分品种,只有生长地之分,在我国主要分布于广东、广西、福建、海南、香港、澳门等地,在国外主要分布于越南等亚热带国家和地区。',
'meta': {'name': 'test1.txt'}},
{'content': '衡量酒水的价格的因素很多的,酒水的血统(也就是那里产的,采用什么工艺等);存储的时间等等,酒水是一件很难标准化得商品,只要你敢要价,有买的那就值那个钱。',
'meta': {'name': 'test2.txt'}}
]

# Step2: Initialize a FaissDocumentStore to store texts of documents
document_store = FAISSDocumentStore(embedding_dim=768)
document_store.write_documents(documents)

# Step2: Initialize a DenseRetriever and build ANN index
retriever = DensePassageRetriever(document_store=document_store, query_embedding_model="rocketqa-zh-dureader-query-encoder")
# Step3: Initialize a DenseRetriever and build ANN index
retriever = DensePassageRetriever(document_store=document_store, query_embedding_model="rocketqa-zh-base-query-encoder",embed_title=False)
document_store.update_embeddings(retriever)

# Step3: Initialize a Ranker
ranker = ErnieRanker(model_name_or_path="rocketqa-zh-dureader-cross-encoder")
# Step4: Initialize a Ranker
ranker = ErnieRanker(model_name_or_path="rocketqa-base-cross-encoder")

# Step4: Initialize a SemanticSearchPipeline and ask questions
# Step5: Initialize a SemanticSearchPipeline and ask questions
from pipelines import SemanticSearchPipeline
pipeline = SemanticSearchPipeline(retriever, ranker)
prediction = pipeline.run(query="亚马逊河流的相关介绍")
prediction = pipeline.run(query="衡量酒水的价格的因素有哪些?")
```
### 快速部署

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@
a. 依赖安装:
我们预置了基于[ 8000 多条保险行业问答数据](https://github.com/SophonPlus/ChineseNlpCorpus/blob/master/datasets/baoxianzhidao/intro.ipynb)搭建保险FAQ智能问答的代码示例,您可以通过如下命令快速体验智能问答的效果
```bash

git clone https://github.com/tvst/htbuilder.git
cd htbuilder/
python setup install
# 1) 安装 pipelines package
# pip 一键安装
pip install --upgrade paddle-pipelines -i https://pypi.tuna.tsinghua.edu.cn/simple
# 或者源码进行安装最新版本
cd ${HOME}/PaddleNLP/pipelines/
pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
python setup.py install
Expand Down
6 changes: 4 additions & 2 deletions pipelines/examples/frequently-asked-question/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,11 @@ b. 硬件环境:
c. 依赖安装:
首先需要安装PaddlePaddle,PaddlePaddle的安装请参考文档[官方安装文档](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html),然后安装下面的依赖:
```bash
pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
# 1) 安装 pipelines package
# pip 一键安装
pip install --upgrade paddle-pipelines -i https://pypi.tuna.tsinghua.edu.cn/simple
# 或者源码进行安装最新版本
cd ${HOME}/PaddleNLP/pipelines/
pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
python setup.py install
```
【注意】以下的所有的流程都只需要在`pipelines`根目录下进行,不需要跳转目录
Expand Down
4 changes: 3 additions & 1 deletion pipelines/examples/question-answering/Install_windows.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ a. 依赖安装:
git clone https://github.com/tvst/htbuilder.git
cd htbuilder/
python setup install
# 1) 安装 pipelines package
# pip 一键安装
pip install --upgrade paddle-pipelines -i https://pypi.tuna.tsinghua.edu.cn/simple
# 或者源码进行安装最新版本
cd ${HOME}/PaddleNLP/pipelines/
pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
python setup.py install
Expand Down
6 changes: 4 additions & 2 deletions pipelines/examples/question-answering/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,11 @@ b. 硬件环境:
c. 依赖安装:
首先需要安装PaddlePaddle,PaddlePaddle的安装请参考文档[官方安装文档](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html),然后安装下面的依赖:
```bash
pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
# 1) 安装 pipelines package
# pip 一键安装
pip install --upgrade paddle-pipelines -i https://pypi.tuna.tsinghua.edu.cn/simple
# 或者源码进行安装最新版本
cd ${HOME}/PaddleNLP/pipelines/
pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
python setup.py install
```
【注意】以下的所有的流程都只需要在`pipelines`根目录下进行,不需要跳转目录
Expand Down
4 changes: 3 additions & 1 deletion pipelines/examples/semantic-search/Install_windows.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ a. 依赖安装:
git clone https://github.com/tvst/htbuilder.git
cd htbuilder/
python setup install
# 1) 安装 pipelines package
# pip 一键安装
pip install --upgrade paddle-pipelines -i https://pypi.tuna.tsinghua.edu.cn/simple
# 或者源码进行安装最新版本
cd ${HOME}/PaddleNLP/pipelines/
pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
python setup.py install
Expand Down
6 changes: 4 additions & 2 deletions pipelines/examples/semantic-search/Neural_Search.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,11 @@ b. 硬件环境:
c. 依赖安装:
首先需要安装PaddlePaddle,PaddlePaddle的安装请参考文档[官方安装文档](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html),然后安装下面的依赖:
```bash
pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
# 1) 安装 pipelines package
# pip 一键安装
pip install --upgrade paddle-pipelines -i https://pypi.tuna.tsinghua.edu.cn/simple
# 或者源码进行安装最新版本
cd ${HOME}/PaddleNLP/pipelines/
pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
python setup.py install
```
【注意】以下的所有的流程都只需要在`pipelines`根目录下进行,不需要跳转目录
Expand Down
6 changes: 4 additions & 2 deletions pipelines/examples/semantic-search/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,11 @@ b. 硬件环境:
c. 依赖安装:
首先需要安装PaddlePaddle,PaddlePaddle的安装请参考文档[官方安装文档](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html),然后安装下面的依赖:
```bash
pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
# 1) 安装 pipelines package
# pip 一键安装
pip install --upgrade paddle-pipelines -i https://pypi.tuna.tsinghua.edu.cn/simple
# 或者源码进行安装最新版本
cd ${HOME}/PaddleNLP/pipelines/
pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
python setup.py install
```
【注意】以下的所有的流程都只需要在`pipelines`根目录下进行,不需要跳转目录
Expand Down
17 changes: 12 additions & 5 deletions pipelines/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,21 +14,28 @@
import os
import setuptools
import sys
import io
import pipelines
import platform

long_description = "PIPELINES: An End to End Natural Language Proceessing Development Kit Based on ERNIE"
description = "Paddle-Pipelines: An End to End Natural Language Proceessing Development Kit Based on PaddleNLP"

with open("requirements.txt") as fin:
REQUIRED_PACKAGES = fin.read()


def read(*names, **kwargs):
with io.open(os.path.join(os.path.dirname(__file__), *names),
encoding=kwargs.get("encoding", "utf8")) as fp:
return fp.read()


setuptools.setup(name="paddle-pipelines",
version=pipelines.__version__,
author="PaddlePaddle Speech and Language Team",
author_email="paddlenlp@baidu.com",
description=long_description,
long_description=long_description,
long_description_content_type="text/plain",
description=description,
long_description=read("README.md"),
long_description_content_type="text/markdown",
url="https://github.com/PaddlePaddle/PaddleNLP",
packages=setuptools.find_packages(
where='.',
Expand Down

0 comments on commit 85e829e

Please sign in to comment.