Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add FasterTokenizer on PPMiniLM #1542

Merged
merged 19 commits into from
Jan 11, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
update export
  • Loading branch information
LiuChiachi committed Jan 11, 2022
commit baa4c1f7f0db730aace6082cb2ee601a9903b13f
11 changes: 10 additions & 1 deletion examples/model_compression/pp-minilm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,16 @@ sh run_clue.sh CLUEWSC2020 1e-4 32 50 128 0 ppminilm-6l-768h

#### 导出微调后模型

模型在训练完成之后,默认情况下参数 `--save_inference_model` 为 True,会自动保存预测模型。
模型在训练完成之后,可以选择每个数据集下效果最好的模型进行导出:

```shell
export TASK_NAME=CLUEWSC2020
export MODEL_PATH=ppminilm-6l-768h
export LR=1e-4
export BS=32

python export_model.py --task_name ${TASK_NAME} --output_dir ${MODEL_PATH}/models/${TASK_NAME}/${LR}_${BS}/
```

静态图(部署)模型路径与动态图模型的路径相同,文件名为 `inference.pdmodel` , `inference.pdiparams` 和 `inference.pdiparams.info` 。

Expand Down
3 changes: 2 additions & 1 deletion examples/model_compression/pp-minilm/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def convert_example(example,
label_dtype = "int64" if label_list else "float32"
# Get the label
example['label'] = np.array(example["label"], dtype="int64")
label = example['label']
# Convert raw text to feature
if 'keyword' in example: # CSL
sentence1 = " ".join(example['keyword'])
Expand Down Expand Up @@ -85,6 +86,6 @@ def convert_example(example,
text_pair=example['sentence2'],
max_seq_len=max_seq_length)
if not is_test:
return example['input_ids'], example['token_type_ids'], example['label']
return example['input_ids'], example['token_type_ids'], label
else:
return example['input_ids'], example['token_type_ids']
75 changes: 75 additions & 0 deletions examples/model_compression/pp-minilm/finetuning/export_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import argparse
import distutils.util

from paddlenlp.transformers import PPMiniLMForSequenceClassification

sys.path.append("../")
from data import METRIC_CLASSES


def parse_args():
parser = argparse.ArgumentParser()

# Required parameters
parser.add_argument(
"--task_name",
default=None,
type=str,
required=True,
help="The name of the task to train selected in the list: " +
", ".join(METRIC_CLASSES.keys()), )
parser.add_argument(
"--output_dir",
default="best_clue_model",
type=str,
help="The output directory where the model predictions and checkpoints will be written.",
)
parser.add_argument(
"--save_inference_model_with_tokenizer",
type=distutils.util.strtobool,
default=True,
help="Whether to save inference model with tokenizer.")

args = parser.parse_args()
return args


def do_export(args):
save_path = os.path.join(args.output_dir, "inference")
model = PPMiniLMForSequenceClassification.from_pretrained(args.output_dir)
is_text_pair = True
if args.task_name in ('tnews', 'iflytek', 'cluewsc2020'):
is_text_pair = False
model.to_static(
save_path,
use_faster_tokenizer=args.save_inference_model_with_tokenizer,
is_text_pair=is_text_pair)


def print_arguments(args):
"""print arguments"""
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).items()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')


if __name__ == "__main__":
args = parse_args()
print_arguments(args)
do_export(args)
24 changes: 0 additions & 24 deletions examples/model_compression/pp-minilm/finetuning/run_clue.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,16 +137,6 @@ def parse_args():
type=distutils.util.strtobool,
default=False,
help="Whether do train.")
parser.add_argument(
"--save_inference_model",
type=distutils.util.strtobool,
default=True,
help="Whether to save inference model.")
parser.add_argument(
"--save_inference_model_with_tokenizer",
type=distutils.util.strtobool,
default=True,
help="Whether to save inference model with tokenizer.")
parser.add_argument(
"--max_steps",
default=-1,
Expand Down Expand Up @@ -384,18 +374,6 @@ def do_train(args):
print("best_acc: ", best_acc)


def export_model(args):
save_path = os.path.join(args.output_dir, "inference")
model = PPMiniLMForSequenceClassification.from_pretrained(args.output_dir)
is_text_pair = True
if args.task_name in ('tnews', 'iflytek', 'cluewsc2020'):
is_text_pair = False
model.to_static(
save_path,
use_faster_tokenizer=args.save_inference_model_with_tokenizer,
is_text_pair=is_text_pair)


def print_arguments(args):
"""print arguments"""
print('----------- Configuration Arguments -----------')
Expand All @@ -409,7 +387,5 @@ def print_arguments(args):
print_arguments(args)
if args.do_train:
do_train(args)
if args.save_inference_model:
export_model(args)
if args.do_eval:
do_eval(args)
11 changes: 9 additions & 2 deletions examples/model_compression/pp-minilm/pruning/export_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import time
import json
from functools import partial
import distutils.util

import numpy as np
import paddle
Expand Down Expand Up @@ -132,11 +133,17 @@ def parse_args():
type=float,
default=1.0,
help="depth mult you want to export")
parser.add_argument(
"--use_faster_tokenizer",
type=distutils.util.strtobool,
default=True,
help="Whether to use FasterTokenizer to accelerate training or further inference."
)
args = parser.parse_args()
return args


def do_train(args):
def do_export(args):
paddle.set_device("gpu" if args.n_gpu else "cpu")
args.model_type = args.model_type.lower()
args.task_name = args.task_name.lower()
Expand Down Expand Up @@ -250,4 +257,4 @@ def print_arguments(args):
if __name__ == "__main__":
args = parse_args()
print_arguments(args)
do_train(args)
do_export(args)
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,7 @@ def batch_generator_func_using_faster_tokenizer():
batch_data[0].append(data['sentence1'])
batch_data[1].append(data['sentence2'])
if len(batch_data[0]) == batch_size:
yield {
"input_ids": batch_data[0],
"token_type_ids": batch_data[1]
}
yield {"text": batch_data[0], "text_pair": batch_data[1]}
batch_data = [[], []]

paddleslim.quant.quant_post_static(
Expand Down
20 changes: 14 additions & 6 deletions paddlenlp/transformers/ppminilm/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,25 +173,33 @@ def to_static(self,
self,
input_spec=[

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

为什么 faster 版本的输入 shape 是 [None], 非 Faster 版本的 shape 是 [None, None]?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里的 to_static 逻辑是放在 FasterPretrainedModel 里还是暴露给 FasterTokenizer 的用户比较合适?@Steffy-zxf
@wawltor @LiuChiachi

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dtype=core.VarDesc.VarType.STRINGS时shape参数无效

paddle.static.InputSpec(
shape=[None], dtype=core.VarDesc.VarType.STRINGS),
paddle.static.InputSpec(
shape=[None], dtype=core.VarDesc.VarType.STRINGS)
shape=[None],
dtype=core.VarDesc.VarType.STRINGS,
name="text"), paddle.static.InputSpec(
shape=[None],
dtype=core.VarDesc.VarType.STRINGS,
name="text_pair")
])
else:
model = paddle.jit.to_static(

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

基类 FasterPretrainedModel 已经实现了 to_static 函数,此处是否直接调用基类函数即可?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

加入了use_faster_tokenizer以及pad_to_max_seq_len2个参数,以及增加了对text pair 为输入的模型的导出,求 @ZeyuChen @guoshengCS 能帮忙看一下~

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

已经删除了pad_to_max_len参数,因为不是必要的

self,
input_spec=[
paddle.static.InputSpec(
shape=[None], dtype=core.VarDesc.VarType.STRINGS)
shape=[None],
dtype=core.VarDesc.VarType.STRINGS,
name="text")
])
else:
model = paddle.jit.to_static(
self,
input_spec=[
paddle.static.InputSpec(
shape=[None, None], dtype="int64"), # input_ids
shape=[None, None], dtype="int64",
name="input_ids"), # input_ids
paddle.static.InputSpec(
shape=[None, None], dtype="int64") # segment_ids
shape=[None, None],
dtype="int64",
name="token_type_ids") # segment_ids
])
paddle.jit.save(model, output_path)
logger.info("Already save the static model to the path %s" %
Expand Down