Skip to content

Commit

Permalink
solve conflicts
Browse files Browse the repository at this point in the history
fix infer perf

remove useless comments
  • Loading branch information
LiuChiachi committed Dec 10, 2021
1 parent ca13671 commit 8d0af8b
Show file tree
Hide file tree
Showing 7 changed files with 317 additions and 68 deletions.
28 changes: 14 additions & 14 deletions examples/model_compression/PP-MiniLM/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,8 @@ cd ..

由于开启了动态shape功能,因此需要设置获取shape的范围。Paddle Inference提供了相应的接口,即首先通过离线输入数据来统计出所有临时tensor的shape范围,TRT子图的tensor输入shape范围可直接根据上一步tune出来的结果来设置,即可完成自动shape范围设置。统计完成后,只需设置统计结果路径,即可启用tuned_dynamic_shape功能。在本案例中,只需要先设置--collect_shape参数,运行infer.py,然后再取消传入这个参数,再次运行infer.py。例如:

本案例是在NVIDIA Tesla T4 单卡上,cuda版本11.1、cudnn版本8.1、TensorRT版本7.2,使用inference/infer.py脚本,对量化后的模型进行预测。

INT8预测脚本:

```shell
Expand All @@ -277,27 +279,25 @@ python infer.py --task_name ${task} --model_path $MODEL_PATH --use_trt
```

### 性能测试
本案例是在NVIDIA Tesla T4 单卡上,cuda11.1、cudnn8.1、TensorRT7.2,使用inference/infer.py脚本,对量化后的模型进行预测。

测试性能时采用了TNEWS数据集下的模型,下表三行分别是微调后的模型、OFA裁剪蒸馏后的模型、量化方法为mse、校准集数量为4的量化模型,计算dev上预测的总耗时(去除前20个steps)。
测试性能环境同上,基于NVIDIA Tesla T4 单卡上,cuda版本11.1、cudnn版本8.1、TensorRT版本7.2。采用的是TNEWS数据集下训练的模型,下表三行分别是微调后的模型、OFA裁剪蒸馏后的模型、量化方法为mse、校准集数量为4的量化模型,计算dev上预测的总耗时(去除前20个steps)。

可以发现借助PaddleSlim裁剪、量化后的模型比原BERT-base模型推理速度加速255.86%,其中裁剪可以加速87.98%。

| | 平均耗时(s) | 加速比 |
| ------------------ | ----------- | ------- |
| BERT | 20.64 | 0 |
| FP32 | 12.61 | 63.68% |
| FP32+裁剪 | 10.98 | 87.98% |
| FP32+裁剪+INT8量化 | 5.80 | 255.86% |


INT8预测脚本:
运行性能测试脚本可以得到FP32、裁剪后、量化后模型的耗时,取5个非--collect_shap阶段打印出的时长取平均:

```shell

sh infer.sh
sh infer_perf.sh
```

```shell
cd ..
```

可以发现借助PaddleSlim裁剪、量化后的模型比原BERT-base模型推理速度加速255.86%,其中裁剪可以加速87.98%。

| | 平均耗时(s) | 加速比 |
| ------------------ | ----------- | ------- |
| BERT | 20.64 | 0 |
| FP32 | 12.61 | 63.68% |
| FP32+裁剪 | 10.98 | 87.98% |
| FP32+裁剪+INT8量化 | 5.80 | 255.86% |
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
from paddlenlp.transformers import LinearDecayWithWarmup
from paddlenlp.transformers import RobertaModel, RobertaTokenizer
from paddlenlp.transformers import ErnieModel, ErnieForSequenceClassification, ErnieTokenizer
from paddlenlp.transformers.distill_utils import to_distill, calc_minilm_loss_multi_relation
from paddlenlp.transformers.distill_utils import to_distill, calc_multi_relation_loss

MODEL_CLASSES = {
"roberta": (RobertaModel, RobertaTokenizer),
Expand Down Expand Up @@ -245,6 +245,7 @@ def __init__(self, input_file, tokenizer, max_seq_length):
line = line[:max_seq_length]
tokenized_example = tokenizer(line, max_seq_len=max_seq_length)
input_ids.append(tokenized_example['input_ids'])

self.inputs = np.asarray(input_ids)
f.close()

Expand Down Expand Up @@ -396,7 +397,7 @@ def do_train(args):
input_ids = batch[0]
attention_mask = paddle.unsqueeze(
(input_ids == pad_token_id
).astype(paddle.get_default_dtype()) * -1e9,
).astype(paddle.get_default_dtype()) * -1e4,
axis=[1, 2])
with paddle.amp.auto_cast(
args.use_amp,
Expand All @@ -408,35 +409,27 @@ def do_train(args):
q_t, q_s = teacher.outputs.q, student.outputs.q
batch_size = q_t.shape[0]
pad_seq_len = q_t.shape[2]
loss_qr1, loss_qr2, loss_qr3 = calc_minilm_loss_multi_relation(
loss_q = calc_multi_relation_loss(
kl_loss_fct, q_s, q_t, attention_mask,
args.num_relation_heads, args.alpha, args.beta)
del q_t, q_s
# K-K relation
k_t, k_s = teacher.outputs.k, student.outputs.k
loss_kr1, loss_kr2, loss_kr3 = calc_minilm_loss_multi_relation(
loss_k = calc_multi_relation_loss(
kl_loss_fct, k_s, k_t, attention_mask,
args.num_relation_heads, args.alpha, args.beta)
del k_t, k_s

# V-V relation
v_t, v_s = teacher.outputs.v, student.outputs.v
loss_vr1, loss_vr2, loss_vr3 = calc_minilm_loss_multi_relation(
loss_v = calc_multi_relation_loss(
kl_loss_fct, v_s, v_t, attention_mask,
args.num_relation_heads, args.alpha, args.beta)

del v_t, v_s

loss1 = (loss_qr1 + loss_kr1 + loss_vr1)
loss1 /= args.num_relation_heads * pad_seq_len * batch_size

loss2 = loss_qr2 + loss_kr2 + loss_vr2
loss2 /= args.num_relation_heads * pad_seq_len * batch_size

loss3 = loss_qr3 + loss_kr3 + loss_vr3
loss3 /= args.num_relation_heads * pad_seq_len * batch_size
loss = (1 - args.alpha - args.beta
) * loss1 + loss2 * args.alpha + loss3 * args.beta
loss = loss_q + loss_k + loss_v
loss /= args.num_relation_heads * pad_seq_len * batch_size

if args.use_amp:
scaler.scale(loss).backward()
Expand All @@ -453,10 +446,10 @@ def do_train(args):
train_cost_avg.record(train_run_cost)
if global_step % args.logging_steps == 0:
logger.info(
"global step: %d, epoch: %d, batch: %d, loss: %f, loss1: %f, loss2: %f, loss3: %f,"
"global step: %d, epoch: %d, batch: %d, loss: %f, "
"lr: %f, avg_batch_cost: %.5f sec, avg_samples: %.5f, ips: %.5f sequences/sec"
% (global_step, epoch, step, loss, loss1, loss2, loss3,
optimizer.get_lr(), train_cost_avg.get_average(),
% (global_step, epoch, step, loss, optimizer.get_lr(),
train_cost_avg.get_average(),
total_samples / args.logging_steps, total_samples /
(args.logging_steps * train_cost_avg.get_average())))
total_samples = 0
Expand Down
10 changes: 0 additions & 10 deletions examples/model_compression/PP-MiniLM/inference/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@

import argparse
import os
import time
from functools import partial
import numpy as np

Expand Down Expand Up @@ -112,7 +111,6 @@ def parse_args():
"--tokenizer_path",
default='../general_distill/ernie-batchbatch-50w_400000/',
type=str,
required=True,
help="The directory for tokenizer.", )
parser.add_argument(
"--model_path",
Expand Down Expand Up @@ -190,7 +188,6 @@ def create_predictor(cls, args):
config.switch_use_feed_fetch_ops(False) # could be deleted
if args.use_trt:
if args.int8:
print("int8")
config.enable_tensorrt_engine(
workspace_size=1 << 30,
precision_mode=inference.PrecisionType.Int8,
Expand Down Expand Up @@ -227,18 +224,15 @@ def create_predictor(cls, args):
predictor.get_output_handle(name)
for name in predictor.get_output_names()
]
cls.time = 0.0

return cls(predictor, input_handles, output_handles)

def predict_batch(self, data):
for input_field, input_handle in zip(data, self.input_handles):
input_handle.copy_from_cpu(input_field.numpy() if isinstance(
input_field, paddle.Tensor) else input_field)
time1 = time.time()
self.predictor.run()
paddle.fluid.core._cuda_synchronize(self.device)
self.time += time.time() - time1
output = [
output_handle.copy_to_cpu() for output_handle in self.output_handles
]
Expand All @@ -258,9 +252,6 @@ def predict(self, dataset, collate_fn, args, batch_size=1):
outputs = []
metric.reset()
for i, data in enumerate(data_loader):
# warmup for performance test
if i < 20:
continue
if len(data) == 2:
output = self.predict_batch(data)
else:
Expand All @@ -272,7 +263,6 @@ def predict(self, dataset, collate_fn, args, batch_size=1):
if len(data) > 2:
res = metric.accumulate()
print("task name: %s, acc: %s, " % (args.task_name, res), end='')
print("time: ", self.time)

return outputs

Expand Down
26 changes: 0 additions & 26 deletions examples/model_compression/PP-MiniLM/inference/infer.sh

This file was deleted.

Loading

0 comments on commit 8d0af8b

Please sign in to comment.