Skip to content

Commit

Permalink
pdate general distill in ppminilm (#1520)
Browse files Browse the repository at this point in the history
  • Loading branch information
LiuChiachi authored Dec 28, 2021
1 parent a16c4bc commit d82766b
Show file tree
Hide file tree
Showing 5 changed files with 15 additions and 15 deletions.
2 changes: 1 addition & 1 deletion examples/model_compression/pp-minilm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ PP-MiniLM 压缩方案以面向预训练模型的任务无关知识蒸馏(Task-a
│ └── run_clue.sh # CLUE 上的微调启动脚本
│ └── run_one_search.sh # 单数据集下精调脚本
│ └── run_all_search.sh # CLUE数据集下精调脚本
│ └── export_model.sh # 导出 fine-tuned 部署模型脚本
│ └── export_model.py # 导出 fine-tuned 部署模型脚本
├── pruning # 裁剪、蒸馏目录
│ └── prune.py # 裁剪、蒸馏脚本
│ └── prune.sh # 裁剪、蒸馏启动脚本
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@ cd ..

其中 `general_distill.py` 参数释义如下:

- `model_type` 指示了学生模型类型,当前仅支持 'ernie'、'roberta'。
- `model_type` 指示了学生模型类型,当前仅支持 'ppminilm'、'roberta'。
- `num_relation_heads` relation head 的个数,一般对于 large-size 的教师模型是64,对于 base-size 的教师模型是 48。
- `teacher_model_type`指示了教师模型类型,当前仅支持 'ernie'、'roberta'。
- `teacher_model_type`指示了教师模型类型,当前仅支持 'roberta'。
- `teacher_layer_index`蒸馏时使用的教师模型的层
- `student_layer_index` 蒸馏时使用的学生模型的层
- `teacher_model_name_or_path`教师模型的名称,例如`'roberta-wwm-ext-large'`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,12 @@
from paddlenlp.utils.tools import TimeCostAverage
from paddlenlp.transformers import LinearDecayWithWarmup
from paddlenlp.transformers import RobertaModel, RobertaTokenizer
from paddlenlp.transformers import ErnieModel, ErnieForSequenceClassification, ErnieTokenizer
from paddlenlp.transformers import PPMiniLMModel, PPMiniLMForSequenceClassification, PPMiniLMTokenizer
from paddlenlp.transformers.distill_utils import to_distill, calc_multi_relation_loss

MODEL_CLASSES = {
"roberta": (RobertaModel, RobertaTokenizer),
"ernie": (ErnieForSequenceClassification, ErnieTokenizer)
"ppminilm": (PPMiniLMForSequenceClassification, PPMiniLMTokenizer)
}


Expand All @@ -47,14 +47,14 @@ def parse_args():
# Required parameters
parser.add_argument(
"--model_type",
default="ernie",
default="ppminilm",
type=str,
required=True,
help="Model type selected in the list: " +
", ".join(MODEL_CLASSES.keys()), )
parser.add_argument(
"--teacher_model_type",
default="ernie",
default="roberta",
type=str,
required=True,
help="Model type selected in the list: " +
Expand Down Expand Up @@ -276,28 +276,28 @@ def do_train(args):
# For student
model_class, _ = MODEL_CLASSES[args.model_type]
if args.num_layers == 6:
ernie = ErnieModel(
ppminilm = PPMiniLMModel(
vocab_size=tokenizer.vocab_size,
num_hidden_layers=6,
hidden_act='relu',
intermediate_size=3072,
hidden_size=768) # layer: 6
elif args.num_layers == 4:
ernie = ErnieModel(
ppminilm = PPMiniLMModel(
vocab_size=tokenizer.vocab_size,
num_hidden_layers=4,
hidden_act='relu',
intermediate_size=1024,
hidden_size=256,
num_attention_heads=16) # layer: 4
else:
ernie = ErnieModel(
ppminilm = PPMiniLMModel(
vocab_size=tokenizer.vocab_size,
num_hidden_layers=2,
hidden_act='relu',
hidden_size=128,
intermediate_size=512) # layer: 2
student = model_class(ernie)
student = model_class(ppminilm)

teacher = teacher_model_class.from_pretrained(
args.teacher_model_name_or_path)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ cp ../../../../paddlenlp/transformers/distill_utils.py ${output_dir}/


python3 -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" general_distill.py \
--model_type ernie \
--model_type ppminilm \
--num_relation_heads ${numH} \
--teacher_model_type ${teacher} \
--teacher_layer_index ${teacher_layer_index} \
Expand Down
6 changes: 3 additions & 3 deletions paddlenlp/transformers/distill_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from paddle.fluid.data_feeder import convert_dtype

from paddlenlp.utils.log import logger
from paddlenlp.transformers import ErnieForSequenceClassification
from paddlenlp.transformers import PPMiniLMForSequenceClassification
from paddlenlp.transformers import TinyBertForPretraining
from paddlenlp.transformers import BertForSequenceClassification

Expand Down Expand Up @@ -208,15 +208,15 @@ def to_distill(self,
if return_qkv:
# forward function of student class should be replaced for distributed training.
TinyBertForPretraining._forward = minilm_pretraining_forward
ErnieForSequenceClassification._forward = minilm_pretraining_forward
PPMiniLMForSequenceClassification._forward = minilm_pretraining_forward
else:
TinyBertForPretraining._forward = tinybert_forward

def init_func(layer):
if isinstance(layer, (MultiHeadAttention, TransformerEncoderLayer,
TransformerEncoder, TinyBertForPretraining,
BertForSequenceClassification,
ErnieForSequenceClassification)):
PPMiniLMForSequenceClassification)):
layer.forward = layer._forward
if isinstance(layer, TransformerEncoder):
layer.return_layer_outputs = return_layer_outputs
Expand Down

0 comments on commit d82766b

Please sign in to comment.