Merge pull request PaddlePaddle#49 from LiuChiachi/fix-lstm-distill-sst2-bug

guoshengCS · web-flow · commit a4f8019a46a9 · 2021-03-03T10:21:57.000+08:00
Update usage of tokenizer in glue and distill lstm
diff --git a/examples/glue/run_glue.py b/examples/glue/run_glue.py
@@ -264,26 +264,12 @@ def _concat_seqs(seqs, separators, seq_mask=0, separator_mask=1):
     else:
         example = tokenizer(
             example[0], text_pair=example[1], max_seq_len=max_seq_length)
-    '''
-    tokens_raw = [tokenizer(l) for l in example]
-    # Truncate to the truncate_length,
-    tokens_trun = _truncate_seqs(tokens_raw, max_seq_length)
-    # Concate the sequences with special tokens
-    tokens_trun[0] = [tokenizer.cls_token] + tokens_trun[0]
-    tokens, segment_ids, _ = _concat_seqs(tokens_trun, [[tokenizer.sep_token]] *
-                                          len(tokens_trun))
-    # Convert the token to ids
-    input_ids = tokenizer.convert_tokens_to_ids(tokens)
-    valid_length = len(input_ids)
-    # The mask has 1 for real tokens and 0 for padding tokens. Only real
-    # tokens are attended to.
-    # input_mask = [1] * len(input_ids)
-    '''
+
     if not is_test:
-        return example['input_ids'], example['segment_ids'], len(example[
+        return example['input_ids'], example['token_type_ids'], len(example[
             'input_ids']), label
     else:
-        return example['input_ids'], example['segment_ids'], len(example[
+        return example['input_ids'], example['token_type_ids'], len(example[
             'input_ids'])
 
 
@@ -312,7 +298,7 @@ def do_train(args):
         train_dataset, batch_size=args.batch_size, shuffle=True)
     batchify_fn = lambda samples, fn=Tuple(
         Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
-        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment
+        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
         Stack(),  # length
         Stack(dtype="int64" if train_dataset.get_labels() else "float32")  # label
     ): [data for i, data in enumerate(fn(samples)) if i != 2]
diff --git a/examples/model_compression/distill_lstm/README.md b/examples/model_compression/distill_lstm/README.md
@@ -62,7 +62,7 @@ wget https://paddlenlp.bj.bcebos.com/data/senta_word_dict.txt
 cd ../../glue
 export CUDA_VISIBLE_DEVICES=0
 export TASK_NAME=SST-2
-python -u ./run_bert_finetune.py \
+python -u ./run_glue.py \
     --model_type bert \
     --model_name_or_path bert-base-uncased \
     --task_name $TASK_NAME \
diff --git a/examples/model_compression/distill_lstm/data.py b/examples/model_compression/distill_lstm/data.py
@@ -282,7 +282,7 @@ def create_distill_loader(task_name,
     if task_name == 'qqp':
         batchify_fn = lambda samples, fn=Tuple(
             Pad(axis=0, pad_val=tokenizer.pad_token_id),  # bert input
-            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # bert segment
+            Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # bert segment
             Pad(axis=0, pad_val=pad_val),  # small input_ids
             Stack(dtype="int64"),  # small seq len
             Pad(axis=0, pad_val=pad_val),  # small input_ids
@@ -292,7 +292,7 @@ def create_distill_loader(task_name,
     else:
         batchify_fn = lambda samples, fn=Tuple(
             Pad(axis=0, pad_val=tokenizer.pad_token_id),  # bert input
-            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # bert segment
+            Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # bert segment
             Pad(axis=0, pad_val=pad_val),  # small input_ids
             Stack(dtype="int64"),  # small seq len
             Stack(dtype="int64")  # small label
diff --git a/examples/model_compression/distill_lstm/utils.py b/examples/model_compression/distill_lstm/utils.py
@@ -160,8 +160,8 @@ def _concat_seqs(seqs, separators, seq_mask=0, separator_mask=1):
             is_split_into_words=is_tokenized)
 
     if not is_test:
-        return example['input_ids'], example['segment_ids'], len(example[
+        return example['input_ids'], example['token_type_ids'], len(example[
             'input_ids']), label
 
-    return example['input_ids'], example['segment_ids'], len(example[
+    return example['input_ids'], example['token_type_ids'], len(example[
         'input_ids'])