update model

TanyaZhao · Jul 16, 2020 · ac17256 · ac17256
1 parent 4f7ae09
commit ac17256
Show file tree

Hide file tree

Showing 36 changed files with 31,017 additions and 0 deletions.
diff --git a/configs/bert.json b/configs/bert.json
@@ -0,0 +1,26 @@
+{
+    "bert_frozen": "false",
+    "hidden_size": 768,
+    "hidden_dropout_prob": 0.2,
+    "classifier_sign": "multi_nonlinear",
+    "clip_grad": 1,
+    "bert_config": {
+        "attention_probs_dropout_prob": 0.1,
+        "directionality": "bidi", 
+        "hidden_act": "gelu", 
+        "hidden_dropout_prob": 0.1, 
+        "hidden_size": 768, 
+        "initializer_range": 0.02, 
+        "intermediate_size": 3072, 
+        "max_position_embeddings": 512,
+        "num_attention_heads": 12, 
+        "num_hidden_layers": 12, 
+        "pooler_fc_size": 768, 
+        "pooler_num_attention_heads": 12, 
+        "pooler_num_fc_layers": 3, 
+        "pooler_size_per_head": 128, 
+        "pooler_type": "first_token_transform", 
+        "type_vocab_size": 2, 
+        "vocab_size": 21128
+    }
+}
diff --git a/configs/default.cfg b/configs/default.cfg
@@ -0,0 +1,36 @@
+[Bert]
+bert_model = ../pretrained_bert/bert-base-cased
+bert_config = ../configs/bert.json
+
+[Data]
+data_dir = ../datasets/conll04/mrc4ere
+train_file = %(data_dir)s/train_dev.json
+dev_file = %(data_dir)s/dev.json
+test_file = %(data_dir)s/test.json
+max_seq_length = 200
+max_query_length = 32
+doc_stride = 128
+
+[Save]
+output_dir = ../ckpt/default
+config_file = ../configs/default.cfg
+result_dir = ../log/output_results/
+
+[Run]
+seed = 3306
+task_name = None
+epochs = 10.0
+learning_rate = 5e-5
+checkpoint = 200
+train_batch_size = 8
+dev_batch_size = 8
+test_batch_size = 8
+export_model = True
+do_train = store_true
+do_eval = store_true
+loss_type = ce
+use_cuda = True
+local_rank = -1
+warmup_proportion = 0.1
+gradient_accumulation_steps = 1
+
diff --git a/layers/__init__.py b/layers/__init__.py
@@ -0,0 +1 @@
+
diff --git a/layers/__pycache__/__init__.cpython-36.pyc b/layers/__pycache__/__init__.cpython-36.pyc
diff --git a/layers/__pycache__/__init__.cpython-37.pyc b/layers/__pycache__/__init__.cpython-37.pyc
diff --git a/layers/__pycache__/bert_layernorm.cpython-36.pyc b/layers/__pycache__/bert_layernorm.cpython-36.pyc
diff --git a/layers/__pycache__/bert_layernorm.cpython-37.pyc b/layers/__pycache__/bert_layernorm.cpython-37.pyc
diff --git a/layers/__pycache__/classifier.cpython-36.pyc b/layers/__pycache__/classifier.cpython-36.pyc
diff --git a/layers/__pycache__/classifier.cpython-37.pyc b/layers/__pycache__/classifier.cpython-37.pyc
diff --git a/layers/bert_layernorm.py b/layers/bert_layernorm.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+
+
+# Author: Xiaoy LI
+# Last update: 2019.04.04
+# First create: 2019.03.29
+# Description:
+# bert_layernorm.py
+
+
+
+import os
+import sys
+import copy
+import json
+import math
+import logging
+import tarfile
+import tempfile
+import shutil
+
+
+root_path = "/".join(os.path.realpath(__file__).split("/")[:-2])
+if root_path not in sys.path:
+    sys.path.insert(0, root_path)
+
+
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+
+
+class BertLayerNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-12):
+        # construct a layernorm module in the TF style
+        # epsilon inside the square are not
+        super(BertLayerNorm, self).__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.bias = nn.Parameter(torch.zeros(hidden_size))
+        self.variance_epsilon = eps
+
+
+    def forward(self, x):
+        u = x.mean(-1, keepdim=True)
+        s = (x - u).pow(2).mean(-1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
+        return self.weight * x + self.bias
diff --git a/layers/classifier.py b/layers/classifier.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+
+
+# Author: Xiaoy LI
+# Last update: 2019.04.02
+# First create: 2019.04.02
+# Description:
+#
+
+
+import os
+import sys
+
+
+
+root_path = "/".join(os.path.realpath(__file__).split("/")[:-2])
+if root_path not in sys.path:
+    sys.path.insert(0, root_path)
+
+
+from models.bert_basic_model import *
+from layers.bert_layernorm import BertLayerNorm
+
+
+class SingleLinearClassifier(nn.Module):
+    def __init__(self, hidden_size, num_label):
+        super(SingleLinearClassifier, self).__init__()
+        self.num_label = num_label
+        self.classifier = nn.Linear(hidden_size, num_label)
+
+    def forward(self, input_features):
+        features_output = self.classifier(input_features)
+
+        return features_output
+
+
+class MultiNonLinearClassifier(nn.Module):
+    def __init__(self, hidden_size, num_label):
+        super(MultiNonLinearClassifier, self).__init__()
+        self.num_label = num_label
+        self.classifier1 = nn.Linear(hidden_size, int(hidden_size / 2))
+        self.classifier2 = nn.Linear(int(hidden_size / 2), num_label)
+
+    def forward(self, input_features):
+        features_output1 = self.classifier1(input_features)
+        features_output1 = nn.ReLU()(features_output1)
+        features_output2 = self.classifier2(features_output1)
+        return features_output2
+
diff --git a/prepare_data/__init__.py b/prepare_data/__init__.py
@@ -0,0 +1 @@
+
diff --git a/prepare_data/__pycache__/__init__.cpython-36.pyc b/prepare_data/__pycache__/__init__.cpython-36.pyc
diff --git a/prepare_data/__pycache__/__init__.cpython-37.pyc b/prepare_data/__pycache__/__init__.cpython-37.pyc
diff --git a/prepare_data/__pycache__/apply_text_norm.cpython-37.pyc b/prepare_data/__pycache__/apply_text_norm.cpython-37.pyc
diff --git a/prepare_data/__pycache__/data_utils.cpython-36.pyc b/prepare_data/__pycache__/data_utils.cpython-36.pyc
diff --git a/prepare_data/__pycache__/data_utils.cpython-37.pyc b/prepare_data/__pycache__/data_utils.cpython-37.pyc
diff --git a/prepare_data/__pycache__/input_features.cpython-37.pyc b/prepare_data/__pycache__/input_features.cpython-37.pyc
diff --git a/prepare_data/__pycache__/mrc_example.cpython-36.pyc b/prepare_data/__pycache__/mrc_example.cpython-36.pyc
diff --git a/prepare_data/__pycache__/mrc_processor.cpython-36.pyc b/prepare_data/__pycache__/mrc_processor.cpython-36.pyc
diff --git a/prepare_data/__pycache__/mrc_utils.cpython-36.pyc b/prepare_data/__pycache__/mrc_utils.cpython-36.pyc
diff --git a/prepare_data/__pycache__/squad_example.cpython-37.pyc b/prepare_data/__pycache__/squad_example.cpython-37.pyc
diff --git a/prepare_data/__pycache__/squad_processor.cpython-37.pyc b/prepare_data/__pycache__/squad_processor.cpython-37.pyc
diff --git a/prepare_data/data_utils.py b/prepare_data/data_utils.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import os
+import sys
+import csv
+import logging
+import argparse
+import random
+import numpy as np
+from tqdm import tqdm, trange
+import torch
+
+root_path = "/".join(os.path.realpath(__file__).split("/")[:-2])
+if root_path not in sys.path:
+    sys.path.insert(0, root_path)
+
+from torch.utils.data import TensorDataset, DataLoader, RandomSampler, \
+    SequentialSampler
+
+class DataProcessor(object):
+    # base class for data converts for sequence classification datasets
+    def get_train_examples(self, data_dir):
+        # get a collection of "InputExample" for the train set
+        raise NotImplementedError()
+
+    def get_dev_examples(self, data_dir):
+        # gets a collections of "InputExample" for the dev set
+        raise NotImplementedError()
+
+    def get_labels(self):
+        # gets the list of labels for this data set
+        raise NotImplementedError()
+
+    @classmethod
+    def _read_tsv(cls, input_file, quotechar=None):
+        # reads a tab separated value file.
+        with open(input_file, "r") as f:
+            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+            lines = []
+            for line in reader:
+                lines.append(line)
+            return lines
+
+def generate_mini_batch_input(all_features, mini_batch_idx, config):
+    batch = [all_features[idx] for idx in mini_batch_idx]
+    input_ids = torch.tensor([[f.input_ids for f in group.input_features] for group in batch], dtype=torch.long)
+    input_mask = torch.tensor([[f.input_mask for f in group.input_features] for group in batch], dtype=torch.long)
+    segment_ids = torch.tensor([[f.segment_ids for f in group.input_features] for group in batch], dtype=torch.long)
+    label_ids = torch.tensor([[f.label_id for f in group.input_features] for group in batch], dtype=torch.long)
+    valid_ids = torch.tensor([[f.valid_id for f in group.input_features] for group in batch], dtype=torch.long)
+    label_mask = torch.tensor([[f.label_mask for f in group.input_features] for group in batch], dtype=torch.long)
+    # label_mask = np.array([[f.label_mask for f in group_f] for group_f in batch_features])
+    input_types = [group.type for group in batch]
+    # entity_types = [[f.entity_type for f in group_f] for group_f in batch]
+    # relations = [[f.relations for f in group_f] for group_f in batch]
+    entity_types = [group.entity_type for group in batch] # batch_size
+    relations = [group.relations for group in batch]
+    doc_tokens = [group.doc_tokens for group in batch]
+
+    input_ids = input_ids.view(-1, config.max_seq_length)  # batch * 3, max_seq_length
+    input_mask = input_mask.view(-1, config.max_seq_length)
+    segment_ids = segment_ids.view(-1, config.max_seq_length)
+    label_ids = label_ids.view(-1, config.max_seq_length)
+    valid_ids = valid_ids.view(-1, config.max_seq_length)
+    label_mask = label_mask.view(-1, config.max_seq_length)
+    # label_mask = np.reshape(label_mask, (-1, config.max_seq_length))
+
+    return input_ids, input_mask, segment_ids, label_ids, valid_ids, label_mask, input_types, entity_types, relations, doc_tokens
+
diff --git a/prepare_data/mrc_example.py b/prepare_data/mrc_example.py
@@ -0,0 +1,72 @@
+class InputExample(object):
+    # a single training / test example for simple sequence classification
+    def __init__(self, guid, text_a, text_b=None, label=None):
+        """
+        Construct s input Example.
+        Args:
+            guid: unqiue id for the example.
+            text_a: string, the untokenzied text of the first seq. for single sequence
+                tasks, only this sequction msut be specified.
+            text_b: (Optional) string, the untokenized text of the second sequence.
+            label: (Optional) string, the label of the example, This should be specifi
+                for train and dev examples, but not for test examples.
+        """
+        self.guid = guid
+        self.text_a = text_a
+        self.text_b = text_b
+        self.label = label
+
+class GroupFeature(object):
+    # a single set of features of data
+    def __init__(self, doc_tokens, q_type, entity_type, relations, input_features):
+        self.doc_tokens = doc_tokens
+        self.type = q_type
+        self.entity_type = entity_type
+        self.relations = relations
+        self.input_features = input_features
+
+
+class InputFeature(object):
+    # a single set of features of data
+    def __init__(self, input_ids, input_mask, segment_ids, label_id, label_mask, valid_id):
+        self.input_ids = input_ids # ques_i + doc_token
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.label_id = label_id
+        self.label_mask = label_mask
+        self.valid_id = valid_id
+
+
+class MRCExample(object):
+    """A single training/test example for the Squad dataset."""
+
+    def __init__(self,
+                 qas_id,
+                 question_text,
+                 doc_tokens,
+                 entity_type,
+                 q_type,
+                 relations,
+                 label=None,
+                 orig_answer_text=None,
+                 start_position=None,
+                 end_position=None):
+        self.qas_id = qas_id
+        self.question_text = question_text
+        self.doc_tokens = doc_tokens
+        self.label = label
+        self.q_type = q_type
+        self.entity_type = entity_type
+        self.relations = relations
+
+    def __str__(self):
+        return self.__repr__()
+
+    def __repr__(self):
+        s = ""
+        s += "qas_id: %s" % (self.qas_id)
+        s += ", question_text: %s" % (
+            self.question_text)
+        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
+        s += ", label: [%s]" % (" ".join(self.label))
+        return s
diff --git a/prepare_data/mrc_processor.py b/prepare_data/mrc_processor.py
@@ -0,0 +1,58 @@
+import os
+import sys
+import csv
+
+from .mrc_utils import *
+
+class DataProcessor(object):
+    # base class for data converts for sequence classification datasets
+    def get_train_examples(self, data_dir):
+        # get a collection of "InputExample" for the train set
+        raise NotImplementedError()
+
+    def get_dev_examples(self, data_dir):
+        # gets a collections of "InputExample" for the dev set
+        raise NotImplementedError()
+
+    @classmethod
+    def _read_tsv(cls, input_file, quotechar=None):
+        # reads a tab separated value file.
+        with open(input_file, "r") as f:
+            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+            lines = []
+            for line in reader:
+                lines.append(line)
+            return lines
+
+class MRCProcessor(DataProcessor):
+
+    def get_train_examples(self, data_dir):
+        train_examples = read_squad_examples(data_dir, is_training=True)
+        return train_examples
+
+    def get_dev_examples(self, data_dir):
+        dev_examples = read_squad_examples(data_dir, is_training=False)
+        return dev_examples
+
+    def get_test_examples(self, data_dir):
+        test_examples = read_squad_examples(data_dir, is_training=False)
+        return test_examples
+
+    def get_labels(self, datasets):
+        label_list = ['[CLS]','[SEP]']
+        for dataset in datasets:
+            for example in dataset:
+                for tmp in list(set(example.label)):
+                    if tmp not in label_list:
+                        label_list.append(tmp)
+
+        return label_list
+
+    def get_entity_types(self, datasets="conll04"):
+        return ["loc", "peop", "org", "other"]
+
+
+
+
+
+