Skip to content

Commit

Permalink
update model
Browse files Browse the repository at this point in the history
  • Loading branch information
TanyaZhao committed Jul 16, 2020
1 parent 4f7ae09 commit ac17256
Show file tree
Hide file tree
Showing 36 changed files with 31,017 additions and 0 deletions.
26 changes: 26 additions & 0 deletions configs/bert.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"bert_frozen": "false",
"hidden_size": 768,
"hidden_dropout_prob": 0.2,
"classifier_sign": "multi_nonlinear",
"clip_grad": 1,
"bert_config": {
"attention_probs_dropout_prob": 0.1,
"directionality": "bidi",
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"max_position_embeddings": 512,
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pooler_fc_size": 768,
"pooler_num_attention_heads": 12,
"pooler_num_fc_layers": 3,
"pooler_size_per_head": 128,
"pooler_type": "first_token_transform",
"type_vocab_size": 2,
"vocab_size": 21128
}
}
36 changes: 36 additions & 0 deletions configs/default.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
[Bert]
bert_model = ../pretrained_bert/bert-base-cased
bert_config = ../configs/bert.json

[Data]
data_dir = ../datasets/conll04/mrc4ere
train_file = %(data_dir)s/train_dev.json
dev_file = %(data_dir)s/dev.json
test_file = %(data_dir)s/test.json
max_seq_length = 200
max_query_length = 32
doc_stride = 128

[Save]
output_dir = ../ckpt/default
config_file = ../configs/default.cfg
result_dir = ../log/output_results/

[Run]
seed = 3306
task_name = None
epochs = 10.0
learning_rate = 5e-5
checkpoint = 200
train_batch_size = 8
dev_batch_size = 8
test_batch_size = 8
export_model = True
do_train = store_true
do_eval = store_true
loss_type = ce
use_cuda = True
local_rank = -1
warmup_proportion = 0.1
gradient_accumulation_steps = 1

1 change: 1 addition & 0 deletions layers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

Binary file added layers/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file added layers/__pycache__/__init__.cpython-37.pyc
Binary file not shown.
Binary file added layers/__pycache__/bert_layernorm.cpython-36.pyc
Binary file not shown.
Binary file added layers/__pycache__/bert_layernorm.cpython-37.pyc
Binary file not shown.
Binary file added layers/__pycache__/classifier.cpython-36.pyc
Binary file not shown.
Binary file added layers/__pycache__/classifier.cpython-37.pyc
Binary file not shown.
51 changes: 51 additions & 0 deletions layers/bert_layernorm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-



# Author: Xiaoy LI
# Last update: 2019.04.04
# First create: 2019.03.29
# Description:
# bert_layernorm.py



import os
import sys
import copy
import json
import math
import logging
import tarfile
import tempfile
import shutil


root_path = "/".join(os.path.realpath(__file__).split("/")[:-2])
if root_path not in sys.path:
sys.path.insert(0, root_path)



import torch
from torch import nn
from torch.nn import CrossEntropyLoss



class BertLayerNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-12):
# construct a layernorm module in the TF style
# epsilon inside the square are not
super(BertLayerNorm, self).__init__()
self.weight = nn.Parameter(torch.ones(hidden_size))
self.bias = nn.Parameter(torch.zeros(hidden_size))
self.variance_epsilon = eps


def forward(self, x):
u = x.mean(-1, keepdim=True)
s = (x - u).pow(2).mean(-1, keepdim=True)
x = (x - u) / torch.sqrt(s + self.variance_epsilon)
return self.weight * x + self.bias
51 changes: 51 additions & 0 deletions layers/classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-



# Author: Xiaoy LI
# Last update: 2019.04.02
# First create: 2019.04.02
# Description:
#


import os
import sys



root_path = "/".join(os.path.realpath(__file__).split("/")[:-2])
if root_path not in sys.path:
sys.path.insert(0, root_path)


from models.bert_basic_model import *
from layers.bert_layernorm import BertLayerNorm


class SingleLinearClassifier(nn.Module):
def __init__(self, hidden_size, num_label):
super(SingleLinearClassifier, self).__init__()
self.num_label = num_label
self.classifier = nn.Linear(hidden_size, num_label)

def forward(self, input_features):
features_output = self.classifier(input_features)

return features_output


class MultiNonLinearClassifier(nn.Module):
def __init__(self, hidden_size, num_label):
super(MultiNonLinearClassifier, self).__init__()
self.num_label = num_label
self.classifier1 = nn.Linear(hidden_size, int(hidden_size / 2))
self.classifier2 = nn.Linear(int(hidden_size / 2), num_label)

def forward(self, input_features):
features_output1 = self.classifier1(input_features)
features_output1 = nn.ReLU()(features_output1)
features_output2 = self.classifier2(features_output1)
return features_output2

1 change: 1 addition & 0 deletions prepare_data/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

Binary file added prepare_data/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file added prepare_data/__pycache__/__init__.cpython-37.pyc
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added prepare_data/__pycache__/mrc_utils.cpython-36.pyc
Binary file not shown.
Binary file not shown.
Binary file not shown.
70 changes: 70 additions & 0 deletions prepare_data/data_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import sys
import csv
import logging
import argparse
import random
import numpy as np
from tqdm import tqdm, trange
import torch

root_path = "/".join(os.path.realpath(__file__).split("/")[:-2])
if root_path not in sys.path:
sys.path.insert(0, root_path)

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, \
SequentialSampler

class DataProcessor(object):
# base class for data converts for sequence classification datasets
def get_train_examples(self, data_dir):
# get a collection of "InputExample" for the train set
raise NotImplementedError()

def get_dev_examples(self, data_dir):
# gets a collections of "InputExample" for the dev set
raise NotImplementedError()

def get_labels(self):
# gets the list of labels for this data set
raise NotImplementedError()

@classmethod
def _read_tsv(cls, input_file, quotechar=None):
# reads a tab separated value file.
with open(input_file, "r") as f:
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
lines = []
for line in reader:
lines.append(line)
return lines

def generate_mini_batch_input(all_features, mini_batch_idx, config):
batch = [all_features[idx] for idx in mini_batch_idx]
input_ids = torch.tensor([[f.input_ids for f in group.input_features] for group in batch], dtype=torch.long)
input_mask = torch.tensor([[f.input_mask for f in group.input_features] for group in batch], dtype=torch.long)
segment_ids = torch.tensor([[f.segment_ids for f in group.input_features] for group in batch], dtype=torch.long)
label_ids = torch.tensor([[f.label_id for f in group.input_features] for group in batch], dtype=torch.long)
valid_ids = torch.tensor([[f.valid_id for f in group.input_features] for group in batch], dtype=torch.long)
label_mask = torch.tensor([[f.label_mask for f in group.input_features] for group in batch], dtype=torch.long)
# label_mask = np.array([[f.label_mask for f in group_f] for group_f in batch_features])
input_types = [group.type for group in batch]
# entity_types = [[f.entity_type for f in group_f] for group_f in batch]
# relations = [[f.relations for f in group_f] for group_f in batch]
entity_types = [group.entity_type for group in batch] # batch_size
relations = [group.relations for group in batch]
doc_tokens = [group.doc_tokens for group in batch]

input_ids = input_ids.view(-1, config.max_seq_length) # batch * 3, max_seq_length
input_mask = input_mask.view(-1, config.max_seq_length)
segment_ids = segment_ids.view(-1, config.max_seq_length)
label_ids = label_ids.view(-1, config.max_seq_length)
valid_ids = valid_ids.view(-1, config.max_seq_length)
label_mask = label_mask.view(-1, config.max_seq_length)
# label_mask = np.reshape(label_mask, (-1, config.max_seq_length))

return input_ids, input_mask, segment_ids, label_ids, valid_ids, label_mask, input_types, entity_types, relations, doc_tokens

72 changes: 72 additions & 0 deletions prepare_data/mrc_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
class InputExample(object):
# a single training / test example for simple sequence classification
def __init__(self, guid, text_a, text_b=None, label=None):
"""
Construct s input Example.
Args:
guid: unqiue id for the example.
text_a: string, the untokenzied text of the first seq. for single sequence
tasks, only this sequction msut be specified.
text_b: (Optional) string, the untokenized text of the second sequence.
label: (Optional) string, the label of the example, This should be specifi
for train and dev examples, but not for test examples.
"""
self.guid = guid
self.text_a = text_a
self.text_b = text_b
self.label = label

class GroupFeature(object):
# a single set of features of data
def __init__(self, doc_tokens, q_type, entity_type, relations, input_features):
self.doc_tokens = doc_tokens
self.type = q_type
self.entity_type = entity_type
self.relations = relations
self.input_features = input_features


class InputFeature(object):
# a single set of features of data
def __init__(self, input_ids, input_mask, segment_ids, label_id, label_mask, valid_id):
self.input_ids = input_ids # ques_i + doc_token
self.input_mask = input_mask
self.segment_ids = segment_ids
self.label_id = label_id
self.label_mask = label_mask
self.valid_id = valid_id


class MRCExample(object):
"""A single training/test example for the Squad dataset."""

def __init__(self,
qas_id,
question_text,
doc_tokens,
entity_type,
q_type,
relations,
label=None,
orig_answer_text=None,
start_position=None,
end_position=None):
self.qas_id = qas_id
self.question_text = question_text
self.doc_tokens = doc_tokens
self.label = label
self.q_type = q_type
self.entity_type = entity_type
self.relations = relations

def __str__(self):
return self.__repr__()

def __repr__(self):
s = ""
s += "qas_id: %s" % (self.qas_id)
s += ", question_text: %s" % (
self.question_text)
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
s += ", label: [%s]" % (" ".join(self.label))
return s
58 changes: 58 additions & 0 deletions prepare_data/mrc_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import os
import sys
import csv

from .mrc_utils import *

class DataProcessor(object):
# base class for data converts for sequence classification datasets
def get_train_examples(self, data_dir):
# get a collection of "InputExample" for the train set
raise NotImplementedError()

def get_dev_examples(self, data_dir):
# gets a collections of "InputExample" for the dev set
raise NotImplementedError()

@classmethod
def _read_tsv(cls, input_file, quotechar=None):
# reads a tab separated value file.
with open(input_file, "r") as f:
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
lines = []
for line in reader:
lines.append(line)
return lines

class MRCProcessor(DataProcessor):

def get_train_examples(self, data_dir):
train_examples = read_squad_examples(data_dir, is_training=True)
return train_examples

def get_dev_examples(self, data_dir):
dev_examples = read_squad_examples(data_dir, is_training=False)
return dev_examples

def get_test_examples(self, data_dir):
test_examples = read_squad_examples(data_dir, is_training=False)
return test_examples

def get_labels(self, datasets):
label_list = ['[CLS]','[SEP]']
for dataset in datasets:
for example in dataset:
for tmp in list(set(example.label)):
if tmp not in label_list:
label_list.append(tmp)

return label_list

def get_entity_types(self, datasets="conll04"):
return ["loc", "peop", "org", "other"]






Loading

0 comments on commit ac17256

Please sign in to comment.