-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
36 changed files
with
31,017 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
{ | ||
"bert_frozen": "false", | ||
"hidden_size": 768, | ||
"hidden_dropout_prob": 0.2, | ||
"classifier_sign": "multi_nonlinear", | ||
"clip_grad": 1, | ||
"bert_config": { | ||
"attention_probs_dropout_prob": 0.1, | ||
"directionality": "bidi", | ||
"hidden_act": "gelu", | ||
"hidden_dropout_prob": 0.1, | ||
"hidden_size": 768, | ||
"initializer_range": 0.02, | ||
"intermediate_size": 3072, | ||
"max_position_embeddings": 512, | ||
"num_attention_heads": 12, | ||
"num_hidden_layers": 12, | ||
"pooler_fc_size": 768, | ||
"pooler_num_attention_heads": 12, | ||
"pooler_num_fc_layers": 3, | ||
"pooler_size_per_head": 128, | ||
"pooler_type": "first_token_transform", | ||
"type_vocab_size": 2, | ||
"vocab_size": 21128 | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
[Bert] | ||
bert_model = ../pretrained_bert/bert-base-cased | ||
bert_config = ../configs/bert.json | ||
|
||
[Data] | ||
data_dir = ../datasets/conll04/mrc4ere | ||
train_file = %(data_dir)s/train_dev.json | ||
dev_file = %(data_dir)s/dev.json | ||
test_file = %(data_dir)s/test.json | ||
max_seq_length = 200 | ||
max_query_length = 32 | ||
doc_stride = 128 | ||
|
||
[Save] | ||
output_dir = ../ckpt/default | ||
config_file = ../configs/default.cfg | ||
result_dir = ../log/output_results/ | ||
|
||
[Run] | ||
seed = 3306 | ||
task_name = None | ||
epochs = 10.0 | ||
learning_rate = 5e-5 | ||
checkpoint = 200 | ||
train_batch_size = 8 | ||
dev_batch_size = 8 | ||
test_batch_size = 8 | ||
export_model = True | ||
do_train = store_true | ||
do_eval = store_true | ||
loss_type = ce | ||
use_cuda = True | ||
local_rank = -1 | ||
warmup_proportion = 0.1 | ||
gradient_accumulation_steps = 1 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
#!/usr/bin/env python3 | ||
# -*- coding: utf-8 -*- | ||
|
||
|
||
|
||
# Author: Xiaoy LI | ||
# Last update: 2019.04.04 | ||
# First create: 2019.03.29 | ||
# Description: | ||
# bert_layernorm.py | ||
|
||
|
||
|
||
import os | ||
import sys | ||
import copy | ||
import json | ||
import math | ||
import logging | ||
import tarfile | ||
import tempfile | ||
import shutil | ||
|
||
|
||
root_path = "/".join(os.path.realpath(__file__).split("/")[:-2]) | ||
if root_path not in sys.path: | ||
sys.path.insert(0, root_path) | ||
|
||
|
||
|
||
import torch | ||
from torch import nn | ||
from torch.nn import CrossEntropyLoss | ||
|
||
|
||
|
||
class BertLayerNorm(nn.Module): | ||
def __init__(self, hidden_size, eps=1e-12): | ||
# construct a layernorm module in the TF style | ||
# epsilon inside the square are not | ||
super(BertLayerNorm, self).__init__() | ||
self.weight = nn.Parameter(torch.ones(hidden_size)) | ||
self.bias = nn.Parameter(torch.zeros(hidden_size)) | ||
self.variance_epsilon = eps | ||
|
||
|
||
def forward(self, x): | ||
u = x.mean(-1, keepdim=True) | ||
s = (x - u).pow(2).mean(-1, keepdim=True) | ||
x = (x - u) / torch.sqrt(s + self.variance_epsilon) | ||
return self.weight * x + self.bias |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
#!/usr/bin/env python3 | ||
# -*- coding: utf-8 -*- | ||
|
||
|
||
|
||
# Author: Xiaoy LI | ||
# Last update: 2019.04.02 | ||
# First create: 2019.04.02 | ||
# Description: | ||
# | ||
|
||
|
||
import os | ||
import sys | ||
|
||
|
||
|
||
root_path = "/".join(os.path.realpath(__file__).split("/")[:-2]) | ||
if root_path not in sys.path: | ||
sys.path.insert(0, root_path) | ||
|
||
|
||
from models.bert_basic_model import * | ||
from layers.bert_layernorm import BertLayerNorm | ||
|
||
|
||
class SingleLinearClassifier(nn.Module): | ||
def __init__(self, hidden_size, num_label): | ||
super(SingleLinearClassifier, self).__init__() | ||
self.num_label = num_label | ||
self.classifier = nn.Linear(hidden_size, num_label) | ||
|
||
def forward(self, input_features): | ||
features_output = self.classifier(input_features) | ||
|
||
return features_output | ||
|
||
|
||
class MultiNonLinearClassifier(nn.Module): | ||
def __init__(self, hidden_size, num_label): | ||
super(MultiNonLinearClassifier, self).__init__() | ||
self.num_label = num_label | ||
self.classifier1 = nn.Linear(hidden_size, int(hidden_size / 2)) | ||
self.classifier2 = nn.Linear(int(hidden_size / 2), num_label) | ||
|
||
def forward(self, input_features): | ||
features_output1 = self.classifier1(input_features) | ||
features_output1 = nn.ReLU()(features_output1) | ||
features_output2 = self.classifier2(features_output1) | ||
return features_output2 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
#!/usr/bin/env python3 | ||
# -*- coding: utf-8 -*- | ||
|
||
import os | ||
import sys | ||
import csv | ||
import logging | ||
import argparse | ||
import random | ||
import numpy as np | ||
from tqdm import tqdm, trange | ||
import torch | ||
|
||
root_path = "/".join(os.path.realpath(__file__).split("/")[:-2]) | ||
if root_path not in sys.path: | ||
sys.path.insert(0, root_path) | ||
|
||
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, \ | ||
SequentialSampler | ||
|
||
class DataProcessor(object): | ||
# base class for data converts for sequence classification datasets | ||
def get_train_examples(self, data_dir): | ||
# get a collection of "InputExample" for the train set | ||
raise NotImplementedError() | ||
|
||
def get_dev_examples(self, data_dir): | ||
# gets a collections of "InputExample" for the dev set | ||
raise NotImplementedError() | ||
|
||
def get_labels(self): | ||
# gets the list of labels for this data set | ||
raise NotImplementedError() | ||
|
||
@classmethod | ||
def _read_tsv(cls, input_file, quotechar=None): | ||
# reads a tab separated value file. | ||
with open(input_file, "r") as f: | ||
reader = csv.reader(f, delimiter="\t", quotechar=quotechar) | ||
lines = [] | ||
for line in reader: | ||
lines.append(line) | ||
return lines | ||
|
||
def generate_mini_batch_input(all_features, mini_batch_idx, config): | ||
batch = [all_features[idx] for idx in mini_batch_idx] | ||
input_ids = torch.tensor([[f.input_ids for f in group.input_features] for group in batch], dtype=torch.long) | ||
input_mask = torch.tensor([[f.input_mask for f in group.input_features] for group in batch], dtype=torch.long) | ||
segment_ids = torch.tensor([[f.segment_ids for f in group.input_features] for group in batch], dtype=torch.long) | ||
label_ids = torch.tensor([[f.label_id for f in group.input_features] for group in batch], dtype=torch.long) | ||
valid_ids = torch.tensor([[f.valid_id for f in group.input_features] for group in batch], dtype=torch.long) | ||
label_mask = torch.tensor([[f.label_mask for f in group.input_features] for group in batch], dtype=torch.long) | ||
# label_mask = np.array([[f.label_mask for f in group_f] for group_f in batch_features]) | ||
input_types = [group.type for group in batch] | ||
# entity_types = [[f.entity_type for f in group_f] for group_f in batch] | ||
# relations = [[f.relations for f in group_f] for group_f in batch] | ||
entity_types = [group.entity_type for group in batch] # batch_size | ||
relations = [group.relations for group in batch] | ||
doc_tokens = [group.doc_tokens for group in batch] | ||
|
||
input_ids = input_ids.view(-1, config.max_seq_length) # batch * 3, max_seq_length | ||
input_mask = input_mask.view(-1, config.max_seq_length) | ||
segment_ids = segment_ids.view(-1, config.max_seq_length) | ||
label_ids = label_ids.view(-1, config.max_seq_length) | ||
valid_ids = valid_ids.view(-1, config.max_seq_length) | ||
label_mask = label_mask.view(-1, config.max_seq_length) | ||
# label_mask = np.reshape(label_mask, (-1, config.max_seq_length)) | ||
|
||
return input_ids, input_mask, segment_ids, label_ids, valid_ids, label_mask, input_types, entity_types, relations, doc_tokens | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
class InputExample(object): | ||
# a single training / test example for simple sequence classification | ||
def __init__(self, guid, text_a, text_b=None, label=None): | ||
""" | ||
Construct s input Example. | ||
Args: | ||
guid: unqiue id for the example. | ||
text_a: string, the untokenzied text of the first seq. for single sequence | ||
tasks, only this sequction msut be specified. | ||
text_b: (Optional) string, the untokenized text of the second sequence. | ||
label: (Optional) string, the label of the example, This should be specifi | ||
for train and dev examples, but not for test examples. | ||
""" | ||
self.guid = guid | ||
self.text_a = text_a | ||
self.text_b = text_b | ||
self.label = label | ||
|
||
class GroupFeature(object): | ||
# a single set of features of data | ||
def __init__(self, doc_tokens, q_type, entity_type, relations, input_features): | ||
self.doc_tokens = doc_tokens | ||
self.type = q_type | ||
self.entity_type = entity_type | ||
self.relations = relations | ||
self.input_features = input_features | ||
|
||
|
||
class InputFeature(object): | ||
# a single set of features of data | ||
def __init__(self, input_ids, input_mask, segment_ids, label_id, label_mask, valid_id): | ||
self.input_ids = input_ids # ques_i + doc_token | ||
self.input_mask = input_mask | ||
self.segment_ids = segment_ids | ||
self.label_id = label_id | ||
self.label_mask = label_mask | ||
self.valid_id = valid_id | ||
|
||
|
||
class MRCExample(object): | ||
"""A single training/test example for the Squad dataset.""" | ||
|
||
def __init__(self, | ||
qas_id, | ||
question_text, | ||
doc_tokens, | ||
entity_type, | ||
q_type, | ||
relations, | ||
label=None, | ||
orig_answer_text=None, | ||
start_position=None, | ||
end_position=None): | ||
self.qas_id = qas_id | ||
self.question_text = question_text | ||
self.doc_tokens = doc_tokens | ||
self.label = label | ||
self.q_type = q_type | ||
self.entity_type = entity_type | ||
self.relations = relations | ||
|
||
def __str__(self): | ||
return self.__repr__() | ||
|
||
def __repr__(self): | ||
s = "" | ||
s += "qas_id: %s" % (self.qas_id) | ||
s += ", question_text: %s" % ( | ||
self.question_text) | ||
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) | ||
s += ", label: [%s]" % (" ".join(self.label)) | ||
return s |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
import os | ||
import sys | ||
import csv | ||
|
||
from .mrc_utils import * | ||
|
||
class DataProcessor(object): | ||
# base class for data converts for sequence classification datasets | ||
def get_train_examples(self, data_dir): | ||
# get a collection of "InputExample" for the train set | ||
raise NotImplementedError() | ||
|
||
def get_dev_examples(self, data_dir): | ||
# gets a collections of "InputExample" for the dev set | ||
raise NotImplementedError() | ||
|
||
@classmethod | ||
def _read_tsv(cls, input_file, quotechar=None): | ||
# reads a tab separated value file. | ||
with open(input_file, "r") as f: | ||
reader = csv.reader(f, delimiter="\t", quotechar=quotechar) | ||
lines = [] | ||
for line in reader: | ||
lines.append(line) | ||
return lines | ||
|
||
class MRCProcessor(DataProcessor): | ||
|
||
def get_train_examples(self, data_dir): | ||
train_examples = read_squad_examples(data_dir, is_training=True) | ||
return train_examples | ||
|
||
def get_dev_examples(self, data_dir): | ||
dev_examples = read_squad_examples(data_dir, is_training=False) | ||
return dev_examples | ||
|
||
def get_test_examples(self, data_dir): | ||
test_examples = read_squad_examples(data_dir, is_training=False) | ||
return test_examples | ||
|
||
def get_labels(self, datasets): | ||
label_list = ['[CLS]','[SEP]'] | ||
for dataset in datasets: | ||
for example in dataset: | ||
for tmp in list(set(example.label)): | ||
if tmp not in label_list: | ||
label_list.append(tmp) | ||
|
||
return label_list | ||
|
||
def get_entity_types(self, datasets="conll04"): | ||
return ["loc", "peop", "org", "other"] | ||
|
||
|
||
|
||
|
||
|
||
|
Oops, something went wrong.