Skip to content

Commit

Permalink
added crf, f1 score 0.65
Browse files Browse the repository at this point in the history
  • Loading branch information
hemingkx committed Nov 17, 2020
1 parent d3bd8b3 commit 77cd904
Show file tree
Hide file tree
Showing 5 changed files with 285 additions and 4 deletions.
25 changes: 25 additions & 0 deletions calculate.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import torch


def find_entities(xs, ys, id2word, id2label, label_type, res=[]):
"""get entities in one sentence x with label y"""
entity = []
Expand Down Expand Up @@ -56,3 +59,25 @@ def f1_score(data_loader, id2word, id2label, model, device):
return (2 * acc * recall) / (acc + recall)
else:
return 0


def f1_crf_score(data_loader, id2word, id2label, model, device):
entity_pred = []
entity_label = []
for idx, batch_samples in enumerate(data_loader):
sentences, labels, lens = batch_samples
sentences = sentences.to(device)
labels = labels.to(device)
_, label_pred = model.forward(sentences)
label_pred = torch.tensor(label_pred, dtype=torch.long)
# crf已经解码过了,不用从softmax转换,所以用"label"标签
entity_pred = find_entities(sentences, label_pred, id2word, id2label, "label", entity_pred)
entity_label = find_entities(sentences, labels, id2word, id2label, "label", entity_label)
entity_right = [i for i in entity_pred if i in entity_label]
print("entity_pred: ", len(entity_pred), "entity_label: ", len(entity_label), "entity_right: ", len(entity_right))
if len(entity_right) != 0:
acc = float(len(entity_right)) / len(entity_pred)
recall = float(len(entity_right)) / len(entity_label)
return (2 * acc * recall) / (acc + recall)
else:
return 0
2 changes: 1 addition & 1 deletion config.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
lr = 0.001
betas = (0.9, 0.999)
epochs = 10
gpu = '2'
gpu = '3'

label2id = {
"O": 0,
Expand Down
158 changes: 158 additions & 0 deletions crf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
import time
import torch
import torch.nn as nn

START_TAG = "<START>"
STOP_TAG = "<STOP>"


def argmax(vec):
# return the argmax as a python int
_, idx = torch.max(vec, 1)
return idx.item()


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
max_score = vec[0, argmax(vec)]
max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
return max_score + \
torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))


class BiLSTM_CRF_MODIFY_PARALLEL(nn.Module):

def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim, device):
super(BiLSTM_CRF_MODIFY_PARALLEL, self).__init__()
self.embedding_dim = embedding_dim
self.hidden_dim = hidden_dim
self.vocab_size = vocab_size
self.tag_to_ix = tag_to_ix
# equal to vocab.label_size
self.tagset_size = len(tag_to_ix)
self.device = device
self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim,
num_layers=2, bidirectional=True, batch_first=True)

# Maps the output of the LSTM into tag space.
self.hidden2tag = nn.Linear(hidden_dim*2, self.tagset_size)

# Matrix of transition parameters. Entry i,j is the score of
# transitioning *to* i *from* j.
self.transitions = nn.Parameter(
torch.randn(self.tagset_size, self.tagset_size)).to(self.device)

# These two statements enforce the constraint that we never transfer
# to the start tag and we never transfer from the stop tag

self.transitions.data[tag_to_ix[START_TAG], :] = -10000
self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000
self.hidden = self.init_hidden()

def init_hidden(self):
return (torch.randn(2, 1, self.hidden_dim),
torch.randn(2, 1, self.hidden_dim))

def _forward_alg_new_parallel(self, feats):
# Do the forward algorithm to compute the partition function
init_alphas = torch.full([feats.shape[0], self.tagset_size], -10000.).to(self.device)
# START_TAG has all of the score.
init_alphas[:, self.tag_to_ix[START_TAG]] = 0.

# Wrap in a variable so that we will get automatic backprop
# Iterate through the sentence
forward_var_list = []
forward_var_list.append(init_alphas)
for feat_index in range(feats.shape[1]): # -1
gamar_r_l = torch.stack([forward_var_list[feat_index]] * feats.shape[2]).transpose(0, 1).to(self.device)
t_r1_k = torch.unsqueeze(feats[:, feat_index, :], 1).transpose(1, 2).to(self.device) # +1
aa = gamar_r_l + t_r1_k + torch.unsqueeze(self.transitions, 0)
forward_var_list.append(torch.logsumexp(aa, dim=2))
terminal_var = forward_var_list[-1] + self.transitions[self.tag_to_ix[STOP_TAG]].repeat([feats.shape[0], 1])
alpha = torch.logsumexp(terminal_var, dim=1)
return alpha

def _get_lstm_features_parallel(self, sentence):
self.hidden = self.init_hidden()
embeds = self.word_embeds(sentence)
lstm_out, self.hidden = self.lstm(embeds)
lstm_feats = self.hidden2tag(lstm_out)
return lstm_feats

def _score_sentence_parallel(self, feats, tags):
# Gives the score of provided tag sequences

score = torch.zeros(tags.shape[0]).to(self.device)
tags = torch.cat([torch.full([tags.shape[0], 1],
self.tag_to_ix[START_TAG]).long().to(self.device), tags.to(self.device)], dim=1).to(self.device)
for i in range(feats.shape[1]):
feat = feats[:, i, :]
score = score + \
self.transitions[tags[:, i + 1], tags[:, i]] + feat[range(feat.shape[0]), tags[:, i + 1]].to(self.device)
score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[:, -1]]
return score

def _viterbi_decode_new(self, feats):
backpointers = []
# Initialize the viterbi variables in log space
init_vvars = torch.full((1, self.tagset_size), -10000.).to(self.device)
init_vvars[0][self.tag_to_ix[START_TAG]] = 0

# forward_var at step i holds the viterbi variables for step i-1
forward_var_list = []
forward_var_list.append(init_vvars)
# feats.shape: [11, 6]
for feat_index in range(feats.shape[0]):
gamar_r_l = torch.stack([forward_var_list[feat_index]] * feats.shape[1]).to(self.device)
gamar_r_l = torch.squeeze(gamar_r_l)
next_tag_var = gamar_r_l + self.transitions
# bptrs_t=torch.argmax(next_tag_var,dim=0)
viterbivars_t, bptrs_t = torch.max(next_tag_var, dim=1)

t_r1_k = torch.unsqueeze(feats[feat_index], 0).to(self.device)
forward_var_new = torch.unsqueeze(viterbivars_t, 0) + t_r1_k

forward_var_list.append(forward_var_new)
backpointers.append(bptrs_t.tolist())

# Transition to STOP_TAG
# shape: torch.Size([1, 6])
terminal_var = forward_var_list[-1] + self.transitions[self.tag_to_ix[STOP_TAG]]
# one number (2 for example)
best_tag_id = torch.argmax(terminal_var).tolist()
path_score = terminal_var[0][best_tag_id]

# Follow the back pointers to decode the best path.
best_path = [best_tag_id]
for bptrs_t in reversed(backpointers):
# bptrs_t is like: [3, 3, 3, 3, 3, 3]
best_tag_id = bptrs_t[best_tag_id]
best_path.append(best_tag_id)
# Pop off the start tag (we dont want to return that to the caller)
start = best_path.pop()
assert start == self.tag_to_ix[START_TAG] # Sanity check
best_path.reverse()
return path_score, best_path

def _viterbi_decode_new_parallel(self, feats):
path_scores = []
best_paths = []
for index in range(feats.shape[0]):
path_score, best_path = self._viterbi_decode_new(feats[index])
best_paths.append(best_path)
path_scores.append(path_score)
return path_scores, best_paths

def neg_log_likelihood_parallel(self, sentences, tags):
feats = self._get_lstm_features_parallel(sentences)
forward_score = self._forward_alg_new_parallel(feats)
gold_score = self._score_sentence_parallel(feats, tags)
return torch.sum(forward_score - gold_score)

def forward(self, sentence): # dont confuse this with _forward_alg above.
# Get the emission scores from the BiLSTM
lstm_feats = self._get_lstm_features_parallel(sentence)
# Find the best path, given the features.
score, tag_seq = self._viterbi_decode_new_parallel(lstm_feats)
return score, tag_seq
44 changes: 42 additions & 2 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@
import config
import numpy as np
from model import BiLSTM_CRF
from crf import BiLSTM_CRF_MODIFY_PARALLEL
from dev_split import dev_split
from data_process import Processor
from Vocabulary import Vocabulary
from data_loader import NERDataset
from torch.utils.data import DataLoader
from sklearn.model_selection import KFold
from train import train, test, sample_test
from train import train, test, sample_test, crf_train, crf_test

input_array = [[1642, 1291, 40, 2255, 970, 46, 124, 1604, 1915, 547, 0, 173,
303, 124, 1029, 52, 20, 2839, 2, 2255, 2078, 1553, 225, 540,
Expand Down Expand Up @@ -107,6 +108,45 @@ def simple_run():
run(word_train, label_train, word_dev, label_dev, vocab, device, kf_index)


def crf_simple_run():
"""train with crf"""
# 设置gpu为命令行参数指定的id
if config.gpu != '':
device = torch.device(f"cuda:{config.gpu}")
else:
device = torch.device("cpu")
# 处理数据,分离文本和标签
processor = Processor(config)
processor.data_process()
# 建立词表
vocab = Vocabulary(config)
vocab.get_vocab()
# 分离出验证集
word_train, word_dev, label_train, label_dev = dev_split(config.train_dir)
# simple run without k-fold
crf_run(word_train, label_train, word_dev, label_dev, vocab, device)


def crf_run(word_train, label_train, word_dev, label_dev, vocab, device):
# build dataset
train_dataset = NERDataset(word_train, label_train, vocab, config.label2id)
dev_dataset = NERDataset(word_dev, label_dev, vocab, config.label2id)
# build data_loader
train_loader = DataLoader(train_dataset, batch_size=config.batch_size,
shuffle=True, collate_fn=train_dataset.collate_fn)
dev_loader = DataLoader(dev_dataset, batch_size=config.batch_size,
shuffle=True, collate_fn=dev_dataset.collate_fn)
# model
model = BiLSTM_CRF_MODIFY_PARALLEL(vocab.vocab_size(), vocab.label2id,
config.embedding_size, config.hidden_size, device)
model.to(device)
# loss and optimizer
optimizer = optim.Adam(model.parameters(), lr=config.lr, betas=config.betas)
# train and test
crf_train(train_loader, dev_loader, vocab, model, optimizer, device)
crf_test(config.test_dir, vocab, model, device)


def run(word_train, label_train, word_dev, label_dev, vocab, device, kf_index):
# build dataset
train_dataset = NERDataset(word_train, label_train, vocab, config.label2id)
Expand Down Expand Up @@ -136,4 +176,4 @@ def run(word_train, label_train, word_dev, label_dev, vocab, device, kf_index):


if __name__ == '__main__':
simple_run()
crf_simple_run()
60 changes: 59 additions & 1 deletion train.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import config
from data_loader import NERDataset
from calculate import f1_score
from calculate import f1_score, f1_crf_score

import numpy as np

Expand Down Expand Up @@ -39,6 +39,33 @@ def train(train_loader, dev_loader, vocab, model, loss_function, optimizer, devi
print("Training Finished!")


def crf_train(train_loader, dev_loader, vocab, model, optimizer, device):
"""train the model and test model performance"""
# start training
for epoch in range(config.epochs):
# step number in one epoch: 336
for idx, batch_samples in enumerate(train_loader):
x, y, lens = batch_samples
x = x.to(device)
y = y.to(device)
model.zero_grad()
# 计算梯度
loss = model.neg_log_likelihood_parallel(x, y)
# 梯度反传
loss.backward()
# 优化更新
optimizer.step()
optimizer.zero_grad()
if idx % 100 == 0:
if idx % 100 == 0:
with torch.no_grad():
# dev loss calculation
dev_loss, f1 = crf_dev(dev_loader, vocab, model, device)
print("epoch: ", epoch, ", index: ", idx, ", train loss: ", loss.item(),
", f1 score: ", f1, ", dev loss: ", dev_loss)
print("Training Finished!")


def sample_test(test_input, test_label, model, device):
"""test model performance on a specific sample"""
test_input = test_input.to(device)
Expand Down Expand Up @@ -66,6 +93,22 @@ def dev(dev_loader, vocab, model, loss_function, device):
return dev_loss, f1


def crf_dev(dev_loader, vocab, model, device):
"""test model performance on dev-set"""
dev_loss = 0
for _, test_samples in enumerate(dev_loader):
x_test, y_test, lens_ = test_samples
x_test = x_test.to(device)
y_test = y_test.to(device)
model.zero_grad()
# 计算梯度
dev_loss += model.neg_log_likelihood_parallel(x_test, y_test)
dev_loss = float(dev_loss) / len(dev_loader)
# f1_score calculation
f1 = f1_crf_score(dev_loader, vocab.id2word, vocab.id2label, model, device)
return dev_loss, f1


def test(dataset_dir, vocab, model, loss_function, device, kf_index):
"""test model performance on the final test set"""
data = np.load(dataset_dir, allow_pickle=True)
Expand All @@ -79,3 +122,18 @@ def test(dataset_dir, vocab, model, loss_function, device, kf_index):
test_loss, f1 = dev(test_loader, vocab, model, loss_function, device)
print("Kf epoch: ", kf_index, ", final test loss: ", test_loss, ", f1 score: ", f1)
return test_loss, f1


def crf_test(dataset_dir, vocab, model, device):
"""test model performance on the final test set"""
data = np.load(dataset_dir, allow_pickle=True)
word_test = data["words"]
label_test = data["labels"]
# build dataset
test_dataset = NERDataset(word_test, label_test, vocab, config.label2id)
# build data_loader
test_loader = DataLoader(test_dataset, batch_size=config.batch_size,
shuffle=True, collate_fn=test_dataset.collate_fn)
test_loss, f1 = crf_dev(test_loader, vocab, model, device)
print("final test loss: ", test_loss, ", f1 score: ", f1)
return test_loss, f1

0 comments on commit 77cd904

Please sign in to comment.