Skip to content

Commit

Permalink
dev_split function of train_data
Browse files Browse the repository at this point in the history
  • Loading branch information
hemingkx committed Nov 15, 2020
1 parent 05b37d9 commit fd63746
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 12 deletions.
8 changes: 3 additions & 5 deletions data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,14 @@


class NERDataset(Dataset):
def __init__(self, dataset_dir, vocab, label2id):
def __init__(self, words, labels, vocab, label2id):
self.vocab = vocab
self.dataset = self.preprocess(np.load(dataset_dir, allow_pickle=True))
self.dataset = self.preprocess(words, labels)
self.label2id = label2id

def preprocess(self, data):
def preprocess(self, words, labels):
"""convert the data to ids"""
processed = []
words = data["words"]
labels = data["labels"]
for (word, label) in zip(words, labels):
word_id = [self.vocab.word_id(w_) for w_ in word]
label_id = [self.vocab.label_id(l_) for l_ in label]
Expand Down
25 changes: 25 additions & 0 deletions dev_split.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import numpy as np
from sklearn.model_selection import train_test_split

import config
from data_process import Processor
from Vocabulary import Vocabulary


def dev_split(dataset_dir):
data = np.load(dataset_dir, allow_pickle=True)
words = data["words"]
labels = data["labels"]
x_train, x_dev, y_train, y_dev = train_test_split(words, labels, test_size=0.1, random_state=0)
return x_train, x_dev, y_train, y_dev


if __name__ == "__main__":
# 处理数据,分离文本和标签
processor = Processor(config)
processor.data_process()
# 建立词表
vocab = Vocabulary(config)
vocab.get_vocab()
word_train, word_dev, label_train, label_dev = dev_split(config.train_dir)
print(len(word_train))
17 changes: 10 additions & 7 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from data_loader import NERDataset
from model import BiLSTM_CRF
from calculate import f1_score
# from sklearn.cross_validation import train_test_split
from dev_split import dev_split

import numpy as np
# 打印完整的numpy array
Expand Down Expand Up @@ -64,13 +64,16 @@
# 建立词表
vocab = Vocabulary(config)
vocab.get_vocab()
# 分离出验证集
word_train, word_dev, label_train, label_dev = dev_split(config.train_dir)
# build dataset
train_dataset = NERDataset(word_train, label_train, vocab, config.label2id)
dev_dataset = NERDataset(word_dev, label_dev, vocab, config.label2id)
# build data_loader
train_dataset = NERDataset(config.train_dir, vocab, config.label2id)
train_loader = DataLoader(train_dataset, batch_size=config.batch_size,
shuffle=True, collate_fn=train_dataset.collate_fn)
test_dataset = NERDataset(config.test_dir, vocab, config.label2id)
test_loader = DataLoader(test_dataset, batch_size=config.batch_size,
shuffle=True, collate_fn=test_dataset.collate_fn)
dev_loader = DataLoader(dev_dataset, batch_size=config.batch_size,
shuffle=True, collate_fn=dev_dataset.collate_fn)
# model
model = BiLSTM_CRF(embedding_size=config.embedding_size,
hidden_size=config.hidden_size,
Expand Down Expand Up @@ -107,7 +110,7 @@
optimizer.zero_grad()
if idx % 100 == 0:
with torch.no_grad():
for _, test_samples in enumerate(test_loader):
for _, test_samples in enumerate(dev_loader):
x_test, y_test, lens_ = test_samples
x_test = x_test.to(device)
y_test = y_test.to(device)
Expand All @@ -117,7 +120,7 @@
# 计算梯度
test_loss = loss_function(y_pred, y_test)
# f1_score calculation
f1 = f1_score(test_loader, vocab.id2word, vocab.id2label, model, device)
f1 = f1_score(dev_loader, vocab.id2word, vocab.id2label, model, device)
print("epoch: ", epoch, ", index: ", idx, ", train loss: ", loss.item(),
", f1 score: ", f1, ", test loss: ", test_loss.item())
print("Training Finished!")
Expand Down

0 comments on commit fd63746

Please sign in to comment.