Skip to content

Commit

Permalink
fix bilstm of train
Browse files Browse the repository at this point in the history
  • Loading branch information
yongzhuo committed Jan 1, 2020
1 parent fd1ca60 commit 00f9cd0
Show file tree
Hide file tree
Showing 12 changed files with 158 additions and 155 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<p align="center">
<img src="macropodus_images/macropodus_logo.png" width="480"\>
<img src="macropodus_images/macropodus_logo.png" width="320"\>
</p>

# [Macropodus](https://github.com/yongzhuo/Macropodus)
Expand Down
2 changes: 1 addition & 1 deletion macropodus/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,6 @@
num2chi = num2chi

# 是否使用深度学习模型
use_dl=False
use_dl=True
if use_dl:
from macropodus.__init_tf_keras import * # tf.python.keras, custom_objects
2 changes: 1 addition & 1 deletion macropodus/conf/path_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,4 @@
path_seg_pku_1998_train = os.path.join(path_root, "data/corpus/seg_pku_1998/train.json")

# path of training model save dir
path_model_dir = os.path.join(path_root, "data/model")
path_model_dir = os.path.join(path_root, "data", "model")
28 changes: 16 additions & 12 deletions macropodus/network/base/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,10 +100,12 @@ def callback(self):
评价函数、早停
:return: callback
"""
cb_em = [
tf.keras.callbacks.EarlyStopping(monitor="val_loss", mode="min", min_delta=1e-8, patience=self.patience),
tf.keras.callbacks.ModelCheckpoint(monitor="val_loss", mode="min", filepath=self.path_model, verbose=1,
save_best_only=True, save_weights_only=False), ]
# import datetime
# self.path_model_dir = os.path.join(self.path_model_dir, "plugins/profile", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
cb_em = [tf.keras.callbacks.ModelCheckpoint(monitor="val_loss", mode="min", filepath=self.path_model, verbose=1, save_best_only=True, save_weights_only=False),
tf.keras.callbacks.TensorBoard(log_dir=os.path.join(self.path_model_dir, "logs"), batch_size=self.batch_size, update_freq='batch'),
tf.keras.callbacks.EarlyStopping(monitor="val_loss", mode="min", min_delta=1e-8, patience=self.patience),
]
return cb_em

def create_compile(self):
Expand Down Expand Up @@ -169,16 +171,18 @@ def fit_generator(self, embed, rate=1):
pg = PreprocessGenerator(self.path_model_l2i_i2l)
_, len_train = pg.preprocess_label2set(self.hyper_parameters["data"]["train_data"])
data_fit_generator = pg.preprocess_label_question_to_idx_fit_generator(embedding_type=self.hyper_parameters["embedding_type"],
batch_size=self.batch_size,
path=self.hyper_parameters["data"]["train_data"],
embed=embed,
rate=rate)
crf_mode=self.hyper_parameters["model"]["crf_mode"],
path=self.hyper_parameters["data"]["train_data"],
batch_size=self.batch_size,
embed=embed,
rate=rate)
_, len_val = pg.preprocess_label2set(self.hyper_parameters["data"]["val_data"])
data_dev_generator = pg.preprocess_label_question_to_idx_fit_generator(embedding_type=self.hyper_parameters["embedding_type"],
batch_size=self.batch_size,
path=self.hyper_parameters["data"]["val_data"],
embed=embed,
rate=rate)
crf_mode=self.hyper_parameters["model"]["crf_mode"],
path=self.hyper_parameters["data"]["val_data"],
batch_size=self.batch_size,
embed=embed,
rate=rate)
steps_per_epoch = len_train // self.batch_size
validation_steps = len_val // self.batch_size
# 训练模型
Expand Down
2 changes: 1 addition & 1 deletion macropodus/network/layers/crf.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,8 @@ def loss(self, y_true, y_pred):
self.sequence_lengths,
transition_params=self.transitions)
# loss_crf = tf.reduce_mean(-log_likelihood)
return tf.reduce_mean(-log_likelihood)
# return tf.math.log(loss_crf)
return tf.reduce_mean(-log_likelihood)

def compute_output_shape(self, input_shape):
if self.mode == 'pad':
Expand Down
14 changes: 9 additions & 5 deletions macropodus/network/predict/predict_w2v_bilstm.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,15 @@
from keras_bert import Tokenizer
import numpy as np
import macropodus
import codecs
import pickle
import codecs
import json
import os


path_model_dir = "D:/workspace/pythonMyCode/Macropodus/macropodus/data/model"


path_dir = path_model_dir # + "/ner_albert_bilstm_people_199801"
# 加载模型结构
model = model_from_json(open(path_dir+"/graph.json", "r", encoding="utf-8").read(),
Expand All @@ -27,11 +31,12 @@

# reader tokenizer
token_dict = {}
path_dict = os.path.join(path_embedding_albert, "vocab.txt")
path_dict = os.path.join(path_model_dir, "vocab.txt")
with codecs.open(path_dict, 'r', 'utf8') as reader:
for line in reader:
token = line.strip()
token_dict[token] = len(token_dict)
token_dict = json.loads(token)

vocab_size = len(token_dict)
tokenizer = Tokenizer(token_dict)
# params
Expand Down Expand Up @@ -59,7 +64,7 @@ def sentence2idx(text):
x_ = np.array(x)
x_1 = np.array([x[0] for x in x_])
x_2 = np.array([x[1] for x in x_])
return [x_1, x_2]
return x_1

while True:
print("请输入:")
Expand All @@ -70,4 +75,3 @@ def sentence2idx(text):
res_idxs = [np.argmax(rl) for rl in res_list]
res_label = [l2i_i2l["i2l"][str(ri)] if str(ri) in l2i_i2l["i2l"] else "O" for ri in res_idxs]
print(res_label[:len(ques)])

199 changes: 97 additions & 102 deletions macropodus/network/preprocess/preprocess_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ class PreprocessGenerator:
"""
数据预处理, 输入为csv格式, [label,ques]
"""

def __init__(self, path_model_l2i_i2l):
self.path_model_l2i_i2l = path_model_l2i_i2l
self.l2i_i2l = None
Expand Down Expand Up @@ -75,14 +76,15 @@ def preprocess_label2set(self, path):
file_csv.close()
return label_sets, len_all

def preprocess_label_question_to_idx_fit_generator(self, embedding_type, batch_size, path, embed, rate=1):
def preprocess_label_question_to_idx_fit_generator(self, embedding_type, batch_size, path, embed, rate=1, crf_mode='reg'):
"""
fit_generator用, 将句子, 类标转化为数字idx
:param embedding_type: str, like 'albert'
:param batch_size: int, like 64
:param path: str, like 'train.json'
:param embed: class, like embed
:param rate: float, like 0.9
:param crf_mode: str, like 'reg', 'pad'
:return: yield
"""
# 首先获取label,set,即存在的具体类
Expand All @@ -108,72 +110,94 @@ def preprocess_label_question_to_idx_fit_generator(self, embedding_type, batch_s
if len_ql <= 500: # sample时候不生效,使得语料足够训练
len_ql = len_all

def process_line(line, embed, use_len_seq=True):
def process_line(line, embed, l2i_i2l):
"""
关键:对每一条数据操作,获取label和问句index
:param line: str, like '大漠帝国'
:param embed: class, like embed
:param use_len_seq: boolean, True or False
对每一条数据操作,获取label和问句index
:param line:
:param embed:
:param l2i_i2l:
:return:
"""

# 对每一条数据操作,对question和label进行padding
ques_label = json.loads(line.strip())
label_org = ques_label["label"]
label_index = [l2i_i2l["l2i"][lr] for lr in label_org]
len_sequence = len(label_index)
que_embed = embed.sentence2idx(ques_label["question"])
# padding label
len_leave = embed.len_max - len(label_index)
if len_leave >= 0:
label_index_leave = [li for li in label_index] + [l2i_i2l["l2i"]["O"] for i in range(len_leave)]
else:
label_index_leave = label_index[0:embed.len_max]
if use_len_seq:
return [que_embed[0], que_embed[1], len_sequence], label_index_leave
# len_sequence = len(label_index)
que_embed = embed.sentence2idx("".join(ques_label["question"]))
# label padding
if embedding_type in ['bert', 'albert']:
# padding label
len_leave = embed.len_max - len(label_index) - 2
if len_leave >= 0:
label_index_leave = [l2i_i2l["l2i"]["<PAD>"]] + [li for li in label_index] + [
l2i_i2l["l2i"]["<PAD>"]] + [l2i_i2l["l2i"]["<PAD>"] for i in range(len_leave)]
else:
label_index_leave = [l2i_i2l["l2i"]["<PAD>"]] + label_index[0:embed.len_max - 2] + [
l2i_i2l["l2i"]["<PAD>"]]
else:
return [que_embed, len_sequence], label_index_leave
# padding label
len_leave = embed.len_max - len(label_index) # -2
if len_leave >= 0:
label_index_leave = [li for li in label_index] + [l2i_i2l["l2i"]["<PAD>"] for i in range(len_leave)]
else:
label_index_leave = label_index[0:embed.len_max]
# 转为one-hot
label_res = to_categorical(label_index_leave, num_classes=len(l2i_i2l["l2i"]))
return que_embed, label_res

while True:
file_csv = open(path, "r", encoding="utf-8")
cout_all_line = 0
cnt = 0
x, y = [], []
file_csv = open(path, "r", encoding="utf-8")
cout_all_line = 0
cnt = 0
x, y = [], []
for line in file_csv:
# 跳出循环
if len_ql < cout_all_line:
break
for line in file_csv:
cout_all_line += 1
if line.strip():
x_line, y_line = process_line(line, embed, use_len_seq=True)
x.append(x_line)
y.append(y_line)
cnt += 1
if cnt == batch_size:
if embedding_type in ['bert', 'albert']:
x_, y_ = np.array(x), np.array(y)
x_1 = np.array([x[0] for x in x_])
x_2 = np.array([x[1] for x in x_])
x_3 = np.array([x[2] for x in x_])
cout_all_line += 1
if line.strip():
# 一个json一个json处理
# 备注:最好训练前先处理,使得ques长度小于等于len_max(word2vec), len_max-2(bert, albert)
x_line, y_line = process_line(line, embed, l2i_i2l)
x.append(x_line)
y.append(y_line.tolist())
cnt += 1
# 使用fit_generator时候, 每个batch_size进行yield
if cnt == batch_size:
# 通过两种方式处理: 1.嵌入类型(bert, word2vec, random), 2.条件随机场(CRF:'pad', 'reg')类型
if embedding_type in ['bert', 'albert']:
x_, y_ = np.array(x), np.array(y)
x_1 = np.array([x[0] for x in x_])
x_2 = np.array([x[1] for x in x_])
x_3 = np.array([x[2] for x in x_])
if crf_mode == 'pad':
x_all = [x_1, x_2, x_3]
elif crf_mode == 'reg':
x_all = [x_1, x_2]
else:
x_all = [x_1, x_2]
else:
x_, y_ = np.array(x), np.array(y)
x_1 = np.array([x[0] for x in x_])
x_2 = np.array([x[1] for x in x_])
if crf_mode == 'pad':
x_all = [x_1, x_2]
elif crf_mode == 'reg':
x_all = [x_1]
else:
x_all, y_ = np.array(x), np.array(y)
x_all = [x_1]

cnt = 0
yield (x_all, y_)
x, y =[], []
file_csv.close()
print("preprocess_label_ques_to_idx ok")
cnt = 0
yield (x_all, y_)
x, y = [], []

def preprocess_label_question_to_idx_fit(self, embedding_type, path, embed, rate=1, batch_size=64, crf_mode='reg', fit_type='fit'):
def preprocess_label_question_to_idx_fit(self, embedding_type, path, embed, rate=1, crf_mode='reg'):
"""
fit用, 关键:对每一条数据操作,获取label和问句index
:param embedding_type: str, like 'albert'
:param path: str, like 'train.json'
:param embed: class, like embed
:param rate: float, like 0.9
:param batch_size: int, like 64
:param crf_mode: str, like 'reg', 'pad'
:param fit_type: str, like 'fit', 'fit_generator'
:return: np.array
"""
# 首先获取label,set,即存在的具体类
Expand Down Expand Up @@ -216,11 +240,13 @@ def process_line(line, embed, l2i_i2l):
# label padding
if embedding_type in ['bert', 'albert']:
# padding label
len_leave = embed.len_max - len(label_index) -2
len_leave = embed.len_max - len(label_index) - 2
if len_leave >= 0:
label_index_leave = [l2i_i2l["l2i"]["<PAD>"]] + [li for li in label_index] + [l2i_i2l["l2i"]["<PAD>"]] + [l2i_i2l["l2i"]["<PAD>"] for i in range(len_leave)]
label_index_leave = [l2i_i2l["l2i"]["<PAD>"]] + [li for li in label_index] + [
l2i_i2l["l2i"]["<PAD>"]] + [l2i_i2l["l2i"]["<PAD>"] for i in range(len_leave)]
else:
label_index_leave = [l2i_i2l["l2i"]["<PAD>"]] + label_index[0:embed.len_max-2] + [l2i_i2l["l2i"]["<PAD>"]]
label_index_leave = [l2i_i2l["l2i"]["<PAD>"]] + label_index[0:embed.len_max - 2] + [
l2i_i2l["l2i"]["<PAD>"]]
else:
# padding label
len_leave = embed.len_max - len(label_index) # -2
Expand Down Expand Up @@ -248,59 +274,28 @@ def process_line(line, embed, l2i_i2l):
x.append(x_line)
y.append(y_line.tolist())
cnt += 1
# 使用fit_generator时候, 每个batch_size进行yield
if fit_type=='fit_generator' and cnt == batch_size:
# 通过两种方式处理: 1.嵌入类型(bert, word2vec, random), 2.条件随机场(CRF:'pad', 'reg')类型
if embedding_type in ['bert', 'albert']:
x_, y_ = np.array(x), np.array(y)
x_1 = np.array([x[0] for x in x_])
x_2 = np.array([x[1] for x in x_])
x_3 = np.array([x[2] for x in x_])
if crf_mode == 'pad':
x_all = [x_1, x_2, x_3]
elif crf_mode == 'reg':
x_all = [x_1, x_2]
else:
x_all = [x_1, x_2]
else:
x_, y_ = np.array(x), np.array(y)
x_1 = np.array([x[0] for x in x_])
x_2 = np.array([x[1] for x in x_])
if crf_mode == 'pad':
x_all = [x_1, x_2]
elif crf_mode == 'reg':
x_all = [x_1]
else:
x_all = [x_1]

cnt = 0
yield (x_all, y_)
x, y =[], []
# 使用fit的时候, return返回
if fit_type=='fit':
# 通过两种方式处理: 1.嵌入类型(bert, word2vec, random), 2.条件随机场(CRF:'pad', 'reg')类型
if embedding_type in ['bert', 'albert']:
x_, y_ = np.array(x), np.array(y)
x_1 = np.array([x[0] for x in x_])
x_2 = np.array([x[1] for x in x_])
x_3 = np.array([x[2] for x in x_])
if crf_mode=='pad':
x_all = [x_1, x_2, x_3]
elif crf_mode=='reg':
x_all = [x_1, x_2]
else:
x_all = [x_1, x_2]
# 通过两种方式处理: 1.嵌入类型(bert, word2vec, random), 2.条件随机场(CRF:'pad', 'reg')类型
if embedding_type in ['bert', 'albert']:
x_, y_ = np.array(x), np.array(y)
x_1 = np.array([x[0] for x in x_])
x_2 = np.array([x[1] for x in x_])
x_3 = np.array([x[2] for x in x_])
if crf_mode == 'pad':
x_all = [x_1, x_2, x_3]
elif crf_mode == 'reg':
x_all = [x_1, x_2]
else:
x_, y_ = np.array(x), np.array(y)
x_1 = np.array([x[0] for x in x_])
x_2 = np.array([x[1] for x in x_])
if crf_mode=='pad':
x_all = [x_1, x_2]
elif crf_mode=='reg':
x_all = x_1
else:
x_all = x_1
# 使用fit的时候, return返回
return x_all, y_


x_all = [x_1, x_2]
else:
x_, y_ = np.array(x), np.array(y)
x_1 = np.array([x[0] for x in x_])
x_2 = np.array([x[1] for x in x_])
if crf_mode == 'pad':
x_all = [x_1, x_2]
elif crf_mode == 'reg':
x_all = x_1
else:
x_all = x_1
# 使用fit的时候, return返回
return x_all, y_
Loading

0 comments on commit 00f9cd0

Please sign in to comment.