forked from zhaoyingjun/chatbot
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
zhaoyingjun
committed
Jun 9, 2022
1 parent
2d4b339
commit 0202b9a
Showing
81 changed files
with
511,752 additions
and
0 deletions.
There are no files selected for viewing
15 changes: 15 additions & 0 deletions
15
Chatbot-tensowflow2.0/Distribute_seq2seqchatbot/config/getConfig.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
# coding=utf-8 | ||
import os | ||
from configparser import SafeConfigParser | ||
config_file=os.getcwd()+'/config/seq2seq.ini' | ||
if not os.path.exists(config_file): | ||
config_file = os.path.dirname(os.getcwd()) + '/config/seq2seq.ini' | ||
print(config_file) | ||
def get_config(): | ||
parser = SafeConfigParser() | ||
parser.read(config_file) | ||
# get the ints, floats and strings | ||
_conf_ints = [ (key, int(value)) for key,value in parser.items('ints')] | ||
_conf_floats = [ (key, float(value)) for key,value in parser.items('floats') ] | ||
_conf_strings = [ (key, str(value)) for key,value in parser.items('strings') ] | ||
return dict(_conf_ints +_conf_floats+ _conf_strings) |
33 changes: 33 additions & 0 deletions
33
Chatbot-tensowflow2.0/Distribute_seq2seqchatbot/config/seq2seq.ini
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
[strings] | ||
# Mode : train, test, serve | ||
mode = train | ||
train_data=train_data | ||
seq_data = train_data/seq.data | ||
vocab_inp_path=train_data/inp.vocab | ||
vocab_tar_path=train_data/tar.vocab | ||
#训练集原始文件 | ||
resource_data = train_data/xiaohuangji50w.conv | ||
#分割后的训练样本文件 | ||
split_train_data=train_data/seq_data_ | ||
#读取识别原始文件中段落和行头的标示 | ||
e = E | ||
m = M | ||
model_data = model_data | ||
log_dir=log_dir | ||
[ints] | ||
# vocabulary size | ||
# 20,000 is a reasonable size | ||
vocab_inp_size = 20000 | ||
vocab_tar_size = 20000 | ||
embedding_dim=128 | ||
train_epoch=10 | ||
# typical options : 128, 256, 512, 1024 | ||
layer_size = 512 | ||
batch_size = 64 | ||
#句子的最长长度 | ||
max_length=20 | ||
number_work=2 | ||
[floats] | ||
#设置最小Loss,当模型loss值达到这个水平后停止训练 | ||
min_loss=0.2 | ||
|
73 changes: 73 additions & 0 deletions
73
Chatbot-tensowflow2.0/Distribute_seq2seqchatbot/data_util.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
# coding=utf-8 | ||
import json | ||
import os | ||
import re | ||
import jieba | ||
from zhon.hanzi import punctuation | ||
from config import getConfig | ||
import io | ||
import tensorflow as tf | ||
|
||
# 加载参数配置文件 | ||
gConfig = {} | ||
gConfig = getConfig.get_config() | ||
conv_path = gConfig['resource_data'] | ||
vocab_inp_path = gConfig['vocab_inp_path'] | ||
vocab_tar_path = gConfig['vocab_tar_path'] | ||
vocab_inp_size = gConfig['vocab_inp_size'] | ||
vocab_tar_size = gConfig['vocab_tar_size'] | ||
seq_train = gConfig['seq_data'] | ||
def predata_util(): | ||
# 判断训练语料文件是否存在,如果不存在则进行提醒 | ||
if not os.path.exists(conv_path): | ||
print("找不到需要处理的文件,请确认在train_data文件中是否存在该文件") | ||
exit() | ||
# 新建一个文件,用于存放处理后的对话语料 | ||
seq_train = open(gConfig['seq_data'], 'w') | ||
# 打开需要处理的语料,逐条读取并进行数据处理 | ||
with open(conv_path, encoding='utf-8') as f: | ||
one_conv = "" # 存储一次完整对话 | ||
i = 0 | ||
# 开始循环处理语料 | ||
for line in f: | ||
line = line.strip('\n') | ||
line = re.sub(r"[%s]+" % punctuation, "", line) # 去除标点符号 | ||
if line == '': | ||
continue | ||
# 判断是否为一段对话的开始,如果是则把刚刚处理的语料保存下来 | ||
if line[0] == gConfig['e']: | ||
if one_conv: | ||
seq_train.write(one_conv[:-1] + '\n') | ||
i = i + 1 | ||
if i % 1000 == 0: | ||
print('处理进度:', i) | ||
one_conv = "" | ||
# 判断是否正在处理对话语句,如果是则进行语料的拼接处理 以及分词 | ||
elif line[0] == gConfig['m']: | ||
one_conv = one_conv + str(" ".join(jieba.cut(line.split(' ')[1]))) + '\t' # 存储一次问或答 | ||
# 处理完成,关闭文件 | ||
seq_train.close() | ||
|
||
def create_vocab(lang, vocab_path, vocab_size): | ||
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size, oov_token=3) | ||
tokenizer.fit_on_texts(lang) | ||
vocab = json.loads(tokenizer.to_json(ensure_ascii=False)) | ||
vocab['index_word'] = tokenizer.index_word | ||
vocab['word_index'] = tokenizer.word_index | ||
vocab['document_count']=tokenizer.document_count | ||
vocab = json.dumps(vocab, ensure_ascii=False) | ||
with open(vocab_path, 'w', encoding='utf-8') as f: | ||
f.write(vocab) | ||
f.close() | ||
print("字典保存在:{}".format(vocab_path)) | ||
|
||
def preprocess_sentence(w): | ||
w = 'start ' + w + ' end' | ||
return w | ||
lines = io.open(seq_train, encoding='UTF-8').readlines() | ||
word_pairs = [[preprocess_sentence(w) for w in l.split('\t')] for l in lines] | ||
input_lang, target_lang = zip(*word_pairs) | ||
predata_util() | ||
create_vocab(input_lang,vocab_inp_path,vocab_inp_size) | ||
create_vocab(target_lang,vocab_tar_path,vocab_tar_size) | ||
|
153 changes: 153 additions & 0 deletions
153
Chatbot-tensowflow2.0/Distribute_seq2seqchatbot/execute.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,153 @@ | ||
# coding=utf-8 | ||
#导入依赖包 | ||
import json | ||
import os | ||
import sys | ||
import time | ||
import tensorflow as tf | ||
import horovod.tensorflow as hvd | ||
import seq2seqModel | ||
from config import getConfig | ||
import io | ||
|
||
hvd.init() | ||
#初始化超参字典,并对相应的参数进行赋值 | ||
gConfig = {} | ||
gConfig= getConfig.get_config() | ||
vocab_inp_size = gConfig['vocab_inp_size'] | ||
vocab_tar_size = gConfig['vocab_tar_size'] | ||
embedding_dim=gConfig['embedding_dim'] | ||
units=gConfig['layer_size'] | ||
BATCH_SIZE=gConfig['batch_size'] | ||
|
||
max_length_inp=gConfig['max_length'] | ||
max_length_tar=gConfig['max_length'] | ||
|
||
log_dir=gConfig['log_dir'] | ||
writer = tf.summary.create_file_writer(log_dir) | ||
#对训练语料进行处理,上下文分别加上start end标示 | ||
def preprocess_sentence(w): | ||
w ='start '+ w + ' end' | ||
return w | ||
#定义数据读取函数,从训练语料中读取数据并进行word2number的处理,并生成词典 | ||
def read_data(path): | ||
path = os.getcwd() + '/' + path | ||
if not os.path.exists(path): | ||
path=os.path.dirname(os.getcwd())+'/'+ path | ||
lines = io.open(path, encoding='UTF-8').read().strip().split('\n') | ||
word_pairs = [[preprocess_sentence(w) for w in l.split('\t')] for l in lines] | ||
input_lang,target_lang=zip(*word_pairs) | ||
input_tokenizer=tokenize(gConfig['vocab_inp_path']) | ||
target_tokenizer=tokenize(gConfig['vocab_tar_path']) | ||
input_tensor=input_tokenizer.texts_to_sequences(input_lang) | ||
target_tensor=target_tokenizer.texts_to_sequences(target_lang) | ||
input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, maxlen=max_length_inp, | ||
padding='post') | ||
target_tensor= tf.keras.preprocessing.sequence.pad_sequences(target_tensor, maxlen=max_length_tar, | ||
padding='post') | ||
return input_tensor,input_tokenizer,target_tensor,target_tokenizer | ||
#定义word2number函数,通过对语料的处理提取词典,并进行word2number处理以及padding补全 | ||
def tokenize(vocab_file): | ||
#从词典中读取预先生成tokenizer的config,构建词典矩阵 | ||
with open(vocab_file,'r',encoding='utf-8') as f: | ||
tokenize_config=json.dumps(json.load(f),ensure_ascii=False) | ||
lang_tokenizer=tf.keras.preprocessing.text.tokenizer_from_json(tokenize_config) | ||
#利用词典进行word2number的转换以及padding处理 | ||
return lang_tokenizer | ||
input_tensor, input_token, target_tensor, target_token = read_data(gConfig['seq_data']) | ||
steps_per_epoch = len(input_tensor) // (gConfig['batch_size']*hvd.size()) | ||
BUFFER_SIZE = len(input_tensor) | ||
dataset = tf.data.Dataset.from_tensor_slices((input_tensor,target_tensor)).shuffle(BUFFER_SIZE) | ||
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True) | ||
enc_hidden = seq2seqModel.encoder.initialize_hidden_state() | ||
dataset = dataset.shard(hvd.size(), hvd.rank()) | ||
#定义训练函数 | ||
def train(): | ||
# 从训练语料中读取数据并使用预生成词典word2number的转换 | ||
print("Preparing data in %s" % gConfig['train_data']) | ||
print('每个epoch的训练步数: {}'.format(steps_per_epoch)) | ||
#如有已经有预训练的模型则加载预训练模型继续训练 | ||
checkpoint_dir = gConfig['model_data'] | ||
ckpt=tf.io.gfile.listdir(checkpoint_dir) | ||
if ckpt: | ||
print("reload pretrained model") | ||
seq2seqModel.checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir)) | ||
|
||
#使用Dataset加载训练数据,Dataset可以加速数据的并发读取并进行训练效率的优化 | ||
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt") | ||
start_time = time.time() | ||
#current_loss=2 | ||
#min_loss=gConfig['min_loss'] | ||
epoch = 0 | ||
train_epoch = gConfig['train_epoch'] | ||
#开始进行循环训练,这里设置了一个结束循环的条件就是当loss小于设置的min_loss超参时终止训练 | ||
while epoch<train_epoch: | ||
start_time_epoch = time.time() | ||
total_loss = 0 | ||
#进行一个epoch的训练,训练的步数为steps_per_epoch | ||
for batch,(inp, targ) in enumerate(dataset.take(steps_per_epoch)): | ||
batch_loss = seq2seqModel.training_step(inp, targ,target_token, enc_hidden,batch==0) | ||
total_loss += batch_loss | ||
print('epoch:{}batch:{} batch_loss: {}'.format(epoch,batch,batch_loss)) | ||
#结束一个epoch的训练后,更新current_loss,计算在本epoch中每步训练平均耗时、loss值 | ||
step_time_epoch = (time.time() - start_time_epoch) / steps_per_epoch | ||
step_loss = total_loss / steps_per_epoch | ||
current_steps = +steps_per_epoch | ||
epoch_time_total = (time.time() - start_time) | ||
print('训练总步数: {} 总耗时: {} epoch平均每步耗时: {} 平均每步loss {:.4f}' | ||
.format(current_steps, epoch_time_total, step_time_epoch, step_loss)) | ||
#将本epoch训练的模型进行保存,更新模型文件 | ||
seq2seqModel.checkpoint.save(file_prefix=checkpoint_prefix) | ||
sys.stdout.flush() | ||
epoch = epoch + 1 | ||
with writer.as_default(): | ||
tf.summary.scalar('loss', step_loss, step=epoch) | ||
#定义预测函数,用于根据上文预测下文对话 | ||
def predict(sentence): | ||
# 从词典中读取预先生成tokenizer的config,构建词典矩阵 | ||
input_tokenizer = tokenize(gConfig['vocab_inp_path']) | ||
target_tokenizer = tokenize(gConfig['vocab_tar_path']) | ||
#加载预训练的模型 | ||
checkpoint_dir = gConfig['model_data'] | ||
seq2seqModel.checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir)) | ||
#对输入的语句进行处理,加上start end标示 | ||
sentence = preprocess_sentence(sentence) | ||
#进行word2number的转换 | ||
inputs = input_tokenizer.texts_to_sequences(sentence) | ||
#进行padding的补全 | ||
inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],maxlen=max_length_inp,padding='post') | ||
inputs = tf.convert_to_tensor(inputs) | ||
result = '' | ||
#初始化一个中间状态 | ||
hidden = [tf.zeros((1, units))] | ||
#对输入上文进行encoder编码,提取特征 | ||
enc_out, enc_hidden = seq2seqModel.encoder(inputs, hidden) | ||
dec_hidden = enc_hidden | ||
#decoder的输入从start的对应Id开始正向输入 | ||
dec_input = tf.expand_dims([target_tokenizer.word_index['start']], 0) | ||
#在最大的语句长度范围内容,使用模型中的decoder进行循环解码 | ||
for t in range(max_length_tar): | ||
#获得解码结果,并使用argmax确定概率最大的id | ||
predictions, dec_hidden, attention_weights = seq2seqModel.decoder(dec_input, dec_hidden, enc_out) | ||
predicted_id = tf.argmax(predictions[0]).numpy() | ||
#判断当前Id是否为语句结束表示,如果是则停止循环解码,否则进行number2word的转换,并进行语句拼接 | ||
if target_tokenizer.index_word[predicted_id] == 'end': | ||
break | ||
result += str(target_tokenizer.index_word[predicted_id]) + ' ' | ||
#将预测得到的id作为下一个时刻的decoder的输入 | ||
dec_input = tf.expand_dims([predicted_id], 0) | ||
return result | ||
#main函数的入口,根据超参设置的模式启动不同工作模式 | ||
if __name__ == '__main__': | ||
#如果在启动python程序时指定了超参文件,则从超参文件中读取超参,否则从默认的超参文件中读取 | ||
if len(sys.argv) - 1: | ||
gConfig = getConfig.get_config(sys.argv[1]) | ||
else: | ||
gConfig = getConfig.get_config() | ||
print('\n>> 执行器模式 : %s\n' %(gConfig['mode'])) | ||
if gConfig['mode'] == 'train': | ||
print('现在进行模型的训练') | ||
train() | ||
elif gConfig['mode'] == 'serve': | ||
print('当前为服务模式,请运行web程序,进行人机交互') | ||
|
120 changes: 120 additions & 0 deletions
120
Chatbot-tensowflow2.0/Distribute_seq2seqchatbot/seq2seqModel.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
#导入依赖包 | ||
import tensorflow as tf | ||
from config import getConfig | ||
import horovod.tensorflow as hvd | ||
tf.config.experimental_run_functions_eagerly(True) | ||
hvd.init() | ||
#初始化超参字典 | ||
gConfig = {} | ||
gConfig= getConfig.get_config() | ||
#通过超参字典为vocab_inp_size、vocab_tar_size、embedding_dim、units等赋值 | ||
vocab_inp_size = gConfig['vocab_inp_size'] | ||
vocab_tar_size = gConfig['vocab_tar_size'] | ||
embedding_dim=gConfig['embedding_dim'] | ||
units=gConfig['layer_size'] | ||
BATCH_SIZE=gConfig['batch_size']*hvd.size() | ||
|
||
#定义Encoder类 | ||
class Encoder(tf.keras.Model): | ||
#初始化函数,对默认参数进行初始化 | ||
def __init__(self, vocab_size, embedding_dim, enc_units, batch_size): | ||
super(Encoder, self).__init__() | ||
self.enc_units = enc_units | ||
self.batch_size = batch_size | ||
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim) | ||
self.gru = tf.keras.layers.GRU(self.enc_units,return_sequences=True,return_state=True,recurrent_initializer='glorot_uniform') | ||
#定义调用函数,实现逻辑计算 | ||
def call(self, x, hidden): | ||
x_emb = self.embedding(x) | ||
output, state = self.gru(x_emb, initial_state = hidden) | ||
return output, state | ||
def initialize_hidden_state(self): | ||
return tf.zeros((self.batch_size, self.enc_units)) | ||
|
||
#定义bahdanauAttention类,bahdanauAttention是常用的attention实现方法之一 | ||
class BahdanauAttention(tf.keras.Model): | ||
def __init__(self, units): | ||
super(BahdanauAttention, self).__init__() | ||
#注意力网络的初始化 | ||
self.W1 = tf.keras.layers.Dense(units) | ||
self.W2 = tf.keras.layers.Dense(units) | ||
self.V = tf.keras.layers.Dense(1) | ||
def call(self, query, values): | ||
#将query增加一个维度,以便可以与values进行线性相加 | ||
hidden_with_time_axis = tf.expand_dims(query, 1) | ||
#将quales与hidden_with_time_axis进行线性相加后,使用tanh进行非线性变换,最后输出一维的score | ||
score = self.V(tf.nn.tanh( | ||
self.W1(values) + self.W2(hidden_with_time_axis))) | ||
#使用softmax将score进行概率化转换,转为为概率空间 | ||
attention_weights = tf.nn.softmax(score, axis=1) | ||
#将权重与values(encoder_out)进行相乘,得到context_vector | ||
context_vector = attention_weights * values | ||
#将乘机后的context_vector按行相加,进行压缩得到最终的context_vector | ||
context_vector = tf.reduce_sum(context_vector, axis=1) | ||
return context_vector, attention_weights | ||
|
||
class Decoder(tf.keras.Model): | ||
def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz): | ||
super(Decoder, self).__init__() | ||
#初始化batch_sz、dec_units、embedding 、gru 、fc、attention | ||
self.batch_sz = batch_sz | ||
self.dec_units = dec_units | ||
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim) | ||
self.gru = tf.keras.layers.GRU(self.dec_units, | ||
return_sequences=True, | ||
return_state=True, | ||
recurrent_initializer='glorot_uniform') | ||
self.fc = tf.keras.layers.Dense(vocab_size) | ||
self.attention = BahdanauAttention(self.dec_units) | ||
def call(self, y, hidden, enc_output): | ||
#首先对enc_output、以及decoder的hidden计算attention,输出上下文语境向量 | ||
context_vector, attention_weights = self.attention(hidden, enc_output) | ||
#对decoder的输入进行embedding | ||
y = self.embedding(y) | ||
#拼接上下文语境与decoder的输入embedding,并送入gru中 | ||
y = tf.concat([tf.expand_dims(context_vector, 1), y], axis=-1) | ||
output, state = self.gru(y) | ||
#将gru的输出进行维度转换,送入全连接神经网络 得到最后的结果 | ||
output = tf.reshape(output, (-1, output.shape[2])) | ||
y = self.fc(output) | ||
return y, state, attention_weights | ||
#定义损失函数 | ||
def loss_function(real, pred): | ||
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) | ||
#mask掉start,去除start对于loss的干扰 | ||
mask = tf.math.logical_not(tf.math.equal(real, 0)) | ||
loss_ = loss_object(real, pred) | ||
mask = tf.cast(mask, dtype=loss_.dtype)#将bool型转换成数值 | ||
loss_ *= mask | ||
return tf.reduce_mean(loss_) | ||
#实例化encoder、decoder、optimizer、checkpoint等 | ||
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE) | ||
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE) | ||
optimizer = tf.keras.optimizers.Adam() | ||
checkpoint = tf.train.Checkpoint(optimizer=optimizer,encoder=encoder,decoder=decoder) | ||
@tf.function | ||
def training_step(inp, targ, targ_lang,enc_hidden,first_batch,allreduce=True): | ||
loss = 0 | ||
with tf.GradientTape() as tape: | ||
enc_output, enc_hidden = encoder(inp, enc_hidden) | ||
dec_hidden = enc_hidden | ||
dec_input = tf.expand_dims([targ_lang.word_index['start']] * BATCH_SIZE, 1) | ||
for t in range(1, targ.shape[1]): | ||
predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output) | ||
loss += loss_function(targ[:, t], predictions) | ||
dec_input = tf.expand_dims(targ[:, t], 1) | ||
step_loss = (loss / int(targ.shape[1])) | ||
variables = encoder.trainable_variables + decoder.trainable_variables | ||
if allreduce: | ||
tape = hvd.DistributedGradientTape(tape) | ||
gradients = tape.gradient(loss, variables) | ||
optimizer.apply_gradients(zip(gradients, variables)) | ||
if first_batch: | ||
hvd.broadcast_variables(variables, root_rank=0) | ||
hvd.broadcast_variables(optimizer.variables(), root_rank=0) | ||
return step_loss | ||
|
||
|
||
|
||
|
||
|
Oops, something went wrong.