Skip to content

Commit

Permalink
ReleaseV1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
zhaoyingjun committed Jun 9, 2022
1 parent 2d4b339 commit 0202b9a
Show file tree
Hide file tree
Showing 81 changed files with 511,752 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# coding=utf-8
import os
from configparser import SafeConfigParser
config_file=os.getcwd()+'/config/seq2seq.ini'
if not os.path.exists(config_file):
config_file = os.path.dirname(os.getcwd()) + '/config/seq2seq.ini'
print(config_file)
def get_config():
parser = SafeConfigParser()
parser.read(config_file)
# get the ints, floats and strings
_conf_ints = [ (key, int(value)) for key,value in parser.items('ints')]
_conf_floats = [ (key, float(value)) for key,value in parser.items('floats') ]
_conf_strings = [ (key, str(value)) for key,value in parser.items('strings') ]
return dict(_conf_ints +_conf_floats+ _conf_strings)
33 changes: 33 additions & 0 deletions Chatbot-tensowflow2.0/Distribute_seq2seqchatbot/config/seq2seq.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
[strings]
# Mode : train, test, serve
mode = train
train_data=train_data
seq_data = train_data/seq.data
vocab_inp_path=train_data/inp.vocab
vocab_tar_path=train_data/tar.vocab
#训练集原始文件
resource_data = train_data/xiaohuangji50w.conv
#分割后的训练样本文件
split_train_data=train_data/seq_data_
#读取识别原始文件中段落和行头的标示
e = E
m = M
model_data = model_data
log_dir=log_dir
[ints]
# vocabulary size
# 20,000 is a reasonable size
vocab_inp_size = 20000
vocab_tar_size = 20000
embedding_dim=128
train_epoch=10
# typical options : 128, 256, 512, 1024
layer_size = 512
batch_size = 64
#句子的最长长度
max_length=20
number_work=2
[floats]
#设置最小Loss,当模型loss值达到这个水平后停止训练
min_loss=0.2

73 changes: 73 additions & 0 deletions Chatbot-tensowflow2.0/Distribute_seq2seqchatbot/data_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# coding=utf-8
import json
import os
import re
import jieba
from zhon.hanzi import punctuation
from config import getConfig
import io
import tensorflow as tf

# 加载参数配置文件
gConfig = {}
gConfig = getConfig.get_config()
conv_path = gConfig['resource_data']
vocab_inp_path = gConfig['vocab_inp_path']
vocab_tar_path = gConfig['vocab_tar_path']
vocab_inp_size = gConfig['vocab_inp_size']
vocab_tar_size = gConfig['vocab_tar_size']
seq_train = gConfig['seq_data']
def predata_util():
# 判断训练语料文件是否存在,如果不存在则进行提醒
if not os.path.exists(conv_path):
print("找不到需要处理的文件,请确认在train_data文件中是否存在该文件")
exit()
# 新建一个文件,用于存放处理后的对话语料
seq_train = open(gConfig['seq_data'], 'w')
# 打开需要处理的语料,逐条读取并进行数据处理
with open(conv_path, encoding='utf-8') as f:
one_conv = "" # 存储一次完整对话
i = 0
# 开始循环处理语料
for line in f:
line = line.strip('\n')
line = re.sub(r"[%s]+" % punctuation, "", line) # 去除标点符号
if line == '':
continue
# 判断是否为一段对话的开始,如果是则把刚刚处理的语料保存下来
if line[0] == gConfig['e']:
if one_conv:
seq_train.write(one_conv[:-1] + '\n')
i = i + 1
if i % 1000 == 0:
print('处理进度:', i)
one_conv = ""
# 判断是否正在处理对话语句,如果是则进行语料的拼接处理 以及分词
elif line[0] == gConfig['m']:
one_conv = one_conv + str(" ".join(jieba.cut(line.split(' ')[1]))) + '\t' # 存储一次问或答
# 处理完成,关闭文件
seq_train.close()

def create_vocab(lang, vocab_path, vocab_size):
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size, oov_token=3)
tokenizer.fit_on_texts(lang)
vocab = json.loads(tokenizer.to_json(ensure_ascii=False))
vocab['index_word'] = tokenizer.index_word
vocab['word_index'] = tokenizer.word_index
vocab['document_count']=tokenizer.document_count
vocab = json.dumps(vocab, ensure_ascii=False)
with open(vocab_path, 'w', encoding='utf-8') as f:
f.write(vocab)
f.close()
print("字典保存在:{}".format(vocab_path))

def preprocess_sentence(w):
w = 'start ' + w + ' end'
return w
lines = io.open(seq_train, encoding='UTF-8').readlines()
word_pairs = [[preprocess_sentence(w) for w in l.split('\t')] for l in lines]
input_lang, target_lang = zip(*word_pairs)
predata_util()
create_vocab(input_lang,vocab_inp_path,vocab_inp_size)
create_vocab(target_lang,vocab_tar_path,vocab_tar_size)

153 changes: 153 additions & 0 deletions Chatbot-tensowflow2.0/Distribute_seq2seqchatbot/execute.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
# coding=utf-8
#导入依赖包
import json
import os
import sys
import time
import tensorflow as tf
import horovod.tensorflow as hvd
import seq2seqModel
from config import getConfig
import io

hvd.init()
#初始化超参字典,并对相应的参数进行赋值
gConfig = {}
gConfig= getConfig.get_config()
vocab_inp_size = gConfig['vocab_inp_size']
vocab_tar_size = gConfig['vocab_tar_size']
embedding_dim=gConfig['embedding_dim']
units=gConfig['layer_size']
BATCH_SIZE=gConfig['batch_size']

max_length_inp=gConfig['max_length']
max_length_tar=gConfig['max_length']

log_dir=gConfig['log_dir']
writer = tf.summary.create_file_writer(log_dir)
#对训练语料进行处理,上下文分别加上start end标示
def preprocess_sentence(w):
w ='start '+ w + ' end'
return w
#定义数据读取函数,从训练语料中读取数据并进行word2number的处理,并生成词典
def read_data(path):
path = os.getcwd() + '/' + path
if not os.path.exists(path):
path=os.path.dirname(os.getcwd())+'/'+ path
lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
word_pairs = [[preprocess_sentence(w) for w in l.split('\t')] for l in lines]
input_lang,target_lang=zip(*word_pairs)
input_tokenizer=tokenize(gConfig['vocab_inp_path'])
target_tokenizer=tokenize(gConfig['vocab_tar_path'])
input_tensor=input_tokenizer.texts_to_sequences(input_lang)
target_tensor=target_tokenizer.texts_to_sequences(target_lang)
input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, maxlen=max_length_inp,
padding='post')
target_tensor= tf.keras.preprocessing.sequence.pad_sequences(target_tensor, maxlen=max_length_tar,
padding='post')
return input_tensor,input_tokenizer,target_tensor,target_tokenizer
#定义word2number函数,通过对语料的处理提取词典,并进行word2number处理以及padding补全
def tokenize(vocab_file):
#从词典中读取预先生成tokenizer的config,构建词典矩阵
with open(vocab_file,'r',encoding='utf-8') as f:
tokenize_config=json.dumps(json.load(f),ensure_ascii=False)
lang_tokenizer=tf.keras.preprocessing.text.tokenizer_from_json(tokenize_config)
#利用词典进行word2number的转换以及padding处理
return lang_tokenizer
input_tensor, input_token, target_tensor, target_token = read_data(gConfig['seq_data'])
steps_per_epoch = len(input_tensor) // (gConfig['batch_size']*hvd.size())
BUFFER_SIZE = len(input_tensor)
dataset = tf.data.Dataset.from_tensor_slices((input_tensor,target_tensor)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
enc_hidden = seq2seqModel.encoder.initialize_hidden_state()
dataset = dataset.shard(hvd.size(), hvd.rank())
#定义训练函数
def train():
# 从训练语料中读取数据并使用预生成词典word2number的转换
print("Preparing data in %s" % gConfig['train_data'])
print('每个epoch的训练步数: {}'.format(steps_per_epoch))
#如有已经有预训练的模型则加载预训练模型继续训练
checkpoint_dir = gConfig['model_data']
ckpt=tf.io.gfile.listdir(checkpoint_dir)
if ckpt:
print("reload pretrained model")
seq2seqModel.checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

#使用Dataset加载训练数据,Dataset可以加速数据的并发读取并进行训练效率的优化
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
start_time = time.time()
#current_loss=2
#min_loss=gConfig['min_loss']
epoch = 0
train_epoch = gConfig['train_epoch']
#开始进行循环训练,这里设置了一个结束循环的条件就是当loss小于设置的min_loss超参时终止训练
while epoch<train_epoch:
start_time_epoch = time.time()
total_loss = 0
#进行一个epoch的训练,训练的步数为steps_per_epoch
for batch,(inp, targ) in enumerate(dataset.take(steps_per_epoch)):
batch_loss = seq2seqModel.training_step(inp, targ,target_token, enc_hidden,batch==0)
total_loss += batch_loss
print('epoch:{}batch:{} batch_loss: {}'.format(epoch,batch,batch_loss))
#结束一个epoch的训练后,更新current_loss,计算在本epoch中每步训练平均耗时、loss值
step_time_epoch = (time.time() - start_time_epoch) / steps_per_epoch
step_loss = total_loss / steps_per_epoch
current_steps = +steps_per_epoch
epoch_time_total = (time.time() - start_time)
print('训练总步数: {} 总耗时: {} epoch平均每步耗时: {} 平均每步loss {:.4f}'
.format(current_steps, epoch_time_total, step_time_epoch, step_loss))
#将本epoch训练的模型进行保存,更新模型文件
seq2seqModel.checkpoint.save(file_prefix=checkpoint_prefix)
sys.stdout.flush()
epoch = epoch + 1
with writer.as_default():
tf.summary.scalar('loss', step_loss, step=epoch)
#定义预测函数,用于根据上文预测下文对话
def predict(sentence):
# 从词典中读取预先生成tokenizer的config,构建词典矩阵
input_tokenizer = tokenize(gConfig['vocab_inp_path'])
target_tokenizer = tokenize(gConfig['vocab_tar_path'])
#加载预训练的模型
checkpoint_dir = gConfig['model_data']
seq2seqModel.checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
#对输入的语句进行处理,加上start end标示
sentence = preprocess_sentence(sentence)
#进行word2number的转换
inputs = input_tokenizer.texts_to_sequences(sentence)
#进行padding的补全
inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],maxlen=max_length_inp,padding='post')
inputs = tf.convert_to_tensor(inputs)
result = ''
#初始化一个中间状态
hidden = [tf.zeros((1, units))]
#对输入上文进行encoder编码,提取特征
enc_out, enc_hidden = seq2seqModel.encoder(inputs, hidden)
dec_hidden = enc_hidden
#decoder的输入从start的对应Id开始正向输入
dec_input = tf.expand_dims([target_tokenizer.word_index['start']], 0)
#在最大的语句长度范围内容,使用模型中的decoder进行循环解码
for t in range(max_length_tar):
#获得解码结果,并使用argmax确定概率最大的id
predictions, dec_hidden, attention_weights = seq2seqModel.decoder(dec_input, dec_hidden, enc_out)
predicted_id = tf.argmax(predictions[0]).numpy()
#判断当前Id是否为语句结束表示,如果是则停止循环解码,否则进行number2word的转换,并进行语句拼接
if target_tokenizer.index_word[predicted_id] == 'end':
break
result += str(target_tokenizer.index_word[predicted_id]) + ' '
#将预测得到的id作为下一个时刻的decoder的输入
dec_input = tf.expand_dims([predicted_id], 0)
return result
#main函数的入口,根据超参设置的模式启动不同工作模式
if __name__ == '__main__':
#如果在启动python程序时指定了超参文件,则从超参文件中读取超参,否则从默认的超参文件中读取
if len(sys.argv) - 1:
gConfig = getConfig.get_config(sys.argv[1])
else:
gConfig = getConfig.get_config()
print('\n>> 执行器模式 : %s\n' %(gConfig['mode']))
if gConfig['mode'] == 'train':
print('现在进行模型的训练')
train()
elif gConfig['mode'] == 'serve':
print('当前为服务模式,请运行web程序,进行人机交互')

120 changes: 120 additions & 0 deletions Chatbot-tensowflow2.0/Distribute_seq2seqchatbot/seq2seqModel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#导入依赖包
import tensorflow as tf
from config import getConfig
import horovod.tensorflow as hvd
tf.config.experimental_run_functions_eagerly(True)
hvd.init()
#初始化超参字典
gConfig = {}
gConfig= getConfig.get_config()
#通过超参字典为vocab_inp_size、vocab_tar_size、embedding_dim、units等赋值
vocab_inp_size = gConfig['vocab_inp_size']
vocab_tar_size = gConfig['vocab_tar_size']
embedding_dim=gConfig['embedding_dim']
units=gConfig['layer_size']
BATCH_SIZE=gConfig['batch_size']*hvd.size()

#定义Encoder类
class Encoder(tf.keras.Model):
#初始化函数,对默认参数进行初始化
def __init__(self, vocab_size, embedding_dim, enc_units, batch_size):
super(Encoder, self).__init__()
self.enc_units = enc_units
self.batch_size = batch_size
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
self.gru = tf.keras.layers.GRU(self.enc_units,return_sequences=True,return_state=True,recurrent_initializer='glorot_uniform')
#定义调用函数,实现逻辑计算
def call(self, x, hidden):
x_emb = self.embedding(x)
output, state = self.gru(x_emb, initial_state = hidden)
return output, state
def initialize_hidden_state(self):
return tf.zeros((self.batch_size, self.enc_units))

#定义bahdanauAttention类,bahdanauAttention是常用的attention实现方法之一
class BahdanauAttention(tf.keras.Model):
def __init__(self, units):
super(BahdanauAttention, self).__init__()
#注意力网络的初始化
self.W1 = tf.keras.layers.Dense(units)
self.W2 = tf.keras.layers.Dense(units)
self.V = tf.keras.layers.Dense(1)
def call(self, query, values):
#将query增加一个维度,以便可以与values进行线性相加
hidden_with_time_axis = tf.expand_dims(query, 1)
#将quales与hidden_with_time_axis进行线性相加后,使用tanh进行非线性变换,最后输出一维的score
score = self.V(tf.nn.tanh(
self.W1(values) + self.W2(hidden_with_time_axis)))
#使用softmax将score进行概率化转换,转为为概率空间
attention_weights = tf.nn.softmax(score, axis=1)
#将权重与values(encoder_out)进行相乘,得到context_vector
context_vector = attention_weights * values
#将乘机后的context_vector按行相加,进行压缩得到最终的context_vector
context_vector = tf.reduce_sum(context_vector, axis=1)
return context_vector, attention_weights

class Decoder(tf.keras.Model):
def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
super(Decoder, self).__init__()
#初始化batch_sz、dec_units、embedding 、gru 、fc、attention
self.batch_sz = batch_sz
self.dec_units = dec_units
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
self.gru = tf.keras.layers.GRU(self.dec_units,
return_sequences=True,
return_state=True,
recurrent_initializer='glorot_uniform')
self.fc = tf.keras.layers.Dense(vocab_size)
self.attention = BahdanauAttention(self.dec_units)
def call(self, y, hidden, enc_output):
#首先对enc_output、以及decoder的hidden计算attention,输出上下文语境向量
context_vector, attention_weights = self.attention(hidden, enc_output)
#对decoder的输入进行embedding
y = self.embedding(y)
#拼接上下文语境与decoder的输入embedding,并送入gru中
y = tf.concat([tf.expand_dims(context_vector, 1), y], axis=-1)
output, state = self.gru(y)
#将gru的输出进行维度转换,送入全连接神经网络 得到最后的结果
output = tf.reshape(output, (-1, output.shape[2]))
y = self.fc(output)
return y, state, attention_weights
#定义损失函数
def loss_function(real, pred):
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
#mask掉start,去除start对于loss的干扰
mask = tf.math.logical_not(tf.math.equal(real, 0))
loss_ = loss_object(real, pred)
mask = tf.cast(mask, dtype=loss_.dtype)#将bool型转换成数值
loss_ *= mask
return tf.reduce_mean(loss_)
#实例化encoder、decoder、optimizer、checkpoint等
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)
optimizer = tf.keras.optimizers.Adam()
checkpoint = tf.train.Checkpoint(optimizer=optimizer,encoder=encoder,decoder=decoder)
@tf.function
def training_step(inp, targ, targ_lang,enc_hidden,first_batch,allreduce=True):
loss = 0
with tf.GradientTape() as tape:
enc_output, enc_hidden = encoder(inp, enc_hidden)
dec_hidden = enc_hidden
dec_input = tf.expand_dims([targ_lang.word_index['start']] * BATCH_SIZE, 1)
for t in range(1, targ.shape[1]):
predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
loss += loss_function(targ[:, t], predictions)
dec_input = tf.expand_dims(targ[:, t], 1)
step_loss = (loss / int(targ.shape[1]))
variables = encoder.trainable_variables + decoder.trainable_variables
if allreduce:
tape = hvd.DistributedGradientTape(tape)
gradients = tape.gradient(loss, variables)
optimizer.apply_gradients(zip(gradients, variables))
if first_batch:
hvd.broadcast_variables(variables, root_rank=0)
hvd.broadcast_variables(optimizer.variables(), root_rank=0)
return step_loss





Loading

0 comments on commit 0202b9a

Please sign in to comment.