diff --git a/Chatbot-tensowflow2.0/Distribute_seq2seqchatbot/config/getConfig.py b/Chatbot-tensowflow2.0/Distribute_seq2seqchatbot/config/getConfig.py new file mode 100644 index 0000000..1226508 --- /dev/null +++ b/Chatbot-tensowflow2.0/Distribute_seq2seqchatbot/config/getConfig.py @@ -0,0 +1,15 @@ +# coding=utf-8 +import os +from configparser import SafeConfigParser +config_file=os.getcwd()+'/config/seq2seq.ini' +if not os.path.exists(config_file): + config_file = os.path.dirname(os.getcwd()) + '/config/seq2seq.ini' +print(config_file) +def get_config(): + parser = SafeConfigParser() + parser.read(config_file) + # get the ints, floats and strings + _conf_ints = [ (key, int(value)) for key,value in parser.items('ints')] + _conf_floats = [ (key, float(value)) for key,value in parser.items('floats') ] + _conf_strings = [ (key, str(value)) for key,value in parser.items('strings') ] + return dict(_conf_ints +_conf_floats+ _conf_strings) diff --git a/Chatbot-tensowflow2.0/Distribute_seq2seqchatbot/config/seq2seq.ini b/Chatbot-tensowflow2.0/Distribute_seq2seqchatbot/config/seq2seq.ini new file mode 100644 index 0000000..3f61709 --- /dev/null +++ b/Chatbot-tensowflow2.0/Distribute_seq2seqchatbot/config/seq2seq.ini @@ -0,0 +1,33 @@ +[strings] +# Mode : train, test, serve +mode = train +train_data=train_data +seq_data = train_data/seq.data +vocab_inp_path=train_data/inp.vocab +vocab_tar_path=train_data/tar.vocab +#训练集原始文件 +resource_data = train_data/xiaohuangji50w.conv +#分割后的训练样本文件 +split_train_data=train_data/seq_data_ +#读取识别原始文件中段落和行头的标示 +e = E +m = M +model_data = model_data +log_dir=log_dir +[ints] +# vocabulary size +# 20,000 is a reasonable size +vocab_inp_size = 20000 +vocab_tar_size = 20000 +embedding_dim=128 +train_epoch=10 +# typical options : 128, 256, 512, 1024 +layer_size = 512 +batch_size = 64 +#句子的最长长度 +max_length=20 +number_work=2 +[floats] +#设置最小Loss,当模型loss值达到这个水平后停止训练 +min_loss=0.2 + diff --git a/Chatbot-tensowflow2.0/Distribute_seq2seqchatbot/data_util.py b/Chatbot-tensowflow2.0/Distribute_seq2seqchatbot/data_util.py new file mode 100644 index 0000000..8af751f --- /dev/null +++ b/Chatbot-tensowflow2.0/Distribute_seq2seqchatbot/data_util.py @@ -0,0 +1,73 @@ +# coding=utf-8 +import json +import os +import re +import jieba +from zhon.hanzi import punctuation +from config import getConfig +import io +import tensorflow as tf + +# 加载参数配置文件 +gConfig = {} +gConfig = getConfig.get_config() +conv_path = gConfig['resource_data'] +vocab_inp_path = gConfig['vocab_inp_path'] +vocab_tar_path = gConfig['vocab_tar_path'] +vocab_inp_size = gConfig['vocab_inp_size'] +vocab_tar_size = gConfig['vocab_tar_size'] +seq_train = gConfig['seq_data'] +def predata_util(): + # 判断训练语料文件是否存在,如果不存在则进行提醒 + if not os.path.exists(conv_path): + print("找不到需要处理的文件,请确认在train_data文件中是否存在该文件") + exit() + # 新建一个文件,用于存放处理后的对话语料 + seq_train = open(gConfig['seq_data'], 'w') + # 打开需要处理的语料,逐条读取并进行数据处理 + with open(conv_path, encoding='utf-8') as f: + one_conv = "" # 存储一次完整对话 + i = 0 + # 开始循环处理语料 + for line in f: + line = line.strip('\n') + line = re.sub(r"[%s]+" % punctuation, "", line) # 去除标点符号 + if line == '': + continue + # 判断是否为一段对话的开始,如果是则把刚刚处理的语料保存下来 + if line[0] == gConfig['e']: + if one_conv: + seq_train.write(one_conv[:-1] + '\n') + i = i + 1 + if i % 1000 == 0: + print('处理进度:', i) + one_conv = "" + # 判断是否正在处理对话语句,如果是则进行语料的拼接处理 以及分词 + elif line[0] == gConfig['m']: + one_conv = one_conv + str(" ".join(jieba.cut(line.split(' ')[1]))) + '\t' # 存储一次问或答 + # 处理完成,关闭文件 + seq_train.close() + +def create_vocab(lang, vocab_path, vocab_size): + tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size, oov_token=3) + tokenizer.fit_on_texts(lang) + vocab = json.loads(tokenizer.to_json(ensure_ascii=False)) + vocab['index_word'] = tokenizer.index_word + vocab['word_index'] = tokenizer.word_index + vocab['document_count']=tokenizer.document_count + vocab = json.dumps(vocab, ensure_ascii=False) + with open(vocab_path, 'w', encoding='utf-8') as f: + f.write(vocab) + f.close() + print("字典保存在:{}".format(vocab_path)) + +def preprocess_sentence(w): + w = 'start ' + w + ' end' + return w +lines = io.open(seq_train, encoding='UTF-8').readlines() +word_pairs = [[preprocess_sentence(w) for w in l.split('\t')] for l in lines] +input_lang, target_lang = zip(*word_pairs) +predata_util() +create_vocab(input_lang,vocab_inp_path,vocab_inp_size) +create_vocab(target_lang,vocab_tar_path,vocab_tar_size) + diff --git a/Chatbot-tensowflow2.0/Distribute_seq2seqchatbot/execute.py b/Chatbot-tensowflow2.0/Distribute_seq2seqchatbot/execute.py new file mode 100644 index 0000000..f4b0cc8 --- /dev/null +++ b/Chatbot-tensowflow2.0/Distribute_seq2seqchatbot/execute.py @@ -0,0 +1,153 @@ +# coding=utf-8 +#导入依赖包 +import json +import os +import sys +import time +import tensorflow as tf +import horovod.tensorflow as hvd +import seq2seqModel +from config import getConfig +import io + +hvd.init() +#初始化超参字典,并对相应的参数进行赋值 +gConfig = {} +gConfig= getConfig.get_config() +vocab_inp_size = gConfig['vocab_inp_size'] +vocab_tar_size = gConfig['vocab_tar_size'] +embedding_dim=gConfig['embedding_dim'] +units=gConfig['layer_size'] +BATCH_SIZE=gConfig['batch_size'] + +max_length_inp=gConfig['max_length'] +max_length_tar=gConfig['max_length'] + +log_dir=gConfig['log_dir'] +writer = tf.summary.create_file_writer(log_dir) +#对训练语料进行处理,上下文分别加上start end标示 +def preprocess_sentence(w): + w ='start '+ w + ' end' + return w +#定义数据读取函数,从训练语料中读取数据并进行word2number的处理,并生成词典 +def read_data(path): + path = os.getcwd() + '/' + path + if not os.path.exists(path): + path=os.path.dirname(os.getcwd())+'/'+ path + lines = io.open(path, encoding='UTF-8').read().strip().split('\n') + word_pairs = [[preprocess_sentence(w) for w in l.split('\t')] for l in lines] + input_lang,target_lang=zip(*word_pairs) + input_tokenizer=tokenize(gConfig['vocab_inp_path']) + target_tokenizer=tokenize(gConfig['vocab_tar_path']) + input_tensor=input_tokenizer.texts_to_sequences(input_lang) + target_tensor=target_tokenizer.texts_to_sequences(target_lang) + input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, maxlen=max_length_inp, + padding='post') + target_tensor= tf.keras.preprocessing.sequence.pad_sequences(target_tensor, maxlen=max_length_tar, + padding='post') + return input_tensor,input_tokenizer,target_tensor,target_tokenizer +#定义word2number函数,通过对语料的处理提取词典,并进行word2number处理以及padding补全 +def tokenize(vocab_file): + #从词典中读取预先生成tokenizer的config,构建词典矩阵 + with open(vocab_file,'r',encoding='utf-8') as f: + tokenize_config=json.dumps(json.load(f),ensure_ascii=False) + lang_tokenizer=tf.keras.preprocessing.text.tokenizer_from_json(tokenize_config) + #利用词典进行word2number的转换以及padding处理 + return lang_tokenizer +input_tensor, input_token, target_tensor, target_token = read_data(gConfig['seq_data']) +steps_per_epoch = len(input_tensor) // (gConfig['batch_size']*hvd.size()) +BUFFER_SIZE = len(input_tensor) +dataset = tf.data.Dataset.from_tensor_slices((input_tensor,target_tensor)).shuffle(BUFFER_SIZE) +dataset = dataset.batch(BATCH_SIZE, drop_remainder=True) +enc_hidden = seq2seqModel.encoder.initialize_hidden_state() +dataset = dataset.shard(hvd.size(), hvd.rank()) +#定义训练函数 +def train(): + # 从训练语料中读取数据并使用预生成词典word2number的转换 + print("Preparing data in %s" % gConfig['train_data']) + print('每个epoch的训练步数: {}'.format(steps_per_epoch)) + #如有已经有预训练的模型则加载预训练模型继续训练 + checkpoint_dir = gConfig['model_data'] + ckpt=tf.io.gfile.listdir(checkpoint_dir) + if ckpt: + print("reload pretrained model") + seq2seqModel.checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir)) + + #使用Dataset加载训练数据,Dataset可以加速数据的并发读取并进行训练效率的优化 + checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt") + start_time = time.time() + #current_loss=2 + #min_loss=gConfig['min_loss'] + epoch = 0 + train_epoch = gConfig['train_epoch'] + #开始进行循环训练,这里设置了一个结束循环的条件就是当loss小于设置的min_loss超参时终止训练 + while epoch