-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
zn-nlp
committed
Oct 18, 2019
1 parent
8f6ac6f
commit 4d42278
Showing
20 changed files
with
1,706 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
|
||
|
||
train_path = "datasets/AutoMaster_TrainSet.csv" | ||
test_path = "datasets/AutoMaster_TestSet.csv" | ||
train_seg_path = "data/train_seg_sample.txt" # segment of train file | ||
test_seg_path = "data/test_seg_sample.txt" # segment of test file | ||
|
||
stop_words_path = "datasets/stop_words.txt" |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
from gensim.models import Word2Vec | ||
from gensim.models.word2vec import LineSentence | ||
from gensim.models.keyedvectors import KeyedVectors | ||
|
||
from src.utils.data_utils import read_lines, dump_pkl | ||
|
||
|
||
def get_sentence(sentence_tag, word_sep=' ', pos_sep='|'): | ||
""" | ||
文本拼接 | ||
:param sentence_tag: | ||
:param word_sep: | ||
:param pos_sep: | ||
:return: | ||
""" | ||
words = [] | ||
for item in sentence_tag.split(word_sep): | ||
if pos_sep in item: | ||
index = item.rindex(pos_sep) | ||
words.append(item[:index]) | ||
else: | ||
words.append(item.strip()) | ||
return word_sep.join(words) | ||
|
||
|
||
def extract_sentence(train_seg_path, test_seg_path, col_sep='\t'): | ||
ret = [] | ||
lines = read_lines(train_seg_path) | ||
lines += read_lines(test_seg_path) | ||
for line in lines: | ||
if col_sep in line: | ||
index = line.index(col_sep) | ||
word_tag = line[index + 1:] | ||
sentence = ''.join(get_sentence(word_tag)) | ||
ret.append(sentence) | ||
return ret | ||
|
||
|
||
def save_sentence(lines, sentence_path): | ||
with open(sentence_path, 'w', encoding='utf-8') as f: | ||
for line in lines: | ||
f.write('%s\n' % line.strip()) | ||
print('save sentence:%s' % sentence_path) | ||
|
||
|
||
def build(train_seg_path, test_seg_path, out_path=None, sentence_path='', | ||
w2v_bin_path="w2v.bin", min_count=1, col_sep='\t'): | ||
sentences = extract_sentence(train_seg_path, test_seg_path, col_sep=col_sep) | ||
save_sentence(sentences, sentence_path) | ||
print('train w2v model...') | ||
# train model | ||
w2v = Word2Vec(sg=1, sentences=LineSentence(sentence_path), | ||
size=256, window=5, min_count=min_count, iter=40) | ||
w2v.wv.save_word2vec_format(w2v_bin_path, binary=True) | ||
print("save %s ok." % w2v_bin_path) | ||
# test | ||
# sim = w2v.wv.similarity('大', '小') | ||
# print('大 vs 小 similarity score:', sim) | ||
# load model | ||
model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True) | ||
word_dict = {} | ||
for word in model.vocab: | ||
word_dict[word] = model[word] | ||
dump_pkl(word_dict, out_path, overwrite=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
import os | ||
import pathlib | ||
|
||
|
||
# pwd_path = os.path.abspath(os.path.dirname(__file__)) | ||
pwd_path = pathlib.Path(os.path.abspath(__file__)).parent.parent | ||
|
||
# Training data path. | ||
# chinese corpus | ||
raw_train_paths = [ | ||
# os.path.join(pwd_path, '../data/cn/CGED/CGED18_HSK_TrainingSet.xml'), | ||
# os.path.join(pwd_path, '../data/cn/CGED/CGED17_HSK_TrainingSet.xml'), | ||
# os.path.join(pwd_path, '../data/cn/CGED/CGED16_HSK_TrainingSet.xml'), | ||
os.path.join(pwd_path, '../data/cn/CGED/sample_HSK_TrainingSet.xml'), | ||
] | ||
|
||
output_dir = os.path.join(pwd_path, 'datasets') | ||
# Training data path. | ||
train_path = os.path.join(output_dir, 'AutoMaster_TrainSet.csv') | ||
# Validation data path. | ||
test_path = os.path.join(output_dir, 'AutoMaster_TestSet.csv') | ||
|
||
# paddle_train config | ||
save_vocab_path = os.path.join(output_dir, 'vocab.txt') | ||
model_save_dir = os.path.join(output_dir, 'paddle_model') | ||
|
||
vocab_max_size = 5000 | ||
vocab_min_count = 5 | ||
hidden_dim = 512 | ||
|
||
use_cuda = False | ||
|
||
batch_size = 64 | ||
epochs = 40 | ||
rnn_hidden_dim = 128 | ||
maxlen = 400 | ||
dropout = 0.0 | ||
gpu_id = 0 | ||
# segment of train file | ||
train_seg_path = os.path.join(output_dir, 'train_set.seg.csv') | ||
# segment of test file | ||
test_seg_path = os.path.join(output_dir, 'test_set.seg.csv') | ||
|
||
stop_words_path = os.path.join(output_dir, 'stop_words.txt') | ||
|
||
|
||
if not os.path.exists(output_dir): | ||
os.makedirs(output_dir) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
from utils.data_utils import load_list | ||
from keras.preprocessing.text import Tokenizer, text_to_word_sequence | ||
from keras.preprocessing.sequence import pad_sequences | ||
import numpy as np | ||
import re | ||
|
||
|
||
class Feature(object): | ||
|
||
def __init__(self, data=None, | ||
feature_type='tfidf_char', | ||
feature_vec_path=None, | ||
is_infer=False, | ||
min_count=1, | ||
word_vocab=None, | ||
max_len=400): | ||
self.data_set = data | ||
self.feature_type = feature_type | ||
self.feature_vec_path = feature_vec_path | ||
self.sentence_symbol = load_list(path='datasets/sentence_symbol.txt') | ||
self.stop_words = load_list(path='datasets/stop_words.txt') | ||
self.is_infer = is_infer | ||
self.min_count = min_count | ||
self.word_vocab = word_vocab | ||
self.max_len = max_len | ||
|
||
def get_feature(self): | ||
if self.feature_type == 'vectorize': | ||
data_feature = self.vectorize(self.data_set) | ||
elif self.feature_type == 'doc_vectorize': | ||
data_feature = self.doc_vectorize(self.data_set) | ||
|
||
return data_feature | ||
|
||
def vectorize(self, data_set): | ||
tokenizer = Tokenizer() | ||
tokenizer.fit_on_texts(data_set) | ||
sequences = tokenizer.fit_on_sequences(data_set) | ||
|
||
# word_index = tokenizer.word_index | ||
data_feature = pad_sequences(sequences, maxlen=self.max_len) | ||
|
||
return data_feature | ||
|
||
def doc_vectorize(self, data_set, max_sentences=16): | ||
tokenizer = Tokenizer() | ||
tokenizer.fit_on_texts(data_set) | ||
|
||
data_feature = np.zeros((len(data_set), max_sentences, self.max_len), dtype='int32') | ||
for i, sentence in enumerate(data_set): | ||
sentence_symbols = "".join(self.sentence_symbol) | ||
split = "[" + sentence_symbols + "]" | ||
short_sents = re.split(split, sentence) | ||
for j, sent in enumerate(short_sents): | ||
if j < max_sentences and sent.strip(): | ||
words = text_to_word_sequence(sent) | ||
k = 0 | ||
for w in words: | ||
if k < self.max_len: | ||
if w in tokenizer.word_index: | ||
data_feature[i, j, k] = tokenizer.word_index[w] | ||
k += 1 | ||
# word_index = tokenizer.word_index | ||
return data_feature | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
import pandas as pd | ||
import config | ||
|
||
start_token = u"<s>" | ||
end_token = u"<e>" | ||
unk_token = u"<unk>" | ||
|
||
max_question_len = 100 | ||
max_dialogue_len = 800 | ||
max_report_len = 100 | ||
|
||
# def data_reader(path, col_sep='\t'): | ||
# contents, labels = [], [] | ||
# with open(path, mode='r', encoding='utf-8') as f: | ||
# for line in f: | ||
# line = line.strip() | ||
# if col_sep in line: | ||
# index = line.index(col_sep) | ||
# label = line[:index].strip() | ||
# labels.append(label) | ||
# content = line[index + 1:].strip() | ||
# else: | ||
# content = line | ||
# contents.append(content) | ||
# return contents, labels | ||
|
||
|
||
def read_data(path): | ||
df = pd.read_csv(path, encoding='utf-8') | ||
question_lens = df['Question'].apply(lambda x: len(x.split(" "))) | ||
dialogue_lens = df['Dialogue'].apply(lambda x: len(x.split(" "))) | ||
report_lens = df['Report'].apply(lambda x: len(x.split(" "))) | ||
data = [] | ||
for i in range(len(df)): | ||
if question_lens[i] > max_question_len or dialogue_lens[i] > max_dialogue_len or report_lens[i] > max_report_len: | ||
continue | ||
item = df.iloc[i] | ||
data.append([[start_token] + item['Question'].split(" ") + [end_token], | ||
[start_token] + item['Dialogue'].split(" ") + [end_token], | ||
[start_token] + item['Report'].split(" ") + [end_token]]) | ||
return data | ||
|
||
|
||
def read_test_data(path): | ||
df = pd.read_csv(path, encoding='utf-8') | ||
data = [] | ||
for i in range(len(df)): | ||
item = df.iloc[i] | ||
question_vec = item['Question'].split(" ")[0:max_question_len] | ||
dialogue_vec = item['Dialogue'].split(" ")[0:max_dialogue_len] | ||
data.append([[start_token] + question_vec + [end_token], | ||
[start_token] + dialogue_vec + [end_token]]) | ||
return data | ||
|
||
|
||
# def read_samples_by_string(path): | ||
# train = pd.read_csv(path, encoding='utf-8') | ||
# lines = [] | ||
# for k in ['Question', 'Dialogue', 'Report']: | ||
# train_values = list(train[k].values) | ||
# lines.extend(train_values) | ||
# | ||
# for line in lines: | ||
# line = line.strip() | ||
# if not line: | ||
# continue | ||
# parts = line.lower().strip() | ||
# yield parts | ||
|
||
|
||
def build_dataset(path): | ||
print('Read data, path:{0}'.format(path)) | ||
train = pd.read_csv(path, encoding='utf-8') | ||
lines = [] | ||
for k in ['Question', 'Dialogue', 'Report']: | ||
lines.extend(list(train[k].values)) | ||
|
||
return lines | ||
|
||
# word_lst = [] | ||
# for i in data_content: | ||
# word_lst.extend(i.split()) |
Oops, something went wrong.