Skip to content

Commit

Permalink
init
Browse files Browse the repository at this point in the history
  • Loading branch information
zn-nlp committed Oct 18, 2019
1 parent 8f6ac6f commit 4d42278
Show file tree
Hide file tree
Showing 20 changed files with 1,706 additions and 0 deletions.
8 changes: 8 additions & 0 deletions config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@


train_path = "datasets/AutoMaster_TrainSet.csv"
test_path = "datasets/AutoMaster_TestSet.csv"
train_seg_path = "data/train_seg_sample.txt" # segment of train file
test_seg_path = "data/test_seg_sample.txt" # segment of test file

stop_words_path = "datasets/stop_words.txt"
Empty file added paddle_model/__init__.py
Empty file.
64 changes: 64 additions & 0 deletions paddle_model/build_w2v.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from gensim.models.keyedvectors import KeyedVectors

from src.utils.data_utils import read_lines, dump_pkl


def get_sentence(sentence_tag, word_sep=' ', pos_sep='|'):
"""
文本拼接
:param sentence_tag:
:param word_sep:
:param pos_sep:
:return:
"""
words = []
for item in sentence_tag.split(word_sep):
if pos_sep in item:
index = item.rindex(pos_sep)
words.append(item[:index])
else:
words.append(item.strip())
return word_sep.join(words)


def extract_sentence(train_seg_path, test_seg_path, col_sep='\t'):
ret = []
lines = read_lines(train_seg_path)
lines += read_lines(test_seg_path)
for line in lines:
if col_sep in line:
index = line.index(col_sep)
word_tag = line[index + 1:]
sentence = ''.join(get_sentence(word_tag))
ret.append(sentence)
return ret


def save_sentence(lines, sentence_path):
with open(sentence_path, 'w', encoding='utf-8') as f:
for line in lines:
f.write('%s\n' % line.strip())
print('save sentence:%s' % sentence_path)


def build(train_seg_path, test_seg_path, out_path=None, sentence_path='',
w2v_bin_path="w2v.bin", min_count=1, col_sep='\t'):
sentences = extract_sentence(train_seg_path, test_seg_path, col_sep=col_sep)
save_sentence(sentences, sentence_path)
print('train w2v model...')
# train model
w2v = Word2Vec(sg=1, sentences=LineSentence(sentence_path),
size=256, window=5, min_count=min_count, iter=40)
w2v.wv.save_word2vec_format(w2v_bin_path, binary=True)
print("save %s ok." % w2v_bin_path)
# test
# sim = w2v.wv.similarity('大', '小')
# print('大 vs 小 similarity score:', sim)
# load model
model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True)
word_dict = {}
for word in model.vocab:
word_dict[word] = model[word]
dump_pkl(word_dict, out_path, overwrite=True)
48 changes: 48 additions & 0 deletions paddle_model/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import os
import pathlib


# pwd_path = os.path.abspath(os.path.dirname(__file__))
pwd_path = pathlib.Path(os.path.abspath(__file__)).parent.parent

# Training data path.
# chinese corpus
raw_train_paths = [
# os.path.join(pwd_path, '../data/cn/CGED/CGED18_HSK_TrainingSet.xml'),
# os.path.join(pwd_path, '../data/cn/CGED/CGED17_HSK_TrainingSet.xml'),
# os.path.join(pwd_path, '../data/cn/CGED/CGED16_HSK_TrainingSet.xml'),
os.path.join(pwd_path, '../data/cn/CGED/sample_HSK_TrainingSet.xml'),
]

output_dir = os.path.join(pwd_path, 'datasets')
# Training data path.
train_path = os.path.join(output_dir, 'AutoMaster_TrainSet.csv')
# Validation data path.
test_path = os.path.join(output_dir, 'AutoMaster_TestSet.csv')

# paddle_train config
save_vocab_path = os.path.join(output_dir, 'vocab.txt')
model_save_dir = os.path.join(output_dir, 'paddle_model')

vocab_max_size = 5000
vocab_min_count = 5
hidden_dim = 512

use_cuda = False

batch_size = 64
epochs = 40
rnn_hidden_dim = 128
maxlen = 400
dropout = 0.0
gpu_id = 0
# segment of train file
train_seg_path = os.path.join(output_dir, 'train_set.seg.csv')
# segment of test file
test_seg_path = os.path.join(output_dir, 'test_set.seg.csv')

stop_words_path = os.path.join(output_dir, 'stop_words.txt')


if not os.path.exists(output_dir):
os.makedirs(output_dir)
66 changes: 66 additions & 0 deletions paddle_model/feature.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from utils.data_utils import load_list
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import re


class Feature(object):

def __init__(self, data=None,
feature_type='tfidf_char',
feature_vec_path=None,
is_infer=False,
min_count=1,
word_vocab=None,
max_len=400):
self.data_set = data
self.feature_type = feature_type
self.feature_vec_path = feature_vec_path
self.sentence_symbol = load_list(path='datasets/sentence_symbol.txt')
self.stop_words = load_list(path='datasets/stop_words.txt')
self.is_infer = is_infer
self.min_count = min_count
self.word_vocab = word_vocab
self.max_len = max_len

def get_feature(self):
if self.feature_type == 'vectorize':
data_feature = self.vectorize(self.data_set)
elif self.feature_type == 'doc_vectorize':
data_feature = self.doc_vectorize(self.data_set)

return data_feature

def vectorize(self, data_set):
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data_set)
sequences = tokenizer.fit_on_sequences(data_set)

# word_index = tokenizer.word_index
data_feature = pad_sequences(sequences, maxlen=self.max_len)

return data_feature

def doc_vectorize(self, data_set, max_sentences=16):
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data_set)

data_feature = np.zeros((len(data_set), max_sentences, self.max_len), dtype='int32')
for i, sentence in enumerate(data_set):
sentence_symbols = "".join(self.sentence_symbol)
split = "[" + sentence_symbols + "]"
short_sents = re.split(split, sentence)
for j, sent in enumerate(short_sents):
if j < max_sentences and sent.strip():
words = text_to_word_sequence(sent)
k = 0
for w in words:
if k < self.max_len:
if w in tokenizer.word_index:
data_feature[i, j, k] = tokenizer.word_index[w]
k += 1
# word_index = tokenizer.word_index
return data_feature


82 changes: 82 additions & 0 deletions paddle_model/reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import pandas as pd
import config

start_token = u"<s>"
end_token = u"<e>"
unk_token = u"<unk>"

max_question_len = 100
max_dialogue_len = 800
max_report_len = 100

# def data_reader(path, col_sep='\t'):
# contents, labels = [], []
# with open(path, mode='r', encoding='utf-8') as f:
# for line in f:
# line = line.strip()
# if col_sep in line:
# index = line.index(col_sep)
# label = line[:index].strip()
# labels.append(label)
# content = line[index + 1:].strip()
# else:
# content = line
# contents.append(content)
# return contents, labels


def read_data(path):
df = pd.read_csv(path, encoding='utf-8')
question_lens = df['Question'].apply(lambda x: len(x.split(" ")))
dialogue_lens = df['Dialogue'].apply(lambda x: len(x.split(" ")))
report_lens = df['Report'].apply(lambda x: len(x.split(" ")))
data = []
for i in range(len(df)):
if question_lens[i] > max_question_len or dialogue_lens[i] > max_dialogue_len or report_lens[i] > max_report_len:
continue
item = df.iloc[i]
data.append([[start_token] + item['Question'].split(" ") + [end_token],
[start_token] + item['Dialogue'].split(" ") + [end_token],
[start_token] + item['Report'].split(" ") + [end_token]])
return data


def read_test_data(path):
df = pd.read_csv(path, encoding='utf-8')
data = []
for i in range(len(df)):
item = df.iloc[i]
question_vec = item['Question'].split(" ")[0:max_question_len]
dialogue_vec = item['Dialogue'].split(" ")[0:max_dialogue_len]
data.append([[start_token] + question_vec + [end_token],
[start_token] + dialogue_vec + [end_token]])
return data


# def read_samples_by_string(path):
# train = pd.read_csv(path, encoding='utf-8')
# lines = []
# for k in ['Question', 'Dialogue', 'Report']:
# train_values = list(train[k].values)
# lines.extend(train_values)
#
# for line in lines:
# line = line.strip()
# if not line:
# continue
# parts = line.lower().strip()
# yield parts


def build_dataset(path):
print('Read data, path:{0}'.format(path))
train = pd.read_csv(path, encoding='utf-8')
lines = []
for k in ['Question', 'Dialogue', 'Report']:
lines.extend(list(train[k].values))

return lines

# word_lst = []
# for i in data_content:
# word_lst.extend(i.split())
Loading

0 comments on commit 4d42278

Please sign in to comment.