init

qiaochen · Oct 18, 2019 · 4d42278 · 4d42278
1 parent 8f6ac6f
commit 4d42278
Show file tree

Hide file tree

Showing 20 changed files with 1,706 additions and 0 deletions.
diff --git a/config.py b/config.py
@@ -0,0 +1,8 @@
+
+
+train_path = "datasets/AutoMaster_TrainSet.csv"
+test_path = "datasets/AutoMaster_TestSet.csv"
+train_seg_path = "data/train_seg_sample.txt"  # segment of train file
+test_seg_path = "data/test_seg_sample.txt"    # segment of test file
+
+stop_words_path = "datasets/stop_words.txt"
diff --git a/paddle_model/__init__.py b/paddle_model/__init__.py
diff --git a/paddle_model/build_w2v.py b/paddle_model/build_w2v.py
@@ -0,0 +1,64 @@
+from gensim.models import Word2Vec
+from gensim.models.word2vec import LineSentence
+from gensim.models.keyedvectors import KeyedVectors
+
+from src.utils.data_utils import read_lines, dump_pkl
+
+
+def get_sentence(sentence_tag, word_sep=' ', pos_sep='|'):
+    """
+    文本拼接
+    :param sentence_tag:
+    :param word_sep:
+    :param pos_sep:
+    :return:
+    """
+    words = []
+    for item in sentence_tag.split(word_sep):
+        if pos_sep in item:
+            index = item.rindex(pos_sep)
+            words.append(item[:index])
+        else:
+            words.append(item.strip())
+    return word_sep.join(words)
+
+
+def extract_sentence(train_seg_path, test_seg_path, col_sep='\t'):
+    ret = []
+    lines = read_lines(train_seg_path)
+    lines += read_lines(test_seg_path)
+    for line in lines:
+        if col_sep in line:
+            index = line.index(col_sep)
+            word_tag = line[index + 1:]
+            sentence = ''.join(get_sentence(word_tag))
+            ret.append(sentence)
+    return ret
+
+
+def save_sentence(lines, sentence_path):
+    with open(sentence_path, 'w', encoding='utf-8') as f:
+        for line in lines:
+            f.write('%s\n' % line.strip())
+    print('save sentence:%s' % sentence_path)
+
+
+def build(train_seg_path, test_seg_path, out_path=None, sentence_path='',
+          w2v_bin_path="w2v.bin", min_count=1, col_sep='\t'):
+    sentences = extract_sentence(train_seg_path, test_seg_path, col_sep=col_sep)
+    save_sentence(sentences, sentence_path)
+    print('train w2v model...')
+    # train model
+    w2v = Word2Vec(sg=1, sentences=LineSentence(sentence_path),
+                   size=256, window=5, min_count=min_count, iter=40)
+    w2v.wv.save_word2vec_format(w2v_bin_path, binary=True)
+    print("save %s ok." % w2v_bin_path)
+    # test
+    # sim = w2v.wv.similarity('大', '小')
+    # print('大 vs 小 similarity score:', sim)
+    # load model
+    model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True)
+    word_dict = {}
+    for word in model.vocab:
+        word_dict[word] = model[word]
+    dump_pkl(word_dict, out_path, overwrite=True)
diff --git a/paddle_model/config.py b/paddle_model/config.py
@@ -0,0 +1,48 @@
+import os
+import pathlib
+
+
+# pwd_path = os.path.abspath(os.path.dirname(__file__))
+pwd_path = pathlib.Path(os.path.abspath(__file__)).parent.parent
+
+# Training data path.
+# chinese corpus
+raw_train_paths = [
+    # os.path.join(pwd_path, '../data/cn/CGED/CGED18_HSK_TrainingSet.xml'),
+    # os.path.join(pwd_path, '../data/cn/CGED/CGED17_HSK_TrainingSet.xml'),
+    # os.path.join(pwd_path, '../data/cn/CGED/CGED16_HSK_TrainingSet.xml'),
+    os.path.join(pwd_path, '../data/cn/CGED/sample_HSK_TrainingSet.xml'),
+]
+
+output_dir = os.path.join(pwd_path, 'datasets')
+# Training data path.
+train_path = os.path.join(output_dir, 'AutoMaster_TrainSet.csv')
+# Validation data path.
+test_path = os.path.join(output_dir, 'AutoMaster_TestSet.csv')
+
+# paddle_train config
+save_vocab_path = os.path.join(output_dir, 'vocab.txt')
+model_save_dir = os.path.join(output_dir, 'paddle_model')
+
+vocab_max_size = 5000
+vocab_min_count = 5
+hidden_dim = 512
+
+use_cuda = False
+
+batch_size = 64
+epochs = 40
+rnn_hidden_dim = 128
+maxlen = 400
+dropout = 0.0
+gpu_id = 0
+# segment of train file
+train_seg_path = os.path.join(output_dir, 'train_set.seg.csv')
+# segment of test file
+test_seg_path = os.path.join(output_dir, 'test_set.seg.csv')
+
+stop_words_path = os.path.join(output_dir, 'stop_words.txt')
+
+
+if not os.path.exists(output_dir):
+    os.makedirs(output_dir)
diff --git a/paddle_model/feature.py b/paddle_model/feature.py
@@ -0,0 +1,66 @@
+from utils.data_utils import load_list
+from keras.preprocessing.text import Tokenizer, text_to_word_sequence
+from keras.preprocessing.sequence import pad_sequences
+import numpy as np
+import re
+
+
+class Feature(object):
+
+    def __init__(self, data=None,
+                 feature_type='tfidf_char',
+                 feature_vec_path=None,
+                 is_infer=False,
+                 min_count=1,
+                 word_vocab=None,
+                 max_len=400):
+        self.data_set = data
+        self.feature_type = feature_type
+        self.feature_vec_path = feature_vec_path
+        self.sentence_symbol = load_list(path='datasets/sentence_symbol.txt')
+        self.stop_words = load_list(path='datasets/stop_words.txt')
+        self.is_infer = is_infer
+        self.min_count = min_count
+        self.word_vocab = word_vocab
+        self.max_len = max_len
+
+    def get_feature(self):
+        if self.feature_type == 'vectorize':
+            data_feature = self.vectorize(self.data_set)
+        elif self.feature_type == 'doc_vectorize':
+            data_feature = self.doc_vectorize(self.data_set)
+
+        return data_feature
+
+    def vectorize(self, data_set):
+        tokenizer = Tokenizer()
+        tokenizer.fit_on_texts(data_set)
+        sequences = tokenizer.fit_on_sequences(data_set)
+
+        # word_index = tokenizer.word_index
+        data_feature = pad_sequences(sequences, maxlen=self.max_len)
+
+        return data_feature
+
+    def doc_vectorize(self, data_set, max_sentences=16):
+        tokenizer = Tokenizer()
+        tokenizer.fit_on_texts(data_set)
+
+        data_feature = np.zeros((len(data_set), max_sentences, self.max_len), dtype='int32')
+        for i, sentence in enumerate(data_set):
+            sentence_symbols = "".join(self.sentence_symbol)
+            split = "[" + sentence_symbols + "]"
+            short_sents = re.split(split, sentence)
+            for j, sent in enumerate(short_sents):
+                if j < max_sentences and sent.strip():
+                    words = text_to_word_sequence(sent)
+                    k = 0
+                    for w in words:
+                        if k < self.max_len:
+                            if w in tokenizer.word_index:
+                                data_feature[i, j, k] = tokenizer.word_index[w]
+                            k += 1
+        # word_index = tokenizer.word_index
+        return data_feature
+
+
diff --git a/paddle_model/reader.py b/paddle_model/reader.py
@@ -0,0 +1,82 @@
+import pandas as pd
+import config
+
+start_token = u"<s>"
+end_token = u"<e>"
+unk_token = u"<unk>"
+
+max_question_len = 100
+max_dialogue_len = 800
+max_report_len = 100
+
+# def data_reader(path, col_sep='\t'):
+#     contents, labels = [], []
+#     with open(path, mode='r', encoding='utf-8') as f:
+#         for line in f:
+#             line = line.strip()
+#             if col_sep in line:
+#                 index = line.index(col_sep)
+#                 label = line[:index].strip()
+#                 labels.append(label)
+#                 content = line[index + 1:].strip()
+#             else:
+#                 content = line
+#             contents.append(content)
+#     return contents, labels
+
+
+def read_data(path):
+    df = pd.read_csv(path, encoding='utf-8')
+    question_lens = df['Question'].apply(lambda x: len(x.split(" ")))
+    dialogue_lens = df['Dialogue'].apply(lambda x: len(x.split(" ")))
+    report_lens = df['Report'].apply(lambda x: len(x.split(" ")))
+    data = []
+    for i in range(len(df)):
+        if question_lens[i] > max_question_len or dialogue_lens[i] > max_dialogue_len or report_lens[i] > max_report_len:
+            continue
+        item = df.iloc[i]
+        data.append([[start_token] + item['Question'].split(" ") + [end_token],
+                     [start_token] + item['Dialogue'].split(" ") + [end_token],
+                     [start_token] + item['Report'].split(" ") + [end_token]])
+    return data
+
+
+def read_test_data(path):
+    df = pd.read_csv(path, encoding='utf-8')
+    data = []
+    for i in range(len(df)):
+        item = df.iloc[i]
+        question_vec = item['Question'].split(" ")[0:max_question_len]
+        dialogue_vec = item['Dialogue'].split(" ")[0:max_dialogue_len]
+        data.append([[start_token] + question_vec + [end_token],
+                     [start_token] + dialogue_vec + [end_token]])
+    return data
+
+
+# def read_samples_by_string(path):
+#     train = pd.read_csv(path, encoding='utf-8')
+#     lines = []
+#     for k in ['Question', 'Dialogue', 'Report']:
+#         train_values = list(train[k].values)
+#         lines.extend(train_values)
+#
+#     for line in lines:
+#         line = line.strip()
+#         if not line:
+#             continue
+#         parts = line.lower().strip()
+#         yield parts
+
+
+def build_dataset(path):
+    print('Read data, path:{0}'.format(path))
+    train = pd.read_csv(path, encoding='utf-8')
+    lines = []
+    for k in ['Question', 'Dialogue', 'Report']:
+        lines.extend(list(train[k].values))
+
+    return lines
+
+# word_lst = []
+#     for i in data_content:
+#         word_lst.extend(i.split())