zake7749 · zake7749 · Nov 6, 2016 · Oct 29, 2016 · Oct 29, 2016 · Oct 31, 2016
diff --git a/.gitignore b/.gitignore
@@ -5,4 +5,4 @@ __pycache__
 *.model
 *.log
 .DS_Store
-*log.txt
+*log.txt
diff --git a/Chatbot/.gitignore b/Chatbot/.gitignore
@@ -0,0 +1,11 @@
+*.bin
+__pycache__
+*.pyc
+*.train
+*.model
+*.log
+.DS_Store
+*log.txt
+Taiba
+QuestionAnswering/data/processed/reply
+jieba_dictionary/dict.txt.big
diff --git a/Chatbot/QuestionAnswering/Matcher/KeywordMatcher.py b/Chatbot/QuestionAnswering/Matcher/KeywordMatcher.py
@@ -0,0 +1,15 @@
+from .matcher import Matcher
+
+class KeywordMatcher(Matcher):
+
+    """
+    基於 TF-IDF 比較短語相似度
+    """
+
+    def __init__(self):
+
+        self.vecModel = None
+        #TODO
+
+    def match(self, query):
+        #TODO
diff --git a/mysite/__init__.py → ...bot/QuestionAnswering/Matcher/__init__.py b/mysite/__init__.py → ...bot/QuestionAnswering/Matcher/__init__.py
diff --git a/Chatbot/QuestionAnswering/Matcher/deepLearning.py b/Chatbot/QuestionAnswering/Matcher/deepLearning.py
@@ -0,0 +1 @@
+#TODO
diff --git a/Chatbot/QuestionAnswering/Matcher/fuzzyMatcher.py b/Chatbot/QuestionAnswering/Matcher/fuzzyMatcher.py
@@ -0,0 +1,79 @@
+from .matcher import Matcher
+from fuzzywuzzy import fuzz
+from fuzzywuzzy import process
+
+class FuzzyMatcher(Matcher):
+
+    """
+    基於萊文斯坦距離比對短語相似度
+    """
+
+    def __init__(self, segLib="Taiba", removeStopWords=False):
+        super().__init__(segLib)
+        self.cleanStopWords = removeStopWords
+        if removeStopWords:
+            self.loadStopWords("data/stopwords/chinese_sw.txt")
+            self.loadStopWords("data/stopwords/specialMarks.txt")
+
+    def joinTitles(self):
+        self.segTitles = ["".join(title) for title in self.segTitles]
+
+    def tieBreak(self, query, i, j):
+        """
+        當去除停用詞後導致兩個字串的匹配度一樣時，從原文裡挑選出更適合的
+
+        Args:
+            - query: 使用者的輸入
+            - i: index 為 i 的 title
+            - j: index 為 j 的 title
+
+        Return: (target, index)
+            - target: 較適合的標題
+            - index : 該標題的 id
+        """
+        raw1 = self.titles[i]
+        raw2 = self.titles[j]
+
+        r1 = fuzz.ratio(query, raw1)
+        r2 = fuzz.ratio(query, raw2)
+
+        if r1 > r2:
+            return (raw1,i)
+        else:
+            return (raw2,j)
+
+    def match(self, query):
+        """
+        讀入使用者 query，若語料庫中存在類似的句子，便回傳該句子與標號
+
+        Args:
+            - query: 使用者欲查詢的語句
+            - removeStopWords: 清除 stopwords
+        """
+        ratio  = -1
+        target = ""
+        target_idx = -1
+
+        if self.cleanStopWords:
+            mQuery = [word for word in self.wordSegmentation(query)
+                      if word not in self.stopwords]
+            mQuery = "".join(mQuery)
+            title_list = self.segTitles
+        else:
+            title_list = self.titles
+            mQuery = query
+
+        for index,title in enumerate(title_list):
+
+            newRatio = fuzz.ratio(mQuery, title)
+
+            if newRatio > ratio:
+                ratio  = newRatio
+                target = title
+                target_idx = index
+
+            elif self.cleanStopWords and newRatio == ratio:
+                target, target_idx = self.tieBreak(query,target_idx,index)
+
+        self.similarity = ratio
+        return target,target_idx
diff --git a/Chatbot/QuestionAnswering/Matcher/matcher.py b/Chatbot/QuestionAnswering/Matcher/matcher.py
@@ -0,0 +1,121 @@
+import logging
+import os
+
+import jieba
+import Taiba
+
+class Matcher(object):
+
+    """
+    比對使用者輸入的句子與目標語料集，
+    回傳語料集中最相似的一個句子。
+    """
+
+    def __init__(self, segLib="Taiba"):
+
+        logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO)
+        self.titles = [] # 欲進行匹配的所有標題
+        self.segTitles = [] # 斷好詞的標題
+
+        self.stopwords = set()
+        self.similarity = 1.
+
+        if segLib == "Taiba":
+            self.useTaiba = True
+        else:
+            self.useTaiba = False
+
+    def jiebaCustomSetting(self, dict_path, usr_dict_path):
+
+        jieba.set_dictionary(dict_path)
+        with open(usr_dict_path, 'r', encoding='utf-8') as dic:
+            for word in dic:
+                jieba.add_word(word.strip('\n'))
+
+    def TaibaCustomSetting(self, usr_dict):
+
+        with open(usr_dict, 'r', encoding='utf-8') as dic:
+            for word in dic:
+                Taiba.add_word(word.strip('\n'))
+
+    def loadStopWords(self, path):
+        with open(path, 'r', encoding='utf-8') as sw:
+            for word in sw:
+                self.stopwords.add(word.strip('\n'))
+
+    def loadTitles(self, path):
+
+        with open(path,'r',encoding='utf-8') as data:
+            self.titles = [line.strip('\n') for line in data]
+
+    def match(self, query):
+        """
+        讀入使用者 query，若語料庫中存在相同的句子，便回傳該句子與標號
+
+        Args:
+            - query: 使用者的輸入
+
+        Return: (title,index)
+            - title: 最為相似的標題
+            - 該標題的索引編號
+        """
+        result = None
+        for index, title in enumerate(self.titles):
+            if title == query:
+                return title,index
+
+    def getSimilarity(self):
+        return self.similarity
+
+    def wordSegmentation(self, string):
+
+        if self.useTaiba:
+            return Taiba.lcut(string,CRF=True)
+        else:
+            return jieba.cut(string,cut_all=True)
+
+    def TitlesSegmentation(self, cleanStopwords=False):
+
+        """
+        將 self.titles 斷詞後的結果輸出，並儲存於 self.segTitles
+
+        Args:
+            - cleanStopwords: 是否要清除標題中的停用詞
+        """
+
+        logging.info("正準備將 titles 斷詞")
+
+        count = 0
+
+        if not os.path.exists('data/SegTitles.txt'):
+
+            self.segTitles = []
+            for title in self.titles:
+
+                if cleanStopwords:
+                    clean = [word for word in self.wordSegmentation(title)
+                            if word not in self.stopwords]
+                    self.segTitles.append(clean)
+                else:
+                    self.segTitles.append(self.wordSegmentation(title))
+
+                count += 1
+                if count % 1000 == 0:
+                    logging.info("已斷詞完前 %d 篇文章" % count)
+
+            with open('data/SegTitles.txt','w',encoding="utf-8") as seg_title:
+                for title in self.segTitles:
+                    seg_title.write(' '.join(title) + '\n')
+            logging.info("完成標題斷詞，結果已暫存至 data/SegTitles.txt")
+        else:
+            logging.info("偵測到先前的標題斷詞結果，讀取中...")
+            with open('data/SegTitles.txt','r',encoding="utf-8") as seg_title:
+                for line in seg_title:
+                    line = line.strip('\n')
+                    seg = line.split()
+
+                    if cleanStopwords:
+                        seg = [word for word in seg
+                               if word not in self.stopwords]
+                    self.segTitles.append(seg)
+                logging.info("%d 個標題已完成載入" % len(self.segTitles))
diff --git a/Chatbot/QuestionAnswering/Matcher/vectorMatcher.py b/Chatbot/QuestionAnswering/Matcher/vectorMatcher.py
@@ -0,0 +1,14 @@
+from gensim import models,corpora
+from sklearn import svm
+
+from . import Matcher
+
+class VectorMatcher(Matcher):
+
+    def __init__(self):
+
+        self.vecModel = None
+        #TODO
+
+    def match(self, query):
+        #TODO
diff --git a/Chatbot/QuestionAnswering/Matcher/wordBagMatcher.py b/Chatbot/QuestionAnswering/Matcher/wordBagMatcher.py
@@ -0,0 +1,6 @@
+from .matcher import Matcher
+
+# 此匹配模組已優化為 WordWeightMatcher
+
+class WordBagMatcher(Matcher):
+    pass
diff --git a/Chatbot/QuestionAnswering/Matcher/wordWeightMatcher.py b/Chatbot/QuestionAnswering/Matcher/wordWeightMatcher.py
@@ -0,0 +1,108 @@
+import math
+import logging
+
+import gensim
+
+from collections import defaultdict
+
+from .matcher import Matcher
+
+class WordWeightMatcher(Matcher):
+
+    """
+    採用詞權重來比對短語相似度
+    """
+
+    def __init__(self, segLib="Taiba"):
+
+        super().__init__(segLib)
+
+        self.wordDictionary = defaultdict(int) # 保存每個詞的出現次數
+        self.totalWords = 0 # 詞總數
+        self.wordWeights = defaultdict(int) # 保存每個詞的權重
+
+    def initialize(self):
+        logging.info("初始化模塊中...")
+        self.TitlesSegmentation()
+        self.buildWordDictionary()
+        self.loadStopWords("data/stopwords/chinese_sw.txt")
+        self.loadStopWords("data/stopwords/specialMarks.txt")
+        self.calculateWeight()
+        logging.info("初始化完成 :>")
+
+    def buildWordDictionary(self):
+
+        for title in self.segTitles:
+            for word in title:
+                self.wordDictionary[word] += 1
+                self.totalWords += 1
+        logging.info("詞記數完成")
+
+    def buildWordBag(self):
+        dictionary = gensim.corpora.Dictionary(self.titles)
+
+    def calculateWeight(self):
+        # 算法的數學推導請見：
+        # 非主流自然语言处理——遗忘算法系列（四）：改进TF-IDF权重公式
+        # http://www.52nlp.cn/forgetnlp4
+        # 此處儲存的 weight 為後項，即 -1 * log(N/T)
+
+        for word,count in self.wordDictionary.items():
+            self.wordWeights[word] = -1 * math.log10(count/self.totalWords)
+        logging.info("詞統計完成")
+
+    def getCooccurrence(self, q1, q2):
+
+        #TODO NEED OPTIMIZE!!!!
+        res = []
+        for word in q1:
+            if word in q2:
+                res.append(word)
+        return res
+
+    def getWordWeight(self, word, n=1):
+        #TODO FIX N
+        return(n * self.wordWeights[word])
+
+    def match(self, query, sort=False):
+
+        """
+        讀入使用者 query，若語料庫中存在相同的句子，便回傳該句子與標號
+        """
+
+        max_similarity = -1
+        target = ""
+        index = -1
+
+        segQuery = [word for word in self.wordSegmentation(query)
+                    if word not in self.stopwords]
+
+        for index,title in enumerate(self.segTitles):
+
+            if len(title) == 0:
+                continue
+
+            allWordsWeight = 0.
+            coWordsWeight = 0.
+
+            coWords = self.getCooccurrence(title, segQuery)
+
+            for word in coWords:
+                coWordsWeight += self.getWordWeight(word)
+
+            for word in title:
+                if word not in coWords:
+                    allWordsWeight += self.getWordWeight(word)
+            for word in segQuery:
+                if word not in coWords:
+                    allWordsWeight += self.getWordWeight(word)
+            similarity = coWordsWeight/allWordsWeight
+
+            if similarity > max_similarity:
+                max_similarity = similarity
+                target = title
+                target_idx = index
+
+        self.similarity = max_similarity * 100 #統一為百分制
+
+        return target,target_idx
diff --git a/Chatbot/QuestionAnswering/Readme.md b/Chatbot/QuestionAnswering/Readme.md
@@ -0,0 +1,5 @@
+## 簡易問答
+
+目前的 QA 是基於 [PTT-Push-Generator](https://github.com/zake7749/PTT-Push-Generator) 進行。
+
+*注意：*目前仍未上傳 QA 的資料集，進行測試時請先關閉 QA 功能
diff --git a/Chatbot/QuestionAnswering/__init__.py b/Chatbot/QuestionAnswering/__init__.py
@@ -0,0 +1,3 @@
+import os
+import sys
+sys.path.append(os.path.dirname(__file__))