forked from zake7749/Chatbot
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
40 changed files
with
1,258,299 additions
and
54 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,4 +5,5 @@ __pycache__ | |
*.model | ||
*.log | ||
.DS_Store | ||
*log.txt | ||
*log.txt | ||
Taiba/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
*.bin | ||
__pycache__ | ||
*.pyc | ||
*.train | ||
*.model | ||
*.log | ||
.DS_Store | ||
*log.txt | ||
Taiba | ||
QuestionAnswering/data/processed/reply | ||
jieba_dictionary/dict.txt.big |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
from .matcher import Matcher | ||
|
||
class KeywordMatcher(Matcher): | ||
|
||
""" | ||
基於 TF-IDF 比較短語相似度 | ||
""" | ||
|
||
def __init__(self): | ||
|
||
self.vecModel = None | ||
#TODO | ||
|
||
def match(self, query): | ||
#TODO |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
#TODO |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
from .matcher import Matcher | ||
from fuzzywuzzy import fuzz | ||
from fuzzywuzzy import process | ||
|
||
class FuzzyMatcher(Matcher): | ||
|
||
""" | ||
基於萊文斯坦距離比對短語相似度 | ||
""" | ||
|
||
def __init__(self, segLib="Taiba", removeStopWords=False): | ||
super().__init__(segLib) | ||
self.cleanStopWords = removeStopWords | ||
if removeStopWords: | ||
self.loadStopWords("data/stopwords/chinese_sw.txt") | ||
self.loadStopWords("data/stopwords/specialMarks.txt") | ||
|
||
def joinTitles(self): | ||
self.segTitles = ["".join(title) for title in self.segTitles] | ||
|
||
def tieBreak(self, query, i, j): | ||
""" | ||
當去除停用詞後導致兩個字串的匹配度一樣時,從原文裡挑選出更適合的 | ||
Args: | ||
- query: 使用者的輸入 | ||
- i: index 為 i 的 title | ||
- j: index 為 j 的 title | ||
Return: (target, index) | ||
- target: 較適合的標題 | ||
- index : 該標題的 id | ||
""" | ||
raw1 = self.titles[i] | ||
raw2 = self.titles[j] | ||
|
||
r1 = fuzz.ratio(query, raw1) | ||
r2 = fuzz.ratio(query, raw2) | ||
|
||
if r1 > r2: | ||
return (raw1,i) | ||
else: | ||
return (raw2,j) | ||
|
||
def match(self, query, custom_title=None): | ||
""" | ||
讀入使用者 query,若語料庫中存在類似的句子,便回傳該句子與標號 | ||
Args: | ||
- query: 使用者欲查詢的語句 | ||
- removeStopWords: 清除 stopwords | ||
- custom_title: 使用者欲比對的問題集 | ||
""" | ||
ratio = -1 | ||
target = "" | ||
target_idx = -1 | ||
|
||
if self.cleanStopWords: | ||
mQuery = [word for word in self.wordSegmentation(query) | ||
if word not in self.stopwords] | ||
mQuery = "".join(mQuery) | ||
title_list = self.segTitles | ||
else: | ||
if custom_title is None: | ||
title_list = self.titles | ||
else: | ||
title_list = custom_title | ||
mQuery = query | ||
|
||
for index,title in enumerate(title_list): | ||
|
||
newRatio = fuzz.ratio(mQuery, title) | ||
|
||
if newRatio > ratio: | ||
ratio = newRatio | ||
target = title | ||
target_idx = index | ||
|
||
elif self.cleanStopWords and newRatio == ratio: | ||
target, target_idx = self.tieBreak(query,target_idx,index) | ||
|
||
self.similarity = ratio | ||
return target,target_idx |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
import logging | ||
import os | ||
|
||
import jieba | ||
import Taiba | ||
|
||
class Matcher(object): | ||
|
||
""" | ||
比對使用者輸入的句子與目標語料集, | ||
回傳語料集中最相似的一個句子。 | ||
""" | ||
|
||
def __init__(self, segLib="Taiba"): | ||
|
||
logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) | ||
self.titles = [] # 欲進行匹配的所有標題 | ||
self.segTitles = [] # 斷好詞的標題 | ||
|
||
self.stopwords = set() | ||
self.similarity = 1. | ||
|
||
if segLib == "Taiba": | ||
self.useTaiba = True | ||
else: | ||
self.useTaiba = False | ||
|
||
def jiebaCustomSetting(self, dict_path, usr_dict_path): | ||
|
||
jieba.set_dictionary(dict_path) | ||
with open(usr_dict_path, 'r', encoding='utf-8') as dic: | ||
for word in dic: | ||
jieba.add_word(word.strip('\n')) | ||
|
||
def TaibaCustomSetting(self, usr_dict): | ||
|
||
with open(usr_dict, 'r', encoding='utf-8') as dic: | ||
for word in dic: | ||
Taiba.add_word(word.strip('\n')) | ||
|
||
def loadStopWords(self, path): | ||
with open(path, 'r', encoding='utf-8') as sw: | ||
for word in sw: | ||
self.stopwords.add(word.strip('\n')) | ||
|
||
def loadTitles(self, path): | ||
|
||
with open(path,'r',encoding='utf-8') as data: | ||
self.titles = [line.strip('\n') for line in data] | ||
|
||
def match(self, query): | ||
""" | ||
讀入使用者 query,若語料庫中存在相同的句子,便回傳該句子與標號 | ||
Args: | ||
- query: 使用者的輸入 | ||
Return: (title,index) | ||
- title: 最為相似的標題 | ||
- 該標題的索引編號 | ||
""" | ||
result = None | ||
for index, title in enumerate(self.titles): | ||
if title == query: | ||
return title,index | ||
|
||
def getSimilarity(self): | ||
return self.similarity | ||
|
||
def wordSegmentation(self, string): | ||
|
||
if self.useTaiba: | ||
return Taiba.lcut(string,CRF=True) | ||
else: | ||
return jieba.cut(string,cut_all=True) | ||
|
||
def TitlesSegmentation(self, cleanStopwords=False): | ||
|
||
""" | ||
將 self.titles 斷詞後的結果輸出,並儲存於 self.segTitles | ||
Args: | ||
- cleanStopwords: 是否要清除標題中的停用詞 | ||
""" | ||
|
||
logging.info("正準備將 titles 斷詞") | ||
|
||
count = 0 | ||
|
||
if not os.path.exists('data/SegTitles.txt'): | ||
|
||
self.segTitles = [] | ||
for title in self.titles: | ||
|
||
if cleanStopwords: | ||
clean = [word for word in self.wordSegmentation(title) | ||
if word not in self.stopwords] | ||
self.segTitles.append(clean) | ||
else: | ||
self.segTitles.append(self.wordSegmentation(title)) | ||
|
||
count += 1 | ||
if count % 1000 == 0: | ||
logging.info("已斷詞完前 %d 篇文章" % count) | ||
|
||
with open('data/SegTitles.txt','w',encoding="utf-8") as seg_title: | ||
for title in self.segTitles: | ||
seg_title.write(' '.join(title) + '\n') | ||
logging.info("完成標題斷詞,結果已暫存至 data/SegTitles.txt") | ||
else: | ||
logging.info("偵測到先前的標題斷詞結果,讀取中...") | ||
with open('data/SegTitles.txt','r',encoding="utf-8") as seg_title: | ||
for line in seg_title: | ||
line = line.strip('\n') | ||
seg = line.split() | ||
|
||
if cleanStopwords: | ||
seg = [word for word in seg | ||
if word not in self.stopwords] | ||
self.segTitles.append(seg) | ||
logging.info("%d 個標題已完成載入" % len(self.segTitles)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
from gensim import models,corpora | ||
from sklearn import svm | ||
|
||
from . import Matcher | ||
|
||
class VectorMatcher(Matcher): | ||
|
||
def __init__(self): | ||
|
||
self.vecModel = None | ||
#TODO | ||
|
||
def match(self, query): | ||
#TODO |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
from .matcher import Matcher | ||
|
||
# 此匹配模組已優化為 WordWeightMatcher | ||
|
||
class WordBagMatcher(Matcher): | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
import math | ||
import logging | ||
|
||
import gensim | ||
|
||
from collections import defaultdict | ||
|
||
from .matcher import Matcher | ||
|
||
class WordWeightMatcher(Matcher): | ||
|
||
""" | ||
採用詞權重來比對短語相似度 | ||
""" | ||
|
||
def __init__(self, segLib="Taiba"): | ||
|
||
super().__init__(segLib) | ||
|
||
self.wordDictionary = defaultdict(int) # 保存每個詞的出現次數 | ||
self.totalWords = 0 # 詞總數 | ||
self.wordWeights = defaultdict(int) # 保存每個詞的權重 | ||
|
||
def initialize(self): | ||
logging.info("初始化模塊中...") | ||
self.TitlesSegmentation() | ||
self.buildWordDictionary() | ||
self.loadStopWords("data/stopwords/chinese_sw.txt") | ||
self.loadStopWords("data/stopwords/specialMarks.txt") | ||
self.calculateWeight() | ||
logging.info("初始化完成 :>") | ||
|
||
def buildWordDictionary(self): | ||
|
||
for title in self.segTitles: | ||
for word in title: | ||
self.wordDictionary[word] += 1 | ||
self.totalWords += 1 | ||
logging.info("詞記數完成") | ||
|
||
def buildWordBag(self): | ||
dictionary = gensim.corpora.Dictionary(self.titles) | ||
|
||
def calculateWeight(self): | ||
# 算法的數學推導請見: | ||
# 非主流自然语言处理——遗忘算法系列(四):改进TF-IDF权重公式 | ||
# http://www.52nlp.cn/forgetnlp4 | ||
# 此處儲存的 weight 為後項,即 -1 * log(N/T) | ||
|
||
for word,count in self.wordDictionary.items(): | ||
self.wordWeights[word] = -1 * math.log10(count/self.totalWords) | ||
logging.info("詞統計完成") | ||
|
||
def getCooccurrence(self, q1, q2): | ||
|
||
#TODO NEED OPTIMIZE!!!! | ||
res = [] | ||
for word in q1: | ||
if word in q2: | ||
res.append(word) | ||
return res | ||
|
||
def getWordWeight(self, word, n=1): | ||
#TODO FIX N | ||
return(n * self.wordWeights[word]) | ||
|
||
def match(self, query, sort=False): | ||
|
||
""" | ||
讀入使用者 query,若語料庫中存在相同的句子,便回傳該句子與標號 | ||
""" | ||
|
||
max_similarity = -1 | ||
target = "" | ||
index = -1 | ||
|
||
segQuery = [word for word in self.wordSegmentation(query) | ||
if word not in self.stopwords] | ||
|
||
for index,title in enumerate(self.segTitles): | ||
|
||
if len(title) == 0: | ||
continue | ||
|
||
allWordsWeight = 0. | ||
coWordsWeight = 0. | ||
|
||
coWords = self.getCooccurrence(title, segQuery) | ||
|
||
for word in coWords: | ||
coWordsWeight += self.getWordWeight(word) | ||
|
||
for word in title: | ||
if word not in coWords: | ||
allWordsWeight += self.getWordWeight(word) | ||
for word in segQuery: | ||
if word not in coWords: | ||
allWordsWeight += self.getWordWeight(word) | ||
similarity = coWordsWeight/allWordsWeight | ||
|
||
if similarity > max_similarity: | ||
max_similarity = similarity | ||
target = title | ||
target_idx = index | ||
|
||
self.similarity = max_similarity * 100 #統一為百分制 | ||
|
||
return target,target_idx |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
## 簡易問答 | ||
|
||
目前的 QA 是基於 [PTT-Push-Generator](https://github.com/zake7749/PTT-Push-Generator) 進行。 | ||
|
||
*注意:*目前仍未上傳 QA 的資料集,進行測試時請先關閉 QA 功能 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
import os | ||
import sys | ||
sys.path.append(os.path.dirname(__file__)) |
Oops, something went wrong.