Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Custom rules dev #19

Merged
merged 14 commits into from
Nov 6, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ __pycache__
*.model
*.log
.DS_Store
*log.txt
*log.txt
11 changes: 11 additions & 0 deletions Chatbot/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
*.bin
__pycache__
*.pyc
*.train
*.model
*.log
.DS_Store
*log.txt
Taiba
QuestionAnswering/data/processed/reply
jieba_dictionary/dict.txt.big
15 changes: 15 additions & 0 deletions Chatbot/QuestionAnswering/Matcher/KeywordMatcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from .matcher import Matcher

class KeywordMatcher(Matcher):

"""
基於 TF-IDF 比較短語相似度
"""

def __init__(self):

self.vecModel = None
#TODO

def match(self, query):
#TODO
File renamed without changes.
1 change: 1 addition & 0 deletions Chatbot/QuestionAnswering/Matcher/deepLearning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
#TODO
79 changes: 79 additions & 0 deletions Chatbot/QuestionAnswering/Matcher/fuzzyMatcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
from .matcher import Matcher
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

class FuzzyMatcher(Matcher):

"""
基於萊文斯坦距離比對短語相似度
"""

def __init__(self, segLib="Taiba", removeStopWords=False):
super().__init__(segLib)
self.cleanStopWords = removeStopWords
if removeStopWords:
self.loadStopWords("data/stopwords/chinese_sw.txt")
self.loadStopWords("data/stopwords/specialMarks.txt")

def joinTitles(self):
self.segTitles = ["".join(title) for title in self.segTitles]

def tieBreak(self, query, i, j):
"""
當去除停用詞後導致兩個字串的匹配度一樣時,從原文裡挑選出更適合的

Args:
- query: 使用者的輸入
- i: index 為 i 的 title
- j: index 為 j 的 title

Return: (target, index)
- target: 較適合的標題
- index : 該標題的 id
"""
raw1 = self.titles[i]
raw2 = self.titles[j]

r1 = fuzz.ratio(query, raw1)
r2 = fuzz.ratio(query, raw2)

if r1 > r2:
return (raw1,i)
else:
return (raw2,j)

def match(self, query):
"""
讀入使用者 query,若語料庫中存在類似的句子,便回傳該句子與標號

Args:
- query: 使用者欲查詢的語句
- removeStopWords: 清除 stopwords
"""
ratio = -1
target = ""
target_idx = -1

if self.cleanStopWords:
mQuery = [word for word in self.wordSegmentation(query)
if word not in self.stopwords]
mQuery = "".join(mQuery)
title_list = self.segTitles
else:
title_list = self.titles
mQuery = query

for index,title in enumerate(title_list):

newRatio = fuzz.ratio(mQuery, title)

if newRatio > ratio:
ratio = newRatio
target = title
target_idx = index

elif self.cleanStopWords and newRatio == ratio:
target, target_idx = self.tieBreak(query,target_idx,index)

self.similarity = ratio
return target,target_idx
121 changes: 121 additions & 0 deletions Chatbot/QuestionAnswering/Matcher/matcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import logging
import os

import jieba
import Taiba

class Matcher(object):

"""
比對使用者輸入的句子與目標語料集,
回傳語料集中最相似的一個句子。
"""

def __init__(self, segLib="Taiba"):

logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO)
self.titles = [] # 欲進行匹配的所有標題
self.segTitles = [] # 斷好詞的標題

self.stopwords = set()
self.similarity = 1.

if segLib == "Taiba":
self.useTaiba = True
else:
self.useTaiba = False

def jiebaCustomSetting(self, dict_path, usr_dict_path):

jieba.set_dictionary(dict_path)
with open(usr_dict_path, 'r', encoding='utf-8') as dic:
for word in dic:
jieba.add_word(word.strip('\n'))

def TaibaCustomSetting(self, usr_dict):

with open(usr_dict, 'r', encoding='utf-8') as dic:
for word in dic:
Taiba.add_word(word.strip('\n'))

def loadStopWords(self, path):
with open(path, 'r', encoding='utf-8') as sw:
for word in sw:
self.stopwords.add(word.strip('\n'))

def loadTitles(self, path):

with open(path,'r',encoding='utf-8') as data:
self.titles = [line.strip('\n') for line in data]

def match(self, query):
"""
讀入使用者 query,若語料庫中存在相同的句子,便回傳該句子與標號

Args:
- query: 使用者的輸入

Return: (title,index)
- title: 最為相似的標題
- 該標題的索引編號
"""
result = None
for index, title in enumerate(self.titles):
if title == query:
return title,index

def getSimilarity(self):
return self.similarity

def wordSegmentation(self, string):

if self.useTaiba:
return Taiba.lcut(string,CRF=True)
else:
return jieba.cut(string,cut_all=True)

def TitlesSegmentation(self, cleanStopwords=False):

"""
將 self.titles 斷詞後的結果輸出,並儲存於 self.segTitles

Args:
- cleanStopwords: 是否要清除標題中的停用詞
"""

logging.info("正準備將 titles 斷詞")

count = 0

if not os.path.exists('data/SegTitles.txt'):

self.segTitles = []
for title in self.titles:

if cleanStopwords:
clean = [word for word in self.wordSegmentation(title)
if word not in self.stopwords]
self.segTitles.append(clean)
else:
self.segTitles.append(self.wordSegmentation(title))

count += 1
if count % 1000 == 0:
logging.info("已斷詞完前 %d 篇文章" % count)

with open('data/SegTitles.txt','w',encoding="utf-8") as seg_title:
for title in self.segTitles:
seg_title.write(' '.join(title) + '\n')
logging.info("完成標題斷詞,結果已暫存至 data/SegTitles.txt")
else:
logging.info("偵測到先前的標題斷詞結果,讀取中...")
with open('data/SegTitles.txt','r',encoding="utf-8") as seg_title:
for line in seg_title:
line = line.strip('\n')
seg = line.split()

if cleanStopwords:
seg = [word for word in seg
if word not in self.stopwords]
self.segTitles.append(seg)
logging.info("%d 個標題已完成載入" % len(self.segTitles))
14 changes: 14 additions & 0 deletions Chatbot/QuestionAnswering/Matcher/vectorMatcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from gensim import models,corpora
from sklearn import svm

from . import Matcher

class VectorMatcher(Matcher):

def __init__(self):

self.vecModel = None
#TODO

def match(self, query):
#TODO
6 changes: 6 additions & 0 deletions Chatbot/QuestionAnswering/Matcher/wordBagMatcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from .matcher import Matcher

# 此匹配模組已優化為 WordWeightMatcher

class WordBagMatcher(Matcher):
pass
108 changes: 108 additions & 0 deletions Chatbot/QuestionAnswering/Matcher/wordWeightMatcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import math
import logging

import gensim

from collections import defaultdict

from .matcher import Matcher

class WordWeightMatcher(Matcher):

"""
採用詞權重來比對短語相似度
"""

def __init__(self, segLib="Taiba"):

super().__init__(segLib)

self.wordDictionary = defaultdict(int) # 保存每個詞的出現次數
self.totalWords = 0 # 詞總數
self.wordWeights = defaultdict(int) # 保存每個詞的權重

def initialize(self):
logging.info("初始化模塊中...")
self.TitlesSegmentation()
self.buildWordDictionary()
self.loadStopWords("data/stopwords/chinese_sw.txt")
self.loadStopWords("data/stopwords/specialMarks.txt")
self.calculateWeight()
logging.info("初始化完成 :>")

def buildWordDictionary(self):

for title in self.segTitles:
for word in title:
self.wordDictionary[word] += 1
self.totalWords += 1
logging.info("詞記數完成")

def buildWordBag(self):
dictionary = gensim.corpora.Dictionary(self.titles)

def calculateWeight(self):
# 算法的數學推導請見:
# 非主流自然语言处理——遗忘算法系列(四):改进TF-IDF权重公式
# http://www.52nlp.cn/forgetnlp4
# 此處儲存的 weight 為後項,即 -1 * log(N/T)

for word,count in self.wordDictionary.items():
self.wordWeights[word] = -1 * math.log10(count/self.totalWords)
logging.info("詞統計完成")

def getCooccurrence(self, q1, q2):

#TODO NEED OPTIMIZE!!!!
res = []
for word in q1:
if word in q2:
res.append(word)
return res

def getWordWeight(self, word, n=1):
#TODO FIX N
return(n * self.wordWeights[word])

def match(self, query, sort=False):

"""
讀入使用者 query,若語料庫中存在相同的句子,便回傳該句子與標號
"""

max_similarity = -1
target = ""
index = -1

segQuery = [word for word in self.wordSegmentation(query)
if word not in self.stopwords]

for index,title in enumerate(self.segTitles):

if len(title) == 0:
continue

allWordsWeight = 0.
coWordsWeight = 0.

coWords = self.getCooccurrence(title, segQuery)

for word in coWords:
coWordsWeight += self.getWordWeight(word)

for word in title:
if word not in coWords:
allWordsWeight += self.getWordWeight(word)
for word in segQuery:
if word not in coWords:
allWordsWeight += self.getWordWeight(word)
similarity = coWordsWeight/allWordsWeight

if similarity > max_similarity:
max_similarity = similarity
target = title
target_idx = index

self.similarity = max_similarity * 100 #統一為百分制

return target,target_idx
5 changes: 5 additions & 0 deletions Chatbot/QuestionAnswering/Readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
## 簡易問答

目前的 QA 是基於 [PTT-Push-Generator](https://github.com/zake7749/PTT-Push-Generator) 進行。

*注意:*目前仍未上傳 QA 的資料集,進行測試時請先關閉 QA 功能
3 changes: 3 additions & 0 deletions Chatbot/QuestionAnswering/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import os
import sys
sys.path.append(os.path.dirname(__file__))
Loading