Skip to content

Commit

Permalink
Optimize the searching strategy of question answering.
Browse files Browse the repository at this point in the history
  • Loading branch information
zake7749 committed Dec 26, 2016
1 parent 986214c commit 3854be2
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 1 deletion.
9 changes: 8 additions & 1 deletion Chatbot/QuestionAnswering/Matcher/bm25Matcher.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import math

from .matcher import Matcher
from .quickSearch import QuickSearcher


class bestMatchingMatcher(Matcher):

Expand All @@ -24,6 +26,8 @@ def __init__(self, segLib="Taiba", removeStopWords=False):
self.k1 = 1.5
self.b = 0.75

self.searcher = QuickSearcher() # 問句篩選

if removeStopWords:
self.loadStopWords("data/stopwords/chinese_sw.txt")
self.loadStopWords("data/stopwords/specialMarks.txt")
Expand All @@ -35,6 +39,7 @@ def initialize(self,ngram=1):
self.TitlesSegmentation() # 將 self.titles 斷詞為 self.segTitles
#self.calculateIDF() # 依照斷詞後結果, 計算每個詞的 idf value
self.initBM25()
self.searcher.buildInvertedIndex(self.segTitles)

"""NEED MORE DISCUSSION
#for n in range(0,ngram):
Expand Down Expand Up @@ -137,7 +142,9 @@ def match(self, query):
target = ''
target_idx = -1

for index in range(self.D):
target_index = self.searcher.quickSearch(seg_query) # 只取出必要的 titles

for index in target_index:
score = self.sim(seg_query, index)
if score > max:
target_idx = index
Expand Down
39 changes: 39 additions & 0 deletions Chatbot/QuestionAnswering/Matcher/quickSearch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
class QuickSearcher(object):

"""
對每個句子的詞建立反向映射表,透過 set operator 快速限縮查詢時間
"""

def __init__(self, docs=None):

self.inverted_word_dic = dict()
#self.buildInvertedIndex(docs)

def buildInvertedIndex(self, docs):

"""
建構詞對 ID 的倒排索引
Args:
- docs: 欲建構的倒排索引表列,每個 doc 需「完成斷詞」
"""

for doc_id,doc in enumerate(docs):
for word in doc:
if word not in self.inverted_word_dic.keys():
self.inverted_word_dic[word] = set()
self.inverted_word_dic[word].add(doc_id)

def quickSearch(self, query):

"""
讀入已斷好詞的 query,依照倒排索引只取出必要的 id
"""

result = set()
# print(query)
for word in query:
if word in self.inverted_word_dic.keys():
result = result.union(self.inverted_word_dic[word])

return result

0 comments on commit 3854be2

Please sign in to comment.