add: decoupling query expansion from lexicon

jarvis0 · jarvis0 · Jan 9, 2022 · Jan 8, 2022 · Jan 8, 2022 · Jan 8, 2022
commit 35bc80f26e9b4d17c851d1561359723dcddab0fd
diff --git a/notebooks/dict_multiprocessing.ipynb b/notebooks/dict_multiprocessing.ipynb
@@ -15,7 +15,7 @@
     tic = time.time()
     bm25 = BM25Okapi(tokenized_corpus)
     print('index', time.time() - tic)
-    n = 1
+    n = 10
     tic = time.time()
     for _ in range(n):
         query = 'running football player player in field across football'

@@ -1,6 +1,6 @@
-from .collector import Collection
-from .lex import Lexicon
-from .parallel_indexer import InvertedIndex
+from .collection import Collection
+from .indexer import InvertedIndex
+from .lexicon import Lexicon
 
 __all__ = [
     'Collection',

@@ -1,7 +1,10 @@
+import itertools
 from collections import Counter, defaultdict
 from typing import DefaultDict, List, Tuple
 
-from .collector import Collection
+from joblib import Parallel, delayed
+
+from . import Collection
 
 
 class Posting:
@@ -18,17 +21,21 @@ class InvertedIndex:
 
     def __init__(self, collection: Collection):
         self.collection: Collection = collection
-        self.index: DefaultDict[str, List[Posting]] = defaultdict(list)
+        self.inv_index: DefaultDict[str, List[Posting]] = defaultdict(list)
 
-    def __index_document(self, doc_id: int):
-        document = self.collection.get_document(doc_id)
+    @staticmethod
+    def __index_document(doc_id, document):
         term_frequencies = Counter(document.tokens)
-        update_dict = {term: self.index[term] + [Posting(doc_id, freq)] for term, freq in term_frequencies.items()}
-        self.index.update(update_dict)
+        partial_term_postings = [(term, [Posting(doc_id, freq)]) for term, freq in term_frequencies.items()]
+        return partial_term_postings
 
     def index_collection(self):
-        for doc_id in self.collection.get_docs_id():
-            self.__index_document(doc_id)
+        map_responses = Parallel(n_jobs=4)(delayed(InvertedIndex.__index_document)(
+            doc_id,
+            self.collection.get_document(doc_id),
+        ) for doc_id in self.collection.get_docs_id())
+        for term, postings in itertools.chain(*map_responses):
+            self.inv_index[term] += postings
 
     def get_items(self) -> List[Tuple[str, List[Posting]]]:
-        return self.index.items()
+        return self.inv_index.items()
@@ -1,10 +1,8 @@
 import math
-from collections import Counter, defaultdict
-from difflib import SequenceMatcher
-from typing import Dict, List, Optional
+from typing import Dict, List
 
-from .collector import Collection
-from .indexer import InvertedIndex, Posting
+from . import Collection, InvertedIndex
+from .indexer import Posting
 
 
 class WordLexicon:
@@ -27,8 +25,6 @@ class Lexicon:
 
     def __init__(self):
         self.lexicon: Dict[str, WordLexicon] = {}
-        self.matcher: Optional[SequenceMatcher] = None
-        self.terms: Optional[List[str]] = None
 
     def __add_word_lexicon(self, collection_size: int, term: str, postings: List[Posting]):
         self.lexicon[term] = WordLexicon(
@@ -48,22 +44,5 @@ def get_words_lexicon(self) -> List[WordLexicon]:
     def get_word_lexicon(self, word: str) -> WordLexicon:
         return self.lexicon[word]
 
-    def init_query_mode(self):
-        self.matcher = SequenceMatcher(isjunk=None, autojunk=False)
-
-    def expand_query(self, query_words: List[str]) -> List[str]:
-        cutoff = 0.8
-        exact_words = set(query_words) & set(self.lexicon)
-        approx_words = set(query_words) - exact_words
-        exact_query_words = [w for w in query_words if w not in approx_words]
-        query_expansion = defaultdict(float)
-        for q_word in approx_words:
-            self.matcher.set_seq2(q_word)
-            for term in self.lexicon:
-                self.matcher.set_seq1(term)
-                if self.matcher.real_quick_ratio() >= cutoff and \
-                   self.matcher.quick_ratio() >= cutoff and \
-                   self.matcher.ratio() >= cutoff:
-                    query_expansion[term] += self.matcher.ratio()
-        query_expansion.update(dict(Counter(exact_query_words)))
-        return query_expansion
+    def get_terms(self) -> List[str]:
+        return self.lexicon.keys()
diff --git a/src/typing_assistant/indexing/parallel_indexer.py b/src/typing_assistant/indexing/parallel_indexer.py
@@ -15,9 +15,8 @@
     print('load lexicon', time.time() - tic)
     print('lexicon entries', len(lexicon.get_words_lexicon()))
 
-    lexicon.init_query_mode()
     ranker = OkapiBM25Ranker(collection, lexicon)
-    n = 1
+    n = 10
     tic = time.time()
     for _ in range(n):
         query = 'running football player player in field across football'

@@ -0,0 +1,29 @@
+from collections import Counter, defaultdict
+from difflib import SequenceMatcher
+from typing import List
+
+from ..indexing import Lexicon
+
+
+class QueryExpander:
+
+    def __init__(self, lexicon: Lexicon):
+        self.terms: List[str] = lexicon.get_terms()
+        self.matcher: SequenceMatcher = SequenceMatcher(isjunk=None, autojunk=False)
+
+    def expand_query(self, query_words: List[str]) -> List[str]:
+        cutoff = 0.8
+        known_words = set(query_words) & set(self.terms)
+        unknown_words = set(query_words) - known_words
+        known_query_words = [w for w in query_words if w not in unknown_words]
+        query_expansion = defaultdict(float)
+        for q_word in unknown_words:
+            self.matcher.set_seq2(q_word)
+            for term in self.terms:
+                self.matcher.set_seq1(term)
+                if self.matcher.real_quick_ratio() >= cutoff and \
+                   self.matcher.quick_ratio() >= cutoff and \
+                   self.matcher.ratio() >= cutoff:
+                    query_expansion[term] += self.matcher.ratio()
+        query_expansion.update(dict(Counter(known_query_words)))
+        return query_expansion
@@ -2,6 +2,7 @@
 from collections import defaultdict
 from typing import List, Tuple
 
+from .query_expander import QueryExpander
 from ..indexing import Collection, Lexicon
 
 
@@ -10,13 +11,14 @@ class OkapiBM25Ranker:
     def __init__(self, collection: Collection, lexicon: Lexicon, kappa: float = 1.5, beta: float = 0.75):
         self.collection: Collection = collection
         self.lexicon: Lexicon = lexicon
+        self.query_expander: QueryExpander = QueryExpander(lexicon)
         self.kappa: float = kappa
         self.beta: float = beta
         self.avgdl: float = sum(x.tot_freq for x in self.lexicon.get_words_lexicon()) / self.collection.get_size()
 
     def lookup_query(self, query: str) -> List[Tuple[str, float]]:
         query_words = list(re.findall(r'\w+', query))
-        expanded_query_words = self.lexicon.expand_query(query_words)
+        expanded_query_words = self.query_expander.expand_query(query_words)
         tf, idf = defaultdict(lambda: defaultdict(int)), {}
         for w in expanded_query_words:
             word_lexicon = self.lexicon.get_word_lexicon(w)