Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add: approximate words search #5

Merged
merged 5 commits into from
Jan 9, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
add: decoupling query expansion from lexicon
  • Loading branch information
jarvis0 committed Jan 8, 2022
commit 35bc80f26e9b4d17c851d1561359723dcddab0fd
4 changes: 2 additions & 2 deletions data/dumps/collection.pkl
Git LFS file not shown
4 changes: 2 additions & 2 deletions data/dumps/lexicon.pkl
Git LFS file not shown
96 changes: 0 additions & 96 deletions notebooks/dict_multiprocessing.ipynb

This file was deleted.

2 changes: 1 addition & 1 deletion src/typing_assistant/bm25_tester.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
tic = time.time()
bm25 = BM25Okapi(tokenized_corpus)
print('index', time.time() - tic)
n = 1
n = 10
tic = time.time()
for _ in range(n):
query = 'running football player player in field across football'
Expand Down
6 changes: 3 additions & 3 deletions src/typing_assistant/indexing/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from .collector import Collection
from .lex import Lexicon
from .parallel_indexer import InvertedIndex
from .collection import Collection
from .indexer import InvertedIndex
from .lexicon import Lexicon

__all__ = [
'Collection',
Expand Down
25 changes: 16 additions & 9 deletions src/typing_assistant/indexing/indexer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import itertools
from collections import Counter, defaultdict
from typing import DefaultDict, List, Tuple

from .collector import Collection
from joblib import Parallel, delayed

from . import Collection


class Posting:
Expand All @@ -18,17 +21,21 @@ class InvertedIndex:

def __init__(self, collection: Collection):
self.collection: Collection = collection
self.index: DefaultDict[str, List[Posting]] = defaultdict(list)
self.inv_index: DefaultDict[str, List[Posting]] = defaultdict(list)

def __index_document(self, doc_id: int):
document = self.collection.get_document(doc_id)
@staticmethod
def __index_document(doc_id, document):
term_frequencies = Counter(document.tokens)
update_dict = {term: self.index[term] + [Posting(doc_id, freq)] for term, freq in term_frequencies.items()}
self.index.update(update_dict)
partial_term_postings = [(term, [Posting(doc_id, freq)]) for term, freq in term_frequencies.items()]
return partial_term_postings

def index_collection(self):
for doc_id in self.collection.get_docs_id():
self.__index_document(doc_id)
map_responses = Parallel(n_jobs=4)(delayed(InvertedIndex.__index_document)(
doc_id,
self.collection.get_document(doc_id),
) for doc_id in self.collection.get_docs_id())
for term, postings in itertools.chain(*map_responses):
self.inv_index[term] += postings

def get_items(self) -> List[Tuple[str, List[Posting]]]:
return self.index.items()
return self.inv_index.items()
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
import math
from collections import Counter, defaultdict
from difflib import SequenceMatcher
from typing import Dict, List, Optional
from typing import Dict, List

from .collector import Collection
from .indexer import InvertedIndex, Posting
from . import Collection, InvertedIndex
from .indexer import Posting


class WordLexicon:
Expand All @@ -27,8 +25,6 @@ class Lexicon:

def __init__(self):
self.lexicon: Dict[str, WordLexicon] = {}
self.matcher: Optional[SequenceMatcher] = None
self.terms: Optional[List[str]] = None

def __add_word_lexicon(self, collection_size: int, term: str, postings: List[Posting]):
self.lexicon[term] = WordLexicon(
Expand All @@ -48,22 +44,5 @@ def get_words_lexicon(self) -> List[WordLexicon]:
def get_word_lexicon(self, word: str) -> WordLexicon:
return self.lexicon[word]

def init_query_mode(self):
self.matcher = SequenceMatcher(isjunk=None, autojunk=False)

def expand_query(self, query_words: List[str]) -> List[str]:
cutoff = 0.8
exact_words = set(query_words) & set(self.lexicon)
approx_words = set(query_words) - exact_words
exact_query_words = [w for w in query_words if w not in approx_words]
query_expansion = defaultdict(float)
for q_word in approx_words:
self.matcher.set_seq2(q_word)
for term in self.lexicon:
self.matcher.set_seq1(term)
if self.matcher.real_quick_ratio() >= cutoff and \
self.matcher.quick_ratio() >= cutoff and \
self.matcher.ratio() >= cutoff:
query_expansion[term] += self.matcher.ratio()
query_expansion.update(dict(Counter(exact_query_words)))
return query_expansion
def get_terms(self) -> List[str]:
return self.lexicon.keys()
40 changes: 0 additions & 40 deletions src/typing_assistant/indexing/parallel_indexer.py

This file was deleted.

3 changes: 1 addition & 2 deletions src/typing_assistant/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,8 @@
print('load lexicon', time.time() - tic)
print('lexicon entries', len(lexicon.get_words_lexicon()))

lexicon.init_query_mode()
ranker = OkapiBM25Ranker(collection, lexicon)
n = 1
n = 10
tic = time.time()
for _ in range(n):
query = 'running football player player in field across football'
Expand Down
29 changes: 29 additions & 0 deletions src/typing_assistant/ranking/query_expander.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from collections import Counter, defaultdict
from difflib import SequenceMatcher
from typing import List

from ..indexing import Lexicon


class QueryExpander:

def __init__(self, lexicon: Lexicon):
self.terms: List[str] = lexicon.get_terms()
self.matcher: SequenceMatcher = SequenceMatcher(isjunk=None, autojunk=False)

def expand_query(self, query_words: List[str]) -> List[str]:
cutoff = 0.8
known_words = set(query_words) & set(self.terms)
unknown_words = set(query_words) - known_words
known_query_words = [w for w in query_words if w not in unknown_words]
query_expansion = defaultdict(float)
for q_word in unknown_words:
self.matcher.set_seq2(q_word)
for term in self.terms:
self.matcher.set_seq1(term)
if self.matcher.real_quick_ratio() >= cutoff and \
self.matcher.quick_ratio() >= cutoff and \
self.matcher.ratio() >= cutoff:
query_expansion[term] += self.matcher.ratio()
query_expansion.update(dict(Counter(known_query_words)))
return query_expansion
4 changes: 3 additions & 1 deletion src/typing_assistant/ranking/ranker.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from collections import defaultdict
from typing import List, Tuple

from .query_expander import QueryExpander
from ..indexing import Collection, Lexicon


Expand All @@ -10,13 +11,14 @@ class OkapiBM25Ranker:
def __init__(self, collection: Collection, lexicon: Lexicon, kappa: float = 1.5, beta: float = 0.75):
self.collection: Collection = collection
self.lexicon: Lexicon = lexicon
self.query_expander: QueryExpander = QueryExpander(lexicon)
self.kappa: float = kappa
self.beta: float = beta
self.avgdl: float = sum(x.tot_freq for x in self.lexicon.get_words_lexicon()) / self.collection.get_size()

def lookup_query(self, query: str) -> List[Tuple[str, float]]:
query_words = list(re.findall(r'\w+', query))
expanded_query_words = self.lexicon.expand_query(query_words)
expanded_query_words = self.query_expander.expand_query(query_words)
tf, idf = defaultdict(lambda: defaultdict(int)), {}
for w in expanded_query_words:
word_lexicon = self.lexicon.get_word_lexicon(w)
Expand Down