Skip to content

Commit

Permalink
fix/update_solver_kwargs (#46)
Browse files Browse the repository at this point in the history
* fix/update_solver_kwargs

context has been dropped in favor of explicit lang

* fix/update_solver_kwargs

context has been dropped in favor of explicit lang

* docstrs
  • Loading branch information
JarbasAl authored Aug 4, 2024
1 parent 91014b4 commit c3d9e22
Show file tree
Hide file tree
Showing 2 changed files with 102 additions and 55 deletions.
84 changes: 46 additions & 38 deletions ovos_classifiers/opm/heuristics.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,55 +87,64 @@ def transform(self, utterances: List[str],


class HeuristicSummarizerPlugin(TldrSolver):
"""heuristic summarizer, picks best sentences based on word frequencies"""
"""Heuristic summarizer that picks the best sentences based on word frequencies."""

def get_tldr(self, document, context=None):
context = context or {}
lang = context.get("lang") or "en"
def get_tldr(self, document: str, lang: Optional[str] = None) -> str:
"""
Summarizes the given document using word frequencies.
Args:
document (str): The document to summarize.
lang (Optional[str]): The language of the document. Defaults to "en".
Returns:
str: The summarized text.
"""
lang = lang or "en"
return WordFrequencySummarizer().summarize(document, lang)


class BM25MultipleChoiceSolver(MultipleChoiceSolver):
"""select best answer to a question from a list of options """
"""Selects the best answer to a question from a list of options using the BM25 algorithm."""

# plugin methods to override
def rerank(self, query: str, options: List[str],
context: Optional[dict] = None) -> List[Tuple[float, str]]:
def rerank(self, query: str, options: List[str], lang: Optional[str] = None) -> List[Tuple[float, str]]:
"""
rank options list, returning a list of tuples (score, text)
Ranks the options list, returning a list of tuples (score, text).
Args:
query (str): The query string.
options (List[str]): The list of options to rank.
lang (Optional[str]): The language of the query and options. Defaults to None.
Returns:
List[Tuple[float, str]]: A list of tuples containing the score and the option text.
"""
from ovos_classifiers.heuristics.machine_comprehension import rank_answers
context = context or {}
try:
lang = context.get("lang")
stopwords = get_stopwords(lang)
except: # in case nltk is not available or stopwords dataset download fails for any reason
stopwords = []
return sorted([(s, a) for a, s in rank_answers(query, options, stopwords).items()],
key=lambda k: k[0], reverse=True)

def select_answer(self, query, options, context=None):
"""
query and options assured to be in self.default_lang
return best answer from options list
"""
context = context or {}
try:
from ovos_classifiers.heuristics.machine_comprehension import get_best_answer
lang = context.get("lang")
stopwords = get_stopwords(lang)
except: # in case nltk is not available or stopwords dataset download fails for any reason
stopwords = []
return get_best_answer(query, options, stopwords)
stopwords = []
if lang:
try:
stopwords = get_stopwords(lang)
except Exception: # In case nltk is not available or stopwords dataset download fails for any reason
pass

ranked_answers = rank_answers(query, options, stopwords)
return sorted([(s, a) for a, s in ranked_answers.items()], key=lambda k: k[0], reverse=True)


class BM25SolverPlugin(EvidenceSolver):
"""extract best sentence from text that answers the question, using BM25 algorithm"""
"""Extracts the best sentence from text that answers the question using the BM25 algorithm."""

def get_best_passage(self, evidence, question, context=None):
def get_best_passage(self, evidence: str, question: str, lang: Optional[str] = None) -> str:
"""
evidence and question assured to be in self.default_lang
returns summary of provided document
Extracts the best passage from the evidence that answers the question.
Args:
evidence (str): The evidence text to search within.
question (str): The question to answer.
lang (Optional[str]): The language of the evidence and question. Defaults to None.
Returns:
str: The best passage that answers the question.
"""
bm25 = BM25()

Expand All @@ -145,9 +154,8 @@ def get_best_passage(self, evidence, question, context=None):
corpus = [word_tokenize(s) for s in sents]
bm25.fit(corpus)
scores = bm25.search(word_tokenize(question))
ans = max([s for s in zip(scores, corpus)],
key=lambda k: k[0])[1]
return " ".join(ans)
best_sentence = max(zip(scores, corpus), key=lambda k: k[0])[1]
return " ".join(best_sentence)


class HeuristicKeywordExtractorPlugin(KeywordExtractor):
Expand Down
73 changes: 56 additions & 17 deletions ovos_classifiers/opm/nltk.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
# these plugins require nltk and may download external data/models at runtime
import random
from typing import Optional, Tuple, Dict

from nltk import pos_tag as _pt
from nltk.corpus import wordnet as wn
from ovos_plugin_manager.templates.keywords import KeywordExtractor
from ovos_plugin_manager.templates.language import LanguageDetector
from ovos_plugin_manager.templates.postag import PosTagger
from ovos_plugin_manager.templates.solvers import QuestionSolver
from ovos_plugin_manager.templates.keywords import KeywordExtractor
from quebra_frases import span_indexed_word_tokenize

from ovos_classifiers.datasets.wordnet import Wordnet
Expand All @@ -15,16 +16,32 @@


class WordnetSolverPlugin(QuestionSolver):
""" question answerer that uses wordnet for definitions synonyms and antonyms"""
"""A question answerer that uses WordNet for definitions, synonyms, and antonyms."""
enable_tx = True
priority = 80

def __init__(self, config=None):
def __init__(self, config: Optional[Dict] = None):
"""
Initializes the WordnetSolverPlugin with a given configuration.
Args:
config (Optional[Dict]): Configuration dictionary. Defaults to None.
"""
config = config or {}
config["lang"] = "en" # only english supported
config["lang"] = "en" # Only English supported
super().__init__(config)

def get_data_key(self, query, lang="en"):
def get_data_key(self, query: str, lang: str = "en") -> Tuple[Optional[str], str]:
"""
Determines the type of data (definition, synonyms, antonyms) requested by the query.
Args:
query (str): The query string.
lang (str): The language of the query. Defaults to "en".
Returns:
Tuple[Optional[str], str]: A tuple containing the data type key and the processed query.
"""
query = HeuristicExtractor.extract_subject(query, lang) or query

# TODO localization
Expand Down Expand Up @@ -53,29 +70,51 @@ def get_data_key(self, query, lang="en"):
return None, query

# officially exported Solver methods
def get_data(self, query, context=None):
pos = wn.NOUN # TODO check context for postag
def get_data(self, query: str, lang: Optional[str] = None) -> Dict[str, str]:
"""
Retrieves WordNet data for the given query.
Args:
query (str): The query string.
lang (Optional[str]): The language of the query. Defaults to None.
Returns:
Dict[str, str]: A dictionary containing WordNet data such as lemmas, antonyms, definitions, etc.
"""
pos = wn.NOUN # TODO: Check context for part of speech
synsets = wn.synsets(query, pos=pos)
if not len(synsets):
return {}
synset = synsets[0]
res = {"lemmas": Wordnet.get_lemmas(query, pos=pos, synset=synset),
"antonyms": Wordnet.get_antonyms(query, pos=pos, synset=synset),
"holonyms": Wordnet.get_holonyms(query, pos=pos, synset=synset),
"hyponyms": Wordnet.get_hyponyms(query, pos=pos, synset=synset),
"hypernyms": Wordnet.get_hypernyms(query, pos=pos, synset=synset),
"root_hypernyms": Wordnet.get_root_hypernyms(query, pos=pos, synset=synset),
"definition": Wordnet.get_definition(query, pos=pos, synset=synset)}
res = {
"lemmas": Wordnet.get_lemmas(query, pos=pos, synset=synset),
"antonyms": Wordnet.get_antonyms(query, pos=pos, synset=synset),
"holonyms": Wordnet.get_holonyms(query, pos=pos, synset=synset),
"hyponyms": Wordnet.get_hyponyms(query, pos=pos, synset=synset),
"hypernyms": Wordnet.get_hypernyms(query, pos=pos, synset=synset),
"root_hypernyms": Wordnet.get_root_hypernyms(query, pos=pos, synset=synset),
"definition": Wordnet.get_definition(query, pos=pos, synset=synset)
}
return res

def get_spoken_answer(self, query, context=None):
lang = context.get("lang") or self.default_lang
def get_spoken_answer(self, query: str, lang: Optional[str] = None) -> Optional[str]:
"""
Generates a spoken answer for the given query.
Args:
query (str): The query string.
lang (Optional[str]): The language of the query. Defaults to None.
Returns:
Optional[str]: The spoken answer, if available.
"""
lang = lang or self.default_lang
lang = lang.split("-")[0]
# extract the best keyword with some regexes or fallback to RAKE
k, query = self.get_data_key(query, lang)
if not query:
query = HeuristicExtractor.extract_subject(query, lang) or query
data = self.search(query, context)
data = self.search(query, lang=lang)
if k and k in data:
v = data[k]
if k in ["lemmas", "antonyms"] and len(v):
Expand Down

0 comments on commit c3d9e22

Please sign in to comment.