Skip to content

Commit

Permalink
feat: Rule-based noun detection
Browse files Browse the repository at this point in the history
  • Loading branch information
ppodolsky committed Jun 23, 2023
1 parent 2528166 commit 5441c3a
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 13 deletions.
2 changes: 1 addition & 1 deletion summa-core/src/components/default_tokenizers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use tantivy::tokenizer::{LowerCaser, RawTokenizer, RemoveLongFilter, SimpleToken
use super::summa_tokenizer::SummaTokenizer;

/// List of stop words mixed for multiple languages
const STOP_WORDS: [&str; 318] = [
pub const STOP_WORDS: [&str; 318] = [
"a",
"an",
"and",
Expand Down
2 changes: 1 addition & 1 deletion summa-core/src/components/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ mod summa_document;
mod summa_tokenizer;

pub use custom_serializer::NamedFieldDocument;
pub use default_tokenizers::default_tokenizers;
pub use default_tokenizers::{default_tokenizers, STOP_WORDS};
pub use fruit_extractors::{build_fruit_extractor, FruitExtractor, IntermediateExtractionResult};
pub use index_holder::{cleanup_index, IndexHolder};
pub use index_registry::IndexRegistry;
Expand Down
32 changes: 23 additions & 9 deletions summa-core/src/components/query_parser/morphology/english.rs
Original file line number Diff line number Diff line change
@@ -1,20 +1,34 @@
use std::collections::HashSet;

use regex::RegexSet;

use crate::components::query_parser::morphology::Morphology;

#[derive(Default, Clone)]
pub struct EnglishMorphology {}

impl Morphology for EnglishMorphology {
fn derive_tenses(&self, word: &str) -> Option<String> {
let is_singular = pluralize_rs::is_singular(word);
let is_plural = pluralize_rs::is_plural(word);

if is_singular {
Some(pluralize_rs::to_plural(word))
} else if is_plural {
Some(pluralize_rs::to_singular(word))
} else {
None
thread_local! {
static NOT_A_NOUN: (RegexSet, HashSet<&'static str>) = (RegexSet::new(&[
r"\d$",
r"ing$",
]).expect("cannot compile regex"), HashSet::from_iter(crate::components::default_tokenizers::STOP_WORDS.into_iter()));
}
NOT_A_NOUN.with(|(not_a_noun_regex, stop_words)| {
if stop_words.contains(word) || not_a_noun_regex.is_match(word) {
return None;
}
let is_singular = pluralize_rs::is_singular(word);
let is_plural = pluralize_rs::is_plural(word);
if is_singular {
Some(pluralize_rs::to_plural(word))
} else if is_plural {
Some(pluralize_rs::to_singular(word))
} else {
None
}
})
}

fn detect_ners(&self, _: &str) -> Vec<String> {
Expand Down
4 changes: 2 additions & 2 deletions summa-core/src/components/query_parser/summa_ql.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1178,8 +1178,8 @@ mod tests {
);
query_parser.query_parser_config.0.morphology_configs = morphology_configs;
query_parser.query_parser_config.0.query_language = Some("en".to_string());
let query = query_parser.parse_query("red search engine");
assert_eq!(format!("{:?}", query), "Ok(BooleanQuery { subqueries: [(Should, DisjunctionMaxQuery { disjuncts: [TermQuery(Term(field=0, type=Str, \"red\")), TermQuery(Term(field=0, type=Str, \"reds\"))], tie_breaker: 0.3 }), (Should, TermQuery(Term(field=0, type=Str, \"search\"))), (Should, DisjunctionMaxQuery { disjuncts: [TermQuery(Term(field=0, type=Str, \"engine\")), TermQuery(Term(field=0, type=Str, \"engines\"))], tie_breaker: 0.3 })] })");
let query = query_parser.parse_query("red1 search engine going");
assert_eq!(format!("{:?}", query), "");
let query = query_parser.parse_query("iso 34-1:2022");
assert_eq!(format!("{:?}", query), "Ok(BooleanQuery { subqueries: [(Should, TermQuery(Term(field=0, type=Str, \"iso\"))), (Should, TermQuery(Term(field=0, type=Str, \"34\"))), (Should, TermQuery(Term(field=0, type=Str, \"1\")))] })");
}
Expand Down

0 comments on commit 5441c3a

Please sign in to comment.