From 5441c3a37552f787474d45ba7b17b97d2c7b2bda Mon Sep 17 00:00:00 2001 From: pasha Date: Fri, 23 Jun 2023 16:59:34 +0300 Subject: [PATCH] feat: Rule-based noun detection --- .../src/components/default_tokenizers.rs | 2 +- summa-core/src/components/mod.rs | 2 +- .../query_parser/morphology/english.rs | 32 +++++++++++++------ .../src/components/query_parser/summa_ql.rs | 4 +-- 4 files changed, 27 insertions(+), 13 deletions(-) diff --git a/summa-core/src/components/default_tokenizers.rs b/summa-core/src/components/default_tokenizers.rs index af80df72..065a0e0c 100644 --- a/summa-core/src/components/default_tokenizers.rs +++ b/summa-core/src/components/default_tokenizers.rs @@ -3,7 +3,7 @@ use tantivy::tokenizer::{LowerCaser, RawTokenizer, RemoveLongFilter, SimpleToken use super::summa_tokenizer::SummaTokenizer; /// List of stop words mixed for multiple languages -const STOP_WORDS: [&str; 318] = [ +pub const STOP_WORDS: [&str; 318] = [ "a", "an", "and", diff --git a/summa-core/src/components/mod.rs b/summa-core/src/components/mod.rs index e3c0c623..cecdcb37 100644 --- a/summa-core/src/components/mod.rs +++ b/summa-core/src/components/mod.rs @@ -13,7 +13,7 @@ mod summa_document; mod summa_tokenizer; pub use custom_serializer::NamedFieldDocument; -pub use default_tokenizers::default_tokenizers; +pub use default_tokenizers::{default_tokenizers, STOP_WORDS}; pub use fruit_extractors::{build_fruit_extractor, FruitExtractor, IntermediateExtractionResult}; pub use index_holder::{cleanup_index, IndexHolder}; pub use index_registry::IndexRegistry; diff --git a/summa-core/src/components/query_parser/morphology/english.rs b/summa-core/src/components/query_parser/morphology/english.rs index f2aeec9a..c8997345 100644 --- a/summa-core/src/components/query_parser/morphology/english.rs +++ b/summa-core/src/components/query_parser/morphology/english.rs @@ -1,3 +1,7 @@ +use std::collections::HashSet; + +use regex::RegexSet; + use crate::components::query_parser::morphology::Morphology; #[derive(Default, Clone)] @@ -5,16 +9,26 @@ pub struct EnglishMorphology {} impl Morphology for EnglishMorphology { fn derive_tenses(&self, word: &str) -> Option { - let is_singular = pluralize_rs::is_singular(word); - let is_plural = pluralize_rs::is_plural(word); - - if is_singular { - Some(pluralize_rs::to_plural(word)) - } else if is_plural { - Some(pluralize_rs::to_singular(word)) - } else { - None + thread_local! { + static NOT_A_NOUN: (RegexSet, HashSet<&'static str>) = (RegexSet::new(&[ + r"\d$", + r"ing$", + ]).expect("cannot compile regex"), HashSet::from_iter(crate::components::default_tokenizers::STOP_WORDS.into_iter())); } + NOT_A_NOUN.with(|(not_a_noun_regex, stop_words)| { + if stop_words.contains(word) || not_a_noun_regex.is_match(word) { + return None; + } + let is_singular = pluralize_rs::is_singular(word); + let is_plural = pluralize_rs::is_plural(word); + if is_singular { + Some(pluralize_rs::to_plural(word)) + } else if is_plural { + Some(pluralize_rs::to_singular(word)) + } else { + None + } + }) } fn detect_ners(&self, _: &str) -> Vec { diff --git a/summa-core/src/components/query_parser/summa_ql.rs b/summa-core/src/components/query_parser/summa_ql.rs index e20c16a5..fdc43b07 100644 --- a/summa-core/src/components/query_parser/summa_ql.rs +++ b/summa-core/src/components/query_parser/summa_ql.rs @@ -1178,8 +1178,8 @@ mod tests { ); query_parser.query_parser_config.0.morphology_configs = morphology_configs; query_parser.query_parser_config.0.query_language = Some("en".to_string()); - let query = query_parser.parse_query("red search engine"); - assert_eq!(format!("{:?}", query), "Ok(BooleanQuery { subqueries: [(Should, DisjunctionMaxQuery { disjuncts: [TermQuery(Term(field=0, type=Str, \"red\")), TermQuery(Term(field=0, type=Str, \"reds\"))], tie_breaker: 0.3 }), (Should, TermQuery(Term(field=0, type=Str, \"search\"))), (Should, DisjunctionMaxQuery { disjuncts: [TermQuery(Term(field=0, type=Str, \"engine\")), TermQuery(Term(field=0, type=Str, \"engines\"))], tie_breaker: 0.3 })] })"); + let query = query_parser.parse_query("red1 search engine going"); + assert_eq!(format!("{:?}", query), ""); let query = query_parser.parse_query("iso 34-1:2022"); assert_eq!(format!("{:?}", query), "Ok(BooleanQuery { subqueries: [(Should, TermQuery(Term(field=0, type=Str, \"iso\"))), (Should, TermQuery(Term(field=0, type=Str, \"34\"))), (Should, TermQuery(Term(field=0, type=Str, \"1\")))] })"); }