Skip to content
This repository was archived by the owner on Apr 4, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion milli/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ bimap = { version = "0.6.2", features = ["serde"] }
bincode = "1.3.3"
bstr = "1.0.1"
byteorder = "1.4.3"
charabia = { version = "0.6.0", default-features = false }
charabia = { version = "0.7.0", default-features = false }
concat-arrays = "0.1.2"
crossbeam-channel = "0.5.6"
either = "1.8.0"
Expand Down Expand Up @@ -70,6 +70,10 @@ hebrew = ["charabia/hebrew"]

# allow japanese specialized tokenization
japanese = ["charabia/japanese"]
japanese-transliteration = ["charabia/japanese-transliteration"]

# allow korean specialized tokenization
korean = ["charabia/korean"]

# allow thai specialized tokenization
thai = ["charabia/thai"]
6 changes: 3 additions & 3 deletions milli/src/search/matches/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,14 @@ const DEFAULT_HIGHLIGHT_SUFFIX: &str = "</em>";
/// Structure used to build a Matcher allowing to customize formating tags.
pub struct MatcherBuilder<'a, A> {
matching_words: MatchingWords,
tokenizer: Tokenizer<'a, A>,
tokenizer: Tokenizer<'a, 'a, A>,
crop_marker: Option<String>,
highlight_prefix: Option<String>,
highlight_suffix: Option<String>,
}

impl<'a, A> MatcherBuilder<'a, A> {
pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'a, A>) -> Self {
pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'a, 'a, A>) -> Self {
Self {
matching_words,
tokenizer,
Expand Down Expand Up @@ -106,7 +106,7 @@ pub struct MatchBounds {
pub struct Matcher<'t, 'm, A> {
text: &'t str,
matching_words: &'m MatchingWords,
tokenizer: &'m Tokenizer<'m, A>,
tokenizer: &'m Tokenizer<'m, 'm, A>,
crop_marker: &'m str,
highlight_prefix: &'m str,
highlight_suffix: &'m str,
Expand Down
8 changes: 4 additions & 4 deletions milli/src/search/query_tree.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use std::hash::Hash;
use std::rc::Rc;
use std::{fmt, mem};

use charabia::classifier::ClassifiedTokenIter;
use charabia::normalizer::NormalizedTokenIter;
use charabia::{SeparatorKind, TokenKind};
use roaring::RoaringBitmap;
use slice_group_by::GroupBy;
Expand Down Expand Up @@ -270,7 +270,7 @@ impl<'a> QueryTreeBuilder<'a> {
/// (the criterion `typo` will be ignored)
pub fn build<A: AsRef<[u8]>>(
&self,
query: ClassifiedTokenIter<A>,
query: NormalizedTokenIter<A>,
) -> Result<Option<(Operation, PrimitiveQuery, MatchingWords)>> {
let primitive_query = create_primitive_query(query, self.words_limit);
if !primitive_query.is_empty() {
Expand Down Expand Up @@ -778,7 +778,7 @@ impl PrimitiveQueryPart {
/// Create primitive query from tokenized query string,
/// the primitive query is an intermediate state to build the query tree.
fn create_primitive_query<A>(
query: ClassifiedTokenIter<A>,
query: NormalizedTokenIter<A>,
words_limit: Option<usize>,
) -> PrimitiveQuery
where
Expand Down Expand Up @@ -892,7 +892,7 @@ mod test {
terms_matching_strategy: TermsMatchingStrategy,
authorize_typos: bool,
words_limit: Option<usize>,
query: ClassifiedTokenIter<A>,
query: NormalizedTokenIter<A>,
) -> Result<Option<(Operation, PrimitiveQuery)>> {
let primitive_query = create_primitive_query(query, words_limit);
if !primitive_query.is_empty() {
Expand Down
4 changes: 2 additions & 2 deletions milli/src/update/index_documents/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1575,11 +1575,11 @@ mod tests {
let rtxn = index.read_txn().unwrap();

// Only the first document should match.
let count = index.word_docids.get(&rtxn, "化妆包").unwrap().unwrap().len();
let count = index.word_docids.get(&rtxn, "huàzhuāngbāo").unwrap().unwrap().len();
assert_eq!(count, 1);

// Only the second document should match.
let count = index.word_docids.get(&rtxn, "").unwrap().unwrap().len();
let count = index.word_docids.get(&rtxn, "bāo").unwrap().unwrap().len();
assert_eq!(count, 1);

let mut search = crate::Search::new(&rtxn, &index);
Expand Down