Merge #736

bors[bot] · ManyTheFish · web-flow · commit 6a10e8570781 · 2023-01-03T15:44:41.000Z
736: Update charabia r=curquiza a=ManyTheFish

Update Charabia to the last version.

&gt; We are now Romanizing Chinese characters into Pinyin.
&gt; Note that we keep the accent because they are in fact never typed directly by the end-user, moreover, changing an accent leads to a different Chinese character, and I don't have sufficient knowledge to forecast the impact of removing accents in this context.

Co-authored-by: ManyTheFish &lt;many@meilisearch.com&gt;
diff --git a/milli/Cargo.toml b/milli/Cargo.toml
@@ -9,7 +9,7 @@ bimap = { version = "0.6.2", features = ["serde"] }
 bincode = "1.3.3"
 bstr = "1.0.1"
 byteorder = "1.4.3"
-charabia = { version = "0.6.0", default-features = false }
+charabia = { version = "0.7.0", default-features = false }
 concat-arrays = "0.1.2"
 crossbeam-channel = "0.5.6"
 either = "1.8.0"
@@ -70,6 +70,10 @@ hebrew = ["charabia/hebrew"]
 
 # allow japanese specialized tokenization
 japanese = ["charabia/japanese"]
+japanese-transliteration = ["charabia/japanese-transliteration"]
+
+# allow korean specialized tokenization
+korean = ["charabia/korean"]
 
 # allow thai specialized tokenization
 thai = ["charabia/thai"]
diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs
@@ -14,14 +14,14 @@ const DEFAULT_HIGHLIGHT_SUFFIX: &str = "</em>";
 /// Structure used to build a Matcher allowing to customize formating tags.
 pub struct MatcherBuilder<'a, A> {
     matching_words: MatchingWords,
-    tokenizer: Tokenizer<'a, A>,
+    tokenizer: Tokenizer<'a, 'a, A>,
     crop_marker: Option<String>,
     highlight_prefix: Option<String>,
     highlight_suffix: Option<String>,
 }
 
 impl<'a, A> MatcherBuilder<'a, A> {
-    pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'a, A>) -> Self {
+    pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'a, 'a, A>) -> Self {
         Self {
             matching_words,
             tokenizer,
@@ -106,7 +106,7 @@ pub struct MatchBounds {
 pub struct Matcher<'t, 'm, A> {
     text: &'t str,
     matching_words: &'m MatchingWords,
-    tokenizer: &'m Tokenizer<'m, A>,
+    tokenizer: &'m Tokenizer<'m, 'm, A>,
     crop_marker: &'m str,
     highlight_prefix: &'m str,
     highlight_suffix: &'m str,
diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs
@@ -6,7 +6,7 @@ use std::hash::Hash;
 use std::rc::Rc;
 use std::{fmt, mem};
 
-use charabia::classifier::ClassifiedTokenIter;
+use charabia::normalizer::NormalizedTokenIter;
 use charabia::{SeparatorKind, TokenKind};
 use roaring::RoaringBitmap;
 use slice_group_by::GroupBy;
@@ -270,7 +270,7 @@ impl<'a> QueryTreeBuilder<'a> {
     ///   (the criterion `typo` will be ignored)
     pub fn build<A: AsRef<[u8]>>(
         &self,
-        query: ClassifiedTokenIter<A>,
+        query: NormalizedTokenIter<A>,
     ) -> Result<Option<(Operation, PrimitiveQuery, MatchingWords)>> {
         let primitive_query = create_primitive_query(query, self.words_limit);
         if !primitive_query.is_empty() {
@@ -778,7 +778,7 @@ impl PrimitiveQueryPart {
 /// Create primitive query from tokenized query string,
 /// the primitive query is an intermediate state to build the query tree.
 fn create_primitive_query<A>(
-    query: ClassifiedTokenIter<A>,
+    query: NormalizedTokenIter<A>,
     words_limit: Option<usize>,
 ) -> PrimitiveQuery
 where
@@ -892,7 +892,7 @@ mod test {
             terms_matching_strategy: TermsMatchingStrategy,
             authorize_typos: bool,
             words_limit: Option<usize>,
-            query: ClassifiedTokenIter<A>,
+            query: NormalizedTokenIter<A>,
         ) -> Result<Option<(Operation, PrimitiveQuery)>> {
             let primitive_query = create_primitive_query(query, words_limit);
             if !primitive_query.is_empty() {
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
@@ -1575,11 +1575,11 @@ mod tests {
         let rtxn = index.read_txn().unwrap();
 
         // Only the first document should match.
-        let count = index.word_docids.get(&rtxn, "化妆包").unwrap().unwrap().len();
+        let count = index.word_docids.get(&rtxn, "huàzhuāngbāo").unwrap().unwrap().len();
         assert_eq!(count, 1);
 
         // Only the second document should match.
-        let count = index.word_docids.get(&rtxn, "包").unwrap().unwrap().len();
+        let count = index.word_docids.get(&rtxn, "bāo").unwrap().unwrap().len();
         assert_eq!(count, 1);
 
         let mut search = crate::Search::new(&rtxn, &index);