Skip to content
This repository was archived by the owner on Apr 4, 2023. It is now read-only.

Commit 6a10e85

Browse files
Merge #736
736: Update charabia r=curquiza a=ManyTheFish Update Charabia to the last version. > We are now Romanizing Chinese characters into Pinyin. > Note that we keep the accent because they are in fact never typed directly by the end-user, moreover, changing an accent leads to a different Chinese character, and I don't have sufficient knowledge to forecast the impact of removing accents in this context. Co-authored-by: ManyTheFish <many@meilisearch.com>
2 parents c505fa9 + 7f88c4f commit 6a10e85

File tree

4 files changed

+14
-10
lines changed

4 files changed

+14
-10
lines changed

milli/Cargo.toml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ bimap = { version = "0.6.2", features = ["serde"] }
99
bincode = "1.3.3"
1010
bstr = "1.0.1"
1111
byteorder = "1.4.3"
12-
charabia = { version = "0.6.0", default-features = false }
12+
charabia = { version = "0.7.0", default-features = false }
1313
concat-arrays = "0.1.2"
1414
crossbeam-channel = "0.5.6"
1515
either = "1.8.0"
@@ -70,6 +70,10 @@ hebrew = ["charabia/hebrew"]
7070

7171
# allow japanese specialized tokenization
7272
japanese = ["charabia/japanese"]
73+
japanese-transliteration = ["charabia/japanese-transliteration"]
74+
75+
# allow korean specialized tokenization
76+
korean = ["charabia/korean"]
7377

7478
# allow thai specialized tokenization
7579
thai = ["charabia/thai"]

milli/src/search/matches/mod.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,14 @@ const DEFAULT_HIGHLIGHT_SUFFIX: &str = "</em>";
1414
/// Structure used to build a Matcher allowing to customize formating tags.
1515
pub struct MatcherBuilder<'a, A> {
1616
matching_words: MatchingWords,
17-
tokenizer: Tokenizer<'a, A>,
17+
tokenizer: Tokenizer<'a, 'a, A>,
1818
crop_marker: Option<String>,
1919
highlight_prefix: Option<String>,
2020
highlight_suffix: Option<String>,
2121
}
2222

2323
impl<'a, A> MatcherBuilder<'a, A> {
24-
pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'a, A>) -> Self {
24+
pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'a, 'a, A>) -> Self {
2525
Self {
2626
matching_words,
2727
tokenizer,
@@ -106,7 +106,7 @@ pub struct MatchBounds {
106106
pub struct Matcher<'t, 'm, A> {
107107
text: &'t str,
108108
matching_words: &'m MatchingWords,
109-
tokenizer: &'m Tokenizer<'m, A>,
109+
tokenizer: &'m Tokenizer<'m, 'm, A>,
110110
crop_marker: &'m str,
111111
highlight_prefix: &'m str,
112112
highlight_suffix: &'m str,

milli/src/search/query_tree.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use std::hash::Hash;
66
use std::rc::Rc;
77
use std::{fmt, mem};
88

9-
use charabia::classifier::ClassifiedTokenIter;
9+
use charabia::normalizer::NormalizedTokenIter;
1010
use charabia::{SeparatorKind, TokenKind};
1111
use roaring::RoaringBitmap;
1212
use slice_group_by::GroupBy;
@@ -270,7 +270,7 @@ impl<'a> QueryTreeBuilder<'a> {
270270
/// (the criterion `typo` will be ignored)
271271
pub fn build<A: AsRef<[u8]>>(
272272
&self,
273-
query: ClassifiedTokenIter<A>,
273+
query: NormalizedTokenIter<A>,
274274
) -> Result<Option<(Operation, PrimitiveQuery, MatchingWords)>> {
275275
let primitive_query = create_primitive_query(query, self.words_limit);
276276
if !primitive_query.is_empty() {
@@ -778,7 +778,7 @@ impl PrimitiveQueryPart {
778778
/// Create primitive query from tokenized query string,
779779
/// the primitive query is an intermediate state to build the query tree.
780780
fn create_primitive_query<A>(
781-
query: ClassifiedTokenIter<A>,
781+
query: NormalizedTokenIter<A>,
782782
words_limit: Option<usize>,
783783
) -> PrimitiveQuery
784784
where
@@ -892,7 +892,7 @@ mod test {
892892
terms_matching_strategy: TermsMatchingStrategy,
893893
authorize_typos: bool,
894894
words_limit: Option<usize>,
895-
query: ClassifiedTokenIter<A>,
895+
query: NormalizedTokenIter<A>,
896896
) -> Result<Option<(Operation, PrimitiveQuery)>> {
897897
let primitive_query = create_primitive_query(query, words_limit);
898898
if !primitive_query.is_empty() {

milli/src/update/index_documents/mod.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1575,11 +1575,11 @@ mod tests {
15751575
let rtxn = index.read_txn().unwrap();
15761576

15771577
// Only the first document should match.
1578-
let count = index.word_docids.get(&rtxn, "化妆包").unwrap().unwrap().len();
1578+
let count = index.word_docids.get(&rtxn, "huàzhuāngbāo").unwrap().unwrap().len();
15791579
assert_eq!(count, 1);
15801580

15811581
// Only the second document should match.
1582-
let count = index.word_docids.get(&rtxn, "").unwrap().unwrap().len();
1582+
let count = index.word_docids.get(&rtxn, "bāo").unwrap().unwrap().len();
15831583
assert_eq!(count, 1);
15841584

15851585
let mut search = crate::Search::new(&rtxn, &index);

0 commit comments

Comments
 (0)