Skip to content

Commit f166843

Browse files
authored
enhance: support use lindera tag filter (#40416)
relate: #39659 Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
1 parent c5428c1 commit f166843

File tree

1 file changed

+240
-20
lines changed

1 file changed

+240
-20
lines changed

internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lindera_tokenizer.rs

Lines changed: 240 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
use core::result::Result::Err;
2+
use std::collections::HashSet;
23

34
use lindera::dictionary::{load_dictionary_from_kind, DictionaryKind};
45
use lindera::mode::Mode;
@@ -7,6 +8,13 @@ use lindera::token::Token as LToken;
78
use lindera::tokenizer::Tokenizer as LTokenizer;
89
use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
910

11+
use lindera::token_filter::japanese_compound_word::JapaneseCompoundWordTokenFilter;
12+
use lindera::token_filter::japanese_keep_tags::JapaneseKeepTagsTokenFilter;
13+
use lindera::token_filter::japanese_stop_tags::JapaneseStopTagsTokenFilter;
14+
use lindera::token_filter::korean_keep_tags::KoreanKeepTagsTokenFilter;
15+
use lindera::token_filter::korean_stop_tags::KoreanStopTagsTokenFilter;
16+
use lindera::token_filter::BoxTokenFilter as LTokenFilter;
17+
1018
use crate::error::{Result, TantivyBindingError};
1119
use serde_json as json;
1220

@@ -15,6 +23,9 @@ pub struct LinderaTokenStream<'a> {
1523
pub token: &'a mut Token,
1624
}
1725

26+
const DICTKINDKEY: &str = "dict_kind";
27+
const FILTERKEY: &str = "filter";
28+
1829
impl<'a> TokenStream for LinderaTokenStream<'a> {
1930
fn advance(&mut self) -> bool {
2031
if self.tokens.is_empty() {
@@ -47,17 +58,25 @@ pub struct LinderaTokenizer {
4758

4859
impl LinderaTokenizer {
4960
/// Create a new `LinderaTokenizer`.
50-
/// This function will create a new `LinderaTokenizer` with settings from the YAML file specified in the `LINDERA_CONFIG_PATH` environment variable.
61+
/// This function will create a new `LinderaTokenizer` with json parameters.
5162
pub fn from_json(params: &json::Map<String, json::Value>) -> Result<LinderaTokenizer> {
5263
let kind = fetch_lindera_kind(params)?;
53-
let dictionary = load_dictionary_from_kind(kind);
54-
if dictionary.is_err() {
55-
return Err(TantivyBindingError::InvalidArgument(format!(
64+
let dictionary = load_dictionary_from_kind(kind.clone()).map_err(|_| {
65+
TantivyBindingError::InvalidArgument(format!(
5666
"lindera tokenizer with invalid dict_kind"
57-
)));
67+
))
68+
})?;
69+
70+
let segmenter = Segmenter::new(Mode::Normal, dictionary, None);
71+
let mut tokenizer = LinderaTokenizer::from_segmenter(segmenter);
72+
73+
// append lindera filter
74+
let filters = fetch_lindera_token_filters(&kind, params)?;
75+
for filter in filters {
76+
tokenizer.append_token_filter(filter)
5877
}
59-
let segmenter = Segmenter::new(Mode::Normal, dictionary.unwrap(), None);
60-
Ok(LinderaTokenizer::from_segmenter(segmenter))
78+
79+
Ok(tokenizer)
6180
}
6281

6382
/// Create a new `LinderaTokenizer`.
@@ -68,6 +87,10 @@ impl LinderaTokenizer {
6887
token: Default::default(),
6988
}
7089
}
90+
91+
pub fn append_token_filter(&mut self, filter: LTokenFilter) {
92+
self.tokenizer.append_token_filter(filter);
93+
}
7194
}
7295

7396
impl Tokenizer for LinderaTokenizer {
@@ -103,40 +126,237 @@ impl DictionaryKindParser for &str {
103126
}
104127

105128
fn fetch_lindera_kind(params: &json::Map<String, json::Value>) -> Result<DictionaryKind> {
106-
match params.get("dict_kind") {
107-
Some(val) => {
108-
if !val.is_string() {
109-
return Err(TantivyBindingError::InvalidArgument(format!(
110-
"lindera tokenizer dict kind should be string"
111-
)));
129+
params
130+
.get(DICTKINDKEY)
131+
.ok_or_else(|| {
132+
TantivyBindingError::InvalidArgument(format!("lindera tokenizer dict_kind must be set"))
133+
})?
134+
.as_str()
135+
.ok_or_else(|| {
136+
TantivyBindingError::InvalidArgument(format!(
137+
"lindera tokenizer dict kind should be string"
138+
))
139+
})?
140+
.into_dict_kind()
141+
}
142+
143+
fn fetch_lindera_tags_from_params(
144+
params: &json::Map<String, json::Value>,
145+
) -> Result<HashSet<String>> {
146+
params
147+
.get("tags")
148+
.ok_or_else(|| {
149+
TantivyBindingError::InvalidArgument(format!(
150+
"lindera japanese stop tag filter tags must be set"
151+
))
152+
})?
153+
.as_array()
154+
.ok_or_else(|| {
155+
TantivyBindingError::InvalidArgument(format!(
156+
"lindera japanese stop tags filter tags must be array"
157+
))
158+
})?
159+
.iter()
160+
.map(|v| {
161+
v.as_str()
162+
.ok_or_else(|| {
163+
TantivyBindingError::InvalidArgument(format!(
164+
"lindera japanese stop tags filter tags must be string"
165+
))
166+
})
167+
.map(|s| s.to_string())
168+
})
169+
.collect::<Result<HashSet<String>>>()
170+
}
171+
172+
fn fetch_japanese_compound_word_token_filter(
173+
kind: &DictionaryKind,
174+
params: Option<&json::Map<String, json::Value>>,
175+
) -> Result<LTokenFilter> {
176+
let filter_param = params.ok_or_else(|| {
177+
TantivyBindingError::InvalidArgument(format!(
178+
"lindera japanese compound word filter must use with params"
179+
))
180+
})?;
181+
182+
let tags: HashSet<String> = fetch_lindera_tags_from_params(filter_param)?;
183+
184+
let new_tag: Option<String> = filter_param
185+
.get("new_tag")
186+
.map(|v| {
187+
v.as_str()
188+
.ok_or_else(|| {
189+
TantivyBindingError::InvalidArgument(format!(
190+
"lindera japanese compound word filter new_tag must be string"
191+
))
192+
})
193+
.map(|s| s.to_string())
194+
})
195+
.transpose()?;
196+
Ok(JapaneseCompoundWordTokenFilter::new(kind.clone(), tags, new_tag).into())
197+
}
198+
199+
fn fetch_japanese_keep_tags_token_filter(
200+
params: Option<&json::Map<String, json::Value>>,
201+
) -> Result<LTokenFilter> {
202+
Ok(
203+
JapaneseKeepTagsTokenFilter::new(fetch_lindera_tags_from_params(params.ok_or_else(
204+
|| {
205+
TantivyBindingError::InvalidArgument(format!(
206+
"lindera japanese keep tags filter must use with params"
207+
))
208+
},
209+
)?)?)
210+
.into(),
211+
)
212+
}
213+
214+
fn fetch_japanese_stop_tags_token_filter(
215+
params: Option<&json::Map<String, json::Value>>,
216+
) -> Result<LTokenFilter> {
217+
Ok(
218+
JapaneseStopTagsTokenFilter::new(fetch_lindera_tags_from_params(params.ok_or_else(
219+
|| {
220+
TantivyBindingError::InvalidArgument(format!(
221+
"lindera japanese stop tags filter must use with params"
222+
))
223+
},
224+
)?)?)
225+
.into(),
226+
)
227+
}
228+
229+
fn fetch_korean_keep_tags_token_filter(
230+
params: Option<&json::Map<String, json::Value>>,
231+
) -> Result<LTokenFilter> {
232+
Ok(
233+
KoreanKeepTagsTokenFilter::new(fetch_lindera_tags_from_params(params.ok_or_else(
234+
|| {
235+
TantivyBindingError::InvalidArgument(format!(
236+
"lindera korean keep tags filter must use with params"
237+
))
238+
},
239+
)?)?)
240+
.into(),
241+
)
242+
}
243+
244+
fn fetch_korean_stop_tags_token_filter(
245+
params: Option<&json::Map<String, json::Value>>,
246+
) -> Result<LTokenFilter> {
247+
Ok(
248+
KoreanStopTagsTokenFilter::new(fetch_lindera_tags_from_params(params.ok_or_else(
249+
|| {
250+
TantivyBindingError::InvalidArgument(format!(
251+
"lindera korean stop tags filter must use with params"
252+
))
253+
},
254+
)?)?)
255+
.into(),
256+
)
257+
}
258+
259+
fn fetch_lindera_token_filter_params(
260+
params: &json::Value,
261+
) -> Result<(&str, Option<&json::Map<String, json::Value>>)> {
262+
if params.is_string() {
263+
return Ok((params.as_str().unwrap(), None));
264+
}
265+
266+
let kind = params
267+
.as_object()
268+
.ok_or_else(|| {
269+
TantivyBindingError::InvalidArgument(format!(
270+
"lindera tokenizer filter params must be object"
271+
))
272+
})?
273+
.get("kind")
274+
.ok_or_else(|| {
275+
TantivyBindingError::InvalidArgument(format!("lindera tokenizer filter must have type"))
276+
})?
277+
.as_str()
278+
.ok_or_else(|| {
279+
TantivyBindingError::InvalidArgument(format!(
280+
"lindera tokenizer filter type should be string"
281+
))
282+
})?;
283+
284+
Ok((kind, Some(params.as_object().unwrap())))
285+
}
286+
287+
fn fetch_lindera_token_filter(
288+
type_name: &str,
289+
kind: &DictionaryKind,
290+
params: Option<&json::Map<String, json::Value>>,
291+
) -> Result<LTokenFilter> {
292+
match type_name {
293+
"japanese_compound_word" => fetch_japanese_compound_word_token_filter(kind, params),
294+
"japanese_keep_tags" => fetch_japanese_keep_tags_token_filter(params),
295+
"japanese_stop_tags" => fetch_japanese_stop_tags_token_filter(params),
296+
"korean_keep_tags" => fetch_korean_keep_tags_token_filter(params),
297+
"korean_stop_tags" => fetch_korean_stop_tags_token_filter(params),
298+
_ => Err(TantivyBindingError::InvalidArgument(format!(
299+
"unknown lindera filter type"
300+
))),
301+
}
302+
}
303+
304+
fn fetch_lindera_token_filters(
305+
kind: &DictionaryKind,
306+
params: &json::Map<String, json::Value>,
307+
) -> Result<Vec<LTokenFilter>> {
308+
let mut result: Vec<LTokenFilter> = vec![];
309+
310+
match params.get(FILTERKEY) {
311+
Some(v) => {
312+
let filter_list = v.as_array().ok_or_else(|| {
313+
TantivyBindingError::InvalidArgument(format!("lindera filters should be array"))
314+
})?;
315+
316+
for filter_params in filter_list {
317+
let (name, params) = fetch_lindera_token_filter_params(filter_params)?;
318+
let filter = fetch_lindera_token_filter(name, kind, params)?;
319+
result.push(filter);
112320
}
113-
val.as_str().unwrap().into_dict_kind()
114-
}
115-
_ => {
116-
return Err(TantivyBindingError::InvalidArgument(format!(
117-
"lindera tokenizer dict_kind must be set"
118-
)))
119321
}
322+
_ => {}
120323
}
324+
325+
Ok(result)
121326
}
122327

123328
#[cfg(test)]
124329
mod tests {
125330
use serde_json as json;
331+
use tantivy::tokenizer::Tokenizer;
126332

127333
use crate::analyzer::tokenizers::lindera_tokenizer::LinderaTokenizer;
128334

129335
#[test]
130336
fn test_lindera_tokenizer() {
131337
let params = r#"{
132338
"type": "lindera",
133-
"dict_kind": "ipadic"
339+
"dict_kind": "ipadic",
340+
"filter": [{
341+
"kind": "japanese_stop_tags",
342+
"tags": ["接続詞", "助詞", "助詞,格助詞", "助詞,連体化"]
343+
}]
134344
}"#;
135345
let json_param = json::from_str::<json::Map<String, json::Value>>(&params);
136346
assert!(json_param.is_ok());
137347

138348
let tokenizer = LinderaTokenizer::from_json(&json_param.unwrap());
139349
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
350+
351+
let mut binding = tokenizer.unwrap();
352+
let stream =
353+
binding.token_stream("東京スカイツリーの最寄り駅はとうきょうスカイツリー駅です");
354+
let mut results = Vec::<String>::new();
355+
for token in stream.tokens {
356+
results.push(token.text.to_string());
357+
}
358+
359+
print!("test tokens :{:?}\n", results)
140360
}
141361

142362
#[test]

0 commit comments

Comments
 (0)