Skip to content

Commit ae62bae

Browse files
committed
support use lindera tag filter
Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
1 parent 276a8d3 commit ae62bae

File tree

1 file changed

+202
-20
lines changed

1 file changed

+202
-20
lines changed

internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lindera_tokenizer.rs

Lines changed: 202 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
use core::result::Result::Err;
2+
use std::collections::HashSet;
23

34
use lindera::dictionary::{load_dictionary_from_kind, DictionaryKind};
45
use lindera::mode::Mode;
@@ -7,6 +8,14 @@ use lindera::token::Token as LToken;
78
use lindera::tokenizer::Tokenizer as LTokenizer;
89
use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
910

11+
use lindera::token_filter::BoxTokenFilter as LTokenFilter;
12+
use lindera::token_filter::japanese_compound_word::JapaneseCompoundWordTokenFilter;
13+
use lindera::token_filter::japanese_stop_tags::JapaneseStopTagsTokenFilter;
14+
use lindera::token_filter::japanese_keep_tags::JapaneseKeepTagsTokenFilter;
15+
use lindera::token_filter::korean_keep_tags::KoreanKeepTagsTokenFilter;
16+
use lindera::token_filter::korean_stop_tags::KoreanStopTagsTokenFilter;
17+
18+
1019
use crate::error::{Result, TantivyBindingError};
1120
use serde_json as json;
1221

@@ -15,6 +24,9 @@ pub struct LinderaTokenStream<'a> {
1524
pub token: &'a mut Token,
1625
}
1726

27+
const DICTKINDKEY: &str = "dict_kind";
28+
const FILTERKEY: &str = "filter";
29+
1830
impl<'a> TokenStream for LinderaTokenStream<'a> {
1931
fn advance(&mut self) -> bool {
2032
if self.tokens.is_empty() {
@@ -50,14 +62,22 @@ impl LinderaTokenizer {
5062
/// This function will create a new `LinderaTokenizer` with settings from the YAML file specified in the `LINDERA_CONFIG_PATH` environment variable.
5163
pub fn from_json(params: &json::Map<String, json::Value>) -> Result<LinderaTokenizer> {
5264
let kind = fetch_lindera_kind(params)?;
53-
let dictionary = load_dictionary_from_kind(kind);
54-
if dictionary.is_err() {
55-
return Err(TantivyBindingError::InvalidArgument(format!(
65+
let dictionary = load_dictionary_from_kind(kind.clone()).map_err(|_|{
66+
TantivyBindingError::InvalidArgument(format!(
5667
"lindera tokenizer with invalid dict_kind"
57-
)));
68+
))
69+
})?;
70+
71+
let segmenter = Segmenter::new(Mode::Normal, dictionary, None);
72+
let mut tokenizer = LinderaTokenizer::from_segmenter(segmenter);
73+
74+
// append lindera filter
75+
let filters = fetch_lindera_token_filters(&kind, params)?;
76+
for filter in filters{
77+
tokenizer.append_token_filter(filter)
5878
}
59-
let segmenter = Segmenter::new(Mode::Normal, dictionary.unwrap(), None);
60-
Ok(LinderaTokenizer::from_segmenter(segmenter))
79+
80+
Ok(tokenizer)
6181
}
6282

6383
/// Create a new `LinderaTokenizer`.
@@ -68,6 +88,10 @@ impl LinderaTokenizer {
6888
token: Default::default(),
6989
}
7090
}
91+
92+
pub fn append_token_filter(&mut self, filter:LTokenFilter) {
93+
self.tokenizer.append_token_filter(filter);
94+
}
7195
}
7296

7397
impl Tokenizer for LinderaTokenizer {
@@ -102,41 +126,199 @@ impl DictionaryKindParser for &str {
102126
}
103127
}
104128

105-
fn fetch_lindera_kind(params: &json::Map<String, json::Value>) -> Result<DictionaryKind> {
106-
match params.get("dict_kind") {
107-
Some(val) => {
108-
if !val.is_string() {
109-
return Err(TantivyBindingError::InvalidArgument(format!(
110-
"lindera tokenizer dict kind should be string"
111-
)));
112-
}
113-
val.as_str().unwrap().into_dict_kind()
114-
}
115-
_ => {
116-
return Err(TantivyBindingError::InvalidArgument(format!(
129+
fn fetch_lindera_kind(params:&json::Map<String, json::Value>) -> Result<DictionaryKind>{
130+
params.get(DICTKINDKEY)
131+
.ok_or_else(||{
132+
TantivyBindingError::InvalidArgument(format!(
117133
"lindera tokenizer dict_kind must be set"
134+
))
135+
})?
136+
.as_str()
137+
.ok_or_else(||{
138+
TantivyBindingError::InvalidArgument(format!(
139+
"lindera tokenizer dict kind should be string"
140+
))})?
141+
.into_dict_kind()
142+
}
143+
144+
fn fetch_lindera_tags_from_params(params:&json::Map<String, json::Value>) -> Result<HashSet<String>>{
145+
params
146+
.get("tags")
147+
.ok_or_else(||{
148+
TantivyBindingError::InvalidArgument(format!(
149+
"lindera japanese stop tag filter tags must be set"
150+
))
151+
})?
152+
.as_array()
153+
.ok_or_else(||{
154+
TantivyBindingError::InvalidArgument(format!(
155+
"lindera japanese stop tags filter tags must be array"
156+
))
157+
})?
158+
.iter()
159+
.map(|v|{
160+
v.as_str()
161+
.ok_or_else(||{
162+
TantivyBindingError::InvalidArgument(format!(
163+
"lindera japanese stop tags filter tags must be string"
164+
))
165+
})
166+
.map(|s|s.to_string())
167+
})
168+
.collect::<Result<HashSet<String>>>()
169+
}
170+
171+
fn fetch_japanese_compound_word_token_filter(kind:&DictionaryKind, params:Option<&json::Map<String, json::Value>>) -> Result<LTokenFilter>{
172+
let filter_param = params.ok_or_else(||{
173+
TantivyBindingError::InvalidArgument(format!(
174+
"lindera japanese compound word filter must use with params"
175+
))
176+
})?;
177+
178+
let tags:HashSet<String> = fetch_lindera_tags_from_params(filter_param)?;
179+
180+
let new_tag: Option<String> = filter_param
181+
.get("new_tag")
182+
.map(|v|{
183+
v.as_str()
184+
.ok_or_else(||{
185+
TantivyBindingError::InvalidArgument(format!(
186+
"lindera japanese compound word filter new_tag must be string"
187+
))
188+
})
189+
.map(|s| s.to_string())
190+
})
191+
.transpose()?;
192+
Ok(JapaneseCompoundWordTokenFilter::new(kind.clone(), tags, new_tag).into())
193+
}
194+
195+
fn fetch_japanese_keep_tags_token_filter(params:Option<&json::Map<String, json::Value>>) -> Result<LTokenFilter> {
196+
Ok(JapaneseKeepTagsTokenFilter::new(fetch_lindera_tags_from_params(
197+
params.ok_or_else(||{
198+
TantivyBindingError::InvalidArgument(format!(
199+
"lindera japanese keep tags filter must use with params"
200+
))
201+
})?)?).into())
202+
}
203+
204+
fn fetch_japanese_stop_tags_token_filter(params:Option<&json::Map<String, json::Value>>) -> Result<LTokenFilter>{
205+
Ok(JapaneseStopTagsTokenFilter::new(fetch_lindera_tags_from_params(
206+
params.ok_or_else(||{
207+
TantivyBindingError::InvalidArgument(format!(
208+
"lindera japanese stop tags filter must use with params"
209+
))
210+
})?)?).into())
211+
}
212+
213+
fn fetch_korean_keep_tags_token_filter(params:Option<&json::Map<String, json::Value>>) -> Result<LTokenFilter> {
214+
Ok(KoreanKeepTagsTokenFilter::new(fetch_lindera_tags_from_params(
215+
params.ok_or_else(||{
216+
TantivyBindingError::InvalidArgument(format!(
217+
"lindera korean keep tags filter must use with params"
218+
))
219+
})?)?).into())
220+
}
221+
222+
fn fetch_korean_stop_tags_token_filter(params:Option<&json::Map<String, json::Value>>) -> Result<LTokenFilter>{
223+
Ok(KoreanStopTagsTokenFilter::new(fetch_lindera_tags_from_params(
224+
params.ok_or_else(||{
225+
TantivyBindingError::InvalidArgument(format!(
226+
"lindera korean stop tags filter must use with params"
227+
))
228+
})?)?).into())
229+
}
230+
231+
fn fetch_lindera_token_filter_params(params: &json::Value) -> Result<(&str, Option<&json::Map<String, json::Value>>)>{
232+
if params.is_string(){
233+
return Ok((params.as_str().unwrap(), None))
234+
}
235+
236+
let kind = params.as_object()
237+
.ok_or_else(||{TantivyBindingError::InvalidArgument(format!("lindera tokenizer filter params must be object"))})?
238+
.get("kind")
239+
.ok_or_else(||{
240+
TantivyBindingError::InvalidArgument(format!(
241+
"lindera tokenizer filter must have type"
242+
))})?
243+
.as_str()
244+
.ok_or_else(||{
245+
TantivyBindingError::InvalidArgument(format!(
246+
"lindera tokenizer filter type should be string"
247+
))
248+
})?;
249+
250+
Ok((kind, Some(params.as_object().unwrap())))
251+
}
252+
253+
fn fetch_lindera_token_filter(type_name: &str, kind:&DictionaryKind, params: Option<&json::Map<String, json::Value>>) -> Result<LTokenFilter>{
254+
match type_name{
255+
"japanese_compound_word" => fetch_japanese_compound_word_token_filter(kind, params),
256+
"japanese_keep_tags" => fetch_japanese_keep_tags_token_filter(params),
257+
"japanese_stop_tags" => fetch_japanese_stop_tags_token_filter(params),
258+
"korean_keep_tags" => fetch_korean_keep_tags_token_filter(params),
259+
"korean_stop_tags" => fetch_korean_stop_tags_token_filter(params),
260+
_ =>{
261+
Err(TantivyBindingError::InvalidArgument(format!(
262+
"unknown lindera filter type"
118263
)))
119264
}
120265
}
121266
}
122267

268+
fn fetch_lindera_token_filters(kind:&DictionaryKind, params:&json::Map<String, json::Value>) -> Result<Vec<LTokenFilter>>{
269+
let mut result: Vec<LTokenFilter> = vec![];
270+
271+
match params.get(FILTERKEY){
272+
Some(v) => {
273+
let filter_list = v
274+
.as_array()
275+
.ok_or_else(||{TantivyBindingError::InvalidArgument(format!(
276+
"lindera filters should be array"
277+
))})?;
278+
279+
for filter_params in filter_list{
280+
let (name, params) = fetch_lindera_token_filter_params(filter_params)?;
281+
let filter= fetch_lindera_token_filter(name, kind, params)?;
282+
result.push(filter);
283+
}
284+
},
285+
_ => {}
286+
}
287+
288+
Ok(result)
289+
}
290+
123291
#[cfg(test)]
124292
mod tests {
125293
use serde_json as json;
294+
use tantivy::tokenizer::Tokenizer;
126295

127296
use crate::analyzer::tokenizers::lindera_tokenizer::LinderaTokenizer;
128297

129298
#[test]
130299
fn test_lindera_tokenizer() {
131300
let params = r#"{
132301
"type": "lindera",
133-
"dict_kind": "ipadic"
302+
"dict_kind": "ipadic",
303+
"filter": [{
304+
"kind": "japanese_stop_tags",
305+
"tags": ["接続詞", "助詞", "助詞,格助詞", "助詞,連体化"]
306+
}]
134307
}"#;
135308
let json_param = json::from_str::<json::Map<String, json::Value>>(&params);
136309
assert!(json_param.is_ok());
137310

138311
let tokenizer = LinderaTokenizer::from_json(&json_param.unwrap());
139-
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
312+
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
313+
314+
let mut binding = tokenizer.unwrap();
315+
let stream = binding.token_stream("東京スカイツリーの最寄り駅はとうきょうスカイツリー駅です");
316+
let mut results = Vec::<String>::new();
317+
for token in stream.tokens {
318+
results.push(token.text.to_string());
319+
}
320+
321+
print!("test tokens :{:?}\n", results)
140322
}
141323

142324
#[test]

0 commit comments

Comments
 (0)