1
1
use core:: result:: Result :: Err ;
2
+ use std:: collections:: HashSet ;
2
3
3
4
use lindera:: dictionary:: { load_dictionary_from_kind, DictionaryKind } ;
4
5
use lindera:: mode:: Mode ;
@@ -7,6 +8,14 @@ use lindera::token::Token as LToken;
7
8
use lindera:: tokenizer:: Tokenizer as LTokenizer ;
8
9
use tantivy:: tokenizer:: { Token , TokenStream , Tokenizer } ;
9
10
11
+ use lindera:: token_filter:: BoxTokenFilter as LTokenFilter ;
12
+ use lindera:: token_filter:: japanese_compound_word:: JapaneseCompoundWordTokenFilter ;
13
+ use lindera:: token_filter:: japanese_stop_tags:: JapaneseStopTagsTokenFilter ;
14
+ use lindera:: token_filter:: japanese_keep_tags:: JapaneseKeepTagsTokenFilter ;
15
+ use lindera:: token_filter:: korean_keep_tags:: KoreanKeepTagsTokenFilter ;
16
+ use lindera:: token_filter:: korean_stop_tags:: KoreanStopTagsTokenFilter ;
17
+
18
+
10
19
use crate :: error:: { Result , TantivyBindingError } ;
11
20
use serde_json as json;
12
21
@@ -15,6 +24,9 @@ pub struct LinderaTokenStream<'a> {
15
24
pub token : & ' a mut Token ,
16
25
}
17
26
27
+ const DICTKINDKEY : & str = "dict_kind" ;
28
+ const FILTERKEY : & str = "filter" ;
29
+
18
30
impl < ' a > TokenStream for LinderaTokenStream < ' a > {
19
31
fn advance ( & mut self ) -> bool {
20
32
if self . tokens . is_empty ( ) {
@@ -50,14 +62,22 @@ impl LinderaTokenizer {
50
62
/// This function will create a new `LinderaTokenizer` with settings from the YAML file specified in the `LINDERA_CONFIG_PATH` environment variable.
51
63
pub fn from_json ( params : & json:: Map < String , json:: Value > ) -> Result < LinderaTokenizer > {
52
64
let kind = fetch_lindera_kind ( params) ?;
53
- let dictionary = load_dictionary_from_kind ( kind) ;
54
- if dictionary. is_err ( ) {
55
- return Err ( TantivyBindingError :: InvalidArgument ( format ! (
65
+ let dictionary = load_dictionary_from_kind ( kind. clone ( ) ) . map_err ( |_|{
66
+ TantivyBindingError :: InvalidArgument ( format ! (
56
67
"lindera tokenizer with invalid dict_kind"
57
- ) ) ) ;
68
+ ) )
69
+ } ) ?;
70
+
71
+ let segmenter = Segmenter :: new ( Mode :: Normal , dictionary, None ) ;
72
+ let mut tokenizer = LinderaTokenizer :: from_segmenter ( segmenter) ;
73
+
74
+ // append lindera filter
75
+ let filters = fetch_lindera_token_filters ( & kind, params) ?;
76
+ for filter in filters{
77
+ tokenizer. append_token_filter ( filter)
58
78
}
59
- let segmenter = Segmenter :: new ( Mode :: Normal , dictionary . unwrap ( ) , None ) ;
60
- Ok ( LinderaTokenizer :: from_segmenter ( segmenter ) )
79
+
80
+ Ok ( tokenizer )
61
81
}
62
82
63
83
/// Create a new `LinderaTokenizer`.
@@ -68,6 +88,10 @@ impl LinderaTokenizer {
68
88
token : Default :: default ( ) ,
69
89
}
70
90
}
91
+
92
+ pub fn append_token_filter ( & mut self , filter : LTokenFilter ) {
93
+ self . tokenizer . append_token_filter ( filter) ;
94
+ }
71
95
}
72
96
73
97
impl Tokenizer for LinderaTokenizer {
@@ -102,41 +126,199 @@ impl DictionaryKindParser for &str {
102
126
}
103
127
}
104
128
105
- fn fetch_lindera_kind ( params : & json:: Map < String , json:: Value > ) -> Result < DictionaryKind > {
106
- match params. get ( "dict_kind" ) {
107
- Some ( val) => {
108
- if !val. is_string ( ) {
109
- return Err ( TantivyBindingError :: InvalidArgument ( format ! (
110
- "lindera tokenizer dict kind should be string"
111
- ) ) ) ;
112
- }
113
- val. as_str ( ) . unwrap ( ) . into_dict_kind ( )
114
- }
115
- _ => {
116
- return Err ( TantivyBindingError :: InvalidArgument ( format ! (
129
+ fn fetch_lindera_kind ( params : & json:: Map < String , json:: Value > ) -> Result < DictionaryKind > {
130
+ params. get ( DICTKINDKEY )
131
+ . ok_or_else ( ||{
132
+ TantivyBindingError :: InvalidArgument ( format ! (
117
133
"lindera tokenizer dict_kind must be set"
134
+ ) )
135
+ } ) ?
136
+ . as_str ( )
137
+ . ok_or_else ( ||{
138
+ TantivyBindingError :: InvalidArgument ( format ! (
139
+ "lindera tokenizer dict kind should be string"
140
+ ) ) } ) ?
141
+ . into_dict_kind ( )
142
+ }
143
+
144
+ fn fetch_lindera_tags_from_params ( params : & json:: Map < String , json:: Value > ) -> Result < HashSet < String > > {
145
+ params
146
+ . get ( "tags" )
147
+ . ok_or_else ( ||{
148
+ TantivyBindingError :: InvalidArgument ( format ! (
149
+ "lindera japanese stop tag filter tags must be set"
150
+ ) )
151
+ } ) ?
152
+ . as_array ( )
153
+ . ok_or_else ( ||{
154
+ TantivyBindingError :: InvalidArgument ( format ! (
155
+ "lindera japanese stop tags filter tags must be array"
156
+ ) )
157
+ } ) ?
158
+ . iter ( )
159
+ . map ( |v|{
160
+ v. as_str ( )
161
+ . ok_or_else ( ||{
162
+ TantivyBindingError :: InvalidArgument ( format ! (
163
+ "lindera japanese stop tags filter tags must be string"
164
+ ) )
165
+ } )
166
+ . map ( |s|s. to_string ( ) )
167
+ } )
168
+ . collect :: < Result < HashSet < String > > > ( )
169
+ }
170
+
171
+ fn fetch_japanese_compound_word_token_filter ( kind : & DictionaryKind , params : Option < & json:: Map < String , json:: Value > > ) -> Result < LTokenFilter > {
172
+ let filter_param = params. ok_or_else ( ||{
173
+ TantivyBindingError :: InvalidArgument ( format ! (
174
+ "lindera japanese compound word filter must use with params"
175
+ ) )
176
+ } ) ?;
177
+
178
+ let tags: HashSet < String > = fetch_lindera_tags_from_params ( filter_param) ?;
179
+
180
+ let new_tag: Option < String > = filter_param
181
+ . get ( "new_tag" )
182
+ . map ( |v|{
183
+ v. as_str ( )
184
+ . ok_or_else ( ||{
185
+ TantivyBindingError :: InvalidArgument ( format ! (
186
+ "lindera japanese compound word filter new_tag must be string"
187
+ ) )
188
+ } )
189
+ . map ( |s| s. to_string ( ) )
190
+ } )
191
+ . transpose ( ) ?;
192
+ Ok ( JapaneseCompoundWordTokenFilter :: new ( kind. clone ( ) , tags, new_tag) . into ( ) )
193
+ }
194
+
195
+ fn fetch_japanese_keep_tags_token_filter ( params : Option < & json:: Map < String , json:: Value > > ) -> Result < LTokenFilter > {
196
+ Ok ( JapaneseKeepTagsTokenFilter :: new ( fetch_lindera_tags_from_params (
197
+ params. ok_or_else ( ||{
198
+ TantivyBindingError :: InvalidArgument ( format ! (
199
+ "lindera japanese keep tags filter must use with params"
200
+ ) )
201
+ } ) ?) ?) . into ( ) )
202
+ }
203
+
204
+ fn fetch_japanese_stop_tags_token_filter ( params : Option < & json:: Map < String , json:: Value > > ) -> Result < LTokenFilter > {
205
+ Ok ( JapaneseStopTagsTokenFilter :: new ( fetch_lindera_tags_from_params (
206
+ params. ok_or_else ( ||{
207
+ TantivyBindingError :: InvalidArgument ( format ! (
208
+ "lindera japanese stop tags filter must use with params"
209
+ ) )
210
+ } ) ?) ?) . into ( ) )
211
+ }
212
+
213
+ fn fetch_korean_keep_tags_token_filter ( params : Option < & json:: Map < String , json:: Value > > ) -> Result < LTokenFilter > {
214
+ Ok ( KoreanKeepTagsTokenFilter :: new ( fetch_lindera_tags_from_params (
215
+ params. ok_or_else ( ||{
216
+ TantivyBindingError :: InvalidArgument ( format ! (
217
+ "lindera korean keep tags filter must use with params"
218
+ ) )
219
+ } ) ?) ?) . into ( ) )
220
+ }
221
+
222
+ fn fetch_korean_stop_tags_token_filter ( params : Option < & json:: Map < String , json:: Value > > ) -> Result < LTokenFilter > {
223
+ Ok ( KoreanStopTagsTokenFilter :: new ( fetch_lindera_tags_from_params (
224
+ params. ok_or_else ( ||{
225
+ TantivyBindingError :: InvalidArgument ( format ! (
226
+ "lindera korean stop tags filter must use with params"
227
+ ) )
228
+ } ) ?) ?) . into ( ) )
229
+ }
230
+
231
+ fn fetch_lindera_token_filter_params ( params : & json:: Value ) -> Result < ( & str , Option < & json:: Map < String , json:: Value > > ) > {
232
+ if params. is_string ( ) {
233
+ return Ok ( ( params. as_str ( ) . unwrap ( ) , None ) )
234
+ }
235
+
236
+ let kind = params. as_object ( )
237
+ . ok_or_else ( ||{ TantivyBindingError :: InvalidArgument ( format ! ( "lindera tokenizer filter params must be object" ) ) } ) ?
238
+ . get ( "kind" )
239
+ . ok_or_else ( ||{
240
+ TantivyBindingError :: InvalidArgument ( format ! (
241
+ "lindera tokenizer filter must have type"
242
+ ) ) } ) ?
243
+ . as_str ( )
244
+ . ok_or_else ( ||{
245
+ TantivyBindingError :: InvalidArgument ( format ! (
246
+ "lindera tokenizer filter type should be string"
247
+ ) )
248
+ } ) ?;
249
+
250
+ Ok ( ( kind, Some ( params. as_object ( ) . unwrap ( ) ) ) )
251
+ }
252
+
253
+ fn fetch_lindera_token_filter ( type_name : & str , kind : & DictionaryKind , params : Option < & json:: Map < String , json:: Value > > ) -> Result < LTokenFilter > {
254
+ match type_name{
255
+ "japanese_compound_word" => fetch_japanese_compound_word_token_filter ( kind, params) ,
256
+ "japanese_keep_tags" => fetch_japanese_keep_tags_token_filter ( params) ,
257
+ "japanese_stop_tags" => fetch_japanese_stop_tags_token_filter ( params) ,
258
+ "korean_keep_tags" => fetch_korean_keep_tags_token_filter ( params) ,
259
+ "korean_stop_tags" => fetch_korean_stop_tags_token_filter ( params) ,
260
+ _ =>{
261
+ Err ( TantivyBindingError :: InvalidArgument ( format ! (
262
+ "unknown lindera filter type"
118
263
) ) )
119
264
}
120
265
}
121
266
}
122
267
268
+ fn fetch_lindera_token_filters ( kind : & DictionaryKind , params : & json:: Map < String , json:: Value > ) -> Result < Vec < LTokenFilter > > {
269
+ let mut result: Vec < LTokenFilter > = vec ! [ ] ;
270
+
271
+ match params. get ( FILTERKEY ) {
272
+ Some ( v) => {
273
+ let filter_list = v
274
+ . as_array ( )
275
+ . ok_or_else ( ||{ TantivyBindingError :: InvalidArgument ( format ! (
276
+ "lindera filters should be array"
277
+ ) ) } ) ?;
278
+
279
+ for filter_params in filter_list{
280
+ let ( name, params) = fetch_lindera_token_filter_params ( filter_params) ?;
281
+ let filter= fetch_lindera_token_filter ( name, kind, params) ?;
282
+ result. push ( filter) ;
283
+ }
284
+ } ,
285
+ _ => { }
286
+ }
287
+
288
+ Ok ( result)
289
+ }
290
+
123
291
#[ cfg( test) ]
124
292
mod tests {
125
293
use serde_json as json;
294
+ use tantivy:: tokenizer:: Tokenizer ;
126
295
127
296
use crate :: analyzer:: tokenizers:: lindera_tokenizer:: LinderaTokenizer ;
128
297
129
298
#[ test]
130
299
fn test_lindera_tokenizer ( ) {
131
300
let params = r#"{
132
301
"type": "lindera",
133
- "dict_kind": "ipadic"
302
+ "dict_kind": "ipadic",
303
+ "filter": [{
304
+ "kind": "japanese_stop_tags",
305
+ "tags": ["接続詞", "助詞", "助詞,格助詞", "助詞,連体化"]
306
+ }]
134
307
}"# ;
135
308
let json_param = json:: from_str :: < json:: Map < String , json:: Value > > ( & params) ;
136
309
assert ! ( json_param. is_ok( ) ) ;
137
310
138
311
let tokenizer = LinderaTokenizer :: from_json ( & json_param. unwrap ( ) ) ;
139
- assert ! ( tokenizer. is_ok( ) , "error: {}" , tokenizer. err( ) . unwrap( ) ) ;
312
+ assert ! ( tokenizer. is_ok( ) , "error: {}" , tokenizer. err( ) . unwrap( ) ) ;
313
+
314
+ let mut binding = tokenizer. unwrap ( ) ;
315
+ let stream = binding. token_stream ( "東京スカイツリーの最寄り駅はとうきょうスカイツリー駅です" ) ;
316
+ let mut results = Vec :: < String > :: new ( ) ;
317
+ for token in stream. tokens {
318
+ results. push ( token. text . to_string ( ) ) ;
319
+ }
320
+
321
+ print ! ( "test tokens :{:?}\n " , results)
140
322
}
141
323
142
324
#[ test]
0 commit comments