Skip to content

Commit 63ac832

Browse files
authored
Merge pull request #24 from fetchadd/master
Add Chinese support
2 parents ccbf5bd + 9396bc5 commit 63ac832

File tree

11 files changed

+365
-8
lines changed

11 files changed

+365
-8
lines changed

.gitignore

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,7 @@ Cargo.lock
88

99
# These are backup files generated by rustfmt
1010
**/*.rs.bk
11-
1211
examples/out.json
1312
out.json
14-
15-
**/node_modules/
13+
**/node_modules/
14+
.idea/

Cargo.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,14 @@ serde_derive = "1.0.34" # First verstion to support #[serde(flatten)]
2828
serde_json = "1"
2929
strum = "0.15"
3030
strum_macros = "0.15"
31+
jieba-rs = "0.4.10"
3132

3233
[features]
3334
default = ["languages"]
3435
nightly = ["bench"]
3536
bench = []
3637

37-
languages = ["da", "de", "du", "es", "fi", "fr", "it", "pt", "ro", "ru", "sv", "tr"]
38+
languages = ["da", "de", "du", "es", "fi", "fr", "it", "pt", "ro", "ru", "sv", "tr", "zh"]
3839
da = ["rust-stemmers"]
3940
de = ["rust-stemmers"]
4041
du = ["rust-stemmers"]
@@ -47,3 +48,4 @@ ro = ["rust-stemmers"]
4748
ru = ["rust-stemmers"]
4849
sv = ["rust-stemmers"]
4950
tr = ["rust-stemmers"]
51+
zh = ["rust-stemmers"]

src/lang/mod.rs

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ macro_rules! make_stemmer {
5454
}
5555

5656
/// Used to configure the `Index` for a specific lanugage.
57-
#[derive(Copy, Clone, Eq, PartialEq, Debug, EnumString, ToString, EnumIter)]
57+
#[derive(Copy, Clone, Eq, PartialEq, Debug, EnumString, ToString, EnumIter, Serialize, Deserialize)]
5858
pub enum Language {
5959
English,
6060
#[cfg(feature = "da")]
@@ -81,6 +81,8 @@ pub enum Language {
8181
Swedish,
8282
#[cfg(feature = "tr")]
8383
Turkish,
84+
#[cfg(feature = "zh")]
85+
Chinese,
8486
#[doc(hidden)]
8587
#[strum(disabled = "true")]
8688
__NonExhaustive,
@@ -123,6 +125,8 @@ impl Language {
123125
"sv" => Some(Language::Swedish),
124126
#[cfg(feature = "tr")]
125127
"tr" => Some(Language::Turkish),
128+
#[cfg(feature = "zh")]
129+
"zh" => Some(Language::Chinese),
126130
_ => None,
127131
}
128132
}
@@ -162,6 +166,8 @@ impl Language {
162166
Language::Swedish => "sv",
163167
#[cfg(feature = "tr")]
164168
Language::Turkish => "tr",
169+
#[cfg(feature = "zh")]
170+
Language::Chinese => "zh",
165171
_ => panic!("Don't use the __NonExhaustive variant!"),
166172
}
167173
}
@@ -194,11 +200,14 @@ impl Language {
194200
Language::Swedish => ::lang::sv::make_pipeline(),
195201
#[cfg(feature = "tr")]
196202
Language::Turkish => ::lang::tr::make_pipeline(),
203+
#[cfg(feature = "zh")]
204+
Language::Chinese => ::lang::zh::make_pipeline(),
197205
_ => panic!("Dont use the `__NonExhaustive` variant!"),
198206
}
199207
}
200208
}
201209

210+
202211
pub mod en;
203212

204213
#[cfg(feature = "da")]
@@ -225,3 +234,5 @@ pub mod ru;
225234
pub mod sv;
226235
#[cfg(feature = "tr")]
227236
pub mod tr;
237+
#[cfg(feature = "zh")]
238+
pub mod zh;

src/lang/zh.rs

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
use pipeline::Pipeline;
2+
3+
4+
pub fn make_pipeline() -> Pipeline {
5+
Pipeline {
6+
queue: vec![
7+
("trimmer-zh".into(), trimmer),
8+
("stopWordFilter-zh".into(), stop_word_filter),
9+
("stemmer-zh".into(), stemmer),
10+
],
11+
}
12+
}
13+
14+
15+
pub fn trimmer(token: String) -> Option<String> {
16+
let ret: String = token.
17+
trim_matches(|c: char| !is_valid_char(c) )
18+
.into();
19+
20+
if ret.eq("") {
21+
return None;
22+
}
23+
24+
Some(ret)
25+
}
26+
27+
make_stop_word_filter!([
28+
"的", "了"
29+
]);
30+
31+
fn stemmer(token: String) -> Option<String> {
32+
Some(token)
33+
}
34+
35+
fn is_valid_char(c: char) -> bool {
36+
let min_max_list = [
37+
[19668, 40869], // min and max Chinese char
38+
['a' as u32, 'z' as u32],
39+
['A' as u32, 'Z' as u32]
40+
];
41+
42+
let c = c as u32;
43+
for min_max in min_max_list.iter() {
44+
if c >= min_max[0] && c <= min_max[1] {
45+
return true;
46+
}
47+
}
48+
49+
false
50+
}

src/lib.rs

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@ extern crate rust_stemmers;
4343
#[cfg(test)]
4444
#[macro_use]
4545
extern crate maplit;
46+
#[cfg(feature = "zh")]
47+
extern crate jieba_rs;
48+
4649

4750
/// The version of elasticlunr.js this library was designed for.
4851
pub const ELASTICLUNR_VERSION: &str = "0.9.5";
@@ -149,6 +152,7 @@ impl IndexBuilder {
149152
document_store: DocumentStore::new(self.save),
150153
pipeline: self.pipeline.unwrap_or_default(),
151154
version: ::ELASTICLUNR_VERSION,
155+
lang: Language::English,
152156
}
153157
}
154158
}
@@ -165,6 +169,7 @@ pub struct Index {
165169
pub version: &'static str,
166170
index: BTreeMap<String, InvertedIndex>,
167171
pub document_store: DocumentStore,
172+
lang: Language,
168173
}
169174

170175
impl Index {
@@ -226,6 +231,7 @@ impl Index {
226231
ref_field: "id".into(),
227232
version: ::ELASTICLUNR_VERSION,
228233
document_store: DocumentStore::new(true),
234+
lang: lang,
229235
}
230236
}
231237

@@ -256,7 +262,20 @@ impl Index {
256262
continue;
257263
}
258264

259-
let tokens = self.pipeline.run(pipeline::tokenize(value.as_ref()));
265+
let raw_tokens: Vec<String>;
266+
267+
match self.lang {
268+
#[cfg(feature = "zh")]
269+
Language::Chinese => {
270+
raw_tokens = pipeline::tokenize_chinese(value.as_ref());
271+
},
272+
_ => {
273+
raw_tokens = pipeline::tokenize(value.as_ref());
274+
}
275+
}
276+
277+
let tokens = self.pipeline.run(raw_tokens);
278+
260279
self.document_store
261280
.add_field_length(doc_ref, field, tokens.len());
262281

@@ -266,6 +285,7 @@ impl Index {
266285

267286
for (token, count) in &token_freq {
268287
let freq = (*count as f64).sqrt();
288+
269289
self.index
270290
.get_mut(field)
271291
.expect(&format!("InvertedIndex does not exist for field {}", field))

src/pipeline.rs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
//! Defines the pipeline which processes text for inclusion in the index. Most users do not need
22
//! to use this module directly.
33
4+
45
use serde::ser::{Serialize, SerializeSeq, Serializer};
6+
#[cfg(feature = "zh")]
7+
use jieba_rs::Jieba;
58

69
/// Splits a text string into a vector of individual tokens.
710
pub fn tokenize(text: &str) -> Vec<String> {
@@ -11,6 +14,16 @@ pub fn tokenize(text: &str) -> Vec<String> {
1114
.collect()
1215
}
1316

17+
#[cfg(feature = "zh")]
18+
pub fn tokenize_chinese(text: &str) -> Vec<String> {
19+
let jieba = Jieba::new();
20+
21+
jieba.cut_for_search(text.as_ref(), false)
22+
.iter()
23+
.map(|s| (*s).into())
24+
.collect()
25+
}
26+
1427
/// The function type used for each step in a pipeline.
1528
pub type PipelineFn = fn(String) -> Option<String>;
1629

tests/data/zh.in.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
这条法国邮船白拉日隆子爵号(VicomtedeBragelonne)正向中国开来。早晨八点多钟,冲洗过的三等舱甲板湿意未干,但已坐满了人,法国人、德国流亡出来的犹太人、印度人、安南人,不用说还有中国人。海风里早含着燥热,胖人身体给炎风吹干了,上一层汗结的盐霜,仿佛刚在巴勒斯坦的死海里洗过澡。毕竟是清晨,人的兴致还没给太阳晒萎,烘懒,说话做事都很起劲。那几个新派到安南或中国租界当警察的法国人,正围了那年轻善撒娇的犹太女人在调情。俾斯麦曾说过,法国公使大使的特点,就是一句外国话不会讲;这几位警察并不懂德文,居然传情达意,引得犹太女人格格地笑,比他们的外交官强多了。这女人的漂亮丈夫,在旁顾而乐之,因为他几天来,香烟、啤酒、柠檬水沾光了不少。红海已过,不怕热极引火,所以等一会甲板上零星果皮、纸片、瓶塞之外,香烟头定又遍处皆是。法国人的思想是有名的清楚,他的文章也明白干净,但是他的做事,无不混乱、肮脏、喧哗,但看这船上的乱糟糟。这船,倚仗人的机巧,载满人的扰攘,寄满人的希望,热闹地行着,每分钟把沾污了人气的一小方小面,还给那无情、无尽、无际的大海。

0 commit comments

Comments
 (0)