Skip to content

Commit

Permalink
Bump whatlang + add support for Armenian, Georgian, Gujarati, Tagalog
Browse files Browse the repository at this point in the history
  • Loading branch information
valeriansaliou committed Jun 16, 2022
1 parent 1b55617 commit 0e8373b
Show file tree
Hide file tree
Showing 10 changed files with 830 additions and 32 deletions.
31 changes: 5 additions & 26 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ twox-hash = "1.5"
byteorder = "1.4"
hashbrown = "0.12"
linked_hash_set = "0.1"
whatlang = "0.12"
whatlang = "0.16"
regex = "1.5"

[target.'cfg(unix)'.dependencies]
Expand Down
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ Sonic supports a wide range of languages in its lexing system. If a language is

* 🇿🇦 Afrikaans
* 🇸🇦 Arabic
* 🇦🇲 Armenian
* 🇦🇿 Azerbaijani
* 🇧🇩 Bengali
* 🇧🇬 Bulgarian
Expand All @@ -208,8 +209,10 @@ Sonic supports a wide range of languages in its lexing system. If a language is
* 🇪🇪 Estonian
* 🇫🇮 Finnish
* 🇫🇷 French
* 🇬🇪 Georgian
* 🇩🇪 German
* 🇬🇷 Greek
* 🇮🇳 Gujarati
* 🇮🇱 Hebrew
* 🇮🇳 Hindi
* 🇭🇺 Hungarian
Expand All @@ -233,6 +236,7 @@ Sonic supports a wide range of languages in its lexing system. If a language is
* 🇸🇮 Slovene
* 🇪🇸 Spanish
* 🇸🇪 Swedish
* 🇵🇭 Tagalog
* 🇮🇳 Tamil
* 🇹🇭 Thai
* 🇹🇷 Turkish
Expand Down
2 changes: 2 additions & 0 deletions src/lexer/ranges.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ pub struct LexerRegexRange(&'static [(char, char)]);
const RANGE_LATIN: &[(char, char)] = &[('\u{0000}', '\u{024F}')];
const RANGE_CYRILLIC: &[(char, char)] = &[('\u{0400}', '\u{052F}')];
const RANGE_ARABIC: &[(char, char)] = &[('\u{0600}', '\u{06FF}'), ('\u{0750}', '\u{077F}')];
const RANGE_ARMENIAN: &[(char, char)] = &[('\u{0530}', '\u{058F}')];
const RANGE_DEVANAGARI: &[(char, char)] = &[('\u{0900}', '\u{097F}')];
const RANGE_HIRAGANA: &[(char, char)] = &[('\u{3040}', '\u{309F}')];
const RANGE_KATAKANA: &[(char, char)] = &[('\u{30A0}', '\u{30FF}'), ('\u{31F0}', '\u{31FF}')];
Expand Down Expand Up @@ -48,6 +49,7 @@ impl LexerRange {
Script::Latin => RANGE_LATIN,
Script::Cyrillic => RANGE_CYRILLIC,
Script::Arabic => RANGE_ARABIC,
Script::Armenian => RANGE_ARMENIAN,
Script::Devanagari => RANGE_DEVANAGARI,
Script::Hiragana => RANGE_HIRAGANA,
Script::Katakana => RANGE_KATAKANA,
Expand Down
8 changes: 7 additions & 1 deletion src/lexer/stopwords.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ lazy_static! {
static ref STOPWORDS_TUK: HashSet<&'static str> = make(tuk::STOPWORDS_TUK);
}

// Recursion group #7 (7 items)
// Recursion group #7 (9 items)
lazy_static! {
static ref STOPWORDS_AKA: HashSet<&'static str> = make(aka::STOPWORDS_AKA);
static ref STOPWORDS_ZUL: HashSet<&'static str> = make(zul::STOPWORDS_ZUL);
Expand All @@ -104,6 +104,8 @@ lazy_static! {
static ref STOPWORDS_LAT: HashSet<&'static str> = make(lat::STOPWORDS_LAT);
static ref STOPWORDS_SLK: HashSet<&'static str> = make(slk::STOPWORDS_SLK);
static ref STOPWORDS_CAT: HashSet<&'static str> = make(cat::STOPWORDS_CAT);
static ref STOPWORDS_TGL: HashSet<&'static str> = make(tgl::STOPWORDS_TGL);
static ref STOPWORDS_HYE: HashSet<&'static str> = make(hye::STOPWORDS_HYE);
}

fn make<'a>(words: &[&'a str]) -> HashSet<&'a str> {
Expand Down Expand Up @@ -246,6 +248,8 @@ impl LexerStopWord {
Lang::Lat => &*STOPWORDS_LAT,
Lang::Slk => &*STOPWORDS_SLK,
Lang::Cat => &*STOPWORDS_CAT,
Lang::Tgl => &*STOPWORDS_TGL,
Lang::Hye => &*STOPWORDS_HYE,
}
}

Expand Down Expand Up @@ -287,6 +291,7 @@ impl LexerStopWord {
Lang::Lat,
Lang::Slk,
Lang::Cat,
Lang::Tgl,
],
Script::Cyrillic => &[
Lang::Rus,
Expand All @@ -299,6 +304,7 @@ impl LexerStopWord {
Lang::Mkd,
],
Script::Arabic => &[Lang::Ara, Lang::Urd, Lang::Pes],
Script::Armenian => &[Lang::Hye],
Script::Devanagari => &[Lang::Hin, Lang::Mar, Lang::Nep],
Script::Ethiopic => &[Lang::Amh],
Script::Hebrew => &[Lang::Heb, Lang::Yid],
Expand Down
228 changes: 226 additions & 2 deletions src/stopwords/guj.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,229 @@
// Copyright: 2019, Valerian Saliou <valerian@valeriansaliou.name>
// License: Mozilla Public License v2.0 (MPL v2.0)

// Notice: we do not have stopwords for this language yet.
pub static STOPWORDS_GUJ: &[&str] = &[];
pub static STOPWORDS_GUJ: &[&str] = &[
"અંગે",
"અંદર",
"અથવા",
"અને",
"અમને",
"અમારું",
"અમે",
"અહીં",
"આ",
"આગળ",
"આથી",
"આનું",
"આને",
"આપણને",
"આપણું",
"આપણે",
"આપી",
"આર",
"આવી",
"આવે",
"ઉપર",
"ઉભા",
"ઊંચે",
"ઊભું",
"એ",
"એક",
"એન",
"એના",
"એનાં",
"એની",
"એનું",
"એને",
"એનો",
"એમ",
"એવા",
"એવાં",
"એવી",
"એવું",
"એવો",
"ઓછું",
"કંઈક",
"કઈ",
"કયું",
"કયો",
"કરતાં",
"કરવું",
"કરી",
"કરીએ",
"કરું",
"કરે",
"કરેલું",
"કર્યા",
"કર્યાં",
"કર્યું",
"કર્યો",
"કાંઈ",
"કે",
"કેટલું",
"કેમ",
"કેવી",
"કેવું",
"કોઈ",
"કોઈક",
"કોણ",
"કોણે",
"કોને",
"ક્યાં",
"ક્યારે",
"ખૂબ",
"ગઈ",
"ગયા",
"ગયાં",
"ગયું",
"ગયો",
"ઘણું",
"છ",
"છતાં",
"છીએ",
"છું",
"છે",
"છેક",
"છો",
"જ",
"જાય",
"જી",
"જે",
"જેટલું",
"જેને",
"જેમ",
"જેવી",
"જેવું",
"જેવો",
"જો",
"જોઈએ",
"જ્યાં",
"જ્યારે",
"ઝાઝું",
"તને",
"તમને",
"તમારું",
"તમે",
"તા",
"તારાથી",
"તારામાં",
"તારું",
"તું",
"તે",
"તેં",
"તેઓ",
"તેણે",
"તેથી",
"તેના",
"તેની",
"તેનું",
"તેને",
"તેમ",
"તેમનું",
"તેમને",
"તેવી",
"તેવું",
"તો",
"ત્યાં",
"ત્યારે",
"થઇ",
"થઈ",
"થઈએ",
"થતા",
"થતાં",
"થતી",
"થતું",
"થતો",
"થયા",
"થયાં",
"થયું",
"થયેલું",
"થયો",
"થવું",
"થાઉં",
"થાઓ",
"થાય",
"થી",
"થોડું",
"દરેક",
"ન",
"નં",
"નં.",
"નથી",
"નહિ",
"નહી",
"નહીં",
"ના",
"ની",
"નીચે",
"નું",
"ને",
"નો",
"પછી",
"પણ",
"પર",
"પરંતુ",
"પહેલાં",
"પાછળ",
"પાસે",
"પોતાનું",
"પ્રત્યેક",
"ફક્ત",
"ફરી",
"ફરીથી",
"બંને",
"બધા",
"બધું",
"બની",
"બહાર",
"બહુ",
"બાદ",
"બે",
"મને",
"મા",
"માં",
"માટે",
"માત્ર",
"મારું",
"મી",
"મૂકવું",
"મૂકી",
"મૂક્યા",
"મૂક્યાં",
"મૂક્યું",
"મેં",
"રહી",
"રહે",
"રહેવું",
"રહ્યા",
"રહ્યાં",
"રહ્યો",
"રીતે",
"રૂ.",
"રૂા",
"લેતા",
"લેતું",
"લેવા",
"વગેરે",
"વધુ",
"શકે",
"શા",
"શું",
"સરખું",
"સામે",
"સુધી",
"હતા",
"હતાં",
"હતી",
"હતું",
"હવે",
"હશે",
"હશો",
"હા",
"હું",
"હો",
"હોઈ",
"હોઈશ",
"હોઈશું",
"હોય",
"હોવા",
];
Loading

0 comments on commit 0e8373b

Please sign in to comment.