Skip to content

Commit 020805c

Browse files
authored
Merge pull request #135 from bact/dev
Fix MetaSound + Adjust tokenizer selector + More documentation + clean code
2 parents 55fcff7 + fb229b2 commit 020805c

File tree

19 files changed

+299
-271
lines changed

19 files changed

+299
-271
lines changed

docs/conf.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
# import sys
1717
# sys.path.insert(0, os.path.abspath('.'))
1818
from datetime import datetime
19-
import sys, os
2019

2120
# -- Project information -----------------------------------------------------
2221

docs/pythainlp-dev-thai.md

Lines changed: 119 additions & 120 deletions
Large diffs are not rendered by default.

examples/tokenize.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,5 +20,5 @@
2020
print(word_tokenize(text2))
2121
# ['กฎหมายแรงงาน']
2222

23-
print(word_tokenize(text2, engine="longest-matching"))
23+
print(word_tokenize(text2, engine="longest"))
2424
# ['กฎหมาย', 'แรงงาน']

pythainlp/MetaSound.py

Lines changed: 0 additions & 64 deletions
This file was deleted.

pythainlp/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from pythainlp.collation import collation
44
from pythainlp.date import now
55
from pythainlp.keywords import find_keyword
6-
from pythainlp.MetaSound import MetaSound
6+
from pythainlp.metasound import metasound
77
from pythainlp.rank import rank
88
from pythainlp.romanization import romanize
99
from pythainlp.sentiment import sentiment

pythainlp/metasound.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
MetaSound - Thai soundex system
4+
5+
References:
6+
Snae & Brückner. (2009). Novel Phonetic Name Matching Algorithm with a Statistical
7+
Ontology for Analysing Names Given in Accordance with Thai Astrology.
8+
https://pdfs.semanticscholar.org/3983/963e87ddc6dfdbb291099aa3927a0e3e4ea6.pdf
9+
"""
10+
11+
_CONS_THANTHAKHAT = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ์"
12+
_THANTHAKHAT = "์" # \u0e4c
13+
_C1 = "กขฃคฆฅ" # sound K -> coded letter 1
14+
_C2 = "จฉชฌซฐทฒดฎตสศษ" # D -> 2
15+
_C3 = "ฟฝพผภบป" # B -> 3
16+
_C4 = "ง" # NG -> 4
17+
_C5 = "ลฬรนณฦญ" # N -> 5
18+
_C6 = "ม" # M -> 6
19+
_C7 = "ย" # Y -> 7
20+
_C8 = "ว" # W -> 8
21+
22+
23+
def metasound(text, length=4):
24+
"""
25+
Thai MetaSound
26+
27+
:param str text: Thai text
28+
:param int length: preferred length of the MetaSound (default is 4)
29+
:return: MetaSound for the text
30+
**Example**::
31+
from pythainlp.metasound import metasound
32+
metasound("ลัก") # 'ล100'
33+
metasound("รัก") # 'ร100'
34+
metasound("รักษ์") # 'ร100'
35+
metasound("บูรณการ", 5)) # 'บ5515'
36+
"""
37+
# keep only consonants and thanthakhat
38+
chars = []
39+
for ch in text:
40+
if ch in _CONS_THANTHAKHAT:
41+
chars.append(ch)
42+
43+
# remove karan (thanthakhat and a consonant before it)
44+
i = 0
45+
while i < len(chars):
46+
if chars[i] == _THANTHAKHAT:
47+
if i > 0:
48+
chars[i - 1] = " "
49+
chars[i] = " "
50+
i += 1
51+
52+
# retain first consonant, encode the rest
53+
chars = chars[:length]
54+
i = 1
55+
while i < len(chars):
56+
if chars[i] in _C1:
57+
chars[i] = "1"
58+
elif chars[i] in _C2:
59+
chars[i] = "2"
60+
elif chars[i] in _C3:
61+
chars[i] = "3"
62+
elif chars[i] in _C4:
63+
chars[i] = "4"
64+
elif chars[i] in _C5:
65+
chars[i] = "5"
66+
elif chars[i] in _C6:
67+
chars[i] = "6"
68+
elif chars[i] in _C7:
69+
chars[i] = "7"
70+
elif chars[i] in _C8:
71+
chars[i] = "8"
72+
else:
73+
chars[i] = "0"
74+
i += 1
75+
76+
while len(chars) < length:
77+
chars.append("0")
78+
79+
return "".join(chars)
80+
81+
82+
if __name__ == "__main__":
83+
print(metasound("บูรณะ")) # บ550 (an example from the original paper [Figure 4])
84+
print(metasound("บูรณการ", 5)) # บ5515
85+
print(metasound("ลักษณะ")) # ล125
86+
print(metasound("ลัก")) # ล100
87+
print(metasound("รัก")) # ร100
88+
print(metasound("รักษ์")) # ร100
89+
print(metasound("")) # 0000
90+
91+
print(metasound("คน"))
92+
print(metasound("คนA"))
93+
print(metasound("ดา"))
94+
print(metasound("ปา"))
95+
print(metasound("งา"))
96+
print(metasound("ลา"))
97+
print(metasound("มา"))
98+
print(metasound("วา"))

pythainlp/romanization/pyicu.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
# -*- coding: utf-8 -*-
22

3-
import sys
4-
53
try:
64
import icu
75
except ImportError:

pythainlp/sentiment/ulmfit_sent.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
Sentiment analyzer based on thai2vec ("ulmfit" engine)
44
Code by https://github.com/cstorm125/thai2vec/tree/master/notebook
55
"""
6-
import sys
76
from collections import defaultdict
87

98
from pythainlp.corpus import download, get_file
@@ -85,7 +84,8 @@ def about():
8584
return """
8685
Sentiment analyzer based on thai2vec
8786
Data is from various online reviews including but not limited to JagerV3 and Wongnai Challenge.
88-
89% accuracy based on 15% validation set compared to 72% of fastText and 52% most-frequent-class baseline.
87+
89% accuracy based on 15% validation set compared to
88+
72% of fastText and 52% most-frequent-class baseline.
8989
9090
Development: Charin Polpanumas
9191
GitHub: https://github.com/cstorm125/thai2vec

pythainlp/tag/__init__.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,8 @@
22
"""
33
Part-Of-Speech tagger
44
"""
5-
import sys
65

7-
ARTAGGER_URL = "https://github.com/wannaphongcom/artagger/archive/master.zip"
6+
_ARTAGGER_URL = "https://github.com/wannaphongcom/artagger/archive/master.zip"
87

98

109
def pos_tag(words, engine="unigram", corpus="orchid"):
@@ -31,11 +30,11 @@ def _tag(text, corpus=None):
3130
except ImportError:
3231
from pythainlp.tools import install_package
3332

34-
install_package(ARTAGGER_URL)
33+
install_package(_ARTAGGER_URL)
3534
try:
3635
from artagger import Tagger
3736
except ImportError:
38-
raise ImportError("Error: Try 'pip install " + ARTAGGER_URL + "'")
37+
raise ImportError("Error: Try 'pip install " + _ARTAGGER_URL + "'")
3938

4039
words = Tagger().tag(" ".join(text))
4140

pythainlp/tag/perceptron.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ def pud_data():
2424
return model
2525

2626

27-
def tag(text, corpus):
27+
def tag(text, corpus="pud"):
2828
"""
2929
รับค่าเป็น ''list'' คืนค่าเป็น ''list'' เช่น [('ข้อความ', 'ชนิดคำ')]"""
3030
if corpus == "orchid":

0 commit comments

Comments
 (0)