Skip to content

Commit ba39115

Browse files
authored
Merge pull request #133 from bact/dev
Code cleaning + small optimization
2 parents 439cc4d + f45afa1 commit ba39115

File tree

20 files changed

+926
-681
lines changed

20 files changed

+926
-681
lines changed

examples/romanization.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# -*- coding: utf-8 -*-
22

3-
from pythainlp.romanization import romanization
3+
from pythainlp.romanization import romanize
44

5-
print(romanization("แมว"))
5+
print(romanize("แมว"))

pythainlp/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
__version__ = 1.7
44
from pythainlp.sentiment import sentiment
55
from pythainlp.spell import spell
6-
from pythainlp.romanization import romanization
6+
from pythainlp.romanization import romanize
77
from pythainlp.tokenize import word_tokenize,sent_tokenize,tcc,etcc
88
from pythainlp.rank import rank
99
from pythainlp.change import texttothai,texttoeng

pythainlp/chunk/__init__.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
# -*- coding: utf-8 -*-
2-
#from __future__ import absolute_import,unicode_literals
3-
# TODO
2+
3+
# from __future__ import absolute_import, unicode_literals
4+
5+
# TODO: Chunking

pythainlp/collation/__init__.py

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,27 @@
11
# -*- coding: utf-8 -*-
2+
"""
3+
Thai collation (sort according to dictionary order)
4+
For Unicode collation, please refer to Unicode Common Locale Data Repository (CLDR)
5+
https://unicode.org/cldr/charts/latest/collation/th.html
6+
"""
27
from __future__ import absolute_import, unicode_literals, print_function
38
import re
49

10+
RE_TONE = re.compile(r"[็-์]")
11+
RE_LV_C = re.compile(r"([เ-ไ])([ก-ฮ])")
12+
513
try:
614
import icu
7-
thkey = icu.Collator.createInstance(icu.Locale('th_TH')).getSortKey
15+
16+
thkey = icu.Collator.createInstance(icu.Locale("th_TH")).getSortKey
817
except ImportError:
18+
919
def thkey(word):
10-
cv = re.sub('[็-์]', '', word,re.U) # remove tone
11-
cv = re.sub('([เ-ไ])([ก-ฮ])', '\\2\\1', cv,re.U) # switch lead vowel
12-
tone = re.sub('[^็-์]', ' ', word,re.U) # just tone
13-
return cv+tone
20+
cv = RE_TONE.sub("", word) # remove tone
21+
cv = RE_LV_C.sub("\\2\\1", cv) # switch lead vowel
22+
tone = RE_TONE.sub(" ", word) # just tone
23+
return cv + tone
24+
1425

1526
def collation(data):
1627
"""
@@ -23,8 +34,9 @@ def collation(data):
2334
"""
2435
return sorted(data, key=thkey)
2536

37+
2638
if __name__ == "__main__":
27-
a=collation(['ไก่','ไข่','ก','ฮา'])==['ก', 'ไก่', 'ไข่', 'ฮา']
28-
print(a)
29-
print(collation(['หลาย','หญิง'])==['หญิง','หลาย'])
30-
print(collation(['ไก่', 'เป็ด', 'หมู', 'วัว'])==['ไก่', 'เป็ด', 'วัว', 'หมู'])
39+
a = collation(["ไก่", "ไข่", "ก", "ฮา"]) == ["ก", "ไก่", "ไข่", "ฮา"]
40+
print(a)
41+
print(collation(["หลาย", "หญิง"]) == ["หญิง", "หลาย"])
42+
print(collation(["ไก่", "เป็ด", "หมู", "วัว"]) == ["ไก่", "เป็ด", "วัว", "หมู"])

pythainlp/romanization/__init__.py

Lines changed: 25 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,27 @@
11
# -*- coding: utf-8 -*-
2-
from __future__ import absolute_import,unicode_literals
2+
3+
from __future__ import absolute_import, unicode_literals
34
from pythainlp.tokenize import word_tokenize
4-
# ถอดเสียงภาษาไทยเป็น Latin
5-
def romanization(data,engine='royin'):
6-
"""
7-
:param str data: Thai text to be romanized
8-
:param str engine: choose between 'royin' , 'pyicu' and 'thai2rom'. 'royin' will romanize according to the standard of Thai Royal Institute. 'pyicu' will romanize according to the Internaitonal Phonetic Alphabet. 'thai2rom' is deep learning thai romanization.
9-
:return: English (more or less) text that spells out how the Thai text should read.
10-
"""
11-
word_list=word_tokenize(data)
12-
listword=[]
13-
i=0
14-
if engine=='royin':
15-
from .royin import romanization
16-
elif engine=='pyicu':
17-
from .pyicu import romanization
18-
elif engine=='thai2rom':
19-
from pythainlp.romanization.thai2rom import thai2rom
20-
thai=thai2rom()
21-
return thai.romanization(data)
22-
else:
23-
raise Exception("error no have engine.")
24-
while i<len(word_list):
25-
listword.append(romanization(word_list[i]))
26-
i+=1
27-
return ''.join(listword)
5+
6+
7+
# ถอดเสียงภาษาไทยเป็นอักษรละติน
8+
def romanize(text, engine="royin"):
9+
"""
10+
:param str data: Thai text to be romanized
11+
:param str engine: choose between 'royin' (default), 'pyicu', and 'thai2rom'. 'royin' will romanize according to the standard of Thai Royal Institute. 'pyicu' will romanize according to the Internaitonal Phonetic Alphabet. 'thai2rom' is deep learning Thai romanization.
12+
:return: English (more or less) text that spells out how the Thai text should read.
13+
"""
14+
if engine == "pyicu":
15+
from .pyicu import romanize
16+
elif engine == "thai2rom":
17+
from .thai2rom import ThaiTransliterator
18+
19+
thai2rom = ThaiTransliterator()
20+
return thai2rom.romanize(text)
21+
else: # use default engine "royin"
22+
from .royin import romanize
23+
24+
words = word_tokenize(text)
25+
romanized_words = [romanize(word) for word in words]
26+
27+
return "".join(romanized_words)

pythainlp/romanization/pyicu.py

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,22 @@
11
# -*- coding: utf-8 -*-
2-
from __future__ import absolute_import,unicode_literals
2+
3+
from __future__ import absolute_import, unicode_literals
34
import sys
5+
46
try:
5-
import icu
7+
import icu
68
except ImportError:
7-
from pythainlp.tools import install_package
8-
install_package('pyicu')
9-
try:
10-
import icu
11-
except ImportError:
12-
sys.exit('Error ! using pip install pyicu')
9+
from pythainlp.tools import install_package
10+
11+
install_package("pyicu")
12+
try:
13+
import icu
14+
except ImportError:
15+
sys.exit("Error: please pip install pyicu")
16+
1317

14-
# ถอดเสียงภาษาไทยเป็น Latin
15-
def romanization(data):
16-
"""เป็นคำสั่ง ถอดเสียงภาษาไทยเป็น Latin รับค่า ''str'' ข้อความ คืนค่าเป็น ''str'' ข้อความ Latin"""
17-
thai2latin = icu.Transliterator.createInstance('Thai-Latin')
18-
return thai2latin.transliterate(data)
18+
# ถอดเสียงภาษาไทยเป็นอักษรละติน
19+
def romanize(data):
20+
"""ถอดเสียงภาษาไทยเป็นอักษรละติน รับค่า ''str'' ข้อความ คืนค่า ''str'' อักษรละติน"""
21+
thai2latin = icu.Transliterator.createInstance("Thai-Latin")
22+
return thai2latin.transliterate(data)

0 commit comments

Comments
 (0)