Skip to content

Commit 8035ea9

Browse files
committed
Remove duplicated codes in tcc.py and newmm.py.
Now newmm.py will call tcc_gen() [previously named tcc1()] from tcc.py. Note: that this may change the tokenization behaviour a little bit, as TCC pattern in tcc.py and newmm.py has three-line differences. I stick with tcc.py's version, which has these three additional lines: ---- ๆ ฯลฯ ฯ ----
1 parent aa80273 commit 8035ea9

File tree

2 files changed

+44
-70
lines changed

2 files changed

+44
-70
lines changed

pythainlp/tokenize/newmm.py

Lines changed: 4 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# -*- coding: utf-8 -*-
22

3-
"""ตัวตัดคำภาษาไทยโดยใช้หลักการ maximal matching และ TCC
3+
"""ตัวตัดคำภาษาไทยโดยใช้หลักการ maximal matching และ Thai Character Cluster (TCC)
44
พัฒนาโดยคุณ Korakot Chaovavanich
55
Notebooks:
66
https://colab.research.google.com/notebook#fileId=1V1Z657_5eSWPo8rLfVRwA0A5E4vkg7SI
@@ -14,6 +14,8 @@
1414

1515
from pythainlp.tokenize import DEFAULT_DICT_TRIE
1616

17+
from .tcc import tcc_gen
18+
1719
# ช่วยตัดพวกภาษาอังกฤษ เป็นต้น
1820
PAT_ENG = re.compile(
1921
r"""(?x)
@@ -24,60 +26,13 @@
2426
"""
2527
)
2628

27-
# TCC
28-
re_tcc = (
29-
"""\
30-
เc็c
31-
เcctาะ
32-
เccีtยะ
33-
เccีtย(?=[เ-ไก-ฮ]|$)
34-
เcc็c
35-
เcิc์c
36-
เcิtc
37-
เcีtยะ?
38-
เcืtอะ?
39-
เc[ิีุู]tย(?=[เ-ไก-ฮ]|$)
40-
เctา?ะ?
41-
cัtวะ
42-
c[ัื]tc[ุิะ]?
43-
c[ิุู]์
44-
c[ะ-ู]t
45-
c็
46-
ct[ะาำ]?
47-
แc็c
48-
แcc์
49-
แctะ
50-
แcc็c
51-
แccc์
52-
โctะ
53-
[เ-ไ]ct
54-
""".replace(
55-
"c", "[ก-ฮ]"
56-
)
57-
.replace("t", "[่-๋]?")
58-
.split()
59-
)
60-
61-
PAT_TCC = re.compile("|".join(re_tcc))
6229
PAT_TWOCHARS = re.compile("[ก-ฮ]{,2}$")
6330

6431

65-
def tcc(w):
66-
p = 0
67-
while p < len(w):
68-
m = PAT_TCC.match(w[p:])
69-
if m:
70-
n = m.span()[1]
71-
else:
72-
n = 1
73-
yield w[p : p + n]
74-
p += n
75-
76-
7732
def tcc_pos(text):
7833
p_set = set()
7934
p = 0
80-
for w in tcc(text):
35+
for w in tcc_gen(text):
8136
p += len(w)
8237
p_set.add(p)
8338
return p_set

pythainlp/tokenize/tcc.py

Lines changed: 40 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,21 @@
11
# -*- coding: utf-8 -*-
2-
from __future__ import absolute_import,division,unicode_literals,print_function
2+
from __future__ import absolute_import, division, print_function, unicode_literals
3+
4+
import re
5+
36
"""
4-
โปรแกรม TCC ภาษาไทย
5-
เดติด
6-
TCC : Mr.Jakkrit TeCho
7-
grammar : คุณ Wittawat Jitkrittum (https://github.com/wittawatj/jtcc/blob/master/TCC.g)
8-
โค้ด : คุณ Korakot Chaovavanich
7+
Separate Thai text into Thai Character Cluster (TCC).
8+
Based on "Character cluster based Thai information retrieval" (Theeramunkong et al. 2002)
9+
http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.59.2548
10+
11+
Credits:
12+
- TCC: Jakkrit TeCho
13+
- Grammar: Wittawat Jitkrittum https://github.com/wittawatj/jtcc/blob/master/TCC.g
14+
- Python code: Korakot Chaovavanich
915
"""
10-
import re
11-
pat_list = """\
16+
17+
RE_TCC = (
18+
"""\
1219
เc็c
1320
เcctาะ
1421
เccีtยะ
@@ -36,22 +43,34 @@
3643
3744
ฯลฯ
3845
39-
""".replace('c','[ก-ฮ]').replace('t', '[่-๋]?').split()
40-
def tcc1(w):
46+
""".replace(
47+
"c", "[ก-ฮ]"
48+
)
49+
.replace("t", "[่-๋]?")
50+
.split()
51+
)
52+
53+
PAT_TCC = re.compile("|".join(RE_TCC))
54+
55+
56+
def tcc_gen(w):
4157
p = 0
42-
pat = re.compile("|".join(pat_list))
43-
while p<len(w):
44-
m = pat.match(w[p:])
58+
while p < len(w):
59+
m = PAT_TCC.match(w[p:])
4560
if m:
4661
n = m.span()[1]
4762
else:
4863
n = 1
49-
yield w[p:p+n]
64+
yield w[p : p + n]
5065
p += n
51-
def tcc(w, sep='/'):
52-
return sep.join(tcc1(w))
53-
if __name__ == '__main__':
54-
print(tcc('แมวกิน'))
55-
print(tcc('ประชาชน'))
56-
print(tcc('ขุดหลุม'))
57-
print(tcc('ยินดี'))
66+
67+
68+
def tcc(w, sep="/"):
69+
return sep.join(tcc_gen(w))
70+
71+
72+
if __name__ == "__main__":
73+
print(tcc("แมวกิน"))
74+
print(tcc("ประชาชน"))
75+
print(tcc("ขุดหลุม"))
76+
print(tcc("ยินดี"))

0 commit comments

Comments
 (0)