File tree Expand file tree Collapse file tree 2 files changed +44
-70
lines changed
Expand file tree Collapse file tree 2 files changed +44
-70
lines changed Original file line number Diff line number Diff line change 11# -*- coding: utf-8 -*-
22
3- """ตัวตัดคำภาษาไทยโดยใช้หลักการ maximal matching และ TCC
3+ """ตัวตัดคำภาษาไทยโดยใช้หลักการ maximal matching และ Thai Character Cluster ( TCC)
44พัฒนาโดยคุณ Korakot Chaovavanich
55Notebooks:
66https://colab.research.google.com/notebook#fileId=1V1Z657_5eSWPo8rLfVRwA0A5E4vkg7SI
1414
1515from pythainlp .tokenize import DEFAULT_DICT_TRIE
1616
17+ from .tcc import tcc_gen
18+
1719# ช่วยตัดพวกภาษาอังกฤษ เป็นต้น
1820PAT_ENG = re .compile (
1921 r"""(?x)
2426"""
2527)
2628
27- # TCC
28- re_tcc = (
29- """\
30- เc็c
31- เcctาะ
32- เccีtยะ
33- เccีtย(?=[เ-ไก-ฮ]|$)
34- เcc็c
35- เcิc์c
36- เcิtc
37- เcีtยะ?
38- เcืtอะ?
39- เc[ิีุู]tย(?=[เ-ไก-ฮ]|$)
40- เctา?ะ?
41- cัtวะ
42- c[ัื]tc[ุิะ]?
43- c[ิุู]์
44- c[ะ-ู]t
45- c็
46- ct[ะาำ]?
47- แc็c
48- แcc์
49- แctะ
50- แcc็c
51- แccc์
52- โctะ
53- [เ-ไ]ct
54- """ .replace (
55- "c" , "[ก-ฮ]"
56- )
57- .replace ("t" , "[่-๋]?" )
58- .split ()
59- )
60-
61- PAT_TCC = re .compile ("|" .join (re_tcc ))
6229PAT_TWOCHARS = re .compile ("[ก-ฮ]{,2}$" )
6330
6431
65- def tcc (w ):
66- p = 0
67- while p < len (w ):
68- m = PAT_TCC .match (w [p :])
69- if m :
70- n = m .span ()[1 ]
71- else :
72- n = 1
73- yield w [p : p + n ]
74- p += n
75-
76-
7732def tcc_pos (text ):
7833 p_set = set ()
7934 p = 0
80- for w in tcc (text ):
35+ for w in tcc_gen (text ):
8136 p += len (w )
8237 p_set .add (p )
8338 return p_set
Original file line number Diff line number Diff line change 11# -*- coding: utf-8 -*-
2- from __future__ import absolute_import ,division ,unicode_literals ,print_function
2+ from __future__ import absolute_import , division , print_function , unicode_literals
3+
4+ import re
5+
36"""
4- โปรแกรม TCC ภาษาไทย
5- เดติด
6- TCC : Mr.Jakkrit TeCho
7- grammar : คุณ Wittawat Jitkrittum (https://github.com/wittawatj/jtcc/blob/master/TCC.g)
8- โค้ด : คุณ Korakot Chaovavanich
7+ Separate Thai text into Thai Character Cluster (TCC).
8+ Based on "Character cluster based Thai information retrieval" (Theeramunkong et al. 2002)
9+ http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.59.2548
10+
11+ Credits:
12+ - TCC: Jakkrit TeCho
13+ - Grammar: Wittawat Jitkrittum https://github.com/wittawatj/jtcc/blob/master/TCC.g
14+ - Python code: Korakot Chaovavanich
915"""
10- import re
11- pat_list = """\
16+
17+ RE_TCC = (
18+ """\
1219 เc็c
1320เcctาะ
1421เccีtยะ
3643ๆ
3744ฯลฯ
3845ฯ
39- """ .replace ('c' ,'[ก-ฮ]' ).replace ('t' , '[่-๋]?' ).split ()
40- def tcc1 (w ):
46+ """ .replace (
47+ "c" , "[ก-ฮ]"
48+ )
49+ .replace ("t" , "[่-๋]?" )
50+ .split ()
51+ )
52+
53+ PAT_TCC = re .compile ("|" .join (RE_TCC ))
54+
55+
56+ def tcc_gen (w ):
4157 p = 0
42- pat = re .compile ("|" .join (pat_list ))
43- while p < len (w ):
44- m = pat .match (w [p :])
58+ while p < len (w ):
59+ m = PAT_TCC .match (w [p :])
4560 if m :
4661 n = m .span ()[1 ]
4762 else :
4863 n = 1
49- yield w [p : p + n ]
64+ yield w [p : p + n ]
5065 p += n
51- def tcc (w , sep = '/' ):
52- return sep .join (tcc1 (w ))
53- if __name__ == '__main__' :
54- print (tcc ('แมวกิน' ))
55- print (tcc ('ประชาชน' ))
56- print (tcc ('ขุดหลุม' ))
57- print (tcc ('ยินดี' ))
66+
67+
68+ def tcc (w , sep = "/" ):
69+ return sep .join (tcc_gen (w ))
70+
71+
72+ if __name__ == "__main__" :
73+ print (tcc ("แมวกิน" ))
74+ print (tcc ("ประชาชน" ))
75+ print (tcc ("ขุดหลุม" ))
76+ print (tcc ("ยินดี" ))
You can’t perform that action at this time.
0 commit comments