Remove duplicated codes in tcc.py and newmm.py.

bact · bact · commit 8035ea9c3d4c · 2018-10-12T17:46:04.000+07:00
Now newmm.py will call tcc_gen() [previously named tcc1()] from tcc.py.

Note: that this may change the tokenization behaviour a little bit, as TCC pattern in tcc.py and newmm.py has three-line differences.

I stick with tcc.py's version, which has these three additional lines:
----
ๆ
ฯลฯ
ฯ
----
diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 
-"""ตัวตัดคำภาษาไทยโดยใช้หลักการ maximal matching และ TCC
+"""ตัวตัดคำภาษาไทยโดยใช้หลักการ maximal matching และ Thai Character Cluster (TCC)
 พัฒนาโดยคุณ Korakot Chaovavanich
 Notebooks:
 https://colab.research.google.com/notebook#fileId=1V1Z657_5eSWPo8rLfVRwA0A5E4vkg7SI
@@ -14,6 +14,8 @@
 
 from pythainlp.tokenize import DEFAULT_DICT_TRIE
 
+from .tcc import tcc_gen
+
 # ช่วยตัดพวกภาษาอังกฤษ เป็นต้น
 PAT_ENG = re.compile(
     r"""(?x)
@@ -24,60 +26,13 @@
 """
 )
 
-# TCC
-re_tcc = (
-    """\
-เc็c
-เcctาะ
-เccีtยะ
-เccีtย(?=[เ-ไก-ฮ]|$)
-เcc็c
-เcิc์c
-เcิtc
-เcีtยะ?
-เcืtอะ?
-เc[ิีุู]tย(?=[เ-ไก-ฮ]|$)
-เctา?ะ?
-cัtวะ
-c[ัื]tc[ุิะ]?
-c[ิุู]์
-c[ะ-ู]t
-c็
-ct[ะาำ]?
-แc็c
-แcc์
-แctะ
-แcc็c
-แccc์
-โctะ
-[เ-ไ]ct
-""".replace(
-        "c", "[ก-ฮ]"
-    )
-    .replace("t", "[่-๋]?")
-    .split()
-)
-
-PAT_TCC = re.compile("|".join(re_tcc))
 PAT_TWOCHARS = re.compile("[ก-ฮ]{,2}$")
 
 
-def tcc(w):
-    p = 0
-    while p < len(w):
-        m = PAT_TCC.match(w[p:])
-        if m:
-            n = m.span()[1]
-        else:
-            n = 1
-        yield w[p : p + n]
-        p += n
-
-
 def tcc_pos(text):
     p_set = set()
     p = 0
-    for w in tcc(text):
+    for w in tcc_gen(text):
         p += len(w)
         p_set.add(p)
     return p_set
diff --git a/pythainlp/tokenize/tcc.py b/pythainlp/tokenize/tcc.py
@@ -1,14 +1,21 @@
 ﻿# -*- coding: utf-8 -*-
-from __future__ import absolute_import,division,unicode_literals,print_function
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import re
+
 """
-โปรแกรม TCC ภาษาไทย
-เดติด
-TCC : Mr.Jakkrit TeCho
-grammar : คุณ Wittawat Jitkrittum (https://github.com/wittawatj/jtcc/blob/master/TCC.g)
-โค้ด : คุณ Korakot Chaovavanich
+Separate Thai text into Thai Character Cluster (TCC).
+Based on "Character cluster based Thai information retrieval" (Theeramunkong et al. 2002)
+http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.59.2548
+
+Credits:
+- TCC: Jakkrit TeCho
+- Grammar: Wittawat Jitkrittum https://github.com/wittawatj/jtcc/blob/master/TCC.g
+- Python code: Korakot Chaovavanich
 """
-import re
-pat_list = """\
+
+RE_TCC = (
+    """\
 เc็c
 เcctาะ
 เccีtยะ
@@ -36,22 +43,34 @@
 ๆ
 ฯลฯ
 ฯ
-""".replace('c','[ก-ฮ]').replace('t', '[่-๋]?').split()
-def tcc1(w):
+""".replace(
+        "c", "[ก-ฮ]"
+    )
+    .replace("t", "[่-๋]?")
+    .split()
+)
+
+PAT_TCC = re.compile("|".join(RE_TCC))
+
+
+def tcc_gen(w):
     p = 0
-    pat = re.compile("|".join(pat_list))
-    while p<len(w):
-        m = pat.match(w[p:])
+    while p < len(w):
+        m = PAT_TCC.match(w[p:])
         if m:
             n = m.span()[1]
         else:
             n = 1
-        yield w[p:p+n]
+        yield w[p : p + n]
         p += n
-def tcc(w, sep='/'):
-    return sep.join(tcc1(w))
-if __name__ == '__main__':
-    print(tcc('แมวกิน'))
-    print(tcc('ประชาชน'))
-    print(tcc('ขุดหลุม'))
-    print(tcc('ยินดี'))
+
+
+def tcc(w, sep="/"):
+    return sep.join(tcc_gen(w))
+
+
+if __name__ == "__main__":
+    print(tcc("แมวกิน"))
+    print(tcc("ประชาชน"))
+    print(tcc("ขุดหลุม"))
+    print(tcc("ยินดี"))