PyThaiNLP · bact · Jun 4, 2021 · May 17, 2021 · May 17, 2021 · May 17, 2021
diff --git a/.github/workflows/macos-test.yml b/.github/workflows/macos-test.yml
@@ -55,4 +55,4 @@ jobs:
         COVERALLS_SERVICE_NAME: github
       run: |
         coverage run -m unittest discover
-        CI_BRANCH=${GITHUB_REF#"ref/heads"} COVERALLS_REPO_TOKEN=${{ secrets.COVERALLS_REPO_TOKEN }} coveralls
+        coveralls
diff --git a/pythainlp/corpus/words_th.txt b/pythainlp/corpus/words_th.txt
@@ -16243,9 +16243,9 @@
 ซ้วยสวย
 ซ่อก ๆ
 ซ็อกเก็ต
-ซ็อกเกอร์
 ซ่อก
 ซอก
+ซอกเกอร์
 ซอกซอน
 ซอกซอย
 ซอกซัง
@@ -16396,7 +16396,6 @@
 ซับพระพักตร์
 ซับพรามส์
 ซับพลายเออร์
-ซับพอร์ท
 ซับไพร์ม
 ซับไพรม์
 ซับเหงื่อ
@@ -16594,7 +16593,6 @@
 ซีเกมส์
 ซีคอน
 ซีคอนสแควร์
-ซีคอนแสควร์
 ซีเครเทส
 ซีเคียวริไทเซชั่น
 ซี่โครง
@@ -16653,7 +16651,6 @@
 ซีเรียส
 ซีโรโทนิน
 ซีไรต์
-ซีไรท์
 ซีล
 ซีลีเนียม
 ซีโลส
@@ -16774,6 +16771,7 @@
 ซูเปอร์โบว์ล
 ซูเปอร์เฟเธอร์เวท
 ซูเปอร์สตาร์
+ซูเปอร์มาร์เก็ต
 ซูพรีโม
 ซูม
 ซูโม่
@@ -16903,7 +16901,6 @@
 เซอร์โคเนียม
 เซอร์เบีย
 เซอร์ไพรซ์
-เซอร์ไพร์ส
 เซอร์ไพรส์
 เซอร์รัส
 เซอร์รารี่
@@ -16992,7 +16989,7 @@
 โซเดียมซัลเฟต
 โซเดียมไซคลาเมต
 โซเดียมไนเตรท
-โซเดียมไฮดรอกไซค์
+โซเดียมไฮดรอกไซด์
 โซเดียมไฮโดรเจนกลูทาเมต
 โซเดียมไฮโดรเจนคาร์บอเนต
 โซเดียมไฮโดรเจนซัลเฟต
@@ -17102,7 +17099,6 @@
 ญิบ
 ญี่
 ญี่ปุ่น
-ญีปุ่น
 เญยธรรม
 ไญยธรรม
 ฎีกา
@@ -17129,7 +17125,6 @@
 ฐานะ
 ฐานะทางการเงิน
 ฐานันดร
-ฐานันดรที่
 ฐานันดรที่สี่
 ฐานันดรศักดิ์
 ฐานานุกรม
@@ -20941,7 +20936,6 @@
 เตลุคู
 เตว็ด
 เตหะราน
-เตอร์กเมนิสถาน
 เต๊อะ
 เตอะ
 เต่อ
@@ -25158,7 +25152,6 @@
 นอร์ฟล็อกซาซิน
 นอร์วีเจียน
 นอร์เวย์
-นอรเวย์
 นอร์เอพิเนฟริน
 นอสตราดามุส
 นะคะ
@@ -26198,7 +26191,6 @@
 นิ
 นิบาตชาดก
 นิบาต
-นิปปอนซุปเปอร์พรีซิชั่น
 นิปปอน
 นิปริยาย
 นิปัจการ
@@ -48706,11 +48698,9 @@
 เวิร์ค
 เวิร์ด
 เวิร์ม
-เวิร์ลด์
-เวิร์ลไวด์เว็บ
+เวิลด์ไวด์เว็บ
 เวิลด์เทรดเซ็นเตอร์
 เวิลด์เทรด
-เวิล์ด
 เวิลด์
 เวียงจันทน์
 เวียงวัง
@@ -48800,7 +48790,6 @@
 โว้ย
 โวย
 โวลต์
-โวลล์แมชชีน
 โว้เว้
 โวสาน
 โวหาร
@@ -48819,7 +48808,7 @@
 ไวฑูรย์
 ไว้ตัว
 ไวตามิน
-ไวตามิลค์
+ไวตามิ้ลค์
 ไวท์เทนนิ่ง
 ไวท์เทนเนอร์
 ไวทย์
@@ -48856,7 +48845,6 @@
 ไวอะกร้า
 ไวอากร้า
 ไวอาก้า
-ไวอาร์ตีลเอส
 ไว้อาลัย
 ไวโอมิง
 ไวโอลิน
@@ -52274,8 +52262,6 @@
 สารกัมมันตรังสี
 สารกึ่งตัวนำ
 สารขัณฑ์
-สารขันฑ์
-สารขันธ์
 สารขัน
 สารคดี
 สารเคมี
@@ -61793,13 +61779,11 @@
 ฮิรางานะ
 ฮิโรชิม่า
 ฮิโรชิมา
-ฮิลตันอะคาเดีย
 ฮิลตัน
 ฮิลล์
 ฮิลลารี
 ฮิวมัส
-ฮิวแมนเพ็บพิโลม่าไทป์
-ฮิวแมนแพบพิลโลมา
+ฮิวแมนแพปพิลโลมา
 ฮิวแมนไรต์วอตซ์
 ฮิวแมน
 ฮิสตามีน
@@ -61845,7 +61829,7 @@
 ฮื่อ
 ฮือ
 ฮุด
-ฮุนเซ็น
+ฮุน เซน
 ฮุนเซน
 ฮุนได
 ฮุบ

diff --git a/pythainlp/tokenize/multi_cut.py b/pythainlp/tokenize/multi_cut.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 """
-Multi cut -- Thai word segmentation with maximum matching. The original source
-code is from Korakot Chaovavanich.
+Multi cut -- Thai word segmentation with maximum matching.
+Original code from Korakot Chaovavanich.
 
 :See Also:
     * `Facebook post \
@@ -12,16 +12,14 @@
 
 import re
 from collections import defaultdict
-from typing import List
+from typing import Iterator, List
 
 from pythainlp.tokenize import DEFAULT_WORD_DICT_TRIE
 from pythainlp.util import Trie
 
 
 class LatticeString(str):
-    """
-    String subclass เพื่อเก็บวิธีตัดหลายๆ วิธี
-    """
+    """String that keeps possible tokenizations"""
 
     def __new__(cls, value, multi=None, in_dict=True):
         return str.__new__(cls, value)
@@ -34,22 +32,22 @@ def __init__(self, value, multi=None, in_dict=True):
                 self.unique = False
         else:
             self.multi = [value]
-        self.in_dict = in_dict  # บอกว่าเป็นคำมีในดิกหรือเปล่า
+        self.in_dict = in_dict  # if in dictionary
 
 
 _RE_NONTHAI = r"""(?x)
-[-a-zA-Z]+|   # Latin
-\d[\d,\.]*|   # number
-[ \t]+|       # space
-\r?\n         # newline
+[-a-zA-Z]+|       # Latin characters
+\d+([,\.]\d+)*|   # number
+[ \t]+|           # space
+\r?\n             # newline
 """
 _PAT_NONTHAI = re.compile(_RE_NONTHAI)
 
 
-def _multicut(text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE):
-    """
-    ส่งคืน LatticeString คืนมาเป็นก้อนๆ
-    """
+def _multicut(
+    text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE
+) -> Iterator[LatticeString]:
+    """Return LatticeString"""
     if not custom_dict:
         custom_dict = DEFAULT_WORD_DICT_TRIE
 
@@ -100,15 +98,15 @@ def serialize(p, p2):  # helper function
             q.add(i)
 
 
-def mmcut(text: str):
+def mmcut(text: str) -> List[str]:
     res = []
     for w in _multicut(text):
         mm = min(w.multi, key=lambda x: x.count("/"))
         res.extend(mm.split("/"))
     return res
 
 
-def _combine(ww: str):
+def _combine(ww: List[LatticeString]) -> Iterator[str]:
     if ww == []:
         yield ""
     else:
@@ -124,12 +122,15 @@ def _combine(ww: str):
 def segment(
     text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE
 ) -> List[str]:
-    """
-    Dictionary-based maximum matching word segmentation.
-
-    :param str text: text to be tokenized to words
-    :param pythainlp.util.Trie custom_dict: dictionary for tokenization
-    :return: list of words, tokenized from the text
+    """Dictionary-based maximum matching word segmentation.
+
+    :param text: text to be tokenized
+    :type text: str
+    :param custom_dict: tokenization dictionary,\
+        defaults to DEFAULT_WORD_DICT_TRIE
+    :type custom_dict: Trie, optional
+    :return: list of segmented tokens
+    :rtype: List[str]
     """
     if not text or not isinstance(text, str):
         return []
@@ -140,11 +141,15 @@ def segment(
 def find_all_segment(
     text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE
 ) -> List[str]:
-    """
-    Get all possible segment variations
-
-    :param str text: input string to be tokenized
-    :return: returns list of segment variations
+    """Get all possible segment variations.
+
+    :param text: input string to be tokenized
+    :type text: str
+    :param custom_dict: tokenization dictionary,\
+        defaults to DEFAULT_WORD_DICT_TRIE
+    :type custom_dict: Trie, optional
+    :return: list of segment variations
+    :rtype: List[str]
     """
     if not text or not isinstance(text, str):
         return []

diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py
@@ -25,10 +25,10 @@
 # match non-Thai tokens
 _PAT_NONTHAI = re.compile(
     r"""(?x)
-[-a-zA-Z]+|   # Latin characters
-\d[\d,\.]*|   # number
-[ \t]+|       # space
-\r?\n         # newline
+[-a-zA-Z]+|        # Latin characters
+\d+([,\.]\d+)*|    # number
+[ \t]+|            # space
+\r?\n              # newline
 """
 )
 
@@ -138,16 +138,23 @@ def segment(
     custom_dict: Trie = DEFAULT_WORD_DICT_TRIE,
     safe_mode: bool = False,
 ) -> List[str]:
-    """
-    Dictionary-based maximal matching word segmentation, constrained with
-    Thai Character Cluster boundaries.
-
-    :param str text: text to be tokenized to words
-    :param pythainlp.util.Trie custom_dict: dictionary for tokenization
-    :param bool safe_mode: True to help avoid long wait for text with long\
-        and continuous ambiguous breaking points. Long wait may still able\
-        to occur. Default is False.
-    :return: list of words, tokenized from the text
+    """Maximal-matching word segmentation, Thai Character Cluster constrained.
+
+    A dictionary-based word segmentation using maximal matching algorithm,
+    constrained to Thai Character Cluster boundaries.
+
+    A custom dictionary can be supplied.
+
+    :param text: text to be tokenized
+    :type text: str
+    :param custom_dict: tokenization dictionary,\
+        defaults to DEFAULT_WORD_DICT_TRIE
+    :type custom_dict: Trie, optional
+    :param safe_mode: reduce chance for long processing time in long text\
+        with many ambiguous breaking points, defaults to False
+    :type safe_mode: bool, optional
+    :return: list of tokens
+    :rtype: List[str]
     """
     if not text or not isinstance(text, str):
         return []