Merge pull request #562 from PyThaiNLP/fix-461

wannaphong · web-flow · commit cbf9360bdff8 · 2021-05-22T10:27:23.000+07:00
Tokenize repeating dots and commas from numbers (fix #461)
diff --git a/pythainlp/tokenize/multi_cut.py b/pythainlp/tokenize/multi_cut.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 """
-Multi cut -- Thai word segmentation with maximum matching. The original source
-code is from Korakot Chaovavanich.
+Multi cut -- Thai word segmentation with maximum matching.
+Original code from Korakot Chaovavanich.
 
 :See Also:
     * `Facebook post \
@@ -12,16 +12,14 @@
 
 import re
 from collections import defaultdict
-from typing import List
+from typing import Iterator, List
 
 from pythainlp.tokenize import DEFAULT_WORD_DICT_TRIE
 from pythainlp.util import Trie
 
 
 class LatticeString(str):
-    """
-    String subclass เพื่อเก็บวิธีตัดหลายๆ วิธี
-    """
+    """String that keeps possible tokenizations"""
 
     def __new__(cls, value, multi=None, in_dict=True):
         return str.__new__(cls, value)
@@ -34,22 +32,22 @@ def __init__(self, value, multi=None, in_dict=True):
                 self.unique = False
         else:
             self.multi = [value]
-        self.in_dict = in_dict  # บอกว่าเป็นคำมีในดิกหรือเปล่า
+        self.in_dict = in_dict  # if in dictionary
 
 
 _RE_NONTHAI = r"""(?x)
-[-a-zA-Z]+|   # Latin
-\d[\d,\.]*|   # number
-[ \t]+|       # space
-\r?\n         # newline
+[-a-zA-Z]+|       # Latin characters
+\d+([,\.]\d+)*|   # number
+[ \t]+|           # space
+\r?\n             # newline
 """
 _PAT_NONTHAI = re.compile(_RE_NONTHAI)
 
 
-def _multicut(text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE):
-    """
-    ส่งคืน LatticeString คืนมาเป็นก้อนๆ
-    """
+def _multicut(
+    text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE
+) -> Iterator[LatticeString]:
+    """Return LatticeString"""
     if not custom_dict:
         custom_dict = DEFAULT_WORD_DICT_TRIE
 
@@ -100,15 +98,15 @@ def serialize(p, p2):  # helper function
             q.add(i)
 
 
-def mmcut(text: str):
+def mmcut(text: str) -> List[str]:
     res = []
     for w in _multicut(text):
         mm = min(w.multi, key=lambda x: x.count("/"))
         res.extend(mm.split("/"))
     return res
 
 
-def _combine(ww: str):
+def _combine(ww: List[LatticeString]) -> Iterator[str]:
     if ww == []:
         yield ""
     else:
@@ -124,12 +122,15 @@ def _combine(ww: str):
 def segment(
     text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE
 ) -> List[str]:
-    """
-    Dictionary-based maximum matching word segmentation.
-
-    :param str text: text to be tokenized to words
-    :param pythainlp.util.Trie custom_dict: dictionary for tokenization
-    :return: list of words, tokenized from the text
+    """Dictionary-based maximum matching word segmentation.
+
+    :param text: text to be tokenized
+    :type text: str
+    :param custom_dict: tokenization dictionary,\
+        defaults to DEFAULT_WORD_DICT_TRIE
+    :type custom_dict: Trie, optional
+    :return: list of segmented tokens
+    :rtype: List[str]
     """
     if not text or not isinstance(text, str):
         return []
@@ -140,11 +141,15 @@ def segment(
 def find_all_segment(
     text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE
 ) -> List[str]:
-    """
-    Get all possible segment variations
-
-    :param str text: input string to be tokenized
-    :return: returns list of segment variations
+    """Get all possible segment variations.
+
+    :param text: input string to be tokenized
+    :type text: str
+    :param custom_dict: tokenization dictionary,\
+        defaults to DEFAULT_WORD_DICT_TRIE
+    :type custom_dict: Trie, optional
+    :return: list of segment variations
+    :rtype: List[str]
     """
     if not text or not isinstance(text, str):
         return []
diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py
@@ -25,10 +25,10 @@
 # match non-Thai tokens
 _PAT_NONTHAI = re.compile(
     r"""(?x)
-[-a-zA-Z]+|   # Latin characters
-\d[\d,\.]*|   # number
-[ \t]+|       # space
-\r?\n         # newline
+[-a-zA-Z]+|        # Latin characters
+\d+([,\.]\d+)*|    # number
+[ \t]+|            # space
+\r?\n              # newline
 """
 )
 
@@ -138,16 +138,23 @@ def segment(
     custom_dict: Trie = DEFAULT_WORD_DICT_TRIE,
     safe_mode: bool = False,
 ) -> List[str]:
-    """
-    Dictionary-based maximal matching word segmentation, constrained with
-    Thai Character Cluster boundaries.
-
-    :param str text: text to be tokenized to words
-    :param pythainlp.util.Trie custom_dict: dictionary for tokenization
-    :param bool safe_mode: True to help avoid long wait for text with long\
-        and continuous ambiguous breaking points. Long wait may still able\
-        to occur. Default is False.
-    :return: list of words, tokenized from the text
+    """Maximal-matching word segmentation, Thai Character Cluster constrained.
+
+    A dictionary-based word segmentation using maximal matching algorithm,
+    constrained to Thai Character Cluster boundaries.
+
+    A custom dictionary can be supplied.
+
+    :param text: text to be tokenized
+    :type text: str
+    :param custom_dict: tokenization dictionary,\
+        defaults to DEFAULT_WORD_DICT_TRIE
+    :type custom_dict: Trie, optional
+    :param safe_mode: reduce chance for long processing time in long text\
+        with many ambiguous breaking points, defaults to False
+    :type safe_mode: bool, optional
+    :return: list of tokens
+    :rtype: List[str]
     """
     if not text or not isinstance(text, str):
         return []
diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py
@@ -450,6 +450,26 @@ def test_mm(self):
             word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="mm"),
             ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"],
         )
+        self.assertEqual(
+            word_tokenize("19...", engine="mm"),
+            ['19', '...'],
+        )
+        self.assertEqual(
+            word_tokenize("19.", engine="mm"),
+            ['19', '.'],
+        )
+        self.assertEqual(
+            word_tokenize("19.84", engine="mm"),
+            ['19.84'],
+        )
+        self.assertEqual(
+            word_tokenize("127.0.0.1", engine="mm"),
+            ["127.0.0.1"],
+        )
+        self.assertEqual(
+            word_tokenize("USD1,984.42", engine="mm"),
+            ['USD', '1,984.42'],
+        )
 
         self.assertIsNotNone(multi_cut.mmcut("ทดสอบ"))
 
@@ -465,6 +485,26 @@ def test_newmm(self):
             word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="newmm"),
             ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"],
         )
+        self.assertEqual(
+            word_tokenize("19...", engine="newmm"),
+            ['19', '...'],
+        )
+        self.assertEqual(
+            word_tokenize("19.", engine="newmm"),
+            ['19', '.'],
+        )
+        self.assertEqual(
+            word_tokenize("19.84", engine="newmm"),
+            ['19.84'],
+        )
+        self.assertEqual(
+            word_tokenize("127.0.0.1", engine="newmm"),
+            ["127.0.0.1"],
+        )
+        self.assertEqual(
+            word_tokenize("USD1,984.42", engine="newmm"),
+            ['USD', '1,984.42'],
+        )
         self.assertEqual(
             word_tokenize(
                 "สวัสดีครับ สบายดีไหมครับ",