Add NERCut tokenization engine (#503)

wannaphong · bact · web-flow · commit 3e622d799c48 · 2020-12-27T19:15:15.000+07:00
* Add NERCut * More readable variable names (#504) * Update nercut docs and update nercut code * Update tokenize.rst * Update nercut.py * Update test_tokenize.py * Update test_tokenize.py * Update core.py Co-authored-by: Arthit Suriyawongkul <arthit@gmail.com>
diff --git a/docs/api/tokenize.rst b/docs/api/tokenize.rst
@@ -19,34 +19,54 @@ Modules
 Tokenization Engines
 --------------------
 
-newmm
-+++++
-.. automodule::  pythainlp.tokenize.newmm
-.. autofunction:: pythainlp.tokenize.newmm.segment
+Word level
+----------
 
+attacut
++++++++
+.. automodule::  pythainlp.tokenize.attacut
 
-longest
+.. autoclass:: pythainlp.tokenize.attacut.AttacutTokenizer
+   :members:
+
+deepcut
 +++++++
-.. automodule::  pythainlp.tokenize.longest
+.. automodule::  pythainlp.tokenize.deepcut
 
 multi_cut
 +++++++++
 .. automodule::  pythainlp.tokenize.multi_cut
 
+.. autofunction:: pythainlp.tokenize.multi_cut.segment
+.. autofunction:: pythainlp.tokenize.multi_cut.find_all_segment
+
+longest
++++++++
+.. automodule::  pythainlp.tokenize.longest
+
+.. autofunction:: pythainlp.tokenize.longest.segment
+
 pyicu
 +++++
 .. automodule::  pythainlp.tokenize.pyicu
 
-deepcut
-+++++++
-.. automodule::  pythainlp.tokenize.deepcut
+nercut
+++++++
+.. automodule::  pythainlp.tokenize.nercut
 
-attacut
-+++++++
-.. automodule::  pythainlp.tokenize.attacut
+.. autofunction:: pythainlp.tokenize.nercut.segment
 
-.. autoclass:: pythainlp.tokenize.attacut.AttacutTokenizer
-   :members:
+newmm
++++++
+
+The default word tokenization engine.
+
+.. automodule::  pythainlp.tokenize.newmm
+
+.. autofunction:: pythainlp.tokenize.newmm.segment
+
+Subword level
+-------------
 
 tcc
 +++
diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
@@ -36,6 +36,9 @@ def clause_tokenize(doc: List[str]) -> List[List[str]]:
         ['และ', 'คุณ', 'เล่น', 'มือถือ'],
         ['ส่วน', 'น้อง', 'เขียน', 'โปรแกรม']]
     """
+    if not doc or not isinstance(doc, str):
+        return []
+
     from .crfcls import segment
 
     return segment(doc)
@@ -74,6 +77,9 @@ def word_tokenize(
         * *deepcut* - wrapper for
           `DeepCut <https://github.com/rkcosmos/deepcut>`_,
           learning-based approach
+        * *nercut* - Dictionary-based maximal matching word segmentation,
+          constrained with Thai Character Cluster (TCC) boundaries,
+          and combining tokens that are parts of the same named-entity.
 
     :Note:
         - The parameter **custom_dict** can be provided as an argument \
@@ -162,6 +168,10 @@ def word_tokenize(
     elif engine == "icu":
         from .pyicu import segment
 
+        segments = segment(text)
+    elif engine == "nercut":
+        from .nercut import segment
+
         segments = segment(text)
     else:
         raise ValueError(
diff --git a/pythainlp/tokenize/nercut.py b/pythainlp/tokenize/nercut.py
@@ -0,0 +1,77 @@
+# -*- coding: utf-8 -*-
+"""
+nercut 0.1
+
+Dictionary-based maximal matching word segmentation, constrained with
+Thai Character Cluster (TCC) boundaries, and combining tokens that are
+parts of the same named-entity.
+
+Code by Wannaphong Phatthiyaphaibun
+"""
+from typing import Iterable, List
+
+from pythainlp.tag.named_entity import ThaiNameTagger
+
+_thainer = ThaiNameTagger()
+
+
+def segment(
+    text: str,
+    taglist: Iterable[str] = [
+        "ORGANIZATION",
+        "PERSON",
+        "PHONE",
+        "EMAIL",
+        "DATE",
+        "TIME",
+    ],
+) -> List[str]:
+    """
+    Dictionary-based maximal matching word segmentation, constrained with
+    Thai Character Cluster (TCC) boundaries, and combining tokens that are
+    parts of the same named-entity.
+
+    :param str text: text to be tokenized to words
+    :parm list taglist: a list of named-entity tags to be used
+    :return: list of words, tokenized from the text
+    """
+    if not text or not isinstance(text, str):
+        return []
+
+    global _thainer
+    tagged_words = _thainer.get_ner(text, pos=False)
+
+    words = []
+    combining_word = ""
+    combining_word = ""
+    for curr_word, curr_tag in tagged_words:
+        if curr_tag != "O":
+            tag = curr_tag[2:]
+        else:
+            tag = "O"
+
+        if curr_tag.startswith("B-") and tag in taglist:
+            if combining_word != "":
+                words.append(combining_word)
+            combining_word = curr_word
+        elif (
+            curr_tag.startswith("I-")
+            and combining_word != ""
+            and tag in taglist
+        ):
+            combining_word += curr_word
+        elif (
+            curr_tag == "O"
+            and combining_word != ""
+        ):
+            words.append(combining_word)
+            combining_word = ""
+            words.append(curr_word)
+        else:
+            combining_word = ""
+            words.append(curr_word)
+
+    if combining_word != "":
+        words.append(combining_word)
+
+    return words
diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py