Merge pull request #622 from PyThaiNLP/add-nlpo3

wannaphong · web-flow · commit 7a41b7b854ed · 2021-11-09T15:12:31.000+07:00
[WIP] Add nlpo3
diff --git a/docker_requirements.txt b/docker_requirements.txt
@@ -27,3 +27,4 @@ tensorflow==2.5.1
 pandas==0.24
 tltk==1.3.8
 OSKut==1.3
+nlpo3==1.2.1
diff --git a/docs/api/tokenize.rst b/docs/api/tokenize.rst
@@ -50,6 +50,13 @@ multi_cut
 .. autofunction:: pythainlp.tokenize.multi_cut.segment
 .. autofunction:: pythainlp.tokenize.multi_cut.find_all_segment
 
+nlpo3
++++++
+.. automodule::  pythainlp.tokenize.nlpo3
+
+.. autofunction:: pythainlp.tokenize.nlpo3.load_dict
+.. autofunction:: pythainlp.tokenize.nlpo3.segment
+
 longest
 +++++++
 .. automodule::  pythainlp.tokenize.longest
@@ -98,4 +105,4 @@ etcc
 ++++
 .. automodule:: pythainlp.tokenize.etcc
 
-.. autofunction:: pythainlp.tokenize.etcc.segment
+.. autofunction:: pythainlp.tokenize.etcc.segment
diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
@@ -69,6 +69,7 @@ def word_tokenize(
           Thai Character Cluster
         * *newmm-safe* - newmm, with a mechanism to help avoid long
           processing time for text with continuous ambiguous breaking points
+        * *nlpo3* - Python binding for nlpO3. It is newmm engine in Rust.
         * *longest* - dictionary-based, Longest Matching
         * *icu* - wrapper for ICU (International Components for Unicode,
           using PyICU), dictionary-based
@@ -192,6 +193,19 @@ def word_tokenize(
         from pythainlp.tokenize.oskut import segment
 
         segments = segment(text)
+    elif engine == "nlpo3":
+        from pythainlp.tokenize.nlpo3 import segment
+        if isinstance(custom_dict, str):
+            segments = segment(text, custom_dict=custom_dict)
+        elif not isinstance(custom_dict, str) and custom_dict is not None:
+            raise ValueError(
+                f"""Tokenizer \"{engine}\":
+                custom_dict must be a str.
+                It is a dictionary name as assigned with load_dict().
+                See pythainlp.tokenize.nlpo3.load_dict()"""
+            )
+        else:
+            segments = segment(text)
     else:
         raise ValueError(
             f"""Tokenizer \"{engine}\" not found.
diff --git a/pythainlp/tokenize/nlpo3.py b/pythainlp/tokenize/nlpo3.py
@@ -0,0 +1,68 @@
+# -*- coding: utf-8 -*-
+from sys import stderr
+from typing import List
+
+from nlpo3 import segment as nlpo3_segment
+from nlpo3 import load_dict as nlpo3_load_dict
+from pythainlp.corpus.common import _THAI_WORDS_FILENAME
+from pythainlp.corpus import path_pythainlp_corpus
+
+_NLPO3_DEFAULT_DICT_NAME = "_67a47bf9"
+_NLPO3_DEFAULT_DICT = nlpo3_load_dict(
+                        path_pythainlp_corpus(_THAI_WORDS_FILENAME),
+                        _NLPO3_DEFAULT_DICT_NAME
+                      )
+
+
+def load_dict(file_path: str, dict_name: str) -> bool:
+    """Load a dictionary file into an in-memory dictionary collection.
+
+    The loaded dictionary will be accessible throught the assigned dict_name.
+    *** This function does not override an existing dict name. ***
+
+    :param file_path: Path to a dictionary file
+    :type file_path: str
+    :param dict_name: A unique dictionary name, use for reference.
+    :type dict_name: str
+    :return bool
+
+    :See Also:
+        * \
+            https://github.com/PyThaiNLP/nlpo3
+    """
+    msg, success = nlpo3_load_dict(file_path=file_path, dict_name=dict_name)
+    if bool is False:
+        print(msg, file=stderr)
+    return success
+
+
+def segment(
+    text: str,
+    custom_dict: str = _NLPO3_DEFAULT_DICT_NAME,
+    safe_mode: bool = False,
+    parallel_mode: bool = False
+) -> List[str]:
+    """Break text into tokens.
+
+    Python binding for nlpO3. It is newmm engine in Rust.
+
+    :param str text: text to be tokenized
+    :param str custom_dict: dictionary name, as assigned with load_dict(),\
+        defaults to pythainlp/corpus/common/words_th.txt
+    :param bool safe_mode: reduce chance for long processing time in long text\
+        with many ambiguous breaking points, defaults to False
+    :param bool parallel_mode: Use multithread mode, defaults to False
+
+    :return: list of tokens
+    :rtype: List[str]
+
+    :See Also:
+        * \
+            https://github.com/PyThaiNLP/nlpo3
+    """
+    return nlpo3_segment(
+        text=text,
+        dict_name=custom_dict,
+        safe=safe_mode,
+        parallel=parallel_mode
+    )
diff --git a/setup.py b/setup.py
@@ -75,6 +75,7 @@
     ],
     "tltk": ["tltk>=1.3.8"],
     "oskut": ["oskut>=1.3"],
+    "nlpo3": ["nlpo3>=1.2.1"],
     "full": [
         "PyYAML>=5.3.1",
         "attacut>=1.0.4",
@@ -99,6 +100,7 @@
         "symspellpy>=6.7.0",
         "tltk>=1.3.8",
         "oskut>=1.3",
+        "nlpo3>=1.2.1",
     ],
 }
 
diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py
@@ -372,6 +372,7 @@ def test_word_tokenize(self):
             word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย"),
             ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"],
         )
+        self.assertIsNotNone(word_tokenize(self.text_1, engine="nlpo3"))
         self.assertIsNotNone(word_tokenize(self.text_1, engine="attacut"))
         self.assertIsNotNone(word_tokenize(self.text_1, engine="deepcut"))
         self.assertIsNotNone(word_tokenize(self.text_1, engine="icu"))

Original file line number	Diff line number	Diff line change
`@@ -372,6 +372,7 @@ def test_word_tokenize(self):`
`372`	`372`	`word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย"),`
`373`	`373`	`["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"],`
`374`	`374`	`)`
	`375`	`+ self.assertIsNotNone(word_tokenize(self.text_1, engine="nlpo3"))`
`375`	`376`	`self.assertIsNotNone(word_tokenize(self.text_1, engine="attacut"))`
`376`	`377`	`self.assertIsNotNone(word_tokenize(self.text_1, engine="deepcut"))`
`377`	`378`	`self.assertIsNotNone(word_tokenize(self.text_1, engine="icu"))`