Skip to content
1 change: 1 addition & 0 deletions docker_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,4 @@ tensorflow==2.5.1
pandas==0.24
tltk==1.3.8
OSKut==1.3
nlpo3==1.2.1
9 changes: 8 additions & 1 deletion docs/api/tokenize.rst
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,13 @@ multi_cut
.. autofunction:: pythainlp.tokenize.multi_cut.segment
.. autofunction:: pythainlp.tokenize.multi_cut.find_all_segment

nlpo3
+++++
.. automodule:: pythainlp.tokenize.nlpo3

.. autofunction:: pythainlp.tokenize.nlpo3.load_dict
.. autofunction:: pythainlp.tokenize.nlpo3.segment

longest
+++++++
.. automodule:: pythainlp.tokenize.longest
Expand Down Expand Up @@ -98,4 +105,4 @@ etcc
++++
.. automodule:: pythainlp.tokenize.etcc

.. autofunction:: pythainlp.tokenize.etcc.segment
.. autofunction:: pythainlp.tokenize.etcc.segment
14 changes: 14 additions & 0 deletions pythainlp/tokenize/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ def word_tokenize(
Thai Character Cluster
* *newmm-safe* - newmm, with a mechanism to help avoid long
processing time for text with continuous ambiguous breaking points
* *nlpo3* - Python binding for nlpO3. It is newmm engine in Rust.
* *longest* - dictionary-based, Longest Matching
* *icu* - wrapper for ICU (International Components for Unicode,
using PyICU), dictionary-based
Expand Down Expand Up @@ -192,6 +193,19 @@ def word_tokenize(
from pythainlp.tokenize.oskut import segment

segments = segment(text)
elif engine == "nlpo3":
from pythainlp.tokenize.nlpo3 import segment
if isinstance(custom_dict, str):
segments = segment(text, custom_dict=custom_dict)
elif not isinstance(custom_dict, str) and custom_dict is not None:
raise ValueError(
f"""Tokenizer \"{engine}\":
custom_dict must be a str.
It is a dictionary name as assigned with load_dict().
See pythainlp.tokenize.nlpo3.load_dict()"""
)
else:
segments = segment(text)
else:
raise ValueError(
f"""Tokenizer \"{engine}\" not found.
Expand Down
68 changes: 68 additions & 0 deletions pythainlp/tokenize/nlpo3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# -*- coding: utf-8 -*-
from sys import stderr
from typing import List

from nlpo3 import segment as nlpo3_segment
from nlpo3 import load_dict as nlpo3_load_dict
from pythainlp.corpus.common import _THAI_WORDS_FILENAME
from pythainlp.corpus import path_pythainlp_corpus

_NLPO3_DEFAULT_DICT_NAME = "_67a47bf9"
_NLPO3_DEFAULT_DICT = nlpo3_load_dict(
path_pythainlp_corpus(_THAI_WORDS_FILENAME),
_NLPO3_DEFAULT_DICT_NAME
)


def load_dict(file_path: str, dict_name: str) -> bool:
"""Load a dictionary file into an in-memory dictionary collection.

The loaded dictionary will be accessible throught the assigned dict_name.
*** This function does not override an existing dict name. ***

:param file_path: Path to a dictionary file
:type file_path: str
:param dict_name: A unique dictionary name, use for reference.
:type dict_name: str
:return bool

:See Also:
* \
https://github.com/PyThaiNLP/nlpo3
"""
msg, success = nlpo3_load_dict(file_path=file_path, dict_name=dict_name)
if bool is False:
print(msg, file=stderr)
return success


def segment(
text: str,
custom_dict: str = _NLPO3_DEFAULT_DICT_NAME,
safe_mode: bool = False,
parallel_mode: bool = False
) -> List[str]:
"""Break text into tokens.

Python binding for nlpO3. It is newmm engine in Rust.

:param str text: text to be tokenized
:param str custom_dict: dictionary name, as assigned with load_dict(),\
defaults to pythainlp/corpus/common/words_th.txt
:param bool safe_mode: reduce chance for long processing time in long text\
with many ambiguous breaking points, defaults to False
:param bool parallel_mode: Use multithread mode, defaults to False

:return: list of tokens
:rtype: List[str]

:See Also:
* \
https://github.com/PyThaiNLP/nlpo3
"""
return nlpo3_segment(
text=text,
dict_name=custom_dict,
safe=safe_mode,
parallel=parallel_mode
)
2 changes: 2 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@
],
"tltk": ["tltk>=1.3.8"],
"oskut": ["oskut>=1.3"],
"nlpo3": ["nlpo3>=1.2.1"],
"full": [
"PyYAML>=5.3.1",
"attacut>=1.0.4",
Expand All @@ -99,6 +100,7 @@
"symspellpy>=6.7.0",
"tltk>=1.3.8",
"oskut>=1.3",
"nlpo3>=1.2.1",
],
}

Expand Down
1 change: 1 addition & 0 deletions tests/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,7 @@ def test_word_tokenize(self):
word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย"),
["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"],
)
self.assertIsNotNone(word_tokenize(self.text_1, engine="nlpo3"))
self.assertIsNotNone(word_tokenize(self.text_1, engine="attacut"))
self.assertIsNotNone(word_tokenize(self.text_1, engine="deepcut"))
self.assertIsNotNone(word_tokenize(self.text_1, engine="icu"))
Expand Down