Skip to content

Commit 7a41b7b

Browse files
authored
Merge pull request #622 from PyThaiNLP/add-nlpo3
[WIP] Add nlpo3
2 parents d8a23ee + 8ef30c5 commit 7a41b7b

File tree

6 files changed

+94
-1
lines changed

6 files changed

+94
-1
lines changed

docker_requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,4 @@ tensorflow==2.5.1
2727
pandas==0.24
2828
tltk==1.3.8
2929
OSKut==1.3
30+
nlpo3==1.2.1

docs/api/tokenize.rst

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,13 @@ multi_cut
5050
.. autofunction:: pythainlp.tokenize.multi_cut.segment
5151
.. autofunction:: pythainlp.tokenize.multi_cut.find_all_segment
5252

53+
nlpo3
54+
+++++
55+
.. automodule:: pythainlp.tokenize.nlpo3
56+
57+
.. autofunction:: pythainlp.tokenize.nlpo3.load_dict
58+
.. autofunction:: pythainlp.tokenize.nlpo3.segment
59+
5360
longest
5461
+++++++
5562
.. automodule:: pythainlp.tokenize.longest
@@ -98,4 +105,4 @@ etcc
98105
++++
99106
.. automodule:: pythainlp.tokenize.etcc
100107

101-
.. autofunction:: pythainlp.tokenize.etcc.segment
108+
.. autofunction:: pythainlp.tokenize.etcc.segment

pythainlp/tokenize/core.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ def word_tokenize(
6969
Thai Character Cluster
7070
* *newmm-safe* - newmm, with a mechanism to help avoid long
7171
processing time for text with continuous ambiguous breaking points
72+
* *nlpo3* - Python binding for nlpO3. It is newmm engine in Rust.
7273
* *longest* - dictionary-based, Longest Matching
7374
* *icu* - wrapper for ICU (International Components for Unicode,
7475
using PyICU), dictionary-based
@@ -192,6 +193,19 @@ def word_tokenize(
192193
from pythainlp.tokenize.oskut import segment
193194

194195
segments = segment(text)
196+
elif engine == "nlpo3":
197+
from pythainlp.tokenize.nlpo3 import segment
198+
if isinstance(custom_dict, str):
199+
segments = segment(text, custom_dict=custom_dict)
200+
elif not isinstance(custom_dict, str) and custom_dict is not None:
201+
raise ValueError(
202+
f"""Tokenizer \"{engine}\":
203+
custom_dict must be a str.
204+
It is a dictionary name as assigned with load_dict().
205+
See pythainlp.tokenize.nlpo3.load_dict()"""
206+
)
207+
else:
208+
segments = segment(text)
195209
else:
196210
raise ValueError(
197211
f"""Tokenizer \"{engine}\" not found.

pythainlp/tokenize/nlpo3.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
# -*- coding: utf-8 -*-
2+
from sys import stderr
3+
from typing import List
4+
5+
from nlpo3 import segment as nlpo3_segment
6+
from nlpo3 import load_dict as nlpo3_load_dict
7+
from pythainlp.corpus.common import _THAI_WORDS_FILENAME
8+
from pythainlp.corpus import path_pythainlp_corpus
9+
10+
_NLPO3_DEFAULT_DICT_NAME = "_67a47bf9"
11+
_NLPO3_DEFAULT_DICT = nlpo3_load_dict(
12+
path_pythainlp_corpus(_THAI_WORDS_FILENAME),
13+
_NLPO3_DEFAULT_DICT_NAME
14+
)
15+
16+
17+
def load_dict(file_path: str, dict_name: str) -> bool:
18+
"""Load a dictionary file into an in-memory dictionary collection.
19+
20+
The loaded dictionary will be accessible throught the assigned dict_name.
21+
*** This function does not override an existing dict name. ***
22+
23+
:param file_path: Path to a dictionary file
24+
:type file_path: str
25+
:param dict_name: A unique dictionary name, use for reference.
26+
:type dict_name: str
27+
:return bool
28+
29+
:See Also:
30+
* \
31+
https://github.com/PyThaiNLP/nlpo3
32+
"""
33+
msg, success = nlpo3_load_dict(file_path=file_path, dict_name=dict_name)
34+
if bool is False:
35+
print(msg, file=stderr)
36+
return success
37+
38+
39+
def segment(
40+
text: str,
41+
custom_dict: str = _NLPO3_DEFAULT_DICT_NAME,
42+
safe_mode: bool = False,
43+
parallel_mode: bool = False
44+
) -> List[str]:
45+
"""Break text into tokens.
46+
47+
Python binding for nlpO3. It is newmm engine in Rust.
48+
49+
:param str text: text to be tokenized
50+
:param str custom_dict: dictionary name, as assigned with load_dict(),\
51+
defaults to pythainlp/corpus/common/words_th.txt
52+
:param bool safe_mode: reduce chance for long processing time in long text\
53+
with many ambiguous breaking points, defaults to False
54+
:param bool parallel_mode: Use multithread mode, defaults to False
55+
56+
:return: list of tokens
57+
:rtype: List[str]
58+
59+
:See Also:
60+
* \
61+
https://github.com/PyThaiNLP/nlpo3
62+
"""
63+
return nlpo3_segment(
64+
text=text,
65+
dict_name=custom_dict,
66+
safe=safe_mode,
67+
parallel=parallel_mode
68+
)

setup.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@
7575
],
7676
"tltk": ["tltk>=1.3.8"],
7777
"oskut": ["oskut>=1.3"],
78+
"nlpo3": ["nlpo3>=1.2.1"],
7879
"full": [
7980
"PyYAML>=5.3.1",
8081
"attacut>=1.0.4",
@@ -99,6 +100,7 @@
99100
"symspellpy>=6.7.0",
100101
"tltk>=1.3.8",
101102
"oskut>=1.3",
103+
"nlpo3>=1.2.1",
102104
],
103105
}
104106

tests/test_tokenize.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -372,6 +372,7 @@ def test_word_tokenize(self):
372372
word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย"),
373373
["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"],
374374
)
375+
self.assertIsNotNone(word_tokenize(self.text_1, engine="nlpo3"))
375376
self.assertIsNotNone(word_tokenize(self.text_1, engine="attacut"))
376377
self.assertIsNotNone(word_tokenize(self.text_1, engine="deepcut"))
377378
self.assertIsNotNone(word_tokenize(self.text_1, engine="icu"))

0 commit comments

Comments
 (0)