Skip to content

Commit 3e622d7

Browse files
wannaphongbact
andauthored
Add NERCut tokenization engine (#503)
* Add NERCut * More readable variable names (#504) * Update nercut docs and update nercut code * Update tokenize.rst * Update nercut.py * Update test_tokenize.py * Update test_tokenize.py * Update core.py Co-authored-by: Arthit Suriyawongkul <arthit@gmail.com>
1 parent e52b8c9 commit 3e622d7

File tree

4 files changed

+302
-180
lines changed

4 files changed

+302
-180
lines changed

docs/api/tokenize.rst

Lines changed: 34 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -19,34 +19,54 @@ Modules
1919
Tokenization Engines
2020
--------------------
2121

22-
newmm
23-
+++++
24-
.. automodule:: pythainlp.tokenize.newmm
25-
.. autofunction:: pythainlp.tokenize.newmm.segment
22+
Word level
23+
----------
2624

25+
attacut
26+
+++++++
27+
.. automodule:: pythainlp.tokenize.attacut
2728

28-
longest
29+
.. autoclass:: pythainlp.tokenize.attacut.AttacutTokenizer
30+
:members:
31+
32+
deepcut
2933
+++++++
30-
.. automodule:: pythainlp.tokenize.longest
34+
.. automodule:: pythainlp.tokenize.deepcut
3135

3236
multi_cut
3337
+++++++++
3438
.. automodule:: pythainlp.tokenize.multi_cut
3539

40+
.. autofunction:: pythainlp.tokenize.multi_cut.segment
41+
.. autofunction:: pythainlp.tokenize.multi_cut.find_all_segment
42+
43+
longest
44+
+++++++
45+
.. automodule:: pythainlp.tokenize.longest
46+
47+
.. autofunction:: pythainlp.tokenize.longest.segment
48+
3649
pyicu
3750
+++++
3851
.. automodule:: pythainlp.tokenize.pyicu
3952

40-
deepcut
41-
+++++++
42-
.. automodule:: pythainlp.tokenize.deepcut
53+
nercut
54+
++++++
55+
.. automodule:: pythainlp.tokenize.nercut
4356

44-
attacut
45-
+++++++
46-
.. automodule:: pythainlp.tokenize.attacut
57+
.. autofunction:: pythainlp.tokenize.nercut.segment
4758

48-
.. autoclass:: pythainlp.tokenize.attacut.AttacutTokenizer
49-
:members:
59+
newmm
60+
+++++
61+
62+
The default word tokenization engine.
63+
64+
.. automodule:: pythainlp.tokenize.newmm
65+
66+
.. autofunction:: pythainlp.tokenize.newmm.segment
67+
68+
Subword level
69+
-------------
5070

5171
tcc
5272
+++

pythainlp/tokenize/core.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,9 @@ def clause_tokenize(doc: List[str]) -> List[List[str]]:
3636
['และ', 'คุณ', 'เล่น', 'มือถือ'],
3737
['ส่วน', 'น้อง', 'เขียน', 'โปรแกรม']]
3838
"""
39+
if not doc or not isinstance(doc, str):
40+
return []
41+
3942
from .crfcls import segment
4043

4144
return segment(doc)
@@ -74,6 +77,9 @@ def word_tokenize(
7477
* *deepcut* - wrapper for
7578
`DeepCut <https://github.com/rkcosmos/deepcut>`_,
7679
learning-based approach
80+
* *nercut* - Dictionary-based maximal matching word segmentation,
81+
constrained with Thai Character Cluster (TCC) boundaries,
82+
and combining tokens that are parts of the same named-entity.
7783
7884
:Note:
7985
- The parameter **custom_dict** can be provided as an argument \
@@ -162,6 +168,10 @@ def word_tokenize(
162168
elif engine == "icu":
163169
from .pyicu import segment
164170

171+
segments = segment(text)
172+
elif engine == "nercut":
173+
from .nercut import segment
174+
165175
segments = segment(text)
166176
else:
167177
raise ValueError(

pythainlp/tokenize/nercut.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
nercut 0.1
4+
5+
Dictionary-based maximal matching word segmentation, constrained with
6+
Thai Character Cluster (TCC) boundaries, and combining tokens that are
7+
parts of the same named-entity.
8+
9+
Code by Wannaphong Phatthiyaphaibun
10+
"""
11+
from typing import Iterable, List
12+
13+
from pythainlp.tag.named_entity import ThaiNameTagger
14+
15+
_thainer = ThaiNameTagger()
16+
17+
18+
def segment(
19+
text: str,
20+
taglist: Iterable[str] = [
21+
"ORGANIZATION",
22+
"PERSON",
23+
"PHONE",
24+
"EMAIL",
25+
"DATE",
26+
"TIME",
27+
],
28+
) -> List[str]:
29+
"""
30+
Dictionary-based maximal matching word segmentation, constrained with
31+
Thai Character Cluster (TCC) boundaries, and combining tokens that are
32+
parts of the same named-entity.
33+
34+
:param str text: text to be tokenized to words
35+
:parm list taglist: a list of named-entity tags to be used
36+
:return: list of words, tokenized from the text
37+
"""
38+
if not text or not isinstance(text, str):
39+
return []
40+
41+
global _thainer
42+
tagged_words = _thainer.get_ner(text, pos=False)
43+
44+
words = []
45+
combining_word = ""
46+
combining_word = ""
47+
for curr_word, curr_tag in tagged_words:
48+
if curr_tag != "O":
49+
tag = curr_tag[2:]
50+
else:
51+
tag = "O"
52+
53+
if curr_tag.startswith("B-") and tag in taglist:
54+
if combining_word != "":
55+
words.append(combining_word)
56+
combining_word = curr_word
57+
elif (
58+
curr_tag.startswith("I-")
59+
and combining_word != ""
60+
and tag in taglist
61+
):
62+
combining_word += curr_word
63+
elif (
64+
curr_tag == "O"
65+
and combining_word != ""
66+
):
67+
words.append(combining_word)
68+
combining_word = ""
69+
words.append(curr_word)
70+
else:
71+
combining_word = ""
72+
words.append(curr_word)
73+
74+
if combining_word != "":
75+
words.append(combining_word)
76+
77+
return words

0 commit comments

Comments
 (0)