Skip to content

Commit eee7a65

Browse files
committed
Add NERCut
1 parent 82e71e4 commit eee7a65

File tree

3 files changed

+81
-0
lines changed

3 files changed

+81
-0
lines changed

pythainlp/tokenize/core.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,10 @@ def word_tokenize(
162162
elif engine == "icu":
163163
from .pyicu import segment
164164

165+
segments = segment(text)
166+
elif engine == "nercut":
167+
from .nercut import segment
168+
165169
segments = segment(text)
166170
else:
167171
raise ValueError(

pythainlp/tokenize/nercut.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
nercut 0.1
4+
5+
Code by Wannaphong Phatthiyaphaibun
6+
"""
7+
from typing import List
8+
from pythainlp.tag.named_entity import ThaiNameTagger
9+
10+
_thainer = ThaiNameTagger()
11+
12+
def segment(
13+
text: str,
14+
tag:List[str] = [
15+
"ORGANIZATION",
16+
"PERSON",
17+
"PHONE",
18+
"EMAIL",
19+
"DATE",
20+
"TIME"
21+
]
22+
) -> List[str]:
23+
"""
24+
nercut 0.1
25+
26+
Code by Wannaphong Phatthiyaphaibun
27+
28+
neww+thainer word segmentation.
29+
30+
:param str text: text to be tokenized to words
31+
:parm list tag: ThaiNER tag
32+
:return: list of words, tokenized from the text
33+
"""
34+
global _thainer
35+
if not text or not isinstance(text, str):
36+
return []
37+
38+
_ws = _thainer.get_ner(text, pos = False)
39+
_list_w = []
40+
_bi = ""
41+
_tag = ""
42+
for i,t in _ws:
43+
if t != "O":
44+
_tag_temp = t.split('-')[1]
45+
else:
46+
_tag_temp = "O"
47+
if t.startswith('B-') and _tag_temp in tag:
48+
if _bi!="" and _tag in tag:
49+
_list_w.append(_bi)
50+
_bi=""
51+
_bi += i
52+
_tag = t.replace('B-','')
53+
elif t.startswith('I-') and t.replace('I-','') == _tag and _tag_temp in tag:
54+
_bi += i
55+
elif t == "O" and _tag != "" and _tag in tag:
56+
_list_w.append(_bi)
57+
_bi=""
58+
_tag = ""
59+
_list_w.append(i)
60+
else:
61+
_bi=""
62+
_tag = ""
63+
_list_w.append(i)
64+
if _bi!="":
65+
_list_w.append(_bi)
66+
return _list_w

tests/test_tokenize.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
)
2121
from pythainlp.tokenize.ssg import segment as ssg_segment
2222
from pythainlp.util import dict_trie
23+
from pythainlp.tokenize import nercut as tokenize_nercut
2324

2425

2526
class TestTokenizePackage(unittest.TestCase):
@@ -230,6 +231,7 @@ def test_word_tokenize(self):
230231
self.assertIsNotNone(word_tokenize(self.text_1, engine="icu"))
231232
self.assertIsNotNone(word_tokenize(self.text_1, engine="deepcut"))
232233
self.assertIsNotNone(word_tokenize(self.text_1, engine="attacut"))
234+
self.assertIsNotNone(word_tokenize(self.text_1, engine="nercut"))
233235
with self.assertRaises(ValueError):
234236
word_tokenize("หมอนทอง", engine="XX") # engine does not exist
235237

@@ -364,6 +366,15 @@ def test_word_tokenize_attacut(self):
364366
["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"],
365367
)
366368

369+
def test_word_tokenize_nercut(self):
370+
self.assertEqual(tokenize_nercut.segment(None), [])
371+
self.assertEqual(tokenize_nercut.segment(""), [])
372+
self.assertIsNotNone(
373+
tokenize_nercut.segment("ทดสอบ")
374+
)
375+
self.assertIsNotNone(tokenize_nercut.segment("ทดสอบ"))
376+
self.assertIsNotNone(word_tokenize("ทดสอบ", engine="nercut"))
377+
367378
def test_sent_tokenize(self):
368379
self.assertEqual(sent_tokenize(None), [])
369380
self.assertEqual(sent_tokenize(""), [])

0 commit comments

Comments
 (0)