Skip to content

Commit 36e46d9

Browse files
committed
More readable variable names
1 parent eee7a65 commit 36e46d9

File tree

1 file changed

+53
-33
lines changed

1 file changed

+53
-33
lines changed

pythainlp/tokenize/nercut.py

Lines changed: 53 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -2,65 +2,85 @@
22
"""
33
nercut 0.1
44
5+
Dictionary-based maximal matching word segmentation, constrained with
6+
Thai Character Cluster (TCC) boundaries, and combining tokens that are
7+
parts of the same named-entity.
8+
59
Code by Wannaphong Phatthiyaphaibun
610
"""
711
from typing import List
12+
813
from pythainlp.tag.named_entity import ThaiNameTagger
914

1015
_thainer = ThaiNameTagger()
1116

17+
1218
def segment(
1319
text: str,
14-
tag:List[str] = [
20+
taglist: List[str] = [
1521
"ORGANIZATION",
1622
"PERSON",
1723
"PHONE",
1824
"EMAIL",
1925
"DATE",
20-
"TIME"
21-
]
26+
"TIME",
27+
],
2228
) -> List[str]:
2329
"""
2430
nercut 0.1
2531
2632
Code by Wannaphong Phatthiyaphaibun
2733
28-
neww+thainer word segmentation.
34+
Dictionary-based maximal matching word segmentation, constrained with
35+
Thai Character Cluster (TCC) boundaries, and combining tokens that are
36+
parts of the same named-entity.
2937
3038
:param str text: text to be tokenized to words
31-
:parm list tag: ThaiNER tag
39+
:parm list taglist: a list of named-entity tags to be used
3240
:return: list of words, tokenized from the text
3341
"""
34-
global _thainer
3542
if not text or not isinstance(text, str):
3643
return []
3744

38-
_ws = _thainer.get_ner(text, pos = False)
39-
_list_w = []
40-
_bi = ""
41-
_tag = ""
42-
for i,t in _ws:
43-
if t != "O":
44-
_tag_temp = t.split('-')[1]
45+
global _thainer
46+
tagged_words = _thainer.get_ner(text, pos=False)
47+
48+
words = []
49+
combining_word = ""
50+
combining_word = ""
51+
for curr_word, curr_tag in tagged_words:
52+
if curr_tag != "O":
53+
tag = curr_tag[2:]
4554
else:
46-
_tag_temp = "O"
47-
if t.startswith('B-') and _tag_temp in tag:
48-
if _bi!="" and _tag in tag:
49-
_list_w.append(_bi)
50-
_bi=""
51-
_bi += i
52-
_tag = t.replace('B-','')
53-
elif t.startswith('I-') and t.replace('I-','') == _tag and _tag_temp in tag:
54-
_bi += i
55-
elif t == "O" and _tag != "" and _tag in tag:
56-
_list_w.append(_bi)
57-
_bi=""
58-
_tag = ""
59-
_list_w.append(i)
55+
tag = "O"
56+
57+
if curr_tag.startswith("B-") and tag in taglist:
58+
if combining_word != "" and combining_word in taglist:
59+
words.append(combining_word)
60+
combining_word = ""
61+
combining_word += curr_word
62+
combining_word = curr_tag[2:]
63+
elif (
64+
curr_tag.startswith("I-")
65+
and curr_tag[2:] == combining_word
66+
and tag in taglist
67+
):
68+
combining_word += curr_word
69+
elif (
70+
curr_tag == "O"
71+
and combining_word != ""
72+
and combining_word in taglist
73+
):
74+
words.append(combining_word)
75+
combining_word = ""
76+
combining_word = ""
77+
words.append(curr_word)
6078
else:
61-
_bi=""
62-
_tag = ""
63-
_list_w.append(i)
64-
if _bi!="":
65-
_list_w.append(_bi)
66-
return _list_w
79+
combining_word = ""
80+
combining_word = ""
81+
words.append(curr_word)
82+
83+
if combining_word != "":
84+
words.append(combining_word)
85+
86+
return words

0 commit comments

Comments
 (0)