22"""
33nercut 0.1
44
5+ Dictionary-based maximal matching word segmentation, constrained with
6+ Thai Character Cluster (TCC) boundaries, and combining tokens that are
7+ parts of the same named-entity.
8+
59Code by Wannaphong Phatthiyaphaibun
610"""
711from typing import List
12+
813from pythainlp .tag .named_entity import ThaiNameTagger
914
1015_thainer = ThaiNameTagger ()
1116
17+
1218def segment (
1319 text : str ,
14- tag : List [str ] = [
20+ taglist : List [str ] = [
1521 "ORGANIZATION" ,
1622 "PERSON" ,
1723 "PHONE" ,
1824 "EMAIL" ,
1925 "DATE" ,
20- "TIME"
21- ]
26+ "TIME" ,
27+ ],
2228) -> List [str ]:
2329 """
2430 nercut 0.1
2531
2632 Code by Wannaphong Phatthiyaphaibun
2733
28- neww+thainer word segmentation.
34+ Dictionary-based maximal matching word segmentation, constrained with
35+ Thai Character Cluster (TCC) boundaries, and combining tokens that are
36+ parts of the same named-entity.
2937
3038 :param str text: text to be tokenized to words
31- :parm list tag: ThaiNER tag
39+ :parm list taglist: a list of named-entity tags to be used
3240 :return: list of words, tokenized from the text
3341 """
34- global _thainer
3542 if not text or not isinstance (text , str ):
3643 return []
3744
38- _ws = _thainer .get_ner (text , pos = False )
39- _list_w = []
40- _bi = ""
41- _tag = ""
42- for i ,t in _ws :
43- if t != "O" :
44- _tag_temp = t .split ('-' )[1 ]
45+ global _thainer
46+ tagged_words = _thainer .get_ner (text , pos = False )
47+
48+ words = []
49+ combining_word = ""
50+ combining_word = ""
51+ for curr_word , curr_tag in tagged_words :
52+ if curr_tag != "O" :
53+ tag = curr_tag [2 :]
4554 else :
46- _tag_temp = "O"
47- if t .startswith ('B-' ) and _tag_temp in tag :
48- if _bi != "" and _tag in tag :
49- _list_w .append (_bi )
50- _bi = ""
51- _bi += i
52- _tag = t .replace ('B-' ,'' )
53- elif t .startswith ('I-' ) and t .replace ('I-' ,'' ) == _tag and _tag_temp in tag :
54- _bi += i
55- elif t == "O" and _tag != "" and _tag in tag :
56- _list_w .append (_bi )
57- _bi = ""
58- _tag = ""
59- _list_w .append (i )
55+ tag = "O"
56+
57+ if curr_tag .startswith ("B-" ) and tag in taglist :
58+ if combining_word != "" and combining_word in taglist :
59+ words .append (combining_word )
60+ combining_word = ""
61+ combining_word += curr_word
62+ combining_word = curr_tag [2 :]
63+ elif (
64+ curr_tag .startswith ("I-" )
65+ and curr_tag [2 :] == combining_word
66+ and tag in taglist
67+ ):
68+ combining_word += curr_word
69+ elif (
70+ curr_tag == "O"
71+ and combining_word != ""
72+ and combining_word in taglist
73+ ):
74+ words .append (combining_word )
75+ combining_word = ""
76+ combining_word = ""
77+ words .append (curr_word )
6078 else :
61- _bi = ""
62- _tag = ""
63- _list_w .append (i )
64- if _bi != "" :
65- _list_w .append (_bi )
66- return _list_w
79+ combining_word = ""
80+ combining_word = ""
81+ words .append (curr_word )
82+
83+ if combining_word != "" :
84+ words .append (combining_word )
85+
86+ return words
0 commit comments