66such as its Part-of-Speech (POS) tag, and Named Entity Recognition (NER) tag.
77"""
88
9- from typing import List , Tuple
10-
119__all__ = ["pos_tag" , "pos_tag_sents" , "tag_provinces" ]
1210from .locations import tag_provinces
13-
14- # tag map for orchid to Universal Dependencies
15- # from Korakot Chaovavanich
16- _TAG_MAP_UD = {
17- # NOUN
18- "NOUN" : "NOUN" ,
19- "NCMN" : "NOUN" ,
20- "NTTL" : "NOUN" ,
21- "CNIT" : "NOUN" ,
22- "CLTV" : "NOUN" ,
23- "CMTR" : "NOUN" ,
24- "CFQC" : "NOUN" ,
25- "CVBL" : "NOUN" ,
26- # VERB
27- "VACT" : "VERB" ,
28- "VSTA" : "VERB" ,
29- # PROPN
30- "PROPN" : "PROPN" ,
31- "NPRP" : "PROPN" ,
32- # ADJ
33- "ADJ" : "ADJ" ,
34- "NONM" : "ADJ" ,
35- "VATT" : "ADJ" ,
36- "DONM" : "ADJ" ,
37- # ADV
38- "ADV" : "ADV" ,
39- "ADVN" : "ADV" ,
40- "ADVI" : "ADV" ,
41- "ADVP" : "ADV" ,
42- "ADVS" : "ADV" ,
43- # INT
44- "INT" : "INTJ" ,
45- # PRON
46- "PRON" : "PRON" ,
47- "PPRS" : "PRON" ,
48- "PDMN" : "PRON" ,
49- "PNTR" : "PRON" ,
50- # DET
51- "DET" : "DET" ,
52- "DDAN" : "DET" ,
53- "DDAC" : "DET" ,
54- "DDBQ" : "DET" ,
55- "DDAQ" : "DET" ,
56- "DIAC" : "DET" ,
57- "DIBQ" : "DET" ,
58- "DIAQ" : "DET" ,
59- # NUM
60- "NUM" : "NUM" ,
61- "NCNM" : "NUM" ,
62- "NLBL" : "NUM" ,
63- "DCNM" : "NUM" ,
64- # AUX
65- "AUX" : "AUX" ,
66- "XVBM" : "AUX" ,
67- "XVAM" : "AUX" ,
68- "XVMM" : "AUX" ,
69- "XVBB" : "AUX" ,
70- "XVAE" : "AUX" ,
71- # ADP
72- "ADP" : "ADP" ,
73- "RPRE" : "ADP" ,
74- # CCONJ
75- "CCONJ" : "CCONJ" ,
76- "JCRG" : "CCONJ" ,
77- # SCONJ
78- "SCONJ" : "SCONJ" ,
79- "PREL" : "SCONJ" ,
80- "JSBR" : "SCONJ" ,
81- "JCMP" : "SCONJ" ,
82- # PART
83- "PART" : "PART" ,
84- "FIXN" : "PART" ,
85- "FIXV" : "PART" ,
86- "EAFF" : "PART" ,
87- "EITT" : "PART" ,
88- "AITT" : "PART" ,
89- "NEG" : "PART" ,
90- # PUNCT
91- "PUNCT" : "PUNCT" ,
92- "PUNC" : "PUNCT" ,
93- }
94-
95-
96- def _UD_Exception (w : str , tag : str ) -> str :
97- if w == "การ" or w == "ความ" :
98- return "NOUN"
99-
100- return tag
101-
102-
103- def _orchid_to_ud (tag ) -> List [Tuple [str , str ]]:
104- _i = 0
105- temp = []
106- while _i < len (tag ):
107- temp .append (
108- (tag [_i ][0 ], _UD_Exception (tag [_i ][0 ], _TAG_MAP_UD [tag [_i ][1 ]]))
109- )
110- _i += 1
111-
112- return temp
113-
114-
115- def pos_tag (
116- words : List [str ], engine : str = "perceptron" , corpus : str = "orchid"
117- ) -> List [Tuple [str , str ]]:
118- """
119- The function tag a list of tokenized words into Part-of-Speech (POS) tags
120- such as 'NOUN', 'VERB', 'ADJ', and 'DET'.
121-
122- :param list words: a list of tokenized words
123- :param str engine:
124- * *perceptron* - perceptron tagger (default)
125- * *unigram* - unigram tagger
126- :param str corpus:
127- * *orchid* - annotated Thai academic articles namedly
128- `Orchid <https://www.academia.edu/9127599/Thai_Treebank>`_ (default)
129- * *orchid_ud* - annotated Thai academic articles *Orchid* but the
130- POS tags are mapped to comply with
131- `Universal Dependencies <https://universaldependencies.org/u/pos>`_
132- POS Tags
133- * *pud* - `Parallel Universal Dependencies (PUD)
134- <https://github.com/UniversalDependencies/UD_Thai-PUD>`_ treebanks
135- :return: returns a list of labels regarding which part of speech it is
136- :rtype: list[tuple[str, str]]
137-
138- :Example:
139-
140- Tag words with corpus `orchid` (default)::
141-
142- from pythainlp.tag import pos_tag
143-
144- words = ['ฉัน','มี','ชีวิต','รอด','ใน','อาคาร','หลบภัย','ของ', \\
145- 'นายก', 'เชอร์ชิล']
146- pos_tag(words)
147- # output:
148- # [('ฉัน', 'PPRS'), ('มี', 'VSTA'), ('ชีวิต', 'NCMN'), ('รอด', 'NCMN'),
149- # ('ใน', 'RPRE'), ('อาคาร', 'NCMN'), ('หลบภัย', 'NCMN'),
150- # ('ของ', 'RPRE'), ('นายก', 'NCMN'), ('เชอร์ชิล', 'NCMN')]
151-
152- Tag words with corpus `orchid_ud`::
153-
154- from pythainlp.tag import pos_tag
155-
156- words = ['ฉัน','มี','ชีวิต','รอด','ใน','อาคาร','หลบภัย','ของ', \\
157- 'นายก', 'เชอร์ชิล']
158- pos_tag(words, corpus='orchid_ud')
159- # output:
160- # [('ฉัน', 'PROPN'), ('มี', 'VERB'), ('ชีวิต', 'NOUN'),
161- # ('รอด', 'NOUN'), ('ใน', 'ADP'), ('อาคาร', 'NOUN'),
162- # ('หลบภัย', 'NOUN'), ('ของ', 'ADP'), ('นายก', 'NOUN'),
163- # ('เชอร์ชิล', 'NOUN')]
164-
165- Tag words with corpus `pud`::
166-
167- from pythainlp.tag import pos_tag
168-
169- words = ['ฉัน','มี','ชีวิต','รอด','ใน','อาคาร','หลบภัย','ของ', \\
170- 'นายก', 'เชอร์ชิล']
171- pos_tag(words, corpus='pud')
172- # [('ฉัน', 'PRON'), ('มี', 'VERB'), ('ชีวิต', 'NOUN'), ('รอด', 'VERB'),
173- # ('ใน', 'ADP'), ('อาคาร', 'NOUN'), ('หลบภัย', 'NOUN'),
174- # ('ของ', 'ADP'), ('นายก', 'NOUN'), ('เชอร์ชิล', 'PROPN')]
175-
176- Tag words with different engines including *perceptron* and *unigram*::
177-
178- from pythainlp.tag import pos_tag
179-
180- words = ['เก้าอี้','มี','จำนวน','ขา', ' ', '=', '3']
181-
182- pos_tag(words, engine='perceptron', corpus='orchid')
183- # output:
184- # [('เก้าอี้', 'NCMN'), ('มี', 'VSTA'), ('จำนวน', 'NCMN'),
185- # ('ขา', 'NCMN'), (' ', 'PUNC'),
186- # ('=', 'PUNC'), ('3', 'NCNM')]
187-
188- pos_tag(words, engine='unigram', corpus='pud')
189- # output:
190- # [('เก้าอี้', None), ('มี', 'VERB'), ('จำนวน', 'NOUN'), ('ขา', None),
191- # ('<space>', None), ('<equal>', None), ('3', 'NUM')]
192- """
193-
194- # NOTE:
195- _corpus = corpus
196- _tag = []
197- if corpus == "orchid_ud" :
198- corpus = "orchid"
199- if not words :
200- return []
201-
202- if engine == "perceptron" :
203- from .perceptron import tag as tag_
204- else : # default, use "unigram" ("old") engine
205- from .unigram import tag as tag_
206- _tag = tag_ (words , corpus = corpus )
207-
208- if _corpus == "orchid_ud" :
209- _tag = _orchid_to_ud (_tag )
210-
211- return _tag
212-
213-
214- def pos_tag_sents (
215- sentences : List [List [str ]],
216- engine : str = "perceptron" ,
217- corpus : str = "orchid" ,
218- ) -> List [List [Tuple [str , str ]]]:
219- """
220- The function tag multiple list of tokenized words into Part-of-Speech
221- (POS) tags.
222-
223- :param list sentences: a list of lists of tokenized words
224- :param str engine:
225- * *perceptron* - perceptron tagger (default)
226- * *unigram* - unigram tagger
227- :param str corpus:
228- * *orchid* - annotated Thai academic articles namedly\
229- `Orchid <https://www.academia.edu/9127599/Thai_Treebank>`_\
230- (default)
231- * *orchid_ud* - annotated Thai academic articles using\
232- `Universal Dependencies <https://universaldependencies.org/>`_ Tags
233- * *pud* - `Parallel Universal Dependencies (PUD)\
234- <https://github.com/UniversalDependencies/UD_Thai-PUD>`_ treebanks
235- :return: returns a list of labels regarding which part of speech it is
236- for each sentence given.
237- :rtype: list[list[tuple[str, str]]]
238-
239- :Example:
240-
241- Labels POS for two sentences::
242-
243- from pythainlp.tag import pos_tag_sents
244-
245- sentences = [['เก้าอี้','มี','3','ขา'], \\
246- ['นก', 'บิน', 'กลับ', 'รัง']]
247- pos_tag_sents(sentences, corpus='pud)
248- # output:
249- # [[('เก้าอี้', 'PROPN'), ('มี', 'VERB'), ('3', 'NUM'),
250- # ('ขา', 'NOUN')], [('นก', 'NOUN'), ('บิน', 'VERB'),
251- # ('กลับ', 'VERB'), ('รัง', 'NOUN')]]
252- """
253- if not sentences :
254- return []
255-
256- return [pos_tag (sent , engine = engine , corpus = corpus ) for sent in sentences ]
11+ from .pos_tag import pos_tag
12+ from .pos_tag import pos_tag_sents
0 commit comments