Skip to content

Commit 21e81a7

Browse files
committed
move pos_tag from pythainlp.tag to pythainlp.tag.pos_tag (build and deploy docs)
1 parent 7ac34c3 commit 21e81a7

File tree

5 files changed

+252
-247
lines changed

5 files changed

+252
-247
lines changed

pythainlp/tag/__init__.py

Lines changed: 2 additions & 246 deletions
Original file line numberDiff line numberDiff line change
@@ -6,251 +6,7 @@
66
such as its Part-of-Speech (POS) tag, and Named Entity Recognition (NER) tag.
77
"""
88

9-
from typing import List, Tuple
10-
119
__all__ = ["pos_tag", "pos_tag_sents", "tag_provinces"]
1210
from .locations import tag_provinces
13-
14-
# tag map for orchid to Universal Dependencies
15-
# from Korakot Chaovavanich
16-
_TAG_MAP_UD = {
17-
# NOUN
18-
"NOUN": "NOUN",
19-
"NCMN": "NOUN",
20-
"NTTL": "NOUN",
21-
"CNIT": "NOUN",
22-
"CLTV": "NOUN",
23-
"CMTR": "NOUN",
24-
"CFQC": "NOUN",
25-
"CVBL": "NOUN",
26-
# VERB
27-
"VACT": "VERB",
28-
"VSTA": "VERB",
29-
# PROPN
30-
"PROPN": "PROPN",
31-
"NPRP": "PROPN",
32-
# ADJ
33-
"ADJ": "ADJ",
34-
"NONM": "ADJ",
35-
"VATT": "ADJ",
36-
"DONM": "ADJ",
37-
# ADV
38-
"ADV": "ADV",
39-
"ADVN": "ADV",
40-
"ADVI": "ADV",
41-
"ADVP": "ADV",
42-
"ADVS": "ADV",
43-
# INT
44-
"INT": "INTJ",
45-
# PRON
46-
"PRON": "PRON",
47-
"PPRS": "PRON",
48-
"PDMN": "PRON",
49-
"PNTR": "PRON",
50-
# DET
51-
"DET": "DET",
52-
"DDAN": "DET",
53-
"DDAC": "DET",
54-
"DDBQ": "DET",
55-
"DDAQ": "DET",
56-
"DIAC": "DET",
57-
"DIBQ": "DET",
58-
"DIAQ": "DET",
59-
# NUM
60-
"NUM": "NUM",
61-
"NCNM": "NUM",
62-
"NLBL": "NUM",
63-
"DCNM": "NUM",
64-
# AUX
65-
"AUX": "AUX",
66-
"XVBM": "AUX",
67-
"XVAM": "AUX",
68-
"XVMM": "AUX",
69-
"XVBB": "AUX",
70-
"XVAE": "AUX",
71-
# ADP
72-
"ADP": "ADP",
73-
"RPRE": "ADP",
74-
# CCONJ
75-
"CCONJ": "CCONJ",
76-
"JCRG": "CCONJ",
77-
# SCONJ
78-
"SCONJ": "SCONJ",
79-
"PREL": "SCONJ",
80-
"JSBR": "SCONJ",
81-
"JCMP": "SCONJ",
82-
# PART
83-
"PART": "PART",
84-
"FIXN": "PART",
85-
"FIXV": "PART",
86-
"EAFF": "PART",
87-
"EITT": "PART",
88-
"AITT": "PART",
89-
"NEG": "PART",
90-
# PUNCT
91-
"PUNCT": "PUNCT",
92-
"PUNC": "PUNCT",
93-
}
94-
95-
96-
def _UD_Exception(w: str, tag: str) -> str:
97-
if w == "การ" or w == "ความ":
98-
return "NOUN"
99-
100-
return tag
101-
102-
103-
def _orchid_to_ud(tag) -> List[Tuple[str, str]]:
104-
_i = 0
105-
temp = []
106-
while _i < len(tag):
107-
temp.append(
108-
(tag[_i][0], _UD_Exception(tag[_i][0], _TAG_MAP_UD[tag[_i][1]]))
109-
)
110-
_i += 1
111-
112-
return temp
113-
114-
115-
def pos_tag(
116-
words: List[str], engine: str = "perceptron", corpus: str = "orchid"
117-
) -> List[Tuple[str, str]]:
118-
"""
119-
The function tag a list of tokenized words into Part-of-Speech (POS) tags
120-
such as 'NOUN', 'VERB', 'ADJ', and 'DET'.
121-
122-
:param list words: a list of tokenized words
123-
:param str engine:
124-
* *perceptron* - perceptron tagger (default)
125-
* *unigram* - unigram tagger
126-
:param str corpus:
127-
* *orchid* - annotated Thai academic articles namedly
128-
`Orchid <https://www.academia.edu/9127599/Thai_Treebank>`_ (default)
129-
* *orchid_ud* - annotated Thai academic articles *Orchid* but the
130-
POS tags are mapped to comply with
131-
`Universal Dependencies <https://universaldependencies.org/u/pos>`_
132-
POS Tags
133-
* *pud* - `Parallel Universal Dependencies (PUD)
134-
<https://github.com/UniversalDependencies/UD_Thai-PUD>`_ treebanks
135-
:return: returns a list of labels regarding which part of speech it is
136-
:rtype: list[tuple[str, str]]
137-
138-
:Example:
139-
140-
Tag words with corpus `orchid` (default)::
141-
142-
from pythainlp.tag import pos_tag
143-
144-
words = ['ฉัน','มี','ชีวิต','รอด','ใน','อาคาร','หลบภัย','ของ', \\
145-
'นายก', 'เชอร์ชิล']
146-
pos_tag(words)
147-
# output:
148-
# [('ฉัน', 'PPRS'), ('มี', 'VSTA'), ('ชีวิต', 'NCMN'), ('รอด', 'NCMN'),
149-
# ('ใน', 'RPRE'), ('อาคาร', 'NCMN'), ('หลบภัย', 'NCMN'),
150-
# ('ของ', 'RPRE'), ('นายก', 'NCMN'), ('เชอร์ชิล', 'NCMN')]
151-
152-
Tag words with corpus `orchid_ud`::
153-
154-
from pythainlp.tag import pos_tag
155-
156-
words = ['ฉัน','มี','ชีวิต','รอด','ใน','อาคาร','หลบภัย','ของ', \\
157-
'นายก', 'เชอร์ชิล']
158-
pos_tag(words, corpus='orchid_ud')
159-
# output:
160-
# [('ฉัน', 'PROPN'), ('มี', 'VERB'), ('ชีวิต', 'NOUN'),
161-
# ('รอด', 'NOUN'), ('ใน', 'ADP'), ('อาคาร', 'NOUN'),
162-
# ('หลบภัย', 'NOUN'), ('ของ', 'ADP'), ('นายก', 'NOUN'),
163-
# ('เชอร์ชิล', 'NOUN')]
164-
165-
Tag words with corpus `pud`::
166-
167-
from pythainlp.tag import pos_tag
168-
169-
words = ['ฉัน','มี','ชีวิต','รอด','ใน','อาคาร','หลบภัย','ของ', \\
170-
'นายก', 'เชอร์ชิล']
171-
pos_tag(words, corpus='pud')
172-
# [('ฉัน', 'PRON'), ('มี', 'VERB'), ('ชีวิต', 'NOUN'), ('รอด', 'VERB'),
173-
# ('ใน', 'ADP'), ('อาคาร', 'NOUN'), ('หลบภัย', 'NOUN'),
174-
# ('ของ', 'ADP'), ('นายก', 'NOUN'), ('เชอร์ชิล', 'PROPN')]
175-
176-
Tag words with different engines including *perceptron* and *unigram*::
177-
178-
from pythainlp.tag import pos_tag
179-
180-
words = ['เก้าอี้','มี','จำนวน','ขา', ' ', '=', '3']
181-
182-
pos_tag(words, engine='perceptron', corpus='orchid')
183-
# output:
184-
# [('เก้าอี้', 'NCMN'), ('มี', 'VSTA'), ('จำนวน', 'NCMN'),
185-
# ('ขา', 'NCMN'), (' ', 'PUNC'),
186-
# ('=', 'PUNC'), ('3', 'NCNM')]
187-
188-
pos_tag(words, engine='unigram', corpus='pud')
189-
# output:
190-
# [('เก้าอี้', None), ('มี', 'VERB'), ('จำนวน', 'NOUN'), ('ขา', None),
191-
# ('<space>', None), ('<equal>', None), ('3', 'NUM')]
192-
"""
193-
194-
# NOTE:
195-
_corpus = corpus
196-
_tag = []
197-
if corpus == "orchid_ud":
198-
corpus = "orchid"
199-
if not words:
200-
return []
201-
202-
if engine == "perceptron":
203-
from .perceptron import tag as tag_
204-
else: # default, use "unigram" ("old") engine
205-
from .unigram import tag as tag_
206-
_tag = tag_(words, corpus=corpus)
207-
208-
if _corpus == "orchid_ud":
209-
_tag = _orchid_to_ud(_tag)
210-
211-
return _tag
212-
213-
214-
def pos_tag_sents(
215-
sentences: List[List[str]],
216-
engine: str = "perceptron",
217-
corpus: str = "orchid",
218-
) -> List[List[Tuple[str, str]]]:
219-
"""
220-
The function tag multiple list of tokenized words into Part-of-Speech
221-
(POS) tags.
222-
223-
:param list sentences: a list of lists of tokenized words
224-
:param str engine:
225-
* *perceptron* - perceptron tagger (default)
226-
* *unigram* - unigram tagger
227-
:param str corpus:
228-
* *orchid* - annotated Thai academic articles namedly\
229-
`Orchid <https://www.academia.edu/9127599/Thai_Treebank>`_\
230-
(default)
231-
* *orchid_ud* - annotated Thai academic articles using\
232-
`Universal Dependencies <https://universaldependencies.org/>`_ Tags
233-
* *pud* - `Parallel Universal Dependencies (PUD)\
234-
<https://github.com/UniversalDependencies/UD_Thai-PUD>`_ treebanks
235-
:return: returns a list of labels regarding which part of speech it is
236-
for each sentence given.
237-
:rtype: list[list[tuple[str, str]]]
238-
239-
:Example:
240-
241-
Labels POS for two sentences::
242-
243-
from pythainlp.tag import pos_tag_sents
244-
245-
sentences = [['เก้าอี้','มี','3','ขา'], \\
246-
['นก', 'บิน', 'กลับ', 'รัง']]
247-
pos_tag_sents(sentences, corpus='pud)
248-
# output:
249-
# [[('เก้าอี้', 'PROPN'), ('มี', 'VERB'), ('3', 'NUM'),
250-
# ('ขา', 'NOUN')], [('นก', 'NOUN'), ('บิน', 'VERB'),
251-
# ('กลับ', 'VERB'), ('รัง', 'NOUN')]]
252-
"""
253-
if not sentences:
254-
return []
255-
256-
return [pos_tag(sent, engine=engine, corpus=corpus) for sent in sentences]
11+
from .pos_tag import pos_tag
12+
from .pos_tag import pos_tag_sents

0 commit comments

Comments
 (0)