|
1 | 1 | # -*- coding: utf-8 -*- |
2 | | -from __future__ import absolute_import,division,print_function,unicode_literals |
| 2 | +""" |
| 3 | +Part-Of-Speech Tagging |
| 4 | +""" |
| 5 | +from __future__ import absolute_import, division, print_function, unicode_literals |
| 6 | + |
3 | 7 | import sys |
4 | | -def pos_tag(list_text,engine='unigram',corpus='orchid'): |
| 8 | + |
| 9 | +ARTAGGER_URL = "https://github.com/wannaphongcom/artagger/archive/master.zip" |
| 10 | + |
| 11 | + |
| 12 | +def pos_tag(texts, engine="unigram", corpus="orchid"): |
5 | 13 | """ |
6 | 14 | Part of Speech tagging function. |
7 | 15 |
|
8 | | - :param list list_text: takes in a list of tokenized words (put differently, a list of string) |
| 16 | + :param list texts: takes in a list of tokenized words (put differently, a list of strings) |
9 | 17 | :param str engine: |
10 | | - * unigram - unigram tagger |
| 18 | + * unigram - unigram tagger (default) |
11 | 19 | * perceptron - perceptron tagger |
12 | 20 | * artagger - RDR POS tagger |
13 | 21 | :param str corpus: |
14 | 22 | * orchid - annotated Thai academic articles |
15 | 23 | * pud - Parallel Universal Dependencies (PUD) treebanks |
16 | 24 | :return: returns a list of labels regarding which part of speech it is |
17 | 25 | """ |
18 | | - if engine=='old' or engine=='unigram': |
19 | | - from .old import tag |
20 | | - elif engine=='perceptron': |
| 26 | + if engine == "perceptron": |
21 | 27 | from .perceptron import tag |
22 | | - elif engine=='artagger': |
23 | | - def tag(text1): |
| 28 | + elif engine == "artagger": |
| 29 | + |
| 30 | + def tag(text): |
24 | 31 | try: |
25 | 32 | from artagger import Tagger |
26 | 33 | except ImportError: |
27 | 34 | from pythainlp.tools import install_package |
28 | | - install_package('https://github.com/wannaphongcom/artagger/archive/master.zip') |
| 35 | + |
| 36 | + install_package(ARTAGGER_URL) |
29 | 37 | try: |
30 | 38 | from artagger import Tagger |
31 | 39 | except ImportError: |
32 | | - print("Error ! using 'pip install https://github.com/wannaphongcom/artagger/archive/master.zip'") |
| 40 | + print("Error: Try 'pip install " + ARTAGGER_URL + "'") |
33 | 41 | sys.exit(0) |
34 | | - words = Tagger().tag(' '.join(text1)) |
35 | | - totag=[] |
| 42 | + words = Tagger().tag(" ".join(text)) |
| 43 | + totag = [] |
36 | 44 | for word in words: |
37 | 45 | totag.append((word.word, word.tag)) |
38 | 46 | return totag |
39 | | - return tag(list_text) |
40 | | - return tag(list_text,corpus=corpus) |
41 | 47 |
|
42 | | -def pos_tag_sents(sentences,engine='unigram',corpus='orchid'): |
43 | | - return [pos_tag(i,engine=engine,corpus=corpus) for i in sentences] |
| 48 | + return tag(texts) |
| 49 | + else: # default, use "unigram" ("old") engine |
| 50 | + from .old import tag |
| 51 | + |
| 52 | + return tag(texts, corpus=corpus) |
| 53 | + |
| 54 | + |
| 55 | +def pos_tag_sents(sentences, engine="unigram", corpus="orchid"): |
| 56 | + return [pos_tag(i, engine=engine, corpus=corpus) for i in sentences] |
0 commit comments