Skip to content

Commit 6f2d256

Browse files
committed
- Remove Python 2 condition check
- make indentation - make sure pos tagger will always return something (will use "unigrame" and "pud" as default)
1 parent fc0bc9f commit 6f2d256

File tree

10 files changed

+288
-226
lines changed

10 files changed

+288
-226
lines changed

pythainlp/romanization/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
def romanize(text, engine="royin"):
99
"""
1010
:param str data: Thai text to be romanized
11-
:param str engine: choose between 'royin' , 'pyicu' and 'thai2rom'. 'royin' will romanize according to the standard of Thai Royal Institute. 'pyicu' will romanize according to the Internaitonal Phonetic Alphabet. 'thai2rom' is deep learning thai romanization.
11+
:param str engine: choose between 'royin' (default), 'pyicu', and 'thai2rom'. 'royin' will romanize according to the standard of Thai Royal Institute. 'pyicu' will romanize according to the Internaitonal Phonetic Alphabet. 'thai2rom' is deep learning Thai romanization.
1212
:return: English (more or less) text that spells out how the Thai text should read.
1313
"""
1414
if engine == "pyicu":

pythainlp/sentiment/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
def sentiment(text, engine="old"):
1515
"""
1616
:param str text: thai text
17-
:param str engine: sentiment analysis engine (old or ulmfit)
17+
:param str engine: sentiment analysis engine ("old" [default] or "ulmfit")
1818
:return: pos or neg
1919
2020
**Example**::

pythainlp/spell/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ def spell(word, engine="pn"):
99
"""
1010
:param str word: word to check spelling
1111
:param str engine:
12-
* pn - Peter Norvig's algorithm
12+
* pn - Peter Norvig's algorithm (default)
1313
* hunspell - uses hunspell's algorithm, which should already exist in Linux
1414
:return: list of words
1515
"""

pythainlp/spell/hunspell.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ def spell(word, lang="th_TH"):
3535
except subprocess.CalledProcessError:
3636
print("Error: Please install hunspell.")
3737
return None
38-
except BaseException:
39-
print("Errr: Other error.")
38+
except BaseException as exception:
39+
print("Errr: Other error: {}".format(exception))
4040
return None
4141

4242

pythainlp/tag/__init__.py

Lines changed: 30 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,43 +1,56 @@
11
# -*- coding: utf-8 -*-
2-
from __future__ import absolute_import,division,print_function,unicode_literals
2+
"""
3+
Part-Of-Speech Tagging
4+
"""
5+
from __future__ import absolute_import, division, print_function, unicode_literals
6+
37
import sys
4-
def pos_tag(list_text,engine='unigram',corpus='orchid'):
8+
9+
ARTAGGER_URL = "https://github.com/wannaphongcom/artagger/archive/master.zip"
10+
11+
12+
def pos_tag(texts, engine="unigram", corpus="orchid"):
513
"""
614
Part of Speech tagging function.
715
8-
:param list list_text: takes in a list of tokenized words (put differently, a list of string)
16+
:param list texts: takes in a list of tokenized words (put differently, a list of strings)
917
:param str engine:
10-
* unigram - unigram tagger
18+
* unigram - unigram tagger (default)
1119
* perceptron - perceptron tagger
1220
* artagger - RDR POS tagger
1321
:param str corpus:
1422
* orchid - annotated Thai academic articles
1523
* pud - Parallel Universal Dependencies (PUD) treebanks
1624
:return: returns a list of labels regarding which part of speech it is
1725
"""
18-
if engine=='old' or engine=='unigram':
19-
from .old import tag
20-
elif engine=='perceptron':
26+
if engine == "perceptron":
2127
from .perceptron import tag
22-
elif engine=='artagger':
23-
def tag(text1):
28+
elif engine == "artagger":
29+
30+
def tag(text):
2431
try:
2532
from artagger import Tagger
2633
except ImportError:
2734
from pythainlp.tools import install_package
28-
install_package('https://github.com/wannaphongcom/artagger/archive/master.zip')
35+
36+
install_package(ARTAGGER_URL)
2937
try:
3038
from artagger import Tagger
3139
except ImportError:
32-
print("Error ! using 'pip install https://github.com/wannaphongcom/artagger/archive/master.zip'")
40+
print("Error: Try 'pip install " + ARTAGGER_URL + "'")
3341
sys.exit(0)
34-
words = Tagger().tag(' '.join(text1))
35-
totag=[]
42+
words = Tagger().tag(" ".join(text))
43+
totag = []
3644
for word in words:
3745
totag.append((word.word, word.tag))
3846
return totag
39-
return tag(list_text)
40-
return tag(list_text,corpus=corpus)
4147

42-
def pos_tag_sents(sentences,engine='unigram',corpus='orchid'):
43-
return [pos_tag(i,engine=engine,corpus=corpus) for i in sentences]
48+
return tag(texts)
49+
else: # default, use "unigram" ("old") engine
50+
from .old import tag
51+
52+
return tag(texts, corpus=corpus)
53+
54+
55+
def pos_tag_sents(sentences, engine="unigram", corpus="orchid"):
56+
return [pos_tag(i, engine=engine, corpus=corpus) for i in sentences]

pythainlp/tag/old.py

Lines changed: 34 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,40 @@
11
# -*- coding: utf-8 -*-
2-
from __future__ import absolute_import,division,unicode_literals
2+
"""
3+
Unigram Part-Of-Speech Tagger
4+
"""
5+
from __future__ import absolute_import, division, unicode_literals
6+
37
import codecs
4-
import os
58
import json
6-
import pythainlp
7-
import nltk.tag
9+
import os
10+
811
import dill
9-
templates_dir = os.path.join(os.path.dirname(pythainlp.__file__), 'corpus')
12+
import nltk.tag
13+
import pythainlp
14+
15+
templates_dir = os.path.join(os.path.dirname(pythainlp.__file__), "corpus")
16+
17+
1018
def orchid_data():
11-
template_file = os.path.join(templates_dir, 'thaipos.json')
12-
with codecs.open(template_file,'r',encoding='utf-8-sig') as handle:
13-
model = json.load(handle)
14-
return model
19+
template_file = os.path.join(templates_dir, "thaipos.json")
20+
with codecs.open(template_file, "r", encoding="utf-8-sig") as handle:
21+
model = json.load(handle)
22+
return model
23+
24+
1525
def pud_data():
16-
template_file = os.path.join(templates_dir, 'ud_thai-pud_unigram_tagger.dill')
17-
with open(template_file,'rb') as handle:
18-
model = dill.load(handle)
19-
return model
20-
def tag(text,corpus):
21-
"""
22-
รับค่าเป็น ''list'' คืนค่าเป็น ''list'' เช่น [('ข้อความ', 'ชนิดคำ')]"""
23-
if corpus=='orchid':
24-
tagger = nltk.tag.UnigramTagger(model=orchid_data())# backoff=default_tagger)
25-
return tagger.tag(text)
26-
elif corpus=='pud':
27-
tagger = pud_data()
28-
return tagger.tag(text)
26+
template_file = os.path.join(templates_dir, "ud_thai-pud_unigram_tagger.dill")
27+
with open(template_file, "rb") as handle:
28+
model = dill.load(handle)
29+
return model
30+
31+
32+
def tag(text, corpus):
33+
"""
34+
รับค่าเป็น ''list'' คืนค่าเป็น ''list'' เช่น [('ข้อความ', 'ชนิดคำ')]"""
35+
if corpus == "orchid":
36+
tagger = nltk.tag.UnigramTagger(model=orchid_data()) # backoff=default_tagger)
37+
return tagger.tag(text)
38+
else: # default, use "pud" as a corpus
39+
tagger = pud_data()
40+
return tagger.tag(text)

pythainlp/tag/perceptron.py

Lines changed: 32 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,37 @@
11
# -*- coding: utf-8 -*-
2-
from __future__ import absolute_import,division,unicode_literals
3-
import sys
2+
"""
3+
Perceptron Part-Of-Speech Tagger
4+
"""
5+
from __future__ import absolute_import, division, unicode_literals
6+
47
import os
5-
import pythainlp
6-
import nltk.tag
8+
79
import dill
8-
templates_dir = os.path.join(os.path.dirname(pythainlp.__file__), 'corpus')
10+
import pythainlp
11+
12+
templates_dir = os.path.join(os.path.dirname(pythainlp.__file__), "corpus")
13+
14+
915
def orchid_data():
10-
template_file = os.path.join(templates_dir, 'pt_tagger_1.dill')
11-
with open(template_file,'rb') as handle:
12-
model = dill.load(handle)
13-
return model
16+
template_file = os.path.join(templates_dir, "pt_tagger_1.dill")
17+
with open(template_file, "rb") as handle:
18+
model = dill.load(handle)
19+
return model
20+
21+
1422
def pud_data():
15-
template_file = os.path.join(templates_dir, 'ud_thai-pud_pt_tagger.dill')
16-
with open(template_file,'rb') as handle:
17-
model = dill.load(handle)
18-
return model
19-
def tag(text,corpus):
20-
"""
21-
รับค่าเป็น ''list'' คืนค่าเป็น ''list'' เช่น [('ข้อความ', 'ชนิดคำ')]"""
22-
if corpus=='orchid':
23-
tagger = orchid_data()
24-
return tagger.tag(text)
25-
elif corpus=='pud':
26-
tagger = pud_data()
27-
return tagger.tag(text)
23+
template_file = os.path.join(templates_dir, "ud_thai-pud_pt_tagger.dill")
24+
with open(template_file, "rb") as handle:
25+
model = dill.load(handle)
26+
return model
27+
28+
29+
def tag(text, corpus):
30+
"""
31+
รับค่าเป็น ''list'' คืนค่าเป็น ''list'' เช่น [('ข้อความ', 'ชนิดคำ')]"""
32+
if corpus == "orchid":
33+
tagger = orchid_data()
34+
return tagger.tag(text)
35+
else: # default, use "pud" as a corpus
36+
tagger = pud_data()
37+
return tagger.tag(text)

pythainlp/ulmfit/utils.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
from fastai.text import *
2929
import dill as pickle
3030
except ImportError:
31-
print("Error installing using 'pip install fastai numpy dill'")
31+
print("Error: Try 'pip install fastai numpy dill'")
3232
sys.exit(0)
3333

3434
# import torch
@@ -281,6 +281,6 @@ def about():
281281
State-of-the-Art Language Modeling, Text Feature Extraction and Text Classification in Thai Language.
282282
Created as part of PyThaiNLP with ULMFit implementation from fast.ai
283283
284-
Development : Charin Polpanumas
285-
GitHub : https://github.com/cstorm125/thai2vec
284+
Development: Charin Polpanumas
285+
GitHub: https://github.com/cstorm125/thai2vec
286286
"""

0 commit comments

Comments
 (0)