- Remove Python 2 condition check

bact · bact · commit 6f2d2560ab6c · 2018-10-21T19:36:17.000+07:00
- make indentation
- make sure pos tagger will always return something (will use "unigrame" and "pud" as default)
diff --git a/pythainlp/romanization/__init__.py b/pythainlp/romanization/__init__.py
@@ -8,7 +8,7 @@
 def romanize(text, engine="royin"):
     """
     :param str data: Thai text to be romanized
-    :param str engine: choose between 'royin' , 'pyicu' and 'thai2rom'. 'royin' will romanize according to the standard of Thai Royal Institute. 'pyicu' will romanize according to the Internaitonal Phonetic Alphabet. 'thai2rom' is deep learning thai romanization.
+    :param str engine: choose between 'royin' (default), 'pyicu', and 'thai2rom'. 'royin' will romanize according to the standard of Thai Royal Institute. 'pyicu' will romanize according to the Internaitonal Phonetic Alphabet. 'thai2rom' is deep learning Thai romanization.
     :return: English (more or less) text that spells out how the Thai text should read.
     """
     if engine == "pyicu":
diff --git a/pythainlp/sentiment/__init__.py b/pythainlp/sentiment/__init__.py
@@ -14,7 +14,7 @@
 def sentiment(text, engine="old"):
     """
     :param str text: thai text
-    :param str engine: sentiment analysis engine (old or ulmfit)
+    :param str engine: sentiment analysis engine ("old" [default] or "ulmfit")
     :return: pos or neg
 
     **Example**::
diff --git a/pythainlp/spell/__init__.py b/pythainlp/spell/__init__.py
@@ -9,7 +9,7 @@ def spell(word, engine="pn"):
     """
     :param str word: word to check spelling
     :param str engine:
-        * pn - Peter Norvig's algorithm
+        * pn - Peter Norvig's algorithm (default)
         * hunspell - uses hunspell's algorithm, which should already exist in Linux
     :return: list of words
     """
diff --git a/pythainlp/spell/hunspell.py b/pythainlp/spell/hunspell.py
@@ -35,8 +35,8 @@ def spell(word, lang="th_TH"):
     except subprocess.CalledProcessError:
         print("Error: Please install hunspell.")
         return None
-    except BaseException:
-        print("Errr: Other error.")
+    except BaseException as exception:
+        print("Errr: Other error: {}".format(exception))
         return None
 
 
diff --git a/pythainlp/tag/__init__.py b/pythainlp/tag/__init__.py
@@ -1,43 +1,56 @@
 # -*- coding: utf-8 -*-
-from __future__ import absolute_import,division,print_function,unicode_literals
+"""
+Part-Of-Speech Tagging
+"""
+from __future__ import absolute_import, division, print_function, unicode_literals
+
 import sys
-def pos_tag(list_text,engine='unigram',corpus='orchid'):
+
+ARTAGGER_URL = "https://github.com/wannaphongcom/artagger/archive/master.zip"
+
+
+def pos_tag(texts, engine="unigram", corpus="orchid"):
     """
     Part of Speech tagging function.
 
-    :param list list_text: takes in a list of tokenized words (put differently, a list of string)
+    :param list texts: takes in a list of tokenized words (put differently, a list of strings)
     :param str engine:
-        * unigram - unigram tagger
+        * unigram - unigram tagger (default)
         * perceptron - perceptron tagger
         * artagger - RDR POS tagger
     :param str corpus:
         * orchid - annotated Thai academic articles
         * pud - Parallel Universal Dependencies (PUD) treebanks
     :return: returns a list of labels regarding which part of speech it is
     """
-    if engine=='old' or engine=='unigram':
-        from .old import tag
-    elif engine=='perceptron':
+    if engine == "perceptron":
         from .perceptron import tag
-    elif engine=='artagger':
-        def tag(text1):
+    elif engine == "artagger":
+
+        def tag(text):
             try:
                 from artagger import Tagger
             except ImportError:
                 from pythainlp.tools import install_package
-                install_package('https://github.com/wannaphongcom/artagger/archive/master.zip')
+
+                install_package(ARTAGGER_URL)
                 try:
                     from artagger import Tagger
                 except ImportError:
-                    print("Error ! using 'pip install https://github.com/wannaphongcom/artagger/archive/master.zip'")
+                    print("Error: Try 'pip install " + ARTAGGER_URL + "'")
                     sys.exit(0)
-            words = Tagger().tag(' '.join(text1))
-            totag=[]
+            words = Tagger().tag(" ".join(text))
+            totag = []
             for word in words:
                 totag.append((word.word, word.tag))
             return totag
-        return tag(list_text)
-    return tag(list_text,corpus=corpus)
 
-def pos_tag_sents(sentences,engine='unigram',corpus='orchid'):
-    return [pos_tag(i,engine=engine,corpus=corpus) for i in sentences]
+        return tag(texts)
+    else:  # default, use "unigram" ("old") engine
+        from .old import tag
+
+    return tag(texts, corpus=corpus)
+
+
+def pos_tag_sents(sentences, engine="unigram", corpus="orchid"):
+    return [pos_tag(i, engine=engine, corpus=corpus) for i in sentences]
diff --git a/pythainlp/tag/old.py b/pythainlp/tag/old.py
@@ -1,28 +1,40 @@
 # -*- coding: utf-8 -*-
-from __future__ import absolute_import,division,unicode_literals
+"""
+Unigram Part-Of-Speech Tagger
+"""
+from __future__ import absolute_import, division, unicode_literals
+
 import codecs
-import os
 import json
-import pythainlp
-import nltk.tag
+import os
+
 import dill
-templates_dir = os.path.join(os.path.dirname(pythainlp.__file__), 'corpus')
+import nltk.tag
+import pythainlp
+
+templates_dir = os.path.join(os.path.dirname(pythainlp.__file__), "corpus")
+
+
 def orchid_data():
-	template_file = os.path.join(templates_dir, 'thaipos.json')
-	with codecs.open(template_file,'r',encoding='utf-8-sig') as handle:
-		model = json.load(handle)
-	return model
+    template_file = os.path.join(templates_dir, "thaipos.json")
+    with codecs.open(template_file, "r", encoding="utf-8-sig") as handle:
+        model = json.load(handle)
+    return model
+
+
 def pud_data():
-	template_file = os.path.join(templates_dir, 'ud_thai-pud_unigram_tagger.dill')
-	with open(template_file,'rb') as handle:
-		model = dill.load(handle)
-	return model
-def tag(text,corpus):
-	"""
-	รับค่าเป็น ''list'' คืนค่าเป็น ''list'' เช่น [('ข้อความ', 'ชนิดคำ')]"""
-	if corpus=='orchid':
-		tagger = nltk.tag.UnigramTagger(model=orchid_data())# backoff=default_tagger)
-		return tagger.tag(text)
-	elif corpus=='pud':
-		tagger = pud_data()
-		return tagger.tag(text)
+    template_file = os.path.join(templates_dir, "ud_thai-pud_unigram_tagger.dill")
+    with open(template_file, "rb") as handle:
+        model = dill.load(handle)
+    return model
+
+
+def tag(text, corpus):
+    """
+    รับค่าเป็น ''list'' คืนค่าเป็น ''list'' เช่น [('ข้อความ', 'ชนิดคำ')]"""
+    if corpus == "orchid":
+        tagger = nltk.tag.UnigramTagger(model=orchid_data())  # backoff=default_tagger)
+        return tagger.tag(text)
+    else:  # default, use "pud" as a corpus
+        tagger = pud_data()
+        return tagger.tag(text)
diff --git a/pythainlp/tag/perceptron.py b/pythainlp/tag/perceptron.py
@@ -1,27 +1,37 @@
 # -*- coding: utf-8 -*-
-from __future__ import absolute_import,division,unicode_literals
-import sys
+"""
+Perceptron Part-Of-Speech Tagger
+"""
+from __future__ import absolute_import, division, unicode_literals
+
 import os
-import pythainlp
-import nltk.tag
+
 import dill
-templates_dir = os.path.join(os.path.dirname(pythainlp.__file__), 'corpus')
+import pythainlp
+
+templates_dir = os.path.join(os.path.dirname(pythainlp.__file__), "corpus")
+
+
 def orchid_data():
-	template_file = os.path.join(templates_dir, 'pt_tagger_1.dill')
-	with open(template_file,'rb') as handle:
-		model = dill.load(handle)
-	return model
+    template_file = os.path.join(templates_dir, "pt_tagger_1.dill")
+    with open(template_file, "rb") as handle:
+        model = dill.load(handle)
+    return model
+
+
 def pud_data():
-	template_file = os.path.join(templates_dir, 'ud_thai-pud_pt_tagger.dill')
-	with open(template_file,'rb') as handle:
-		model = dill.load(handle)
-	return model
-def tag(text,corpus):
-	"""
-	รับค่าเป็น ''list'' คืนค่าเป็น ''list'' เช่น [('ข้อความ', 'ชนิดคำ')]"""
-	if corpus=='orchid':
-		tagger = orchid_data()
-		return tagger.tag(text)
-	elif corpus=='pud':
-		tagger = pud_data()
-		return tagger.tag(text)
+    template_file = os.path.join(templates_dir, "ud_thai-pud_pt_tagger.dill")
+    with open(template_file, "rb") as handle:
+        model = dill.load(handle)
+    return model
+
+
+def tag(text, corpus):
+    """
+    รับค่าเป็น ''list'' คืนค่าเป็น ''list'' เช่น [('ข้อความ', 'ชนิดคำ')]"""
+    if corpus == "orchid":
+        tagger = orchid_data()
+        return tagger.tag(text)
+    else:  # default, use "pud" as a corpus
+        tagger = pud_data()
+        return tagger.tag(text)
diff --git a/pythainlp/ulmfit/utils.py b/pythainlp/ulmfit/utils.py
@@ -28,7 +28,7 @@
         from fastai.text import *
         import dill as pickle
     except ImportError:
-        print("Error installing using 'pip install fastai numpy dill'")
+        print("Error: Try 'pip install fastai numpy dill'")
         sys.exit(0)
 
 # import torch
@@ -281,6 +281,6 @@ def about():
     State-of-the-Art Language Modeling, Text Feature Extraction and Text Classification in Thai Language.
     Created as part of PyThaiNLP with ULMFit implementation from fast.ai
 
-    Development : Charin Polpanumas
-    GitHub : https://github.com/cstorm125/thai2vec
+    Development: Charin Polpanumas
+    GitHub: https://github.com/cstorm125/thai2vec
     """
diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py
diff --git a/pythainlp/word_vector/thai2vec.py b/pythainlp/word_vector/thai2vec.py