PyThaiNLP · bact · May 27, 2020 · May 26, 2020 · May 26, 2020 · May 26, 2020
diff --git a/README.md b/README.md
@@ -27,7 +27,7 @@ Using PyThaiNLP:
 - [PyThaiNLP Get Started](https://www.thainlp.org/pythainlp/tutorials/notebooks/pythainlp_get_started.html)
 - More tutorials at [https://www.thainlp.org/pythainlp/tutorials/](https://www.thainlp.org/pythainlp/tutorials/)
 - See full documentation at [https://thainlp.org/pythainlp/docs/2.1/](https://thainlp.org/pythainlp/docs/2.1/)
-- Some additional data (like word lists and language models) maybe automatically downloaded by the library during runtime and it will be kept under the directory `~/pythainlp-data` by default.
+- Some additional data (like word lists and language models) may get automatically download during runtime and it will be kept under the directory `~/pythainlp-data` by default. See corpus catalog at [https://github.com/PyThaiNLP/pythainlp-corpus](https://github.com/PyThaiNLP/pythainlp-corpus).
   - The data location can be changed, using `PYTHAINLP_DATA_DIR` environment variable.
 - For PyThaiNLP tokenization performance and measurement methods, see [tokenization benchmark](tokenization-benchmark.md)
 - 📫 follow our [PyThaiNLP](https://www.facebook.com/pythainlp/) Facebook page

diff --git a/pythainlp/corpus/__init__.py b/pythainlp/corpus/__init__.py
@@ -36,16 +36,20 @@
 _CORPUS_DIRNAME = "corpus"
 _CORPUS_PATH = os.path.join(get_pythainlp_path(), _CORPUS_DIRNAME)
 
+# remote corpus catalog URL
 _CORPUS_DB_URL = (
     "https://raw.githubusercontent.com/"
-    + "PyThaiNLP/pythainlp-corpus/"
-    + "2.2/db.json"
+    "PyThaiNLP/pythainlp-corpus/"
+    "2.2/db.json"
 )
 
+# local corpus catalog filename
 _CORPUS_DB_FILENAME = "db.json"
+
+# local corpus catalog full path
 _CORPUS_DB_PATH = get_full_data_path(_CORPUS_DB_FILENAME)
 
-# Create a local corpus database if it does not already exist
+# create a local corpus database if it does not already exist
 if not os.path.exists(_CORPUS_DB_PATH):
     TinyDB(_CORPUS_DB_PATH)
 

diff --git a/pythainlp/corpus/core.py b/pythainlp/corpus/core.py
@@ -9,14 +9,16 @@
 from urllib.request import urlopen
 
 import requests
-from requests.exceptions import HTTPError
-from tinydb import Query, TinyDB
-
 from pythainlp.corpus import corpus_db_path, corpus_db_url, corpus_path
 from pythainlp.tools import get_full_data_path
+from requests.exceptions import HTTPError
+from tinydb import Query, TinyDB
 
 
 def get_corpus_db(url: str) -> requests.Response:
+    """
+    Get corpus catalog from server.
+    """
     corpus_db = None
     try:
         corpus_db = requests.get(url)
@@ -29,20 +31,23 @@ def get_corpus_db(url: str) -> requests.Response:
 
 
 def get_corpus_db_detail(name: str) -> dict:
+    """
+    Get details about a corpus, using information from local catalog.
+    """
     local_db = TinyDB(corpus_db_path())
     query = Query()
     res = local_db.search(query.name == name)
     local_db.close()
 
     if res:
         return res[0]
-    else:
-        return dict()
+
+    return dict()
 
 
 def get_corpus(filename: str) -> frozenset:
     """
-    Read corpus from file and return a frozenset.
+    Read corpus data from file and return a frozenset.
 
     (Please see the filename from
     `this file
@@ -82,7 +87,7 @@ def get_corpus_path(name: str) -> Union[str, None]:
     Get corpus path.
 
     :param str name: corpus name
-    :return: path to the corpus or **None** of the corpus doesn't
+    :return: path to the corpus or **None** of the corpus doesn't \
              exist in the device
     :rtype: str
 
@@ -112,18 +117,22 @@ def get_corpus_path(name: str) -> Union[str, None]:
         print(get_corpus_path('wiki_lm_lstm'))
         # output: /root/pythainlp-data/thwiki_model_lstm.pth
     """
-    db = TinyDB(corpus_db_path())
-    query = Query()
-    path = None
-
-    if db.search(query.name == name):
-        path = get_full_data_path(db.search(query.name == name)[0]["file"])
-
+    # check if the corpus is in local catalog, download if not
+    corpus_db_detail = get_corpus_db_detail(name)
+    if not corpus_db_detail or not corpus_db_detail.get("file_name"):
+        download(name)
+        corpus_db_detail = get_corpus_db_detail(name)
+
+    if corpus_db_detail and corpus_db_detail.get("file_name"):
+        # corpus is in the local catalog, get full path to the file
+        path = get_full_data_path(corpus_db_detail.get("file_name"))
+        # check if the corpus file actually exists, download if not
         if not os.path.exists(path):
             download(name)
+        if os.path.exists(path):
+            return path
 
-    db.close()
-    return path
+    return None
 
 
 def _download(url: str, dst: str) -> int:
@@ -174,9 +183,7 @@ def _check_hash(dst: str, md5: str) -> None:
                 raise Exception("Hash does not match expected.")
 
 
-def download(
-    name: str, force: bool = False, url: str = None
-) -> bool:
+def download(name: str, force: bool = False, url: str = None) -> bool:
     """
     Download corpus.
 
@@ -215,7 +222,7 @@ def download(
 
     corpus_db = corpus_db.json()
 
-    # Check if corpus is available
+    # check if corpus is available
     if name in list(corpus_db.keys()):
         local_db = TinyDB(corpus_db_path())
         query = Query()
@@ -239,7 +246,7 @@ def download(
                     {
                         "name": name,
                         "version": corpus["version"],
-                        "file": corpus["file_name"],
+                        "file_name": corpus["file_name"],
                     }
                 )
         else:

diff --git a/pythainlp/tag/named_entity.py b/pythainlp/tag/named_entity.py
@@ -7,13 +7,14 @@
 
 from typing import List, Tuple, Union
 
-import pycrfsuite
-from pythainlp.corpus import download, get_corpus_path, thai_stopwords
+from pycrfsuite import Tagger as CRFTagger
+from pythainlp.corpus import get_corpus_path, thai_stopwords
 from pythainlp.tag import pos_tag
 from pythainlp.tokenize import word_tokenize
 from pythainlp.util import isthai
 
-_WORD_TOKENIZER = "newmm"  # ตัวตัดคำ
+_CORPUS_NAME = "thainer-1-4"
+_TOKENIZER_ENGINE = "newmm"  # should be the same as one used in training data
 
 
 def _is_stopword(word: str) -> bool:  # เช็คว่าเป็นคำฟุ่มเฟือย
@@ -74,14 +75,10 @@ def _doc2features(doc, i) -> dict:
 class ThaiNameTagger:
     def __init__(self):
         """
-        Thai named-entity recognizer
+        Thai named-entity recognizer.
         """
-        self.__data_path = get_corpus_path("thainer-1-4")
-        if not self.__data_path:
-            download("thainer-1-4")
-            self.__data_path = get_corpus_path("thainer-1-4")
-        self.crf = pycrfsuite.Tagger()
-        self.crf.open(self.__data_path)
+        self.crf = CRFTagger()
+        self.crf.open(get_corpus_path(_CORPUS_NAME))
 
     def get_ner(
         self, text: str, pos: bool = True, tag: bool = False
@@ -137,41 +134,41 @@ def get_ner(
                             tag=True)
             'วันที่ <DATE>15 ก.ย. 61</DATE> ทดสอบระบบเวลา <TIME>14:49 น.</TIME>'
         """
-        self.__tokens = word_tokenize(text, engine=_WORD_TOKENIZER)
-        self.__pos_tags = pos_tag(
-            self.__tokens, engine="perceptron", corpus="orchid_ud"
-        )
-        self.__x_test = self.__extract_features(self.__pos_tags)
-        self.__y = self.crf.tag(self.__x_test)
-
-        self.sent_ner = [
-            (self.__pos_tags[i][0], data) for i, data in enumerate(self.__y)
-        ]
+        tokens = word_tokenize(text, engine=_TOKENIZER_ENGINE)
+        pos_tags = pos_tag(tokens, engine="perceptron", corpus="orchid_ud")
+        x_test = ThaiNameTagger.__extract_features(pos_tags)
+        y = self.crf.tag(x_test)
+
+        sent_ner = [(pos_tags[i][0], data) for i, data in enumerate(y)]
+
         if tag:
-            self.temp = ""
-            self.sent = ""
-            for idx, (word, ner) in enumerate(self.sent_ner):
-                if "B-" in ner and self.temp != "":
-                    self.sent += "</" + self.temp + ">"
-                    self.temp = ner.replace("B-", "")
-                    self.sent += "<" + self.temp + ">"
-                elif "B-" in ner:
-                    self.temp = ner.replace("B-", "")
-                    self.sent += "<" + self.temp + ">"
-                elif "O" == ner and self.temp != "":
-                    self.sent += "</" + self.temp + ">"
-                    self.temp = ""
-                self.sent += word
-                if idx == len(self.sent_ner) - 1 and self.temp != "":
-                    self.sent += "</" + self.temp + ">"
-            return self.sent
-        elif pos:
+            temp = ""
+            sent = ""
+            for idx, (word, ner) in enumerate(sent_ner):
+                if ner.startswith("B-") and temp != "":
+                    sent += "</" + temp + ">"
+                    temp = ner[2:]
+                    sent += "<" + temp + ">"
+                elif ner.startswith("B-"):
+                    temp = ner[2:]
+                    sent += "<" + temp + ">"
+                elif ner == "O" and temp != "":
+                    sent += "</" + temp + ">"
+                    temp = ""
+                sent += word
+
+                if idx == len(sent_ner) - 1 and temp != "":
+                    sent += "</" + temp + ">"
+
+            return sent
+
+        if pos:
             return [
-                (self.__pos_tags[i][0], self.__pos_tags[i][1], data)
-                for i, data in enumerate(self.__y)
+                (pos_tags[i][0], pos_tags[i][1], data)
+                for i, data in enumerate(y)
             ]
-        else:
-            return self.sent_ner
+
+        return sent_ner
 
     @staticmethod
     def __extract_features(doc):

diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py
@@ -180,14 +180,13 @@ def pos_tag(
         # [('เก้าอี้', None), ('มี', 'VERB'), ('จำนวน', 'NOUN'), ('ขา', None),
         #   ('<space>', None), ('<equal>', None), ('3', 'NUM')]
     """
+    if not words:
+        return []
 
-    # NOTE:
     _corpus = corpus
     _tag = []
     if corpus == "orchid_ud":
         corpus = "orchid"
-    if not words:
-        return []
 
     if engine == "perceptron":
         from .perceptron import tag as tag_
@@ -243,4 +242,4 @@ def pos_tag_sents(
     if not sentences:
         return []
 
-    return [pos_tag(sent, engine=engine, corpus=corpus) for sent in sentences]
+    return [pos_tag(sent, engine=engine, corpus=corpus) for sent in sentences]
diff --git a/pythainlp/tools/__init__.py b/pythainlp/tools/__init__.py
@@ -1,14 +1,14 @@
 # -*- coding: utf-8 -*-
 __all__ = [
+    "PYTHAINLP_DEFAULT_DATA_DIR",
     "get_full_data_path",
     "get_pythainlp_data_path",
     "get_pythainlp_path",
-    "PYTHAINLP_DATA_DIR",
 ]
 
 from pythainlp.tools.path import (
+    PYTHAINLP_DEFAULT_DATA_DIR,
     get_full_data_path,
     get_pythainlp_data_path,
     get_pythainlp_path,
-    PYTHAINLP_DATA_DIR,
 )
diff --git a/pythainlp/tools/path.py b/pythainlp/tools/path.py
@@ -6,9 +6,9 @@
 """
 import os
 
-import pythainlp
+from pythainlp import __file__ as pythainlp_file
 
-PYTHAINLP_DATA_DIR = "pythainlp-data"
+PYTHAINLP_DEFAULT_DATA_DIR = "pythainlp-data"
 
 
 def get_full_data_path(path: str) -> str:
@@ -49,10 +49,10 @@ def get_pythainlp_data_path() -> str:
         get_pythainlp_data_path()
         # output: '/root/pythainlp-data'
     """
-    path = os.getenv(
-        "PYTHAINLP_DATA_DIR", os.path.join("~", PYTHAINLP_DATA_DIR)
+    pythainlp_data_dir = os.getenv(
+        "PYTHAINLP_DATA_DIR", os.path.join("~", PYTHAINLP_DEFAULT_DATA_DIR)
     )
-    path = os.path.expanduser(path)
+    path = os.path.expanduser(pythainlp_data_dir)
     os.makedirs(path, exist_ok=True)
     return path
 
@@ -72,4 +72,4 @@ def get_pythainlp_path() -> str:
         get_pythainlp_path()
         # output: '/usr/local/lib/python3.6/dist-packages/pythainlp'
     """
-    return os.path.dirname(pythainlp.__file__)
+    return os.path.dirname(pythainlp_file)