Skip to content

Properly check if download() is needed in get_corpus_path() #414

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 22 commits into from
May 27, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ Using PyThaiNLP:
- [PyThaiNLP Get Started](https://www.thainlp.org/pythainlp/tutorials/notebooks/pythainlp_get_started.html)
- More tutorials at [https://www.thainlp.org/pythainlp/tutorials/](https://www.thainlp.org/pythainlp/tutorials/)
- See full documentation at [https://thainlp.org/pythainlp/docs/2.1/](https://thainlp.org/pythainlp/docs/2.1/)
- Some additional data (like word lists and language models) maybe automatically downloaded by the library during runtime and it will be kept under the directory `~/pythainlp-data` by default.
- Some additional data (like word lists and language models) may get automatically download during runtime and it will be kept under the directory `~/pythainlp-data` by default. See corpus catalog at [https://github.com/PyThaiNLP/pythainlp-corpus](https://github.com/PyThaiNLP/pythainlp-corpus).
- The data location can be changed, using `PYTHAINLP_DATA_DIR` environment variable.
- For PyThaiNLP tokenization performance and measurement methods, see [tokenization benchmark](tokenization-benchmark.md)
- 📫 follow our [PyThaiNLP](https://www.facebook.com/pythainlp/) Facebook page
Expand Down
10 changes: 7 additions & 3 deletions pythainlp/corpus/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,16 +36,20 @@
_CORPUS_DIRNAME = "corpus"
_CORPUS_PATH = os.path.join(get_pythainlp_path(), _CORPUS_DIRNAME)

# remote corpus catalog URL
_CORPUS_DB_URL = (
"https://raw.githubusercontent.com/"
+ "PyThaiNLP/pythainlp-corpus/"
+ "2.2/db.json"
"PyThaiNLP/pythainlp-corpus/"
"2.2/db.json"
)

# local corpus catalog filename
_CORPUS_DB_FILENAME = "db.json"

# local corpus catalog full path
_CORPUS_DB_PATH = get_full_data_path(_CORPUS_DB_FILENAME)

# Create a local corpus database if it does not already exist
# create a local corpus database if it does not already exist
if not os.path.exists(_CORPUS_DB_PATH):
TinyDB(_CORPUS_DB_PATH)

Expand Down
49 changes: 28 additions & 21 deletions pythainlp/corpus/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,16 @@
from urllib.request import urlopen

import requests
from requests.exceptions import HTTPError
from tinydb import Query, TinyDB

from pythainlp.corpus import corpus_db_path, corpus_db_url, corpus_path
from pythainlp.tools import get_full_data_path
from requests.exceptions import HTTPError
from tinydb import Query, TinyDB


def get_corpus_db(url: str) -> requests.Response:
"""
Get corpus catalog from server.
"""
corpus_db = None
try:
corpus_db = requests.get(url)
Expand All @@ -29,20 +31,23 @@ def get_corpus_db(url: str) -> requests.Response:


def get_corpus_db_detail(name: str) -> dict:
"""
Get details about a corpus, using information from local catalog.
"""
local_db = TinyDB(corpus_db_path())
query = Query()
res = local_db.search(query.name == name)
local_db.close()

if res:
return res[0]
else:
return dict()

return dict()


def get_corpus(filename: str) -> frozenset:
"""
Read corpus from file and return a frozenset.
Read corpus data from file and return a frozenset.

(Please see the filename from
`this file
Expand Down Expand Up @@ -82,7 +87,7 @@ def get_corpus_path(name: str) -> Union[str, None]:
Get corpus path.

:param str name: corpus name
:return: path to the corpus or **None** of the corpus doesn't
:return: path to the corpus or **None** of the corpus doesn't \
exist in the device
:rtype: str

Expand Down Expand Up @@ -112,18 +117,22 @@ def get_corpus_path(name: str) -> Union[str, None]:
print(get_corpus_path('wiki_lm_lstm'))
# output: /root/pythainlp-data/thwiki_model_lstm.pth
"""
db = TinyDB(corpus_db_path())
query = Query()
path = None

if db.search(query.name == name):
path = get_full_data_path(db.search(query.name == name)[0]["file"])

# check if the corpus is in local catalog, download if not
corpus_db_detail = get_corpus_db_detail(name)
if not corpus_db_detail or not corpus_db_detail.get("file_name"):
download(name)
corpus_db_detail = get_corpus_db_detail(name)

if corpus_db_detail and corpus_db_detail.get("file_name"):
# corpus is in the local catalog, get full path to the file
path = get_full_data_path(corpus_db_detail.get("file_name"))
# check if the corpus file actually exists, download if not
if not os.path.exists(path):
download(name)
if os.path.exists(path):
return path

db.close()
return path
return None


def _download(url: str, dst: str) -> int:
Expand Down Expand Up @@ -174,9 +183,7 @@ def _check_hash(dst: str, md5: str) -> None:
raise Exception("Hash does not match expected.")


def download(
name: str, force: bool = False, url: str = None
) -> bool:
def download(name: str, force: bool = False, url: str = None) -> bool:
"""
Download corpus.

Expand Down Expand Up @@ -215,7 +222,7 @@ def download(

corpus_db = corpus_db.json()

# Check if corpus is available
# check if corpus is available
if name in list(corpus_db.keys()):
local_db = TinyDB(corpus_db_path())
query = Query()
Expand All @@ -239,7 +246,7 @@ def download(
{
"name": name,
"version": corpus["version"],
"file": corpus["file_name"],
"file_name": corpus["file_name"],
}
)
else:
Expand Down
81 changes: 39 additions & 42 deletions pythainlp/tag/named_entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,14 @@

from typing import List, Tuple, Union

import pycrfsuite
from pythainlp.corpus import download, get_corpus_path, thai_stopwords
from pycrfsuite import Tagger as CRFTagger
from pythainlp.corpus import get_corpus_path, thai_stopwords
from pythainlp.tag import pos_tag
from pythainlp.tokenize import word_tokenize
from pythainlp.util import isthai

_WORD_TOKENIZER = "newmm" # ตัวตัดคำ
_CORPUS_NAME = "thainer-1-4"
_TOKENIZER_ENGINE = "newmm" # should be the same as one used in training data


def _is_stopword(word: str) -> bool: # เช็คว่าเป็นคำฟุ่มเฟือย
Expand Down Expand Up @@ -74,14 +75,10 @@ def _doc2features(doc, i) -> dict:
class ThaiNameTagger:
def __init__(self):
"""
Thai named-entity recognizer
Thai named-entity recognizer.
"""
self.__data_path = get_corpus_path("thainer-1-4")
if not self.__data_path:
download("thainer-1-4")
self.__data_path = get_corpus_path("thainer-1-4")
self.crf = pycrfsuite.Tagger()
self.crf.open(self.__data_path)
self.crf = CRFTagger()
self.crf.open(get_corpus_path(_CORPUS_NAME))

def get_ner(
self, text: str, pos: bool = True, tag: bool = False
Expand Down Expand Up @@ -137,41 +134,41 @@ def get_ner(
tag=True)
'วันที่ <DATE>15 ก.ย. 61</DATE> ทดสอบระบบเวลา <TIME>14:49 น.</TIME>'
"""
self.__tokens = word_tokenize(text, engine=_WORD_TOKENIZER)
self.__pos_tags = pos_tag(
self.__tokens, engine="perceptron", corpus="orchid_ud"
)
self.__x_test = self.__extract_features(self.__pos_tags)
self.__y = self.crf.tag(self.__x_test)

self.sent_ner = [
(self.__pos_tags[i][0], data) for i, data in enumerate(self.__y)
]
tokens = word_tokenize(text, engine=_TOKENIZER_ENGINE)
pos_tags = pos_tag(tokens, engine="perceptron", corpus="orchid_ud")
x_test = ThaiNameTagger.__extract_features(pos_tags)
y = self.crf.tag(x_test)

sent_ner = [(pos_tags[i][0], data) for i, data in enumerate(y)]

if tag:
self.temp = ""
self.sent = ""
for idx, (word, ner) in enumerate(self.sent_ner):
if "B-" in ner and self.temp != "":
self.sent += "</" + self.temp + ">"
self.temp = ner.replace("B-", "")
self.sent += "<" + self.temp + ">"
elif "B-" in ner:
self.temp = ner.replace("B-", "")
self.sent += "<" + self.temp + ">"
elif "O" == ner and self.temp != "":
self.sent += "</" + self.temp + ">"
self.temp = ""
self.sent += word
if idx == len(self.sent_ner) - 1 and self.temp != "":
self.sent += "</" + self.temp + ">"
return self.sent
elif pos:
temp = ""
sent = ""
for idx, (word, ner) in enumerate(sent_ner):
if ner.startswith("B-") and temp != "":
sent += "</" + temp + ">"
temp = ner[2:]
sent += "<" + temp + ">"
elif ner.startswith("B-"):
temp = ner[2:]
sent += "<" + temp + ">"
elif ner == "O" and temp != "":
sent += "</" + temp + ">"
temp = ""
sent += word

if idx == len(sent_ner) - 1 and temp != "":
sent += "</" + temp + ">"

return sent

if pos:
return [
(self.__pos_tags[i][0], self.__pos_tags[i][1], data)
for i, data in enumerate(self.__y)
(pos_tags[i][0], pos_tags[i][1], data)
for i, data in enumerate(y)
]
else:
return self.sent_ner

return sent_ner

@staticmethod
def __extract_features(doc):
Expand Down
7 changes: 3 additions & 4 deletions pythainlp/tag/pos_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,14 +180,13 @@ def pos_tag(
# [('เก้าอี้', None), ('มี', 'VERB'), ('จำนวน', 'NOUN'), ('ขา', None),
# ('<space>', None), ('<equal>', None), ('3', 'NUM')]
"""
if not words:
return []

# NOTE:
_corpus = corpus
_tag = []
if corpus == "orchid_ud":
corpus = "orchid"
if not words:
return []

if engine == "perceptron":
from .perceptron import tag as tag_
Expand Down Expand Up @@ -243,4 +242,4 @@ def pos_tag_sents(
if not sentences:
return []

return [pos_tag(sent, engine=engine, corpus=corpus) for sent in sentences]
return [pos_tag(sent, engine=engine, corpus=corpus) for sent in sentences]
4 changes: 2 additions & 2 deletions pythainlp/tools/__init__.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
# -*- coding: utf-8 -*-
__all__ = [
"PYTHAINLP_DEFAULT_DATA_DIR",
"get_full_data_path",
"get_pythainlp_data_path",
"get_pythainlp_path",
"PYTHAINLP_DATA_DIR",
]

from pythainlp.tools.path import (
PYTHAINLP_DEFAULT_DATA_DIR,
get_full_data_path,
get_pythainlp_data_path,
get_pythainlp_path,
PYTHAINLP_DATA_DIR,
)
12 changes: 6 additions & 6 deletions pythainlp/tools/path.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
"""
import os

import pythainlp
from pythainlp import __file__ as pythainlp_file

PYTHAINLP_DATA_DIR = "pythainlp-data"
PYTHAINLP_DEFAULT_DATA_DIR = "pythainlp-data"


def get_full_data_path(path: str) -> str:
Expand Down Expand Up @@ -49,10 +49,10 @@ def get_pythainlp_data_path() -> str:
get_pythainlp_data_path()
# output: '/root/pythainlp-data'
"""
path = os.getenv(
"PYTHAINLP_DATA_DIR", os.path.join("~", PYTHAINLP_DATA_DIR)
pythainlp_data_dir = os.getenv(
"PYTHAINLP_DATA_DIR", os.path.join("~", PYTHAINLP_DEFAULT_DATA_DIR)
)
path = os.path.expanduser(path)
path = os.path.expanduser(pythainlp_data_dir)
os.makedirs(path, exist_ok=True)
return path

Expand All @@ -72,4 +72,4 @@ def get_pythainlp_path() -> str:
get_pythainlp_path()
# output: '/usr/local/lib/python3.6/dist-packages/pythainlp'
"""
return os.path.dirname(pythainlp.__file__)
return os.path.dirname(pythainlp_file)
Loading