Skip to content

Commit 5bcda6c

Browse files
committed
fix force=True behavior
1 parent 973a6e6 commit 5bcda6c

File tree

1 file changed

+36
-56
lines changed

1 file changed

+36
-56
lines changed

pythainlp/corpus/__init__.py

Lines changed: 36 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
# -*- coding: utf-8 -*-
22
import hashlib
33
import os
4-
import queue
5-
import threading
64
from typing import NoReturn, Union
75
from urllib.request import urlopen
86

@@ -26,6 +24,7 @@
2624
_CORPUS_DB_FILENAME = "db.json"
2725
_CORPUS_DB_PATH = get_full_data_path(_CORPUS_DB_FILENAME)
2826

27+
# Create a local corpus database if it does not already exist
2928
if not os.path.exists(_CORPUS_DB_PATH):
3029
TinyDB(_CORPUS_DB_PATH)
3130

@@ -45,14 +44,12 @@ def corpus_db_path() -> str:
4544
def get_corpus_db_detail(name: str) -> dict:
4645
db = TinyDB(corpus_db_path())
4746
query = Query()
48-
return db.search(query.name == name)[0]
4947

50-
51-
def read_text_corpus(path: str) -> list:
52-
lines = []
53-
with open(path, "r", encoding="utf-8-sig") as fh:
54-
lines = fh.read().splitlines()
55-
return lines
48+
res = db.search(query.name == name)
49+
if res:
50+
return res[0]
51+
else:
52+
return dict()
5653

5754

5855
def get_corpus(filename: str) -> frozenset:
@@ -71,6 +68,10 @@ def get_corpus(filename: str) -> frozenset:
7168
7269
from pythainlp.corpus import get_corpus
7370
71+
get_corpus('negations_th.txt')
72+
# output:
73+
# frozenset({'แต่', 'ไม่'})
74+
7475
get_corpus('ttc_freq.txt')
7576
# output:
7677
# frozenset({'โดยนัยนี้\\t1',
@@ -81,12 +82,11 @@ def get_corpus(filename: str) -> frozenset:
8182
# 'เหนี่ยง\\t3',
8283
# 'ชงฆ์\\t3',
8384
# ...})
84-
85-
get_corpus('negations_th.txt')
86-
# output:
87-
# frozenset({'แต่', 'ไม่'})
8885
"""
89-
lines = read_text_corpus(os.path.join(corpus_path(), filename))
86+
path = os.path.join(corpus_path(), filename)
87+
lines = []
88+
with open(path, "r", encoding="utf-8-sig") as fh:
89+
lines = fh.read().splitlines()
9090

9191
return frozenset(lines)
9292

@@ -140,25 +140,6 @@ def get_corpus_path(name: str) -> Union[str, None]:
140140
return path
141141

142142

143-
def _get_input(message, channel):
144-
response = input(message)
145-
channel.put(response)
146-
147-
148-
def _input_with_timeout(message, timeout, default_response):
149-
channel = queue.Queue()
150-
thread = threading.Thread(target=_get_input, args=(message, channel))
151-
thread.daemon = True
152-
thread.start()
153-
154-
try:
155-
response = channel.get(True, timeout)
156-
return response
157-
except queue.Empty:
158-
pass
159-
return default_response
160-
161-
162143
def _download(url: str, dst: str) -> int:
163144
"""
164145
@param: url to download file
@@ -199,7 +180,7 @@ def download(name: str, force: bool = False) -> NoReturn:
199180
https://github.com/PyThaiNLP/pythainlp-corpus/blob/master/db.json
200181
201182
:param string name: corpus name
202-
:param bool force: force install
183+
:param bool force: force download
203184
204185
:Example:
205186
::
@@ -234,41 +215,40 @@ def download(name: str, force: bool = False) -> NoReturn:
234215
if name in list(corpus_data.keys()):
235216
corpus = corpus_data[name]
236217
print("Corpus:", name)
218+
found = local_db.search(query.name == name)
237219

238220
# If not found in local, download
239-
if not local_db.search(query.name == name):
221+
if force or not found:
240222
print(f"- Downloading: {name} {corpus['version']}")
241223
_download(corpus["download"], corpus["file_name"])
242224
_check_hash(corpus["file_name"], corpus["md5"])
243-
local_db.insert(
244-
{
245-
"name": name,
246-
"version": corpus["version"],
247-
"file": corpus["file_name"],
248-
}
249-
)
225+
226+
if found:
227+
local_db.update(
228+
{"version": corpus["version"]}, query.name == name
229+
)
230+
else:
231+
local_db.insert(
232+
{
233+
"name": name,
234+
"version": corpus["version"],
235+
"file": corpus["file_name"],
236+
}
237+
)
250238
else:
251239
if local_db.search(
252240
query.name == name and query.version == corpus["version"]
253241
):
254242
# Already has the same version
255243
print("- Already up to date.")
256244
else:
257-
# Has the corpus but different version, update
245+
# Has the corpus but different version
258246
current_ver = local_db.search(query.name == name)[0]["version"]
259-
message = f"- Update from {current_ver} to {corpus['version']} [y/n]?"
260-
response = _input_with_timeout(message, 10, "n")
261-
response = response.lower()
262-
263-
if force or response == "y":
264-
print(f"- Downloading: {name} {corpus['version']}")
265-
_download(corpus["download"], corpus["file_name"])
266-
_check_hash(corpus["file_name"], corpus["md5"])
267-
local_db.update(
268-
{"version": corpus["version"]}, query.name == name
269-
)
270-
else:
271-
print("- Not update.")
247+
print(f"- Existing version: {current_ver}")
248+
print(f"- New version available: {corpus['version']}")
249+
print("- Use download(data_name, force=True) to update")
250+
else:
251+
print("Corpus not found:", name)
272252

273253
local_db.close()
274254

0 commit comments

Comments
 (0)