Skip to content

Commit e52b8c9

Browse files
authored
Merge pull request #505 from PyThaiNLP/check-pythainlp-version
Add check versions for pythainlp.corpus
2 parents 01cbfe8 + 5f4db2f commit e52b8c9

File tree

3 files changed

+95
-33
lines changed

3 files changed

+95
-33
lines changed

pythainlp/corpus/__init__.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,7 @@
3939

4040
# remote corpus catalog URL
4141
_CORPUS_DB_URL = (
42-
"https://raw.githubusercontent.com/"
43-
"PyThaiNLP/pythainlp-corpus/"
44-
"2.2/db.json"
42+
"https://pythainlp.github.io/pythainlp-corpus/db.json"
4543
)
4644

4745
# local corpus catalog filename

pythainlp/corpus/core.py

Lines changed: 73 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from pythainlp.tools import get_full_data_path
1414
from requests.exceptions import HTTPError
1515
from tinydb import Query, TinyDB
16+
from pythainlp import __version__
1617

1718

1819
def get_corpus_db(url: str) -> requests.Response:
@@ -68,7 +69,7 @@ def get_corpus(filename: str, as_is: bool = False) -> Union[frozenset, list]:
6869
6970
(Please see the filename from
7071
`this file
71-
<https://github.com/PyThaiNLP/pythainlp-corpus/blob/master/db.json>`_
72+
<https://pythainlp.github.io/pythainlp-corpus/db.json>`_
7273
7374
:param str filename: filename of the corpus to be read
7475
@@ -103,22 +104,6 @@ def get_corpus(filename: str, as_is: bool = False) -> Union[frozenset, list]:
103104
return frozenset(filter(None, lines))
104105

105106

106-
def _update_all():
107-
print("Update Corpus...")
108-
with TinyDB(corpus_db_path()) as local_db:
109-
item_all = local_db.all()
110-
query = Query()
111-
for item in item_all:
112-
name = item["name"]
113-
if "file_name" in item.keys():
114-
local_db.update(
115-
{"filename": item["file_name"]}, query.name == name
116-
)
117-
elif "file" in item.keys():
118-
local_db.update({"filename": item["file"]}, query.name == name)
119-
local_db.close()
120-
121-
122107
def get_corpus_path(name: str, version : str = None) -> Union[str, None]:
123108
"""
124109
Get corpus path.
@@ -163,16 +148,6 @@ def get_corpus_path(name: str, version : str = None) -> Union[str, None]:
163148

164149
# check if the corpus is in local catalog, download if not
165150
corpus_db_detail = get_corpus_db_detail(name)
166-
if (
167-
corpus_db_detail.get("file_name") is not None
168-
and corpus_db_detail.get("filename") is None
169-
):
170-
_update_all()
171-
elif (
172-
corpus_db_detail.get("file") is not None
173-
and corpus_db_detail.get("filename") is None
174-
):
175-
_update_all()
176151

177152
if not corpus_db_detail or not corpus_db_detail.get("filename"):
178153
download(name, version = version)
@@ -238,14 +213,70 @@ def _check_hash(dst: str, md5: str) -> None:
238213
raise Exception("Hash does not match expected.")
239214

240215

216+
def _version2int(v: str) -> int:
217+
"""
218+
X.X.X => X0X0X
219+
"""
220+
if v.endswith(".*"):
221+
v = v.replace(".*", ".0") # X.X.* => X.X.0
222+
v_list = v.split(".")
223+
if len(v_list) < 3:
224+
v_list.append('0')
225+
v_new = ""
226+
for i, value in enumerate(v_list):
227+
if i != 0:
228+
if len(value) < 2:
229+
v_new += "0"+value
230+
else:
231+
v_new += value
232+
else:
233+
v_new += value
234+
return int(v_new)
235+
236+
237+
def _check_version(cause: str) -> bool:
238+
temp = cause
239+
check = False
240+
v = _version2int(__version__)
241+
242+
if cause == "*":
243+
check = True
244+
elif cause.startswith("==") and '>' not in cause and '<' not in cause:
245+
temp = cause.replace("==", '')
246+
check = v == _version2int(temp)
247+
elif cause.startswith(">=") and '<' not in cause:
248+
temp = cause.replace(">=", '')
249+
check = v >= _version2int(temp)
250+
elif cause.startswith(">") and '<' not in cause:
251+
temp = cause.replace(">", '')
252+
check = v > _version2int(temp)
253+
elif cause.startswith(">=") and '<=' not in cause and '<' in cause:
254+
temp = cause.replace(">=", '').split('<')
255+
check = v >= _version2int(temp[0]) and v < _version2int(temp[1])
256+
elif cause.startswith(">=") and '<=' in cause:
257+
temp = cause.replace(">=", '').split('<=')
258+
check = v >= _version2int(temp[0]) and v <= _version2int(temp[1])
259+
elif cause.startswith(">") and '<' in cause:
260+
temp = cause.replace(">", '').split('<')
261+
check = v > _version2int(temp[0]) and v < _version2int(temp[1])
262+
elif cause.startswith("<="):
263+
temp = cause.replace("<=", '')
264+
check = v <= _version2int(temp[0])
265+
elif cause.startswith("<"):
266+
temp = cause.replace("<", '')
267+
check = v < _version2int(temp[0])
268+
269+
return check
270+
271+
241272
def download(
242273
name: str, force: bool = False, url: str = None, version: str = None
243274
) -> bool:
244275
"""
245276
Download corpus.
246277
247278
The available corpus names can be seen in this file:
248-
https://github.com/PyThaiNLP/pythainlp-corpus/blob/master/db.json
279+
https://pythainlp.github.io/pythainlp-corpus/db.json
249280
250281
:param str name: corpus name
251282
:param bool force: force download
@@ -288,7 +319,20 @@ def download(
288319
corpus = corpus_db[name.lower()]
289320
print("Corpus:", name)
290321
if version is None:
291-
version = corpus["latest_version"]
322+
for v in corpus["versions"]:
323+
if _check_version(corpus["versions"][v]["pythainlp_version"]):
324+
version = v
325+
else:
326+
if version not in list(corpus["versions"].keys()):
327+
print("Not found corpus")
328+
local_db.close()
329+
return False
330+
elif _check_version(
331+
corpus["versions"][version]["pythainlp_version"]
332+
) is False:
333+
print("Versions Corpus not support")
334+
local_db.close()
335+
return False
292336
corpus_versions = corpus["versions"][version]
293337
file_name = corpus_versions["filename"]
294338
found = local_db.search(

tests/test_corpus.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,27 @@ def test_corpus(self):
7474
self.assertTrue(remove("test")) # remove existing
7575
self.assertFalse(remove("test")) # remove non-existing
7676
self.assertIsNone(get_corpus_path("XXXkdjfBzc")) # query non-existing
77-
self.assertTrue(download(name="test", version="0.1"))
77+
self.assertFalse(download(name="test", version="0.0"))
78+
self.assertFalse(download(name="test", version="0.0.0"))
79+
self.assertFalse(download(name="test", version="0.0.1"))
80+
self.assertFalse(download(name="test", version="0.0.2"))
81+
self.assertFalse(download(name="test", version="0.0.3"))
82+
self.assertFalse(download(name="test", version="0.0.4"))
83+
self.assertTrue(download(name="test", version="0.0.5"))
84+
self.assertTrue(remove("test")) # remove existing
85+
self.assertIsNotNone(download(name="test", version="0.0.6"))
86+
self.assertIsNotNone(download(name="test", version="0.0.7"))
87+
self.assertIsNotNone(download(name="test", version="0.0.8"))
88+
self.assertIsNotNone(download(name="test", version="0.0.9"))
89+
self.assertIsNotNone(download(name="test", version="0.0.10"))
90+
with self.assertRaises(Exception) as context:
91+
self.assertIsNotNone(download(name="test", version="0.0.11"))
92+
self.assertTrue(
93+
"Hash does not match expected."
94+
in
95+
str(context.exception)
96+
)
97+
self.assertIsNotNone(download(name="test", version="0.1"))
7898
self.assertTrue(remove("test"))
7999

80100
def test_tnc(self):

0 commit comments

Comments
 (0)