|
13 | 13 | from pythainlp.tools import get_full_data_path |
14 | 14 | from requests.exceptions import HTTPError |
15 | 15 | from tinydb import Query, TinyDB |
| 16 | +from pythainlp import __version__ |
16 | 17 |
|
17 | 18 |
|
18 | 19 | def get_corpus_db(url: str) -> requests.Response: |
@@ -68,7 +69,7 @@ def get_corpus(filename: str, as_is: bool = False) -> Union[frozenset, list]: |
68 | 69 |
|
69 | 70 | (Please see the filename from |
70 | 71 | `this file |
71 | | - <https://github.com/PyThaiNLP/pythainlp-corpus/blob/master/db.json>`_ |
| 72 | + <https://pythainlp.github.io/pythainlp-corpus/db.json>`_ |
72 | 73 |
|
73 | 74 | :param str filename: filename of the corpus to be read |
74 | 75 |
|
@@ -103,22 +104,6 @@ def get_corpus(filename: str, as_is: bool = False) -> Union[frozenset, list]: |
103 | 104 | return frozenset(filter(None, lines)) |
104 | 105 |
|
105 | 106 |
|
106 | | -def _update_all(): |
107 | | - print("Update Corpus...") |
108 | | - with TinyDB(corpus_db_path()) as local_db: |
109 | | - item_all = local_db.all() |
110 | | - query = Query() |
111 | | - for item in item_all: |
112 | | - name = item["name"] |
113 | | - if "file_name" in item.keys(): |
114 | | - local_db.update( |
115 | | - {"filename": item["file_name"]}, query.name == name |
116 | | - ) |
117 | | - elif "file" in item.keys(): |
118 | | - local_db.update({"filename": item["file"]}, query.name == name) |
119 | | - local_db.close() |
120 | | - |
121 | | - |
122 | 107 | def get_corpus_path(name: str, version : str = None) -> Union[str, None]: |
123 | 108 | """ |
124 | 109 | Get corpus path. |
@@ -163,16 +148,6 @@ def get_corpus_path(name: str, version : str = None) -> Union[str, None]: |
163 | 148 |
|
164 | 149 | # check if the corpus is in local catalog, download if not |
165 | 150 | corpus_db_detail = get_corpus_db_detail(name) |
166 | | - if ( |
167 | | - corpus_db_detail.get("file_name") is not None |
168 | | - and corpus_db_detail.get("filename") is None |
169 | | - ): |
170 | | - _update_all() |
171 | | - elif ( |
172 | | - corpus_db_detail.get("file") is not None |
173 | | - and corpus_db_detail.get("filename") is None |
174 | | - ): |
175 | | - _update_all() |
176 | 151 |
|
177 | 152 | if not corpus_db_detail or not corpus_db_detail.get("filename"): |
178 | 153 | download(name, version = version) |
@@ -238,14 +213,70 @@ def _check_hash(dst: str, md5: str) -> None: |
238 | 213 | raise Exception("Hash does not match expected.") |
239 | 214 |
|
240 | 215 |
|
| 216 | +def _version2int(v: str) -> int: |
| 217 | + """ |
| 218 | + X.X.X => X0X0X |
| 219 | + """ |
| 220 | + if v.endswith(".*"): |
| 221 | + v = v.replace(".*", ".0") # X.X.* => X.X.0 |
| 222 | + v_list = v.split(".") |
| 223 | + if len(v_list) < 3: |
| 224 | + v_list.append('0') |
| 225 | + v_new = "" |
| 226 | + for i, value in enumerate(v_list): |
| 227 | + if i != 0: |
| 228 | + if len(value) < 2: |
| 229 | + v_new += "0"+value |
| 230 | + else: |
| 231 | + v_new += value |
| 232 | + else: |
| 233 | + v_new += value |
| 234 | + return int(v_new) |
| 235 | + |
| 236 | + |
| 237 | +def _check_version(cause: str) -> bool: |
| 238 | + temp = cause |
| 239 | + check = False |
| 240 | + v = _version2int(__version__) |
| 241 | + |
| 242 | + if cause == "*": |
| 243 | + check = True |
| 244 | + elif cause.startswith("==") and '>' not in cause and '<' not in cause: |
| 245 | + temp = cause.replace("==", '') |
| 246 | + check = v == _version2int(temp) |
| 247 | + elif cause.startswith(">=") and '<' not in cause: |
| 248 | + temp = cause.replace(">=", '') |
| 249 | + check = v >= _version2int(temp) |
| 250 | + elif cause.startswith(">") and '<' not in cause: |
| 251 | + temp = cause.replace(">", '') |
| 252 | + check = v > _version2int(temp) |
| 253 | + elif cause.startswith(">=") and '<=' not in cause and '<' in cause: |
| 254 | + temp = cause.replace(">=", '').split('<') |
| 255 | + check = v >= _version2int(temp[0]) and v < _version2int(temp[1]) |
| 256 | + elif cause.startswith(">=") and '<=' in cause: |
| 257 | + temp = cause.replace(">=", '').split('<=') |
| 258 | + check = v >= _version2int(temp[0]) and v <= _version2int(temp[1]) |
| 259 | + elif cause.startswith(">") and '<' in cause: |
| 260 | + temp = cause.replace(">", '').split('<') |
| 261 | + check = v > _version2int(temp[0]) and v < _version2int(temp[1]) |
| 262 | + elif cause.startswith("<="): |
| 263 | + temp = cause.replace("<=", '') |
| 264 | + check = v <= _version2int(temp[0]) |
| 265 | + elif cause.startswith("<"): |
| 266 | + temp = cause.replace("<", '') |
| 267 | + check = v < _version2int(temp[0]) |
| 268 | + |
| 269 | + return check |
| 270 | + |
| 271 | + |
241 | 272 | def download( |
242 | 273 | name: str, force: bool = False, url: str = None, version: str = None |
243 | 274 | ) -> bool: |
244 | 275 | """ |
245 | 276 | Download corpus. |
246 | 277 |
|
247 | 278 | The available corpus names can be seen in this file: |
248 | | - https://github.com/PyThaiNLP/pythainlp-corpus/blob/master/db.json |
| 279 | + https://pythainlp.github.io/pythainlp-corpus/db.json |
249 | 280 |
|
250 | 281 | :param str name: corpus name |
251 | 282 | :param bool force: force download |
@@ -288,7 +319,20 @@ def download( |
288 | 319 | corpus = corpus_db[name.lower()] |
289 | 320 | print("Corpus:", name) |
290 | 321 | if version is None: |
291 | | - version = corpus["latest_version"] |
| 322 | + for v in corpus["versions"]: |
| 323 | + if _check_version(corpus["versions"][v]["pythainlp_version"]): |
| 324 | + version = v |
| 325 | + else: |
| 326 | + if version not in list(corpus["versions"].keys()): |
| 327 | + print("Not found corpus") |
| 328 | + local_db.close() |
| 329 | + return False |
| 330 | + elif _check_version( |
| 331 | + corpus["versions"][version]["pythainlp_version"] |
| 332 | + ) is False: |
| 333 | + print("Versions Corpus not support") |
| 334 | + local_db.close() |
| 335 | + return False |
292 | 336 | corpus_versions = corpus["versions"][version] |
293 | 337 | file_name = corpus_versions["filename"] |
294 | 338 | found = local_db.search( |
|
0 commit comments