44"""
55Corpus related functions.
66"""
7+
78import json
89import os
910from typing import Union
@@ -25,16 +26,16 @@ def get_corpus_db(url: str):
2526
2627 corpus_db = None
2728 try :
28- corpus_db = requests .get (url )
29+ corpus_db = requests .get (url , timeout = 10 )
2930 except requests .exceptions .HTTPError as http_err :
3031 print (f"HTTP error occurred: { http_err } " )
31- except Exception as err :
32+ except requests . exceptions . RequestException as err :
3233 print (f"Non-HTTP error occurred: { err } " )
3334
3435 return corpus_db
3536
3637
37- def get_corpus_db_detail (name : str , version : str = '' ) -> dict :
38+ def get_corpus_db_detail (name : str , version : str = "" ) -> dict :
3839 """
3940 Get details about a corpus, using information from local catalog.
4041
@@ -176,7 +177,7 @@ def get_corpus_as_is(filename: str) -> list:
176177 return lines
177178
178179
179- def get_corpus_default_db (name : str , version : str = '' ) -> Union [str , None ]:
180+ def get_corpus_default_db (name : str , version : str = "" ) -> Union [str , None ]:
180181 """
181182 Get model path from default_db.json
182183
@@ -207,7 +208,7 @@ def get_corpus_default_db(name: str, version: str = '') -> Union[str, None]:
207208
208209
209210def get_corpus_path (
210- name : str , version : str = '' , force : bool = False
211+ name : str , version : str = "" , force : bool = False
211212) -> Union [str , None ]:
212213 """
213214 Get corpus path.
@@ -251,11 +252,11 @@ def get_corpus_path(
251252 """
252253 from typing import Dict
253254
254- _CUSTOMIZE : Dict [str , str ] = {
255+ CUSTOMIZE : Dict [str , str ] = {
255256 # "the corpus name":"path"
256257 }
257- if name in list (_CUSTOMIZE ):
258- return _CUSTOMIZE [name ]
258+ if name in list (CUSTOMIZE ):
259+ return CUSTOMIZE [name ]
259260
260261 default_path = get_corpus_default_db (name = name , version = version )
261262 if default_path is not None :
@@ -290,14 +291,14 @@ def _download(url: str, dst: str) -> int:
290291 @param: URL for downloading file
291292 @param: dst place to put the file into
292293 """
293- _CHUNK_SIZE = 64 * 1024 # 64 KiB
294+ CHUNK_SIZE = 64 * 1024 # 64 KiB
294295
295296 from urllib .request import urlopen
296297
297298 import requests
298299
299300 file_size = int (urlopen (url ).info ().get ("Content-Length" , - 1 ))
300- r = requests .get (url , stream = True )
301+ r = requests .get (url , stream = True , timeout = 10 )
301302 with open (get_full_data_path (dst ), "wb" ) as f :
302303 pbar = None
303304 try :
@@ -307,7 +308,7 @@ def _download(url: str, dst: str) -> int:
307308 except ImportError :
308309 pbar = None
309310
310- for chunk in r .iter_content (chunk_size = _CHUNK_SIZE ):
311+ for chunk in r .iter_content (chunk_size = CHUNK_SIZE ):
311312 if chunk :
312313 f .write (chunk )
313314 if pbar :
@@ -334,7 +335,7 @@ def _check_hash(dst: str, md5: str) -> None:
334335 file_md5 = hashlib .md5 (content ).hexdigest ()
335336
336337 if md5 != file_md5 :
337- raise Exception ("Hash does not match expected." )
338+ raise ValueError ("Hash does not match expected." )
338339
339340
340341def _version2int (v : str ) -> int :
@@ -401,7 +402,7 @@ def _check_version(cause: str) -> bool:
401402
402403
403404def download (
404- name : str , force : bool = False , url : str = '' , version : str = ''
405+ name : str , force : bool = False , url : str = "" , version : str = ""
405406) -> bool :
406407 """
407408 Download corpus.
@@ -422,7 +423,7 @@ def download(
422423
423424 from pythainlp.corpus import download
424425
425- download(' wiki_lm_lstm' , force=True)
426+ download(" wiki_lm_lstm" , force=True)
426427 # output:
427428 # Corpus: wiki_lm_lstm
428429 # - Downloading: wiki_lm_lstm 0.1
@@ -459,10 +460,13 @@ def download(
459460
460461 # version may still be None here
461462 if version not in corpus ["versions" ]:
462- print ("Not found corpus " )
463+ print ("Corpus not found. " )
463464 return False
464- elif _check_version (corpus ["versions" ][version ]["pythainlp_version" ]) is False :
465- print ("Versions Corpus not support" )
465+ elif (
466+ _check_version (corpus ["versions" ][version ]["pythainlp_version" ])
467+ is False
468+ ):
469+ print ("Corpus version not supported." )
466470 return False
467471 corpus_versions = corpus ["versions" ][version ]
468472 file_name = corpus_versions ["filename" ]
@@ -505,8 +509,10 @@ def download(
505509 foldername = name + "_" + str (version )
506510 if not os .path .exists (get_full_data_path (foldername )):
507511 os .mkdir (get_full_data_path (foldername ))
508- with zipfile .ZipFile (get_full_data_path (file_name ), "r" ) as zip :
509- zip .extractall (path = get_full_data_path (foldername ))
512+ with zipfile .ZipFile (
513+ get_full_data_path (file_name ), "r"
514+ ) as zip_file :
515+ zip_file .extractall (path = get_full_data_path (foldername ))
510516
511517 if found :
512518 local_db ["_default" ][found ]["version" ] = version
@@ -517,7 +523,9 @@ def download(
517523 # This awkward behavior is for backward-compatibility with
518524 # database files generated previously using TinyDB
519525 if local_db ["_default" ]:
520- corpus_no = max ((int (no ) for no in local_db ["_default" ])) + 1
526+ corpus_no = (
527+ max ((int (no ) for no in local_db ["_default" ])) + 1
528+ )
521529 else :
522530 corpus_no = 1
523531 local_db ["_default" ][str (corpus_no )] = {
@@ -564,13 +572,13 @@ def remove(name: str) -> bool:
564572
565573 from pythainlp.corpus import remove, get_corpus_path, get_corpus
566574
567- print(remove(' ttc' ))
575+ print(remove(" ttc" ))
568576 # output: True
569577
570- print(get_corpus_path(' ttc' ))
578+ print(get_corpus_path(" ttc" ))
571579 # output: None
572580
573- get_corpus(' ttc' )
581+ get_corpus(" ttc" )
574582 # output:
575583 # FileNotFoundError: [Errno 2] No such file or directory:
576584 # '/usr/local/lib/python3.6/dist-packages/pythainlp/corpus/ttc'
@@ -580,7 +588,9 @@ def remove(name: str) -> bool:
580588 return False
581589 with open (corpus_db_path (), "r" , encoding = "utf-8-sig" ) as f :
582590 db = json .load (f )
583- data = [corpus for corpus in db ["_default" ].values () if corpus ["name" ] == name ]
591+ data = [
592+ corpus for corpus in db ["_default" ].values () if corpus ["name" ] == name
593+ ]
584594
585595 if data :
586596 path = get_corpus_path (name )
0 commit comments