@@ -51,18 +51,26 @@ def get_corpus_db_detail(name: str) -> dict:
5151 return dict ()
5252
5353
54- def get_corpus (filename : str ) -> frozenset :
54+ def get_corpus (filename : str , as_is : bool = False ) -> Union [ frozenset , list ] :
5555 """
56- Read corpus data from file and return a frozenset.
56+ Read corpus data from file and return a frozenset or a list.
57+
58+ Each line in the file will be a member of the set or the list.
59+
60+ By default, a frozenset will be return, with whitespaces stripped, and
61+ empty values and duplicates removed.
62+
63+ If as_is is True, a list will be return, with no modifications
64+ in member values and their orders.
5765
5866 (Please see the filename from
5967 `this file
6068 <https://github.com/PyThaiNLP/pythainlp-corpus/blob/master/db.json>`_
6169
6270 :param str filename: filename of the corpus to be read
6371
64- :return: :mod :`frozenset` consist of lines in the file
65- :rtype: :mod :`frozenset`
72+ :return: :class :`frozenset` or :class:`list` consists of lines in the file
73+ :rtype: :class :`frozenset` or :class:`list `
6674
6775 :Example:
6876 ::
@@ -85,7 +93,11 @@ def get_corpus(filename: str) -> frozenset:
8593 with open (path , "r" , encoding = "utf-8-sig" ) as fh :
8694 lines = fh .read ().splitlines ()
8795
88- return frozenset (lines )
96+ if as_is :
97+ return lines
98+
99+ lines = [line .strip () for line in lines ]
100+ return frozenset (filter (None , lines ))
89101
90102
91103def _update_all ():
@@ -96,7 +108,9 @@ def _update_all():
96108 for item in item_all :
97109 name = item ["name" ]
98110 if "file_name" in item .keys ():
99- local_db .update ({"filename" : item ["file_name" ]}, query .name == name )
111+ local_db .update (
112+ {"filename" : item ["file_name" ]}, query .name == name
113+ )
100114 elif "file" in item .keys ():
101115 local_db .update ({"filename" : item ["file" ]}, query .name == name )
102116 local_db .close ()
@@ -139,9 +153,15 @@ def get_corpus_path(name: str) -> Union[str, None]:
139153 """
140154 # check if the corpus is in local catalog, download if not
141155 corpus_db_detail = get_corpus_db_detail (name )
142- if corpus_db_detail .get ("file_name" ) is not None and corpus_db_detail .get ("filename" ) is None :
156+ if (
157+ corpus_db_detail .get ("file_name" ) is not None
158+ and corpus_db_detail .get ("filename" ) is None
159+ ):
143160 _update_all ()
144- elif corpus_db_detail .get ("file" ) is not None and corpus_db_detail .get ("filename" ) is None :
161+ elif (
162+ corpus_db_detail .get ("file" ) is not None
163+ and corpus_db_detail .get ("filename" ) is None
164+ ):
145165 _update_all ()
146166
147167 if not corpus_db_detail or not corpus_db_detail .get ("filename" ):
@@ -208,7 +228,9 @@ def _check_hash(dst: str, md5: str) -> None:
208228 raise Exception ("Hash does not match expected." )
209229
210230
211- def download (name : str , force : bool = False , url : str = None , version : str = None ) -> bool :
231+ def download (
232+ name : str , force : bool = False , url : str = None , version : str = None
233+ ) -> bool :
212234 """
213235 Download corpus.
214236
@@ -256,34 +278,28 @@ def download(name: str, force: bool = False, url: str = None, version: str = Non
256278 corpus = corpus_db [name .lower ()]
257279 print ("Corpus:" , name )
258280 if version is None :
259- version = corpus [' latest_version' ]
281+ version = corpus [" latest_version" ]
260282 corpus_versions = corpus ["versions" ][version ]
261283 file_name = corpus_versions ["filename" ]
262- found = local_db .search ((query .name == name ) & (query .version == version ))
284+ found = local_db .search (
285+ (query .name == name ) & (query .version == version )
286+ )
263287
264288 # If not found in local, download
265289 if force or not found :
266290 print (f"- Downloading: { name } { version } " )
267291 _download (
268- corpus_versions ["download_url" ],
269- file_name ,
292+ corpus_versions ["download_url" ], file_name ,
270293 )
271294 _check_hash (
272- file_name ,
273- corpus_versions ["md5" ],
295+ file_name , corpus_versions ["md5" ],
274296 )
275297
276298 if found :
277- local_db .update (
278- {"version" : version }, query .name == name
279- )
299+ local_db .update ({"version" : version }, query .name == name )
280300 else :
281301 local_db .insert (
282- {
283- "name" : name ,
284- "version" : version ,
285- "filename" : file_name ,
286- }
302+ {"name" : name , "version" : version , "filename" : file_name }
287303 )
288304 else :
289305 if local_db .search (
0 commit comments