1
1
# -*- coding: utf-8 -*-
2
2
import hashlib
3
3
import os
4
- import queue
5
- import threading
6
4
from typing import NoReturn , Union
7
5
from urllib .request import urlopen
8
6
26
24
_CORPUS_DB_FILENAME = "db.json"
27
25
_CORPUS_DB_PATH = get_full_data_path (_CORPUS_DB_FILENAME )
28
26
27
+ # Create a local corpus database if it does not already exist
29
28
if not os .path .exists (_CORPUS_DB_PATH ):
30
29
TinyDB (_CORPUS_DB_PATH )
31
30
@@ -45,14 +44,12 @@ def corpus_db_path() -> str:
45
44
def get_corpus_db_detail (name : str ) -> dict :
46
45
db = TinyDB (corpus_db_path ())
47
46
query = Query ()
48
- return db .search (query .name == name )[0 ]
49
47
50
-
51
- def read_text_corpus (path : str ) -> list :
52
- lines = []
53
- with open (path , "r" , encoding = "utf-8-sig" ) as fh :
54
- lines = fh .read ().splitlines ()
55
- return lines
48
+ res = db .search (query .name == name )
49
+ if res :
50
+ return res [0 ]
51
+ else :
52
+ return dict ()
56
53
57
54
58
55
def get_corpus (filename : str ) -> frozenset :
@@ -71,6 +68,10 @@ def get_corpus(filename: str) -> frozenset:
71
68
72
69
from pythainlp.corpus import get_corpus
73
70
71
+ get_corpus('negations_th.txt')
72
+ # output:
73
+ # frozenset({'แต่', 'ไม่'})
74
+
74
75
get_corpus('ttc_freq.txt')
75
76
# output:
76
77
# frozenset({'โดยนัยนี้\\ t1',
@@ -81,12 +82,11 @@ def get_corpus(filename: str) -> frozenset:
81
82
# 'เหนี่ยง\\ t3',
82
83
# 'ชงฆ์\\ t3',
83
84
# ...})
84
-
85
- get_corpus('negations_th.txt')
86
- # output:
87
- # frozenset({'แต่', 'ไม่'})
88
85
"""
89
- lines = read_text_corpus (os .path .join (corpus_path (), filename ))
86
+ path = os .path .join (corpus_path (), filename )
87
+ lines = []
88
+ with open (path , "r" , encoding = "utf-8-sig" ) as fh :
89
+ lines = fh .read ().splitlines ()
90
90
91
91
return frozenset (lines )
92
92
@@ -140,25 +140,6 @@ def get_corpus_path(name: str) -> Union[str, None]:
140
140
return path
141
141
142
142
143
- def _get_input (message , channel ):
144
- response = input (message )
145
- channel .put (response )
146
-
147
-
148
- def _input_with_timeout (message , timeout , default_response ):
149
- channel = queue .Queue ()
150
- thread = threading .Thread (target = _get_input , args = (message , channel ))
151
- thread .daemon = True
152
- thread .start ()
153
-
154
- try :
155
- response = channel .get (True , timeout )
156
- return response
157
- except queue .Empty :
158
- pass
159
- return default_response
160
-
161
-
162
143
def _download (url : str , dst : str ) -> int :
163
144
"""
164
145
@param: url to download file
@@ -199,7 +180,7 @@ def download(name: str, force: bool = False) -> NoReturn:
199
180
https://github.com/PyThaiNLP/pythainlp-corpus/blob/master/db.json
200
181
201
182
:param string name: corpus name
202
- :param bool force: force install
183
+ :param bool force: force download
203
184
204
185
:Example:
205
186
::
@@ -234,41 +215,40 @@ def download(name: str, force: bool = False) -> NoReturn:
234
215
if name in list (corpus_data .keys ()):
235
216
corpus = corpus_data [name ]
236
217
print ("Corpus:" , name )
218
+ found = local_db .search (query .name == name )
237
219
238
220
# If not found in local, download
239
- if not local_db . search ( query . name == name ) :
221
+ if force or not found :
240
222
print (f"- Downloading: { name } { corpus ['version' ]} " )
241
223
_download (corpus ["download" ], corpus ["file_name" ])
242
224
_check_hash (corpus ["file_name" ], corpus ["md5" ])
243
- local_db .insert (
244
- {
245
- "name" : name ,
246
- "version" : corpus ["version" ],
247
- "file" : corpus ["file_name" ],
248
- }
249
- )
225
+
226
+ if found :
227
+ local_db .update (
228
+ {"version" : corpus ["version" ]}, query .name == name
229
+ )
230
+ else :
231
+ local_db .insert (
232
+ {
233
+ "name" : name ,
234
+ "version" : corpus ["version" ],
235
+ "file" : corpus ["file_name" ],
236
+ }
237
+ )
250
238
else :
251
239
if local_db .search (
252
240
query .name == name and query .version == corpus ["version" ]
253
241
):
254
242
# Already has the same version
255
243
print ("- Already up to date." )
256
244
else :
257
- # Has the corpus but different version, update
245
+ # Has the corpus but different version
258
246
current_ver = local_db .search (query .name == name )[0 ]["version" ]
259
- message = f"- Update from { current_ver } to { corpus ['version' ]} [y/n]?"
260
- response = _input_with_timeout (message , 10 , "n" )
261
- response = response .lower ()
262
-
263
- if force or response == "y" :
264
- print (f"- Downloading: { name } { corpus ['version' ]} " )
265
- _download (corpus ["download" ], corpus ["file_name" ])
266
- _check_hash (corpus ["file_name" ], corpus ["md5" ])
267
- local_db .update (
268
- {"version" : corpus ["version" ]}, query .name == name
269
- )
270
- else :
271
- print ("- Not update." )
247
+ print (f"- Existing version: { current_ver } " )
248
+ print (f"- New version available: { corpus ['version' ]} " )
249
+ print ("- Use download(data_name, force=True) to update" )
250
+ else :
251
+ print ("Corpus not found:" , name )
272
252
273
253
local_db .close ()
274
254
0 commit comments