Skip to content

Commit 8f171ca

Browse files
authored
Merge pull request #132 from bact/dev
Clean code (/corpus, /tools, /ulmfit)
2 parents 64519a5 + 7d71561 commit 8f171ca

File tree

4 files changed

+239
-157
lines changed

4 files changed

+239
-157
lines changed

pythainlp/corpus/__init__.py

Lines changed: 93 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,40 @@
11
# -*- coding: utf-8 -*-
2-
from __future__ import absolute_import,unicode_literals
3-
from pythainlp.tools import get_path_db,get_path_data
4-
from tinydb import TinyDB,Query
5-
from future.moves.urllib.request import urlopen
6-
from tqdm import tqdm
7-
import requests
2+
3+
from __future__ import absolute_import, unicode_literals
4+
85
import os
6+
97
import requests
10-
#__all__ = ["thaipos", "thaiword","alphabet","tone","country","wordnet"]
11-
path_db_=get_path_db()
8+
from future.moves.urllib.request import urlopen
9+
from pythainlp.tools import get_path_data, get_path_db
10+
from tinydb import Query, TinyDB
11+
from tqdm import tqdm
12+
13+
CORPUS_DB_URL = (
14+
"https://raw.githubusercontent.com/PyThaiNLP/pythainlp-corpus/master/db.json"
15+
)
16+
17+
# __all__ = ["thaipos", "thaiword","alphabet","tone","country","wordnet"]
18+
path_db_ = get_path_db()
19+
20+
1221
def get_file(name):
13-
db=TinyDB(path_db_)
22+
db = TinyDB(path_db_)
1423
temp = Query()
15-
if len(db.search(temp.name==name))>0:
16-
path= get_path_data(db.search(temp.name==name)[0]['file'])
24+
if len(db.search(temp.name == name)) > 0:
25+
path = get_path_data(db.search(temp.name == name)[0]["file"])
1726
db.close()
1827
if not os.path.exists(path):
1928
download(name)
2029
return path
30+
31+
2132
def download_(url, dst):
2233
"""
2334
@param: url to download file
2435
@param: dst place to put the file
2536
"""
26-
file_size = int(urlopen(url).info().get('Content-Length', -1))
37+
file_size = int(urlopen(url).info().get("Content-Length", -1))
2738
if os.path.exists(dst):
2839
first_byte = os.path.getsize(dst)
2940
else:
@@ -32,55 +43,90 @@ def download_(url, dst):
3243
return file_size
3344
header = {"Range": "bytes=%s-%s" % (first_byte, file_size)}
3445
pbar = tqdm(
35-
total=file_size, initial=first_byte,
36-
unit='B', unit_scale=True, desc=url.split('/')[-1])
46+
total=file_size,
47+
initial=first_byte,
48+
unit="B",
49+
unit_scale=True,
50+
desc=url.split("/")[-1],
51+
)
3752
req = requests.get(url, headers=header, stream=True)
38-
with(open(get_path_data(dst), 'wb')) as f:
53+
with (open(get_path_data(dst), "wb")) as f:
3954
for chunk in req.iter_content(chunk_size=1024):
4055
if chunk:
4156
f.write(chunk)
4257
pbar.update(1024)
4358
pbar.close()
44-
#return file_size
45-
def download(name,force=False):
46-
db=TinyDB(path_db_)
59+
# return file_size
60+
61+
62+
def download(name, force=False):
63+
db = TinyDB(path_db_)
4764
temp = Query()
48-
data=requests.get("https://raw.githubusercontent.com/PyThaiNLP/pythainlp-corpus/master/db.json")
49-
data_json=data.json()
65+
data = requests.get(CORPUS_DB_URL)
66+
data_json = data.json()
5067
if name in list(data_json.keys()):
51-
temp_name=data_json[name]
52-
print("Download : "+name)
53-
if len(db.search(temp.name==name))==0:
54-
print(name+" "+temp_name['version'])
55-
download_(temp_name['download'],temp_name['file_name'])
56-
db.insert({'name': name, 'version': temp_name['version'],'file':temp_name['file_name']})
68+
temp_name = data_json[name]
69+
print("Download : " + name)
70+
71+
if not db.search(temp.name == name):
72+
print(name + " " + temp_name["version"])
73+
download_(temp_name["download"], temp_name["file_name"])
74+
db.insert(
75+
{
76+
"name": name,
77+
"version": temp_name["version"],
78+
"file": temp_name["file_name"],
79+
}
80+
)
5781
else:
58-
if len(db.search(temp.name==name and temp.version==temp_name['version']))==0:
82+
if not db.search(
83+
temp.name == name and temp.version == temp_name["version"]
84+
):
5985
print("have update")
60-
print("from "+name+" "+db.search(temp.name==name)[0]['version']+" update to "+name+" "+temp_name['version'])
61-
yes_no="y"
62-
if force==False:
63-
yes_no=str(input("y or n : ")).lower()
64-
if "y"==yes_no:
65-
download_(temp_name['download'],temp_name['file_name'])
66-
db.update({'version':temp_name['version']},temp.name==name)
86+
print(
87+
"from "
88+
+ name
89+
+ " "
90+
+ db.search(temp.name == name)[0]["version"]
91+
+ " update to "
92+
+ name
93+
+ " "
94+
+ temp_name["version"]
95+
)
96+
yes_no = "y"
97+
if not force:
98+
yes_no = str(input("y or n : ")).lower()
99+
if "y" == yes_no:
100+
download_(temp_name["download"], temp_name["file_name"])
101+
db.update({"version": temp_name["version"]}, temp.name == name)
67102
else:
68103
print("re-download")
69-
print("from "+name+" "+db.search(temp.name==name)[0]['version']+" update to "+name+" "+temp_name['version'])
70-
yes_no="y"
71-
if force==False:
72-
yes_no=str(input("y or n : ")).lower()
73-
if "y"==yes_no:
74-
download_(temp_name['download'],temp_name['file_name'])
75-
db.update({'version':temp_name['version']},temp.name==name)
104+
print(
105+
"from "
106+
+ name
107+
+ " "
108+
+ db.search(temp.name == name)[0]["version"]
109+
+ " update to "
110+
+ name
111+
+ " "
112+
+ temp_name["version"]
113+
)
114+
yes_no = "y"
115+
if not force:
116+
yes_no = str(input("y or n : ")).lower()
117+
if "y" == yes_no:
118+
download_(temp_name["download"], temp_name["file_name"])
119+
db.update({"version": temp_name["version"]}, temp.name == name)
76120
db.close()
121+
122+
77123
def remove(name):
78-
db=TinyDB(path_db_)
124+
db = TinyDB(path_db_)
79125
temp = Query()
80-
data=db.search(temp.name==name)
81-
if len(data)>0:
82-
path=get_file(name)
126+
data = db.search(temp.name == name)
127+
if len(data) > 0:
128+
path = get_file(name)
83129
os.remove(path)
84-
db.remove(temp.name==name)
130+
db.remove(temp.name == name)
85131
return True
86-
return False
132+
return False

pythainlp/tools/__init__.py

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,32 @@
11
# -*- coding: utf-8 -*-
2-
from __future__ import absolute_import,unicode_literals
2+
3+
from __future__ import absolute_import, unicode_literals
4+
35
import os
4-
import dill
5-
from pythainlp.tokenize import tcc
6-
import marisa_trie
76
import subprocess
87
import sys
98

9+
1010
def install_package(package):
1111
subprocess.call([sys.executable, "-m", "pip", "install", package])
12+
13+
1214
def get_path_db():
13-
path = os.path.join(get_path_pythainlp_data(), "db.json")
14-
if not os.path.exists(path):
15-
from tinydb import TinyDB
16-
db=TinyDB(path)
17-
#db.insert({'name': 'hi', 'version': '0.1','file':''})
18-
return path
15+
path = os.path.join(get_path_pythainlp_data(), "db.json")
16+
if not os.path.exists(path):
17+
from tinydb import TinyDB
18+
19+
db = TinyDB(path)
20+
# db.insert({'name': 'hi', 'version': '0.1','file':''})
21+
return path
22+
23+
1924
def get_path_data(filename):
20-
return os.path.join(get_path_pythainlp_data(), filename)
25+
return os.path.join(get_path_pythainlp_data(), filename)
26+
27+
2128
def get_path_pythainlp_data():
22-
path= os.path.join(os.path.expanduser("~"), 'pythainlp-data')
23-
if not os.path.exists(path):
24-
os.makedirs(path)
25-
return path
29+
path = os.path.join(os.path.expanduser("~"), "pythainlp-data")
30+
if not os.path.exists(path):
31+
os.makedirs(path)
32+
return path

pythainlp/ulmfit/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
# -*- coding: utf-8 -*-
2-
from __future__ import absolute_import,unicode_literals
2+
3+
from __future__ import absolute_import, unicode_literals

0 commit comments

Comments
 (0)