forked from YoongiKim/AutoCrawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
398 lines (326 loc) · 15 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
"""
Copyright 2018 YoongiKim
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import os
import requests
import shutil
from multiprocessing import Pool
import signal
import argparse
from collect_links import CollectLinks
import imghdr
import base64
from pathlib import Path
import random
class Sites:
GOOGLE = 1
NAVER = 2
GOOGLE_FULL = 3
NAVER_FULL = 4
@staticmethod
def get_text(code):
if code == Sites.GOOGLE:
return 'google'
elif code == Sites.NAVER:
return 'naver'
elif code == Sites.GOOGLE_FULL:
return 'google'
elif code == Sites.NAVER_FULL:
return 'naver'
@staticmethod
def get_face_url(code):
if code == Sites.GOOGLE or Sites.GOOGLE_FULL:
return "&tbs=itp:face"
if code == Sites.NAVER or Sites.NAVER_FULL:
return "&face=1"
class AutoCrawler:
def __init__(self, skip_already_exist=True, n_threads=4, do_google=True, do_naver=True, download_path='download',
full_resolution=False, face=False, no_gui=False, limit=0, proxy_list=None):
"""
:param skip_already_exist: Skips keyword already downloaded before. This is needed when re-downloading.
:param n_threads: Number of threads to download.
:param do_google: Download from google.com (boolean)
:param do_naver: Download from naver.com (boolean)
:param download_path: Download folder path
:param full_resolution: Download full resolution image instead of thumbnails (slow)
:param face: Face search mode
:param no_gui: No GUI mode. Acceleration for full_resolution mode.
:param limit: Maximum count of images to download. (0: infinite)
:param proxy_list: The proxy list. Every thread will randomly choose one from the list.
"""
self.skip = skip_already_exist
self.n_threads = n_threads
self.do_google = do_google
self.do_naver = do_naver
self.download_path = download_path
self.full_resolution = full_resolution
self.face = face
self.no_gui = no_gui
self.limit = limit
self.proxy_list = proxy_list if proxy_list and len(proxy_list) > 0 else None
os.makedirs('./{}'.format(self.download_path), exist_ok=True)
@staticmethod
def all_dirs(path):
paths = []
for dir in os.listdir(path):
if os.path.isdir(path + '/' + dir):
paths.append(path + '/' + dir)
return paths
@staticmethod
def all_files(path):
paths = []
for root, dirs, files in os.walk(path):
for file in files:
if os.path.isfile(path + '/' + file):
paths.append(path + '/' + file)
return paths
@staticmethod
def get_extension_from_link(link, default='jpg'):
splits = str(link).split('.')
if len(splits) == 0:
return default
ext = splits[-1].lower()
if ext == 'jpg' or ext == 'jpeg':
return 'jpg'
elif ext == 'gif':
return 'gif'
elif ext == 'png':
return 'png'
else:
return default
@staticmethod
def validate_image(path):
ext = imghdr.what(path)
if ext == 'jpeg':
ext = 'jpg'
return ext # returns None if not valid
@staticmethod
def make_dir(dirname):
current_path = os.getcwd()
path = os.path.join(current_path, dirname)
if not os.path.exists(path):
os.makedirs(path)
@staticmethod
def get_keywords(keywords_file='keywords.txt'):
# read search keywords from file
with open(keywords_file, 'r', encoding='utf-8-sig') as f:
text = f.read()
lines = text.split('\n')
lines = filter(lambda x: x != '' and x is not None, lines)
keywords = sorted(set(lines))
print('{} keywords found: {}'.format(len(keywords), keywords))
# re-save sorted keywords
with open(keywords_file, 'w+', encoding='utf-8') as f:
for keyword in keywords:
f.write('{}\n'.format(keyword))
return keywords
@staticmethod
def save_object_to_file(object, file_path, is_base64=False):
try:
with open('{}'.format(file_path), 'wb') as file:
if is_base64:
file.write(object)
else:
shutil.copyfileobj(object.raw, file)
except Exception as e:
print('Save failed - {}'.format(e))
@staticmethod
def base64_to_object(src):
header, encoded = str(src).split(',', 1)
data = base64.decodebytes(bytes(encoded, encoding='utf-8'))
return data
def download_images(self, keyword, links, site_name, max_count=0):
self.make_dir('{}/{}'.format(self.download_path, keyword.replace('"', '')))
total = len(links)
success_count = 0
if max_count == 0:
max_count = total
for index, link in enumerate(links):
if success_count >= max_count:
break
try:
print('Downloading {} from {}: {} / {}'.format(keyword, site_name, success_count + 1, max_count))
if str(link).startswith('data:image/jpeg;base64'):
response = self.base64_to_object(link)
ext = 'jpg'
is_base64 = True
elif str(link).startswith('data:image/png;base64'):
response = self.base64_to_object(link)
ext = 'png'
is_base64 = True
else:
response = requests.get(link, stream=True, timeout=10)
ext = self.get_extension_from_link(link)
is_base64 = False
no_ext_path = '{}/{}/{}_{}'.format(self.download_path.replace('"', ''), keyword, site_name,
str(index).zfill(4))
path = no_ext_path + '.' + ext
self.save_object_to_file(response, path, is_base64=is_base64)
success_count += 1
del response
ext2 = self.validate_image(path)
if ext2 is None:
print('Unreadable file - {}'.format(link))
os.remove(path)
success_count -= 1
else:
if ext != ext2:
path2 = no_ext_path + '.' + ext2
os.rename(path, path2)
print('Renamed extension {} -> {}'.format(ext, ext2))
except KeyboardInterrupt:
break
except Exception as e:
print('Download failed - ', e)
continue
def download_from_site(self, keyword, site_code):
site_name = Sites.get_text(site_code)
add_url = Sites.get_face_url(site_code) if self.face else ""
try:
proxy = None
if self.proxy_list:
proxy = random.choice(self.proxy_list)
collect = CollectLinks(no_gui=self.no_gui, proxy=proxy) # initialize chrome driver
except Exception as e:
print('Error occurred while initializing chromedriver - {}'.format(e))
return
try:
print('Collecting links... {} from {}'.format(keyword, site_name))
if site_code == Sites.GOOGLE:
links = collect.google(keyword, add_url)
elif site_code == Sites.NAVER:
links = collect.naver(keyword, add_url)
elif site_code == Sites.GOOGLE_FULL:
links = collect.google_full(keyword, add_url, self.limit)
elif site_code == Sites.NAVER_FULL:
links = collect.naver_full(keyword, add_url)
else:
print('Invalid Site Code')
links = []
print('Downloading images from collected links... {} from {}'.format(keyword, site_name))
self.download_images(keyword, links, site_name, max_count=self.limit)
Path('{}/{}/{}_done'.format(self.download_path, keyword.replace('"', ''), site_name)).touch()
print('Done {} : {}'.format(site_name, keyword))
except Exception as e:
print('Exception {}:{} - {}'.format(site_name, keyword, e))
return
def download(self, args):
self.download_from_site(keyword=args[0], site_code=args[1])
def init_worker(self):
signal.signal(signal.SIGINT, signal.SIG_IGN)
def do_crawling(self):
keywords = self.get_keywords()
tasks = []
for keyword in keywords:
dir_name = '{}/{}'.format(self.download_path, keyword)
google_done = os.path.exists(os.path.join(os.getcwd(), dir_name, 'google_done'))
naver_done = os.path.exists(os.path.join(os.getcwd(), dir_name, 'naver_done'))
if google_done and naver_done and self.skip:
print('Skipping done task {}'.format(dir_name))
continue
if self.do_google and not google_done:
if self.full_resolution:
tasks.append([keyword, Sites.GOOGLE_FULL])
else:
tasks.append([keyword, Sites.GOOGLE])
if self.do_naver and not naver_done:
if self.full_resolution:
tasks.append([keyword, Sites.NAVER_FULL])
else:
tasks.append([keyword, Sites.NAVER])
try:
pool = Pool(self.n_threads, initializer=self.init_worker)
pool.map(self.download, tasks)
except KeyboardInterrupt:
pool.terminate()
pool.join()
else:
pool.terminate()
pool.join()
print('Task ended. Pool join.')
self.imbalance_check()
print('End Program')
def imbalance_check(self):
print('Data imbalance checking...')
dict_num_files = {}
for dir in self.all_dirs(self.download_path):
n_files = len(self.all_files(dir))
dict_num_files[dir] = n_files
avg = 0
for dir, n_files in dict_num_files.items():
avg += n_files / len(dict_num_files)
print('dir: {}, file_count: {}'.format(dir, n_files))
dict_too_small = {}
for dir, n_files in dict_num_files.items():
if n_files < avg * 0.5:
dict_too_small[dir] = n_files
if len(dict_too_small) >= 1:
print('Data imbalance detected.')
print('Below keywords have smaller than 50% of average file count.')
print('I recommend you to remove these directories and re-download for that keyword.')
print('_________________________________')
print('Too small file count directories:')
for dir, n_files in dict_too_small.items():
print('dir: {}, file_count: {}'.format(dir, n_files))
print("Remove directories above? (y/n)")
answer = input()
if answer == 'y':
# removing directories too small files
print("Removing too small file count directories...")
for dir, n_files in dict_too_small.items():
shutil.rmtree(dir)
print('Removed {}'.format(dir))
print('Now re-run this program to re-download removed files. (with skip_already_exist=True)')
else:
print('Data imbalance not detected.')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--skip', type=str, default='true',
help='Skips keyword already downloaded before. This is needed when re-downloading.')
parser.add_argument('--threads', type=int, default=4, help='Number of threads to download.')
parser.add_argument('--google', type=str, default='true', help='Download from google.com (boolean)')
parser.add_argument('--naver', type=str, default='true', help='Download from naver.com (boolean)')
parser.add_argument('--full', type=str, default='false',
help='Download full resolution image instead of thumbnails (slow)')
parser.add_argument('--face', type=str, default='false', help='Face search mode')
parser.add_argument('--no_gui', type=str, default='auto',
help='No GUI mode. Acceleration for full_resolution mode. '
'But unstable on thumbnail mode. '
'Default: "auto" - false if full=false, true if full=true')
parser.add_argument('--limit', type=int, default=0,
help='Maximum count of images to download per site.')
parser.add_argument('--proxy-list', type=str, default='',
help='The comma separated proxy list like: "socks://127.0.0.1:1080,http://127.0.0.1:1081". '
'Every thread will randomly choose one from the list.')
args = parser.parse_args()
_skip = False if str(args.skip).lower() == 'false' else True
_threads = args.threads
_google = False if str(args.google).lower() == 'false' else True
_naver = False if str(args.naver).lower() == 'false' else True
_full = False if str(args.full).lower() == 'false' else True
_face = False if str(args.face).lower() == 'false' else True
_limit = int(args.limit)
_proxy_list = args.proxy_list.split(',')
no_gui_input = str(args.no_gui).lower()
if no_gui_input == 'auto':
_no_gui = _full
elif no_gui_input == 'true':
_no_gui = True
else:
_no_gui = False
print(
'Options - skip:{}, threads:{}, google:{}, naver:{}, full_resolution:{}, face:{}, no_gui:{}, limit:{}, _proxy_list:{}'
.format(_skip, _threads, _google, _naver, _full, _face, _no_gui, _limit, _proxy_list))
crawler = AutoCrawler(skip_already_exist=_skip, n_threads=_threads,
do_google=_google, do_naver=_naver, full_resolution=_full,
face=_face, no_gui=_no_gui, limit=_limit, proxy_list=_proxy_list)
crawler.do_crawling()