Skip to content

Commit

Permalink
filter corrupted images
Browse files Browse the repository at this point in the history
  • Loading branch information
Yoongi Kim committed Feb 26, 2019
1 parent d9bdef6 commit 5573539
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 11 deletions.
18 changes: 10 additions & 8 deletions collect_links.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ def google_full(self, keyword, add_url=""):
time.sleep(1)

links = []
# count = 1
count = 1

last_scroll = 0
scroll_patience = 0
Expand All @@ -182,11 +182,12 @@ def google_full(self, keyword, add_url=""):

if src not in links and src is not None:
links.append(src)
# print('%d: %s'%(count, src))
# count += 1
print('%d: %s'%(count, src))
count += 1

except StaleElementReferenceException:
print('[Expected Exception - StaleElementReferenceException]')
# print('[Expected Exception - StaleElementReferenceException]')
pass
except Exception as e:
print('[Exception occurred while collecting links from google_full] {}'.format(e))

Expand Down Expand Up @@ -223,7 +224,7 @@ def naver_full(self, keyword, add_url=""):
time.sleep(1)

links = []
# count = 1
count = 1

last_scroll = 0
scroll_patience = 0
Expand All @@ -238,11 +239,12 @@ def naver_full(self, keyword, add_url=""):

if src not in links and src is not None:
links.append(src)
# print('%d: %s' % (count, src))
# count += 1
print('%d: %s' % (count, src))
count += 1

except StaleElementReferenceException:
print('[Expected Exception - StaleElementReferenceException]')
# print('[Expected Exception - StaleElementReferenceException]')
pass
except Exception as e:
print('[Exception occurred while collecting links from naver_full] {}'.format(e))

Expand Down
29 changes: 26 additions & 3 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from multiprocessing import Pool
import argparse
from collect_links import CollectLinks
import imghdr


class Sites:
Expand Down Expand Up @@ -105,6 +106,13 @@ def get_extension_from_link(link, default='jpg'):
else:
return default

@staticmethod
def validate_image(path):
ext = imghdr.what(path)
if ext == 'jpeg':
ext = 'jpg'
return ext # returns None if not valid

@staticmethod
def make_dir(dirname):
current_path = os.getcwd()
Expand All @@ -130,10 +138,11 @@ def get_keywords(keywords_file='keywords.txt'):

return keywords

def save_image_to_file(self, image, file_path):
@staticmethod
def save_object_to_file(object, file_path):
try:
with open('{}'.format(file_path), 'wb') as file:
shutil.copyfileobj(image.raw, file)
shutil.copyfileobj(object.raw, file)
except Exception as e:
print('Save failed - {}'.format(e))

Expand All @@ -146,9 +155,23 @@ def download_images(self, keyword, links, site_name):
print('Downloading {} from {}: {} / {}'.format(keyword, site_name, index + 1, total))
response = requests.get(link, stream=True)
ext = self.get_extension_from_link(link)
self.save_image_to_file(response, '{}/{}/{}_{}.{}'.format(self.download_path, keyword, site_name, str(index).zfill(4), ext))

no_ext_path = '{}/{}/{}_{}'.format(self.download_path, keyword, site_name, str(index).zfill(4))
path = no_ext_path + '.' + ext
self.save_object_to_file(response, path)

del response

ext2 = self.validate_image(path)
if ext2 is None:
print('Unreadable file - {}'.format(link))
os.remove(path)
else:
if ext != ext2:
path2 = no_ext_path + '.' + ext2
os.rename(path, path2)
print('Renamed extension {} -> {}'.format(ext, ext2))

except Exception as e:
print('Download failed - ', e)
continue
Expand Down

0 comments on commit 5573539

Please sign in to comment.