diff --git a/collect_links.py b/collect_links.py index 8929892..bdd7c6f 100644 --- a/collect_links.py +++ b/collect_links.py @@ -168,7 +168,7 @@ def google_full(self, keyword, add_url=""): time.sleep(1) links = [] - # count = 1 + count = 1 last_scroll = 0 scroll_patience = 0 @@ -182,11 +182,12 @@ def google_full(self, keyword, add_url=""): if src not in links and src is not None: links.append(src) - # print('%d: %s'%(count, src)) - # count += 1 + print('%d: %s'%(count, src)) + count += 1 except StaleElementReferenceException: - print('[Expected Exception - StaleElementReferenceException]') + # print('[Expected Exception - StaleElementReferenceException]') + pass except Exception as e: print('[Exception occurred while collecting links from google_full] {}'.format(e)) @@ -223,7 +224,7 @@ def naver_full(self, keyword, add_url=""): time.sleep(1) links = [] - # count = 1 + count = 1 last_scroll = 0 scroll_patience = 0 @@ -238,11 +239,12 @@ def naver_full(self, keyword, add_url=""): if src not in links and src is not None: links.append(src) - # print('%d: %s' % (count, src)) - # count += 1 + print('%d: %s' % (count, src)) + count += 1 except StaleElementReferenceException: - print('[Expected Exception - StaleElementReferenceException]') + # print('[Expected Exception - StaleElementReferenceException]') + pass except Exception as e: print('[Exception occurred while collecting links from naver_full] {}'.format(e)) diff --git a/main.py b/main.py index 969becc..6eead12 100644 --- a/main.py +++ b/main.py @@ -21,6 +21,7 @@ from multiprocessing import Pool import argparse from collect_links import CollectLinks +import imghdr class Sites: @@ -105,6 +106,13 @@ def get_extension_from_link(link, default='jpg'): else: return default + @staticmethod + def validate_image(path): + ext = imghdr.what(path) + if ext == 'jpeg': + ext = 'jpg' + return ext # returns None if not valid + @staticmethod def make_dir(dirname): current_path = os.getcwd() @@ -130,10 +138,11 @@ def get_keywords(keywords_file='keywords.txt'): return keywords - def save_image_to_file(self, image, file_path): + @staticmethod + def save_object_to_file(object, file_path): try: with open('{}'.format(file_path), 'wb') as file: - shutil.copyfileobj(image.raw, file) + shutil.copyfileobj(object.raw, file) except Exception as e: print('Save failed - {}'.format(e)) @@ -146,9 +155,23 @@ def download_images(self, keyword, links, site_name): print('Downloading {} from {}: {} / {}'.format(keyword, site_name, index + 1, total)) response = requests.get(link, stream=True) ext = self.get_extension_from_link(link) - self.save_image_to_file(response, '{}/{}/{}_{}.{}'.format(self.download_path, keyword, site_name, str(index).zfill(4), ext)) + + no_ext_path = '{}/{}/{}_{}'.format(self.download_path, keyword, site_name, str(index).zfill(4)) + path = no_ext_path + '.' + ext + self.save_object_to_file(response, path) + del response + ext2 = self.validate_image(path) + if ext2 is None: + print('Unreadable file - {}'.format(link)) + os.remove(path) + else: + if ext != ext2: + path2 = no_ext_path + '.' + ext2 + os.rename(path, path2) + print('Renamed extension {} -> {}'.format(ext, ext2)) + except Exception as e: print('Download failed - ', e) continue