diff --git a/README.md b/README.md index 3864723..b701627 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,8 @@ python3 main.py [--skip true] [--threads 4] [--google true] [--naver true] [--fu (can be used for docker linux system) --limit 0 Maximum count of images to download per site. (0: infinite) +--proxy-list '' The comma separated proxy list like: "socks://127.0.0.1:1080,http://127.0.0.1:1081". + Every thread will randomly choose one from the list. ``` diff --git a/collect_links.py b/collect_links.py index c5d1d9a..11981c0 100644 --- a/collect_links.py +++ b/collect_links.py @@ -14,7 +14,6 @@ limitations under the License. """ - import time from selenium import webdriver from selenium.webdriver.common.keys import Keys @@ -29,7 +28,7 @@ class CollectLinks: - def __init__(self, no_gui=False): + def __init__(self, no_gui=False, proxy=None): executable = '' if platform.system() == 'Windows': @@ -52,6 +51,8 @@ def __init__(self, no_gui=False): chrome_options.add_argument('--disable-dev-shm-usage') if no_gui: chrome_options.add_argument('--headless') + if proxy: + chrome_options.add_argument("--proxy-server={}".format(proxy)) self.browser = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=chrome_options) browser_version = 'Failed to detect version' @@ -73,7 +74,8 @@ def __init__(self, no_gui=False): print('Current chrome-driver version:\t{}'.format(chromedriver_version)) if major_version_different: print('warning: Version different') - print('Download correct version at "http://chromedriver.chromium.org/downloads" and place in "./chromedriver"') + print( + 'Download correct version at "http://chromedriver.chromium.org/downloads" and place in "./chromedriver"') print('_________________________________') def get_scroll(self): @@ -97,7 +99,8 @@ def wait_and_click(self, xpath): return elem def highlight(self, element): - self.browser.execute_script("arguments[0].setAttribute('style', arguments[1]);", element, "background: yellow; border: 2px solid red;") + self.browser.execute_script("arguments[0].setAttribute('style', arguments[1]);", element, + "background: yellow; border: 2px solid red;") @staticmethod def remove_duplicates(_list): @@ -159,7 +162,8 @@ def google(self, keyword, add_url=""): return links def naver(self, keyword, add_url=""): - self.browser.get("https://search.naver.com/search.naver?where=image&sm=tab_jum&query={}{}".format(keyword, add_url)) + self.browser.get( + "https://search.naver.com/search.naver?where=image&sm=tab_jum&query={}{}".format(keyword, add_url)) time.sleep(1) @@ -171,7 +175,8 @@ def naver(self, keyword, add_url=""): elem.send_keys(Keys.PAGE_DOWN) time.sleep(0.2) - imgs = self.browser.find_elements(By.XPATH, '//div[@class="photo_bx api_ani_send _photoBox"]//img[@class="_image _listImage"]') + imgs = self.browser.find_elements(By.XPATH, + '//div[@class="photo_bx api_ani_send _photoBox"]//img[@class="_image _listImage"]') print('Scraping links') @@ -263,7 +268,8 @@ def google_full(self, keyword, add_url=""): def naver_full(self, keyword, add_url=""): print('[Full Resolution Mode]') - self.browser.get("https://search.naver.com/search.naver?where=image&sm=tab_jum&query={}{}".format(keyword, add_url)) + self.browser.get( + "https://search.naver.com/search.naver?where=image&sm=tab_jum&query={}{}".format(keyword, add_url)) time.sleep(1) elem = self.browser.find_element_by_tag_name("body") diff --git a/main.py b/main.py index a62f918..9ce1a0b 100644 --- a/main.py +++ b/main.py @@ -14,7 +14,6 @@ limitations under the License. """ - import os import requests import shutil @@ -23,6 +22,8 @@ from collect_links import CollectLinks import imghdr import base64 +from pathlib import Path +import random class Sites: @@ -52,7 +53,7 @@ def get_face_url(code): class AutoCrawler: def __init__(self, skip_already_exist=True, n_threads=4, do_google=True, do_naver=True, download_path='download', - full_resolution=False, face=False, no_gui=False, limit=0): + full_resolution=False, face=False, no_gui=False, limit=0, proxy_list=None): """ :param skip_already_exist: Skips keyword already downloaded before. This is needed when re-downloading. :param n_threads: Number of threads to download. @@ -63,6 +64,7 @@ def __init__(self, skip_already_exist=True, n_threads=4, do_google=True, do_nave :param face: Face search mode :param no_gui: No GUI mode. Acceleration for full_resolution mode. :param limit: Maximum count of images to download. (0: infinite) + :param proxy_list: The proxy list. Every thread will randomly choose one from the list. """ self.skip = skip_already_exist @@ -74,6 +76,7 @@ def __init__(self, skip_already_exist=True, n_threads=4, do_google=True, do_nave self.face = face self.no_gui = no_gui self.limit = limit + self.proxy_list = proxy_list if proxy_list and len(proxy_list) > 0 else None os.makedirs('./{}'.format(self.download_path), exist_ok=True) @@ -188,7 +191,8 @@ def download_images(self, keyword, links, site_name, max_count=0): ext = self.get_extension_from_link(link) is_base64 = False - no_ext_path = '{}/{}/{}_{}'.format(self.download_path.replace('"', ''), keyword, site_name, str(index).zfill(4)) + no_ext_path = '{}/{}/{}_{}'.format(self.download_path.replace('"', ''), keyword, site_name, + str(index).zfill(4)) path = no_ext_path + '.' + ext self.save_object_to_file(response, path, is_base64=is_base64) @@ -215,7 +219,10 @@ def download_from_site(self, keyword, site_code): add_url = Sites.get_face_url(site_code) if self.face else "" try: - collect = CollectLinks(no_gui=self.no_gui) # initialize chrome driver + proxy = None + if self.proxy_list: + proxy = random.choice(self.proxy_list) + collect = CollectLinks(no_gui=self.no_gui, proxy=proxy) # initialize chrome driver except Exception as e: print('Error occurred while initializing chromedriver - {}'.format(e)) return @@ -241,6 +248,7 @@ def download_from_site(self, keyword, site_code): print('Downloading images from collected links... {} from {}'.format(keyword, site_name)) self.download_images(keyword, links, site_name, max_count=self.limit) + Path('{}/{}/{}_done'.format(self.download_path, keyword.replace('"', ''), site_name)).touch() print('Done {} : {}'.format(site_name, keyword)) @@ -257,17 +265,19 @@ def do_crawling(self): for keyword in keywords: dir_name = '{}/{}'.format(self.download_path, keyword) - if os.path.exists(os.path.join(os.getcwd(), dir_name)) and self.skip: - print('Skipping already existing directory {}'.format(dir_name)) + google_done = os.path.exists(os.path.join(os.getcwd(), dir_name, 'google_done')) + naver_done = os.path.exists(os.path.join(os.getcwd(), dir_name, 'naver_done')) + if google_done and naver_done and self.skip: + print('Skipping done task {}'.format(dir_name)) continue - if self.do_google: + if self.do_google and not google_done: if self.full_resolution: tasks.append([keyword, Sites.GOOGLE_FULL]) else: tasks.append([keyword, Sites.GOOGLE]) - if self.do_naver: + if self.do_naver and not naver_done: if self.full_resolution: tasks.append([keyword, Sites.NAVER_FULL]) else: @@ -334,12 +344,18 @@ def imbalance_check(self): parser.add_argument('--threads', type=int, default=4, help='Number of threads to download.') parser.add_argument('--google', type=str, default='true', help='Download from google.com (boolean)') parser.add_argument('--naver', type=str, default='true', help='Download from naver.com (boolean)') - parser.add_argument('--full', type=str, default='false', help='Download full resolution image instead of thumbnails (slow)') + parser.add_argument('--full', type=str, default='false', + help='Download full resolution image instead of thumbnails (slow)') parser.add_argument('--face', type=str, default='false', help='Face search mode') - parser.add_argument('--no_gui', type=str, default='auto', help='No GUI mode. Acceleration for full_resolution mode. ' - 'But unstable on thumbnail mode. ' - 'Default: "auto" - false if full=false, true if full=true') - parser.add_argument('--limit', type=int, default=0, help='Maximum count of images to download per site. (0: infinite)') + parser.add_argument('--no_gui', type=str, default='auto', + help='No GUI mode. Acceleration for full_resolution mode. ' + 'But unstable on thumbnail mode. ' + 'Default: "auto" - false if full=false, true if full=true') + parser.add_argument('--limit', type=int, default=0, + help='Maximum count of images to download per site. (0: infinite)') + parser.add_argument('--proxy-list', type=str, default='', + help='The comma separated proxy list like: "socks://127.0.0.1:1080,http://127.0.0.1:1081". ' + 'Every thread will randomly choose one from the list.') args = parser.parse_args() _skip = False if str(args.skip).lower() == 'false' else True @@ -349,6 +365,7 @@ def imbalance_check(self): _full = False if str(args.full).lower() == 'false' else True _face = False if str(args.face).lower() == 'false' else True _limit = int(args.limit) + _proxy_list = args.proxy_list.split(',') no_gui_input = str(args.no_gui).lower() if no_gui_input == 'auto': @@ -358,10 +375,11 @@ def imbalance_check(self): else: _no_gui = False - print('Options - skip:{}, threads:{}, google:{}, naver:{}, full_resolution:{}, face:{}, no_gui:{}, limit:{}' - .format(_skip, _threads, _google, _naver, _full, _face, _no_gui, _limit)) + print( + 'Options - skip:{}, threads:{}, google:{}, naver:{}, full_resolution:{}, face:{}, no_gui:{}, limit:{}, _proxy_list:{}' + .format(_skip, _threads, _google, _naver, _full, _face, _no_gui, _limit, _proxy_list)) crawler = AutoCrawler(skip_already_exist=_skip, n_threads=_threads, do_google=_google, do_naver=_naver, full_resolution=_full, - face=_face, no_gui=_no_gui, limit=_limit) + face=_face, no_gui=_no_gui, limit=_limit, proxy_list=_proxy_list) crawler.do_crawling()