From a9d1b64f2963904c91116ec90f2b680ddcf13378 Mon Sep 17 00:00:00 2001 From: crazcell Date: Sun, 8 Aug 2021 14:08:42 +0800 Subject: [PATCH] feat: allow to use proxy list --- collect_links.py | 20 +++++++++++++------- main.py | 21 ++++++++++++++++----- 2 files changed, 29 insertions(+), 12 deletions(-) diff --git a/collect_links.py b/collect_links.py index c5d1d9a..11981c0 100644 --- a/collect_links.py +++ b/collect_links.py @@ -14,7 +14,6 @@ limitations under the License. """ - import time from selenium import webdriver from selenium.webdriver.common.keys import Keys @@ -29,7 +28,7 @@ class CollectLinks: - def __init__(self, no_gui=False): + def __init__(self, no_gui=False, proxy=None): executable = '' if platform.system() == 'Windows': @@ -52,6 +51,8 @@ def __init__(self, no_gui=False): chrome_options.add_argument('--disable-dev-shm-usage') if no_gui: chrome_options.add_argument('--headless') + if proxy: + chrome_options.add_argument("--proxy-server={}".format(proxy)) self.browser = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=chrome_options) browser_version = 'Failed to detect version' @@ -73,7 +74,8 @@ def __init__(self, no_gui=False): print('Current chrome-driver version:\t{}'.format(chromedriver_version)) if major_version_different: print('warning: Version different') - print('Download correct version at "http://chromedriver.chromium.org/downloads" and place in "./chromedriver"') + print( + 'Download correct version at "http://chromedriver.chromium.org/downloads" and place in "./chromedriver"') print('_________________________________') def get_scroll(self): @@ -97,7 +99,8 @@ def wait_and_click(self, xpath): return elem def highlight(self, element): - self.browser.execute_script("arguments[0].setAttribute('style', arguments[1]);", element, "background: yellow; border: 2px solid red;") + self.browser.execute_script("arguments[0].setAttribute('style', arguments[1]);", element, + "background: yellow; border: 2px solid red;") @staticmethod def remove_duplicates(_list): @@ -159,7 +162,8 @@ def google(self, keyword, add_url=""): return links def naver(self, keyword, add_url=""): - self.browser.get("https://search.naver.com/search.naver?where=image&sm=tab_jum&query={}{}".format(keyword, add_url)) + self.browser.get( + "https://search.naver.com/search.naver?where=image&sm=tab_jum&query={}{}".format(keyword, add_url)) time.sleep(1) @@ -171,7 +175,8 @@ def naver(self, keyword, add_url=""): elem.send_keys(Keys.PAGE_DOWN) time.sleep(0.2) - imgs = self.browser.find_elements(By.XPATH, '//div[@class="photo_bx api_ani_send _photoBox"]//img[@class="_image _listImage"]') + imgs = self.browser.find_elements(By.XPATH, + '//div[@class="photo_bx api_ani_send _photoBox"]//img[@class="_image _listImage"]') print('Scraping links') @@ -263,7 +268,8 @@ def google_full(self, keyword, add_url=""): def naver_full(self, keyword, add_url=""): print('[Full Resolution Mode]') - self.browser.get("https://search.naver.com/search.naver?where=image&sm=tab_jum&query={}{}".format(keyword, add_url)) + self.browser.get( + "https://search.naver.com/search.naver?where=image&sm=tab_jum&query={}{}".format(keyword, add_url)) time.sleep(1) elem = self.browser.find_element_by_tag_name("body") diff --git a/main.py b/main.py index 2b18bb4..9ce1a0b 100644 --- a/main.py +++ b/main.py @@ -23,6 +23,7 @@ import imghdr import base64 from pathlib import Path +import random class Sites: @@ -52,7 +53,7 @@ def get_face_url(code): class AutoCrawler: def __init__(self, skip_already_exist=True, n_threads=4, do_google=True, do_naver=True, download_path='download', - full_resolution=False, face=False, no_gui=False, limit=0): + full_resolution=False, face=False, no_gui=False, limit=0, proxy_list=None): """ :param skip_already_exist: Skips keyword already downloaded before. This is needed when re-downloading. :param n_threads: Number of threads to download. @@ -63,6 +64,7 @@ def __init__(self, skip_already_exist=True, n_threads=4, do_google=True, do_nave :param face: Face search mode :param no_gui: No GUI mode. Acceleration for full_resolution mode. :param limit: Maximum count of images to download. (0: infinite) + :param proxy_list: The proxy list. Every thread will randomly choose one from the list. """ self.skip = skip_already_exist @@ -74,6 +76,7 @@ def __init__(self, skip_already_exist=True, n_threads=4, do_google=True, do_nave self.face = face self.no_gui = no_gui self.limit = limit + self.proxy_list = proxy_list if proxy_list and len(proxy_list) > 0 else None os.makedirs('./{}'.format(self.download_path), exist_ok=True) @@ -216,7 +219,10 @@ def download_from_site(self, keyword, site_code): add_url = Sites.get_face_url(site_code) if self.face else "" try: - collect = CollectLinks(no_gui=self.no_gui) # initialize chrome driver + proxy = None + if self.proxy_list: + proxy = random.choice(self.proxy_list) + collect = CollectLinks(no_gui=self.no_gui, proxy=proxy) # initialize chrome driver except Exception as e: print('Error occurred while initializing chromedriver - {}'.format(e)) return @@ -347,6 +353,9 @@ def imbalance_check(self): 'Default: "auto" - false if full=false, true if full=true') parser.add_argument('--limit', type=int, default=0, help='Maximum count of images to download per site. (0: infinite)') + parser.add_argument('--proxy-list', type=str, default='', + help='The comma separated proxy list like: "socks://127.0.0.1:1080,http://127.0.0.1:1081". ' + 'Every thread will randomly choose one from the list.') args = parser.parse_args() _skip = False if str(args.skip).lower() == 'false' else True @@ -356,6 +365,7 @@ def imbalance_check(self): _full = False if str(args.full).lower() == 'false' else True _face = False if str(args.face).lower() == 'false' else True _limit = int(args.limit) + _proxy_list = args.proxy_list.split(',') no_gui_input = str(args.no_gui).lower() if no_gui_input == 'auto': @@ -365,10 +375,11 @@ def imbalance_check(self): else: _no_gui = False - print('Options - skip:{}, threads:{}, google:{}, naver:{}, full_resolution:{}, face:{}, no_gui:{}, limit:{}' - .format(_skip, _threads, _google, _naver, _full, _face, _no_gui, _limit)) + print( + 'Options - skip:{}, threads:{}, google:{}, naver:{}, full_resolution:{}, face:{}, no_gui:{}, limit:{}, _proxy_list:{}' + .format(_skip, _threads, _google, _naver, _full, _face, _no_gui, _limit, _proxy_list)) crawler = AutoCrawler(skip_already_exist=_skip, n_threads=_threads, do_google=_google, do_naver=_naver, full_resolution=_full, - face=_face, no_gui=_no_gui, limit=_limit) + face=_face, no_gui=_no_gui, limit=_limit, proxy_list=_proxy_list) crawler.do_crawling()