Skip to content

Commit

Permalink
Merge pull request YoongiKim#37 from hstable/master
Browse files Browse the repository at this point in the history
Better skip-check strategy and allow to use proxy list
  • Loading branch information
YoongiKim authored Aug 8, 2021
2 parents dc0982b + 37e1c2c commit 3209f79
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 23 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ python3 main.py [--skip true] [--threads 4] [--google true] [--naver true] [--fu
(can be used for docker linux system)
--limit 0 Maximum count of images to download per site. (0: infinite)
--proxy-list '' The comma separated proxy list like: "socks://127.0.0.1:1080,http://127.0.0.1:1081".
Every thread will randomly choose one from the list.
```


Expand Down
20 changes: 13 additions & 7 deletions collect_links.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
limitations under the License.
"""


import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
Expand All @@ -29,7 +28,7 @@


class CollectLinks:
def __init__(self, no_gui=False):
def __init__(self, no_gui=False, proxy=None):
executable = ''

if platform.system() == 'Windows':
Expand All @@ -52,6 +51,8 @@ def __init__(self, no_gui=False):
chrome_options.add_argument('--disable-dev-shm-usage')
if no_gui:
chrome_options.add_argument('--headless')
if proxy:
chrome_options.add_argument("--proxy-server={}".format(proxy))
self.browser = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=chrome_options)

browser_version = 'Failed to detect version'
Expand All @@ -73,7 +74,8 @@ def __init__(self, no_gui=False):
print('Current chrome-driver version:\t{}'.format(chromedriver_version))
if major_version_different:
print('warning: Version different')
print('Download correct version at "http://chromedriver.chromium.org/downloads" and place in "./chromedriver"')
print(
'Download correct version at "http://chromedriver.chromium.org/downloads" and place in "./chromedriver"')
print('_________________________________')

def get_scroll(self):
Expand All @@ -97,7 +99,8 @@ def wait_and_click(self, xpath):
return elem

def highlight(self, element):
self.browser.execute_script("arguments[0].setAttribute('style', arguments[1]);", element, "background: yellow; border: 2px solid red;")
self.browser.execute_script("arguments[0].setAttribute('style', arguments[1]);", element,
"background: yellow; border: 2px solid red;")

@staticmethod
def remove_duplicates(_list):
Expand Down Expand Up @@ -159,7 +162,8 @@ def google(self, keyword, add_url=""):
return links

def naver(self, keyword, add_url=""):
self.browser.get("https://search.naver.com/search.naver?where=image&sm=tab_jum&query={}{}".format(keyword, add_url))
self.browser.get(
"https://search.naver.com/search.naver?where=image&sm=tab_jum&query={}{}".format(keyword, add_url))

time.sleep(1)

Expand All @@ -171,7 +175,8 @@ def naver(self, keyword, add_url=""):
elem.send_keys(Keys.PAGE_DOWN)
time.sleep(0.2)

imgs = self.browser.find_elements(By.XPATH, '//div[@class="photo_bx api_ani_send _photoBox"]//img[@class="_image _listImage"]')
imgs = self.browser.find_elements(By.XPATH,
'//div[@class="photo_bx api_ani_send _photoBox"]//img[@class="_image _listImage"]')

print('Scraping links')

Expand Down Expand Up @@ -263,7 +268,8 @@ def google_full(self, keyword, add_url=""):
def naver_full(self, keyword, add_url=""):
print('[Full Resolution Mode]')

self.browser.get("https://search.naver.com/search.naver?where=image&sm=tab_jum&query={}{}".format(keyword, add_url))
self.browser.get(
"https://search.naver.com/search.naver?where=image&sm=tab_jum&query={}{}".format(keyword, add_url))
time.sleep(1)

elem = self.browser.find_element_by_tag_name("body")
Expand Down
50 changes: 34 additions & 16 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
limitations under the License.
"""


import os
import requests
import shutil
Expand All @@ -23,6 +22,8 @@
from collect_links import CollectLinks
import imghdr
import base64
from pathlib import Path
import random


class Sites:
Expand Down Expand Up @@ -52,7 +53,7 @@ def get_face_url(code):

class AutoCrawler:
def __init__(self, skip_already_exist=True, n_threads=4, do_google=True, do_naver=True, download_path='download',
full_resolution=False, face=False, no_gui=False, limit=0):
full_resolution=False, face=False, no_gui=False, limit=0, proxy_list=None):
"""
:param skip_already_exist: Skips keyword already downloaded before. This is needed when re-downloading.
:param n_threads: Number of threads to download.
Expand All @@ -63,6 +64,7 @@ def __init__(self, skip_already_exist=True, n_threads=4, do_google=True, do_nave
:param face: Face search mode
:param no_gui: No GUI mode. Acceleration for full_resolution mode.
:param limit: Maximum count of images to download. (0: infinite)
:param proxy_list: The proxy list. Every thread will randomly choose one from the list.
"""

self.skip = skip_already_exist
Expand All @@ -74,6 +76,7 @@ def __init__(self, skip_already_exist=True, n_threads=4, do_google=True, do_nave
self.face = face
self.no_gui = no_gui
self.limit = limit
self.proxy_list = proxy_list if proxy_list and len(proxy_list) > 0 else None

os.makedirs('./{}'.format(self.download_path), exist_ok=True)

Expand Down Expand Up @@ -188,7 +191,8 @@ def download_images(self, keyword, links, site_name, max_count=0):
ext = self.get_extension_from_link(link)
is_base64 = False

no_ext_path = '{}/{}/{}_{}'.format(self.download_path.replace('"', ''), keyword, site_name, str(index).zfill(4))
no_ext_path = '{}/{}/{}_{}'.format(self.download_path.replace('"', ''), keyword, site_name,
str(index).zfill(4))
path = no_ext_path + '.' + ext
self.save_object_to_file(response, path, is_base64=is_base64)

Expand All @@ -215,7 +219,10 @@ def download_from_site(self, keyword, site_code):
add_url = Sites.get_face_url(site_code) if self.face else ""

try:
collect = CollectLinks(no_gui=self.no_gui) # initialize chrome driver
proxy = None
if self.proxy_list:
proxy = random.choice(self.proxy_list)
collect = CollectLinks(no_gui=self.no_gui, proxy=proxy) # initialize chrome driver
except Exception as e:
print('Error occurred while initializing chromedriver - {}'.format(e))
return
Expand All @@ -241,6 +248,7 @@ def download_from_site(self, keyword, site_code):

print('Downloading images from collected links... {} from {}'.format(keyword, site_name))
self.download_images(keyword, links, site_name, max_count=self.limit)
Path('{}/{}/{}_done'.format(self.download_path, keyword.replace('"', ''), site_name)).touch()

print('Done {} : {}'.format(site_name, keyword))

Expand All @@ -257,17 +265,19 @@ def do_crawling(self):

for keyword in keywords:
dir_name = '{}/{}'.format(self.download_path, keyword)
if os.path.exists(os.path.join(os.getcwd(), dir_name)) and self.skip:
print('Skipping already existing directory {}'.format(dir_name))
google_done = os.path.exists(os.path.join(os.getcwd(), dir_name, 'google_done'))
naver_done = os.path.exists(os.path.join(os.getcwd(), dir_name, 'naver_done'))
if google_done and naver_done and self.skip:
print('Skipping done task {}'.format(dir_name))
continue

if self.do_google:
if self.do_google and not google_done:
if self.full_resolution:
tasks.append([keyword, Sites.GOOGLE_FULL])
else:
tasks.append([keyword, Sites.GOOGLE])

if self.do_naver:
if self.do_naver and not naver_done:
if self.full_resolution:
tasks.append([keyword, Sites.NAVER_FULL])
else:
Expand Down Expand Up @@ -334,12 +344,18 @@ def imbalance_check(self):
parser.add_argument('--threads', type=int, default=4, help='Number of threads to download.')
parser.add_argument('--google', type=str, default='true', help='Download from google.com (boolean)')
parser.add_argument('--naver', type=str, default='true', help='Download from naver.com (boolean)')
parser.add_argument('--full', type=str, default='false', help='Download full resolution image instead of thumbnails (slow)')
parser.add_argument('--full', type=str, default='false',
help='Download full resolution image instead of thumbnails (slow)')
parser.add_argument('--face', type=str, default='false', help='Face search mode')
parser.add_argument('--no_gui', type=str, default='auto', help='No GUI mode. Acceleration for full_resolution mode. '
'But unstable on thumbnail mode. '
'Default: "auto" - false if full=false, true if full=true')
parser.add_argument('--limit', type=int, default=0, help='Maximum count of images to download per site. (0: infinite)')
parser.add_argument('--no_gui', type=str, default='auto',
help='No GUI mode. Acceleration for full_resolution mode. '
'But unstable on thumbnail mode. '
'Default: "auto" - false if full=false, true if full=true')
parser.add_argument('--limit', type=int, default=0,
help='Maximum count of images to download per site. (0: infinite)')
parser.add_argument('--proxy-list', type=str, default='',
help='The comma separated proxy list like: "socks://127.0.0.1:1080,http://127.0.0.1:1081". '
'Every thread will randomly choose one from the list.')
args = parser.parse_args()

_skip = False if str(args.skip).lower() == 'false' else True
Expand All @@ -349,6 +365,7 @@ def imbalance_check(self):
_full = False if str(args.full).lower() == 'false' else True
_face = False if str(args.face).lower() == 'false' else True
_limit = int(args.limit)
_proxy_list = args.proxy_list.split(',')

no_gui_input = str(args.no_gui).lower()
if no_gui_input == 'auto':
Expand All @@ -358,10 +375,11 @@ def imbalance_check(self):
else:
_no_gui = False

print('Options - skip:{}, threads:{}, google:{}, naver:{}, full_resolution:{}, face:{}, no_gui:{}, limit:{}'
.format(_skip, _threads, _google, _naver, _full, _face, _no_gui, _limit))
print(
'Options - skip:{}, threads:{}, google:{}, naver:{}, full_resolution:{}, face:{}, no_gui:{}, limit:{}, _proxy_list:{}'
.format(_skip, _threads, _google, _naver, _full, _face, _no_gui, _limit, _proxy_list))

crawler = AutoCrawler(skip_already_exist=_skip, n_threads=_threads,
do_google=_google, do_naver=_naver, full_resolution=_full,
face=_face, no_gui=_no_gui, limit=_limit)
face=_face, no_gui=_no_gui, limit=_limit, proxy_list=_proxy_list)
crawler.do_crawling()

0 comments on commit 3209f79

Please sign in to comment.