Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better skip-check strategy and allow to use proxy list #37

Merged
merged 4 commits into from
Aug 8, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ python3 main.py [--skip true] [--threads 4] [--google true] [--naver true] [--fu
(can be used for docker linux system)

--limit 0 Maximum count of images to download per site. (0: infinite)
--proxy-list '' The comma separated proxy list like: "socks://127.0.0.1:1080,http://127.0.0.1:1081".
Every thread will randomly choose one from the list.
```


Expand Down
20 changes: 13 additions & 7 deletions collect_links.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
limitations under the License.
"""


import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
Expand All @@ -29,7 +28,7 @@


class CollectLinks:
def __init__(self, no_gui=False):
def __init__(self, no_gui=False, proxy=None):
executable = ''

if platform.system() == 'Windows':
Expand All @@ -52,6 +51,8 @@ def __init__(self, no_gui=False):
chrome_options.add_argument('--disable-dev-shm-usage')
if no_gui:
chrome_options.add_argument('--headless')
if proxy:
chrome_options.add_argument("--proxy-server={}".format(proxy))
self.browser = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=chrome_options)

browser_version = 'Failed to detect version'
Expand All @@ -73,7 +74,8 @@ def __init__(self, no_gui=False):
print('Current chrome-driver version:\t{}'.format(chromedriver_version))
if major_version_different:
print('warning: Version different')
print('Download correct version at "http://chromedriver.chromium.org/downloads" and place in "./chromedriver"')
print(
'Download correct version at "http://chromedriver.chromium.org/downloads" and place in "./chromedriver"')
print('_________________________________')

def get_scroll(self):
Expand All @@ -97,7 +99,8 @@ def wait_and_click(self, xpath):
return elem

def highlight(self, element):
self.browser.execute_script("arguments[0].setAttribute('style', arguments[1]);", element, "background: yellow; border: 2px solid red;")
self.browser.execute_script("arguments[0].setAttribute('style', arguments[1]);", element,
"background: yellow; border: 2px solid red;")

@staticmethod
def remove_duplicates(_list):
Expand Down Expand Up @@ -159,7 +162,8 @@ def google(self, keyword, add_url=""):
return links

def naver(self, keyword, add_url=""):
self.browser.get("https://search.naver.com/search.naver?where=image&sm=tab_jum&query={}{}".format(keyword, add_url))
self.browser.get(
"https://search.naver.com/search.naver?where=image&sm=tab_jum&query={}{}".format(keyword, add_url))

time.sleep(1)

Expand All @@ -171,7 +175,8 @@ def naver(self, keyword, add_url=""):
elem.send_keys(Keys.PAGE_DOWN)
time.sleep(0.2)

imgs = self.browser.find_elements(By.XPATH, '//div[@class="photo_bx api_ani_send _photoBox"]//img[@class="_image _listImage"]')
imgs = self.browser.find_elements(By.XPATH,
'//div[@class="photo_bx api_ani_send _photoBox"]//img[@class="_image _listImage"]')

print('Scraping links')

Expand Down Expand Up @@ -263,7 +268,8 @@ def google_full(self, keyword, add_url=""):
def naver_full(self, keyword, add_url=""):
print('[Full Resolution Mode]')

self.browser.get("https://search.naver.com/search.naver?where=image&sm=tab_jum&query={}{}".format(keyword, add_url))
self.browser.get(
"https://search.naver.com/search.naver?where=image&sm=tab_jum&query={}{}".format(keyword, add_url))
time.sleep(1)

elem = self.browser.find_element_by_tag_name("body")
Expand Down
50 changes: 34 additions & 16 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
limitations under the License.
"""


import os
import requests
import shutil
Expand All @@ -23,6 +22,8 @@
from collect_links import CollectLinks
import imghdr
import base64
from pathlib import Path
import random


class Sites:
Expand Down Expand Up @@ -52,7 +53,7 @@ def get_face_url(code):

class AutoCrawler:
def __init__(self, skip_already_exist=True, n_threads=4, do_google=True, do_naver=True, download_path='download',
full_resolution=False, face=False, no_gui=False, limit=0):
full_resolution=False, face=False, no_gui=False, limit=0, proxy_list=None):
"""
:param skip_already_exist: Skips keyword already downloaded before. This is needed when re-downloading.
:param n_threads: Number of threads to download.
Expand All @@ -63,6 +64,7 @@ def __init__(self, skip_already_exist=True, n_threads=4, do_google=True, do_nave
:param face: Face search mode
:param no_gui: No GUI mode. Acceleration for full_resolution mode.
:param limit: Maximum count of images to download. (0: infinite)
:param proxy_list: The proxy list. Every thread will randomly choose one from the list.
"""

self.skip = skip_already_exist
Expand All @@ -74,6 +76,7 @@ def __init__(self, skip_already_exist=True, n_threads=4, do_google=True, do_nave
self.face = face
self.no_gui = no_gui
self.limit = limit
self.proxy_list = proxy_list if proxy_list and len(proxy_list) > 0 else None

os.makedirs('./{}'.format(self.download_path), exist_ok=True)

Expand Down Expand Up @@ -188,7 +191,8 @@ def download_images(self, keyword, links, site_name, max_count=0):
ext = self.get_extension_from_link(link)
is_base64 = False

no_ext_path = '{}/{}/{}_{}'.format(self.download_path.replace('"', ''), keyword, site_name, str(index).zfill(4))
no_ext_path = '{}/{}/{}_{}'.format(self.download_path.replace('"', ''), keyword, site_name,
str(index).zfill(4))
path = no_ext_path + '.' + ext
self.save_object_to_file(response, path, is_base64=is_base64)

Expand All @@ -215,7 +219,10 @@ def download_from_site(self, keyword, site_code):
add_url = Sites.get_face_url(site_code) if self.face else ""

try:
collect = CollectLinks(no_gui=self.no_gui) # initialize chrome driver
proxy = None
if self.proxy_list:
proxy = random.choice(self.proxy_list)
collect = CollectLinks(no_gui=self.no_gui, proxy=proxy) # initialize chrome driver
except Exception as e:
print('Error occurred while initializing chromedriver - {}'.format(e))
return
Expand All @@ -241,6 +248,7 @@ def download_from_site(self, keyword, site_code):

print('Downloading images from collected links... {} from {}'.format(keyword, site_name))
self.download_images(keyword, links, site_name, max_count=self.limit)
Path('{}/{}/{}_done'.format(self.download_path, keyword.replace('"', ''), site_name)).touch()

print('Done {} : {}'.format(site_name, keyword))

Expand All @@ -257,17 +265,19 @@ def do_crawling(self):

for keyword in keywords:
dir_name = '{}/{}'.format(self.download_path, keyword)
if os.path.exists(os.path.join(os.getcwd(), dir_name)) and self.skip:
print('Skipping already existing directory {}'.format(dir_name))
google_done = os.path.exists(os.path.join(os.getcwd(), dir_name, 'google_done'))
naver_done = os.path.exists(os.path.join(os.getcwd(), dir_name, 'naver_done'))
if google_done and naver_done and self.skip:
print('Skipping done task {}'.format(dir_name))
continue

if self.do_google:
if self.do_google and not google_done:
if self.full_resolution:
tasks.append([keyword, Sites.GOOGLE_FULL])
else:
tasks.append([keyword, Sites.GOOGLE])

if self.do_naver:
if self.do_naver and not naver_done:
if self.full_resolution:
tasks.append([keyword, Sites.NAVER_FULL])
else:
Expand Down Expand Up @@ -334,12 +344,18 @@ def imbalance_check(self):
parser.add_argument('--threads', type=int, default=4, help='Number of threads to download.')
parser.add_argument('--google', type=str, default='true', help='Download from google.com (boolean)')
parser.add_argument('--naver', type=str, default='true', help='Download from naver.com (boolean)')
parser.add_argument('--full', type=str, default='false', help='Download full resolution image instead of thumbnails (slow)')
parser.add_argument('--full', type=str, default='false',
help='Download full resolution image instead of thumbnails (slow)')
parser.add_argument('--face', type=str, default='false', help='Face search mode')
parser.add_argument('--no_gui', type=str, default='auto', help='No GUI mode. Acceleration for full_resolution mode. '
'But unstable on thumbnail mode. '
'Default: "auto" - false if full=false, true if full=true')
parser.add_argument('--limit', type=int, default=0, help='Maximum count of images to download per site. (0: infinite)')
parser.add_argument('--no_gui', type=str, default='auto',
help='No GUI mode. Acceleration for full_resolution mode. '
'But unstable on thumbnail mode. '
'Default: "auto" - false if full=false, true if full=true')
parser.add_argument('--limit', type=int, default=0,
help='Maximum count of images to download per site. (0: infinite)')
parser.add_argument('--proxy-list', type=str, default='',
help='The comma separated proxy list like: "socks://127.0.0.1:1080,http://127.0.0.1:1081". '
'Every thread will randomly choose one from the list.')
args = parser.parse_args()

_skip = False if str(args.skip).lower() == 'false' else True
Expand All @@ -349,6 +365,7 @@ def imbalance_check(self):
_full = False if str(args.full).lower() == 'false' else True
_face = False if str(args.face).lower() == 'false' else True
_limit = int(args.limit)
_proxy_list = args.proxy_list.split(',')

no_gui_input = str(args.no_gui).lower()
if no_gui_input == 'auto':
Expand All @@ -358,10 +375,11 @@ def imbalance_check(self):
else:
_no_gui = False

print('Options - skip:{}, threads:{}, google:{}, naver:{}, full_resolution:{}, face:{}, no_gui:{}, limit:{}'
.format(_skip, _threads, _google, _naver, _full, _face, _no_gui, _limit))
print(
'Options - skip:{}, threads:{}, google:{}, naver:{}, full_resolution:{}, face:{}, no_gui:{}, limit:{}, _proxy_list:{}'
.format(_skip, _threads, _google, _naver, _full, _face, _no_gui, _limit, _proxy_list))

crawler = AutoCrawler(skip_already_exist=_skip, n_threads=_threads,
do_google=_google, do_naver=_naver, full_resolution=_full,
face=_face, no_gui=_no_gui, limit=_limit)
face=_face, no_gui=_no_gui, limit=_limit, proxy_list=_proxy_list)
crawler.do_crawling()