Skip to content

Commit

Permalink
Chrome Driver Auto OS Detection
Browse files Browse the repository at this point in the history
  • Loading branch information
Yoongi Kim committed Nov 24, 2018
1 parent 83d2729 commit e926741
Show file tree
Hide file tree
Showing 6 changed files with 111 additions and 79 deletions.
12 changes: 4 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,13 @@ Google, Naver multiprocess image crawler

1. Install Chrome

2. Extract chromedriver.zip
2. pip install -r requirements.txt

3. Add PATH where you extracted chromedriver.
3. Write search keywords in keywords.txt

4. pip install -r requirements.txt
4. **Run auto_crawler.py**

5. Write search keywords in keywords.txt

6. Run auto_crawler.py

7. Files will be downloaded to 'download' directory.
5. Files will be downloaded to 'download' directory.


# Arguments
Expand Down
55 changes: 38 additions & 17 deletions auto_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,19 @@
import shutil
from multiprocessing import Pool
import argparse
import collect_links
from collect_links import CollectLinks


class Sites:
GOOGLE = 1
NAVER = 2

@staticmethod
def get_text(code):
if code == Sites.GOOGLE:
return 'google'
elif code == Sites.NAVER:
return 'naver'


class AutoCrawler:
Expand Down Expand Up @@ -93,45 +105,54 @@ def save_image_to_file(self, image, file_path):
except Exception as e:
print('Save failed - {}'.format(e))

def download_images(self, keyword, links, site):
def download_images(self, keyword, links, site_name):
self.make_dir('{}/{}'.format(self.download_path, keyword))
total = len(links)

for index, link in enumerate(links):
try:
print('Downloading {} from {}: {} / {}'.format(keyword, site, index+1, total))
print('Downloading {} from {}: {} / {}'.format(keyword, site_name, index + 1, total))
response = requests.get(link, stream=True)
self.save_image_to_file(response, '{}/{}/{}_{}.jpg'.format(self.download_path, keyword, site, index))
self.save_image_to_file(response, '{}/{}/{}_{}.jpg'.format(self.download_path, keyword, site_name, index))
del response

except Exception as e:
print('Download failed - ', e)
continue

def download_from_site(self, keyword, site, collect_links_func):
def download_from_site(self, keyword, site_code):
site_name = Sites.get_text(site_code)
collect = CollectLinks() # initialize chrome driver

try:
dirname = '{}/{}'.format(self.download_path, keyword)

if os.path.exists(os.path.join(os.getcwd(), dirname)) and self.skip:
print('Skipping already existing directory {}'.format(dirname))
return

print('Collecting links... {} from {}'.format(keyword, site))
links = collect_links_func(keyword)
print('Collecting links... {} from {}'.format(keyword, site_name))

if site_code == Sites.GOOGLE:
links = collect.google(keyword)

elif site_code == Sites.NAVER:
links = collect.naver(keyword)

else:
print('Invalid Site Code')
links = []

print('Downloading images from collected links... {} from {}'.format(keyword, site))
self.download_images(keyword, links, site)
print('Downloading images from collected links... {} from {}'.format(keyword, site_name))
self.download_images(keyword, links, site_name)

print('Done {} : {}'.format(site, keyword))
print('Done {} : {}'.format(site_name, keyword))

except Exception as e:
print('Exception {} - {}'.format(keyword, e))
print('Exception {}:{} - {}'.format(site_name, keyword, e))

def download(self, args):
if args[1] == 'google':
self.download_from_site(keyword=args[0], site=args[1], collect_links_func=collect_links.google)
elif args[1] == 'naver':
self.download_from_site(keyword=args[0], site=args[1], collect_links_func=collect_links.naver)
self.download_from_site(keyword=args[0], site_code=args[1])

def do_crawling(self):
keywords = self.get_keywords()
Expand All @@ -140,10 +161,10 @@ def do_crawling(self):

for keyword in keywords:
if self.do_google:
tasks.append([keyword, 'google'])
tasks.append([keyword, Sites.GOOGLE])

if self.do_naver:
tasks.append([keyword, 'naver'])
tasks.append([keyword, Sites.NAVER])

pool = Pool(self.n_threads)
pool.map_async(self.download, tasks)
Expand Down
Binary file added chromedriver/chromedriver_linux
Binary file not shown.
Binary file added chromedriver/chromedriver_mac
Binary file not shown.
Binary file added chromedriver/chromedriver_win.exe
Binary file not shown.
123 changes: 69 additions & 54 deletions collect_links.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,94 +20,109 @@
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.common.exceptions import ElementNotVisibleException
import platform

def google(keyword):
browser = webdriver.Chrome()

browser.get("https://www.google.com/search?q={}&source=lnms&tbm=isch".format(keyword))
class CollectLinks:
def __init__(self):
executable = ''

time.sleep(1)
if platform.system() == 'Windows':
print('Detected OS : Windows')
executable = './chromedriver/chromedriver_win.exe'
elif platform.system() == 'Linux':
print('Detected OS : Linux')
executable = './chromedriver/chromedriver_linux'
elif platform.system() == 'Darwin':
print('Detected OS : Darwin')
executable = './chromedriver/chromedriver_mac'
else:
assert False, 'Unknown OS Type'

print('Scrolling down')
self.browser = webdriver.Chrome(executable)

elem = browser.find_element_by_tag_name("body")
def google(self, keyword):
self.browser.get("https://www.google.com/search?q={}&source=lnms&tbm=isch".format(keyword))

for i in range(60):
elem.send_keys(Keys.PAGE_DOWN)
time.sleep(0.2)
time.sleep(1)

try:
btn_more = browser.find_element(By.XPATH, '//input[@value="결과 더보기"]')
btn_more.click()
print('Scrolling down')

elem = self.browser.find_element_by_tag_name("body")

for i in range(60):
elem.send_keys(Keys.PAGE_DOWN)
time.sleep(0.2)

except ElementNotVisibleException:
pass

photo_grid_boxes = browser.find_elements(By.XPATH, '//div[@class="rg_bx rg_di rg_el ivg-i"]')

print('Scraping links')
try:
btn_more = self.browser.find_element(By.XPATH, '//input[@value="결과 더보기"]')
btn_more.click()

links = []
for i in range(60):
elem.send_keys(Keys.PAGE_DOWN)
time.sleep(0.2)

for box in photo_grid_boxes:
imgs = box.find_elements(By.TAG_NAME, 'img')
except ElementNotVisibleException:
pass

for img in imgs:
src = img.get_attribute("src")
if src[0] != 'd':
links.append(src)
photo_grid_boxes = self.browser.find_elements(By.XPATH, '//div[@class="rg_bx rg_di rg_el ivg-i"]')

print('Collect links done. Site: {}, Keyword: {}, Total: {}'.format('google', keyword, len(links)))
browser.close()
print('Scraping links')

return links
links = []

for box in photo_grid_boxes:
imgs = box.find_elements(By.TAG_NAME, 'img')

def naver(keyword):
browser = webdriver.Chrome()
for img in imgs:
src = img.get_attribute("src")
if src[0] != 'd':
links.append(src)

browser.get("https://search.naver.com/search.naver?where=image&sm=tab_jum&query={}".format(keyword))
print('Collect links done. Site: {}, Keyword: {}, Total: {}'.format('google', keyword, len(links)))
self.browser.close()

time.sleep(1)
return links

print('Scrolling down')
def naver(self, keyword):
self.browser.get("https://search.naver.com/search.naver?where=image&sm=tab_jum&query={}".format(keyword))

elem = browser.find_element_by_tag_name("body")
time.sleep(1)

for i in range(60):
elem.send_keys(Keys.PAGE_DOWN)
time.sleep(0.2)
print('Scrolling down')

try:
btn_more = browser.find_element(By.XPATH, '//a[@class="btn_more _more"]')
btn_more.click()
elem = self.browser.find_element_by_tag_name("body")

for i in range(60):
elem.send_keys(Keys.PAGE_DOWN)
time.sleep(0.2)

except ElementNotVisibleException:
pass
try:
btn_more = self.browser.find_element(By.XPATH, '//a[@class="btn_more _more"]')
btn_more.click()

for i in range(60):
elem.send_keys(Keys.PAGE_DOWN)
time.sleep(0.2)

except ElementNotVisibleException:
pass

photo_grid_boxes = browser.find_elements(By.XPATH, '//div[@class="photo_grid _box"]')
photo_grid_boxes = self.browser.find_elements(By.XPATH, '//div[@class="photo_grid _box"]')

print('Scraping links')
print('Scraping links')

links = []
links = []

for box in photo_grid_boxes:
imgs = box.find_elements(By.CLASS_NAME, '_img')
for box in photo_grid_boxes:
imgs = box.find_elements(By.CLASS_NAME, '_img')

for img in imgs:
src = img.get_attribute("src")
if src[0] != 'd':
links.append(src)
for img in imgs:
src = img.get_attribute("src")
if src[0] != 'd':
links.append(src)

print('Collect links done. Site: {}, Keyword: {}, Total: {}'.format('naver', keyword, len(links)))
browser.close()
print('Collect links done. Site: {}, Keyword: {}, Total: {}'.format('naver', keyword, len(links)))
self.browser.close()

return links
return links

0 comments on commit e926741

Please sign in to comment.