Skip to content

Commit

Permalink
multiprocess bug fixed
Browse files Browse the repository at this point in the history
  • Loading branch information
Yoongi Kim committed Nov 22, 2018
1 parent 945459d commit 4d30b3d
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 74 deletions.
10 changes: 5 additions & 5 deletions auto_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,8 @@
import shutil
from multiprocessing import Pool
import argparse
import collect_links

import google
import naver

class AutoCrawler:
def __init__(self, skip_already_exist=True, n_threads=4, do_google=True, do_naver=True, download_path='download'):
Expand Down Expand Up @@ -126,9 +125,9 @@ def download_from_site(self, keyword, site, collect_links_func):

def download(self, args):
if args[1] == 'google':
self.download_from_site(keyword=args[0], site=args[1], collect_links_func=google.collect_links)
self.download_from_site(keyword=args[0], site=args[1], collect_links_func=collect_links.google)
elif args[1] == 'naver':
self.download_from_site(keyword=args[0], site=args[1], collect_links_func=naver.collect_links)
self.download_from_site(keyword=args[0], site=args[1], collect_links_func=collect_links.naver)

def do_crawling(self):
keywords = self.get_keywords()
Expand All @@ -152,6 +151,7 @@ def do_crawling(self):

def integrity_check(self):
print('Integrity Checking...')
print('Data imbalance checking...')

dict_num_files = {}

Expand All @@ -172,7 +172,7 @@ def integrity_check(self):

if len(dict_too_small) >= 1:
for dir, n_files in dict_too_small.items():
print('_________________________________')
print('Data imbalance detected.')
print('Below keywords have smaller than 50% of average file count.')
print('I recommend you to remove these directories and re-download for that keyword.')
print('_________________________________')
Expand Down
47 changes: 46 additions & 1 deletion naver.py → collect_links.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,53 @@
from selenium.webdriver.common.by import By
from selenium.common.exceptions import ElementNotVisibleException

def google(keyword):
browser = webdriver.Chrome()

browser.get("https://www.google.com/search?q={}&source=lnms&tbm=isch".format(keyword))

time.sleep(1)

print('Scrolling down')

elem = browser.find_element_by_tag_name("body")

for i in range(60):
elem.send_keys(Keys.PAGE_DOWN)
time.sleep(0.2)

try:
btn_more = browser.find_element(By.XPATH, '//input[@value="결과 더보기"]')
btn_more.click()

for i in range(60):
elem.send_keys(Keys.PAGE_DOWN)
time.sleep(0.2)

except ElementNotVisibleException:
pass

photo_grid_boxes = browser.find_elements(By.XPATH, '//div[@class="rg_bx rg_di rg_el ivg-i"]')

print('Scraping links')

links = []

for box in photo_grid_boxes:
imgs = box.find_elements(By.TAG_NAME, 'img')

for img in imgs:
src = img.get_attribute("src")
if src[0] != 'd':
links.append(src)

print('Collect links done. Site: {}, Keyword: {}, Total: {}'.format('google', keyword, len(links)))
browser.close()

return links


def collect_links(keyword):
def naver(keyword):
browser = webdriver.Chrome()

browser.get("https://search.naver.com/search.naver?where=image&sm=tab_jum&query={}".format(keyword))
Expand Down
68 changes: 0 additions & 68 deletions google.py

This file was deleted.

0 comments on commit 4d30b3d

Please sign in to comment.