diff --git a/README.md b/README.md index df324dc..9d57107 100644 --- a/README.md +++ b/README.md @@ -7,17 +7,13 @@ Google, Naver multiprocess image crawler 1. Install Chrome -2. Extract chromedriver.zip +2. pip install -r requirements.txt -3. Add PATH where you extracted chromedriver. +3. Write search keywords in keywords.txt -4. pip install -r requirements.txt +4. **Run auto_crawler.py** -5. Write search keywords in keywords.txt - -6. Run auto_crawler.py - -7. Files will be downloaded to 'download' directory. +5. Files will be downloaded to 'download' directory. # Arguments diff --git a/auto_crawler.py b/auto_crawler.py index 713036f..2c57051 100644 --- a/auto_crawler.py +++ b/auto_crawler.py @@ -20,7 +20,19 @@ import shutil from multiprocessing import Pool import argparse -import collect_links +from collect_links import CollectLinks + + +class Sites: + GOOGLE = 1 + NAVER = 2 + + @staticmethod + def get_text(code): + if code == Sites.GOOGLE: + return 'google' + elif code == Sites.NAVER: + return 'naver' class AutoCrawler: @@ -93,22 +105,25 @@ def save_image_to_file(self, image, file_path): except Exception as e: print('Save failed - {}'.format(e)) - def download_images(self, keyword, links, site): + def download_images(self, keyword, links, site_name): self.make_dir('{}/{}'.format(self.download_path, keyword)) total = len(links) for index, link in enumerate(links): try: - print('Downloading {} from {}: {} / {}'.format(keyword, site, index+1, total)) + print('Downloading {} from {}: {} / {}'.format(keyword, site_name, index + 1, total)) response = requests.get(link, stream=True) - self.save_image_to_file(response, '{}/{}/{}_{}.jpg'.format(self.download_path, keyword, site, index)) + self.save_image_to_file(response, '{}/{}/{}_{}.jpg'.format(self.download_path, keyword, site_name, index)) del response except Exception as e: print('Download failed - ', e) continue - def download_from_site(self, keyword, site, collect_links_func): + def download_from_site(self, keyword, site_code): + site_name = Sites.get_text(site_code) + collect = CollectLinks() # initialize chrome driver + try: dirname = '{}/{}'.format(self.download_path, keyword) @@ -116,22 +131,28 @@ def download_from_site(self, keyword, site, collect_links_func): print('Skipping already existing directory {}'.format(dirname)) return - print('Collecting links... {} from {}'.format(keyword, site)) - links = collect_links_func(keyword) + print('Collecting links... {} from {}'.format(keyword, site_name)) + + if site_code == Sites.GOOGLE: + links = collect.google(keyword) + + elif site_code == Sites.NAVER: + links = collect.naver(keyword) + + else: + print('Invalid Site Code') + links = [] - print('Downloading images from collected links... {} from {}'.format(keyword, site)) - self.download_images(keyword, links, site) + print('Downloading images from collected links... {} from {}'.format(keyword, site_name)) + self.download_images(keyword, links, site_name) - print('Done {} : {}'.format(site, keyword)) + print('Done {} : {}'.format(site_name, keyword)) except Exception as e: - print('Exception {} - {}'.format(keyword, e)) + print('Exception {}:{} - {}'.format(site_name, keyword, e)) def download(self, args): - if args[1] == 'google': - self.download_from_site(keyword=args[0], site=args[1], collect_links_func=collect_links.google) - elif args[1] == 'naver': - self.download_from_site(keyword=args[0], site=args[1], collect_links_func=collect_links.naver) + self.download_from_site(keyword=args[0], site_code=args[1]) def do_crawling(self): keywords = self.get_keywords() @@ -140,10 +161,10 @@ def do_crawling(self): for keyword in keywords: if self.do_google: - tasks.append([keyword, 'google']) + tasks.append([keyword, Sites.GOOGLE]) if self.do_naver: - tasks.append([keyword, 'naver']) + tasks.append([keyword, Sites.NAVER]) pool = Pool(self.n_threads) pool.map_async(self.download, tasks) diff --git a/chromedriver/chromedriver_linux b/chromedriver/chromedriver_linux new file mode 100644 index 0000000..1b8ff86 Binary files /dev/null and b/chromedriver/chromedriver_linux differ diff --git a/chromedriver/chromedriver_mac b/chromedriver/chromedriver_mac new file mode 100644 index 0000000..eb5bd61 Binary files /dev/null and b/chromedriver/chromedriver_mac differ diff --git a/chromedriver/chromedriver_win.exe b/chromedriver/chromedriver_win.exe new file mode 100644 index 0000000..f74d9fe Binary files /dev/null and b/chromedriver/chromedriver_win.exe differ diff --git a/collect_links.py b/collect_links.py index c108b32..b1aacf2 100644 --- a/collect_links.py +++ b/collect_links.py @@ -20,94 +20,109 @@ from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By from selenium.common.exceptions import ElementNotVisibleException +import platform -def google(keyword): - browser = webdriver.Chrome() - browser.get("https://www.google.com/search?q={}&source=lnms&tbm=isch".format(keyword)) +class CollectLinks: + def __init__(self): + executable = '' - time.sleep(1) + if platform.system() == 'Windows': + print('Detected OS : Windows') + executable = './chromedriver/chromedriver_win.exe' + elif platform.system() == 'Linux': + print('Detected OS : Linux') + executable = './chromedriver/chromedriver_linux' + elif platform.system() == 'Darwin': + print('Detected OS : Darwin') + executable = './chromedriver/chromedriver_mac' + else: + assert False, 'Unknown OS Type' - print('Scrolling down') + self.browser = webdriver.Chrome(executable) - elem = browser.find_element_by_tag_name("body") + def google(self, keyword): + self.browser.get("https://www.google.com/search?q={}&source=lnms&tbm=isch".format(keyword)) - for i in range(60): - elem.send_keys(Keys.PAGE_DOWN) - time.sleep(0.2) + time.sleep(1) - try: - btn_more = browser.find_element(By.XPATH, '//input[@value="결과 더보기"]') - btn_more.click() + print('Scrolling down') + + elem = self.browser.find_element_by_tag_name("body") for i in range(60): elem.send_keys(Keys.PAGE_DOWN) time.sleep(0.2) - except ElementNotVisibleException: - pass - - photo_grid_boxes = browser.find_elements(By.XPATH, '//div[@class="rg_bx rg_di rg_el ivg-i"]') - - print('Scraping links') + try: + btn_more = self.browser.find_element(By.XPATH, '//input[@value="결과 더보기"]') + btn_more.click() - links = [] + for i in range(60): + elem.send_keys(Keys.PAGE_DOWN) + time.sleep(0.2) - for box in photo_grid_boxes: - imgs = box.find_elements(By.TAG_NAME, 'img') + except ElementNotVisibleException: + pass - for img in imgs: - src = img.get_attribute("src") - if src[0] != 'd': - links.append(src) + photo_grid_boxes = self.browser.find_elements(By.XPATH, '//div[@class="rg_bx rg_di rg_el ivg-i"]') - print('Collect links done. Site: {}, Keyword: {}, Total: {}'.format('google', keyword, len(links))) - browser.close() + print('Scraping links') - return links + links = [] + for box in photo_grid_boxes: + imgs = box.find_elements(By.TAG_NAME, 'img') -def naver(keyword): - browser = webdriver.Chrome() + for img in imgs: + src = img.get_attribute("src") + if src[0] != 'd': + links.append(src) - browser.get("https://search.naver.com/search.naver?where=image&sm=tab_jum&query={}".format(keyword)) + print('Collect links done. Site: {}, Keyword: {}, Total: {}'.format('google', keyword, len(links))) + self.browser.close() - time.sleep(1) + return links - print('Scrolling down') + def naver(self, keyword): + self.browser.get("https://search.naver.com/search.naver?where=image&sm=tab_jum&query={}".format(keyword)) - elem = browser.find_element_by_tag_name("body") + time.sleep(1) - for i in range(60): - elem.send_keys(Keys.PAGE_DOWN) - time.sleep(0.2) + print('Scrolling down') - try: - btn_more = browser.find_element(By.XPATH, '//a[@class="btn_more _more"]') - btn_more.click() + elem = self.browser.find_element_by_tag_name("body") for i in range(60): elem.send_keys(Keys.PAGE_DOWN) time.sleep(0.2) - except ElementNotVisibleException: - pass + try: + btn_more = self.browser.find_element(By.XPATH, '//a[@class="btn_more _more"]') + btn_more.click() + + for i in range(60): + elem.send_keys(Keys.PAGE_DOWN) + time.sleep(0.2) + + except ElementNotVisibleException: + pass - photo_grid_boxes = browser.find_elements(By.XPATH, '//div[@class="photo_grid _box"]') + photo_grid_boxes = self.browser.find_elements(By.XPATH, '//div[@class="photo_grid _box"]') - print('Scraping links') + print('Scraping links') - links = [] + links = [] - for box in photo_grid_boxes: - imgs = box.find_elements(By.CLASS_NAME, '_img') + for box in photo_grid_boxes: + imgs = box.find_elements(By.CLASS_NAME, '_img') - for img in imgs: - src = img.get_attribute("src") - if src[0] != 'd': - links.append(src) + for img in imgs: + src = img.get_attribute("src") + if src[0] != 'd': + links.append(src) - print('Collect links done. Site: {}, Keyword: {}, Total: {}'.format('naver', keyword, len(links))) - browser.close() + print('Collect links done. Site: {}, Keyword: {}, Total: {}'.format('naver', keyword, len(links))) + self.browser.close() - return links + return links