diff --git a/auto_crawler.py b/auto_crawler.py index 40c4fa6..c007140 100644 --- a/auto_crawler.py +++ b/auto_crawler.py @@ -20,9 +20,8 @@ import shutil from multiprocessing import Pool import argparse +import collect_links -import google -import naver class AutoCrawler: def __init__(self, skip_already_exist=True, n_threads=4, do_google=True, do_naver=True, download_path='download'): @@ -126,9 +125,9 @@ def download_from_site(self, keyword, site, collect_links_func): def download(self, args): if args[1] == 'google': - self.download_from_site(keyword=args[0], site=args[1], collect_links_func=google.collect_links) + self.download_from_site(keyword=args[0], site=args[1], collect_links_func=collect_links.google) elif args[1] == 'naver': - self.download_from_site(keyword=args[0], site=args[1], collect_links_func=naver.collect_links) + self.download_from_site(keyword=args[0], site=args[1], collect_links_func=collect_links.naver) def do_crawling(self): keywords = self.get_keywords() @@ -152,6 +151,7 @@ def do_crawling(self): def integrity_check(self): print('Integrity Checking...') + print('Data imbalance checking...') dict_num_files = {} @@ -172,7 +172,7 @@ def integrity_check(self): if len(dict_too_small) >= 1: for dir, n_files in dict_too_small.items(): - print('_________________________________') + print('Data imbalance detected.') print('Below keywords have smaller than 50% of average file count.') print('I recommend you to remove these directories and re-download for that keyword.') print('_________________________________') diff --git a/naver.py b/collect_links.py similarity index 62% rename from naver.py rename to collect_links.py index 1f52b30..c108b32 100644 --- a/naver.py +++ b/collect_links.py @@ -21,8 +21,53 @@ from selenium.webdriver.common.by import By from selenium.common.exceptions import ElementNotVisibleException +def google(keyword): + browser = webdriver.Chrome() + + browser.get("https://www.google.com/search?q={}&source=lnms&tbm=isch".format(keyword)) + + time.sleep(1) + + print('Scrolling down') + + elem = browser.find_element_by_tag_name("body") + + for i in range(60): + elem.send_keys(Keys.PAGE_DOWN) + time.sleep(0.2) + + try: + btn_more = browser.find_element(By.XPATH, '//input[@value="결과 더보기"]') + btn_more.click() + + for i in range(60): + elem.send_keys(Keys.PAGE_DOWN) + time.sleep(0.2) + + except ElementNotVisibleException: + pass + + photo_grid_boxes = browser.find_elements(By.XPATH, '//div[@class="rg_bx rg_di rg_el ivg-i"]') + + print('Scraping links') + + links = [] + + for box in photo_grid_boxes: + imgs = box.find_elements(By.TAG_NAME, 'img') + + for img in imgs: + src = img.get_attribute("src") + if src[0] != 'd': + links.append(src) + + print('Collect links done. Site: {}, Keyword: {}, Total: {}'.format('google', keyword, len(links))) + browser.close() + + return links + -def collect_links(keyword): +def naver(keyword): browser = webdriver.Chrome() browser.get("https://search.naver.com/search.naver?where=image&sm=tab_jum&query={}".format(keyword)) diff --git a/google.py b/google.py deleted file mode 100644 index 5e6bad1..0000000 --- a/google.py +++ /dev/null @@ -1,68 +0,0 @@ -""" -Copyright 2018 YoongiKim - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - - -import time -from selenium import webdriver -from selenium.webdriver.common.keys import Keys -from selenium.webdriver.common.by import By -from selenium.common.exceptions import ElementNotVisibleException - - -def collect_links(keyword): - browser = webdriver.Chrome() - - browser.get("https://www.google.com/search?q={}&source=lnms&tbm=isch".format(keyword)) - - time.sleep(1) - - print('Scrolling down') - - elem = browser.find_element_by_tag_name("body") - - for i in range(60): - elem.send_keys(Keys.PAGE_DOWN) - time.sleep(0.2) - - try: - btn_more = browser.find_element(By.XPATH, '//input[@value="결과 더보기"]') - btn_more.click() - - for i in range(60): - elem.send_keys(Keys.PAGE_DOWN) - time.sleep(0.2) - - except ElementNotVisibleException: - pass - - photo_grid_boxes = browser.find_elements(By.XPATH, '//div[@class="rg_bx rg_di rg_el ivg-i"]') - - print('Scraping links') - - links = [] - - for box in photo_grid_boxes: - imgs = box.find_elements(By.TAG_NAME, 'img') - - for img in imgs: - src = img.get_attribute("src") - if src[0] != 'd': - links.append(src) - - print('Collect links done. Site: {}, Keyword: {}, Total: {}'.format('google', keyword, len(links))) - browser.close() - - return links