diff --git a/collect_links.py b/collect_links.py index 0ac804f..9e3fab9 100644 --- a/collect_links.py +++ b/collect_links.py @@ -40,8 +40,12 @@ def __init__(self): self.browser = webdriver.Chrome(executable) - def google(self, keyword): - self.browser.get("https://www.google.com/search?q={}&source=lnms&tbm=isch".format(keyword)) + def get_scroll(self): + pos = self.browser.execute_script("return window.pageYOffset;") + return pos + + def google(self, keyword, add_url=""): + self.browser.get("https://www.google.com/search?q={}&source=lnms&tbm=isch{}".format(keyword, add_url)) time.sleep(1) @@ -72,20 +76,24 @@ def google(self, keyword): links = [] for box in photo_grid_boxes: - imgs = box.find_elements(By.TAG_NAME, 'img') + try: + imgs = box.find_elements(By.TAG_NAME, 'img') + + for img in imgs: + src = img.get_attribute("src") + if src[0] != 'd': + links.append(src) - for img in imgs: - src = img.get_attribute("src") - if src[0] != 'd': - links.append(src) + except Exception as e: + print('[Exception occurred while collecting links from google] {}'.format(e)) print('Collect links done. Site: {}, Keyword: {}, Total: {}'.format('google', keyword, len(links))) self.browser.close() - return links + return set(links) - def naver(self, keyword): - self.browser.get("https://search.naver.com/search.naver?where=image&sm=tab_jum&query={}".format(keyword)) + def naver(self, keyword, add_url=""): + self.browser.get("https://search.naver.com/search.naver?where=image&sm=tab_jum&query={}{}".format(keyword, add_url)) time.sleep(1) @@ -115,125 +123,131 @@ def naver(self, keyword): links = [] for box in photo_grid_boxes: - imgs = box.find_elements(By.CLASS_NAME, '_img') + try: + imgs = box.find_elements(By.CLASS_NAME, '_img') - for img in imgs: - src = img.get_attribute("src") - if src[0] != 'd': - links.append(src) + for img in imgs: + src = img.get_attribute("src") + if src[0] != 'd': + links.append(src) + except Exception as e: + print('[Exception occurred while collecting links from naver] {}'.format(e)) print('Collect links done. Site: {}, Keyword: {}, Total: {}'.format('naver', keyword, len(links))) self.browser.close() - return links + return set(links) + + def google_full(self, keyword, add_url=""): + print('[Full Resolution Mode]') - def google_full(self, keyword): - self.browser.get("https://www.google.com/search?q={}&source=lnms&tbm=isch".format(keyword)) + self.browser.get("https://www.google.co.kr/search?q={}&tbm=isch{}".format(keyword, add_url)) time.sleep(2) - first_photo_grid_boxes = self.browser.find_element(By.XPATH, '//img[@class="rg_ic rg_i"]') - print(first_photo_grid_boxes.get_attribute('id')) + elem = self.browser.find_element_by_tag_name("body") + + print('Scraping links') - first_photo_grid_boxes.click() + boxes = self.browser.find_elements(By.XPATH, '//div[@class="rg_bx rg_di rg_el ivg-i"]') + boxes[0].click() time.sleep(1) - container = self.browser.find_element(By.XPATH, '//div[@class="irc_land irc_bg"]') - print(container.get_attribute('id')) - - img = container.find_element_by_id("irc-mi") - print(img.get_attribute('src')) - - next_button = container.find_element(By.XPATH, '//div[@class="WPyac" and @id="irc-rac"]') - print(next_button.get_attribute('id')) - next_button.click() - - input() - - # print('Scraping links') - # - # links = [] - # - # for box in photo_grid_boxes: - # imgs = box.find_elements(By.TAG_NAME, 'img') - # - # for img in imgs: - # src = img.get_attribute("src") - # if src[0] != 'd': - # links.append(src) - # - # print('Collect links done. Site: {}, Keyword: {}, Total: {}'.format('google', keyword, len(links))) + links = [] + count = 1 + + last_scroll = 0 + scroll_patience = 0 + + while True: + try: + imgs = self.browser.find_elements(By.XPATH, '//div[@class="irc_c i8187 immersive-container irc-rcd"]//img[@class="irc_mi"]') + + for img in imgs: + src = img.get_attribute('src') + + if src not in links and src is not None: + links.append(src) + print('%d: %s'%(count, src)) + count += 1 + + except Exception as e: + print('[Exception occurred while collecting links from google_full] {}'.format(e)) + + scroll = self.get_scroll() + if scroll == last_scroll: + scroll_patience += 1 + else: + scroll_patience = 0 + last_scroll = scroll + + if scroll_patience >= 30: + break + + elem.send_keys(Keys.RIGHT) + + self.browser.close() - return links + return set(links) - def naver_full(self, keyword): - self.browser.get("https://search.naver.com/search.naver?where=image&sm=tab_jum&query={}".format(keyword)) + def naver_full(self, keyword, add_url=""): + print('[Full Resolution Mode]') - time.sleep(1) + self.browser.get("https://search.naver.com/search.naver?where=image&sm=tab_jum&query={}{}".format(keyword, add_url)) - print('Scrolling down') + time.sleep(2) elem = self.browser.find_element_by_tag_name("body") - for i in range(60): - elem.send_keys(Keys.PAGE_DOWN) - time.sleep(0.2) + print('Scraping links') - try: - btn_more = self.browser.find_element(By.XPATH, '//a[@class="btn_more _more"]') - btn_more.click() + boxes = self.browser.find_elements(By.XPATH, '//div[@class="img_area _item"]') - for i in range(60): - elem.send_keys(Keys.PAGE_DOWN) - time.sleep(0.2) + boxes[0].click() + time.sleep(1) - except ElementNotVisibleException: - pass + links = [] + count = 1 - photo_grid_boxes = self.browser.find_elements(By.XPATH, '//div[@class="photo_grid _box"]') + last_scroll = 0 + scroll_patience = 0 - links = [] + while True: + try: + imgs = self.browser.find_elements(By.XPATH, + '//div[@class="image_viewer_wrap _sauImageViewer"]//img[@class="_image_source"]') - for box in photo_grid_boxes: - areas = box.find_elements(By.XPATH, '//div[@class="img_area _item"]') - for area in areas: - data_id = area.get_attribute('data-id') - print(data_id) - self.browser.get("https://search.naver.com/search.naver?where=image&sm=tab_jum&query={}#imgId={}&vType=rollout".format(keyword, data_id)) - time.sleep(1) - - - # print('Collect links done. Site: {}, Keyword: {}, Total: {}'.format('naver', keyword, len(links))) - # self.browser.close() - - return links - # def naver_full(self, keyword): - # from selenium.webdriver.common.action_chains import ActionChains - # mouse = webdriver.ActionChains(self.browser) - # - # self.browser.get("https://search.naver.com/search.naver?where=image&sm=tab_jum&query={}".format(keyword)) - # time.sleep(1) - # elem = self.browser.find_element_by_tag_name("body") - # - # first_photo_grid_boxes = elem.find_element(By.XPATH, '//span[@class="img_border"]') - # first_photo_grid_boxes.click() - # - # links = [] - # - # img = elem.find_element(By.XPATH, '//img[@class="_image_source"]') - # link = img.get_attribute("src") - # print(link) - # links.append(link) - # - # next_button = elem.find_element(By.XPATH, '//a[@class="btn_next _next"]') - # mouse.move_to_element(next_button).click().perform() - # - # time.sleep(1) + for img in imgs: + src = img.get_attribute('src') + + if src not in links and src is not None: + links.append(src) + print('%d: %s' % (count, src)) + count += 1 + + except Exception as e: + print('[Exception occurred while collecting links from naver_full] {}'.format(e)) + + scroll = self.get_scroll() + if scroll == last_scroll: + scroll_patience += 1 + else: + scroll_patience = 0 + last_scroll = scroll + + if scroll_patience >= 30: + break + + elem.send_keys(Keys.RIGHT) + + self.browser.close() + + return set(links) if __name__ == '__main__': collect = CollectLinks() - links = collect.naver_full('python') - print(links) + links = collect.naver_full('박보영') + print(len(links), links) diff --git a/main.py b/main.py index 0330f6a..da16397 100644 --- a/main.py +++ b/main.py @@ -26,6 +26,8 @@ class Sites: GOOGLE = 1 NAVER = 2 + GOOGLE_FULL = 3 + NAVER_FULL = 4 @staticmethod def get_text(code): @@ -33,21 +35,39 @@ def get_text(code): return 'google' elif code == Sites.NAVER: return 'naver' + elif code == Sites.GOOGLE_FULL: + return 'google' + elif code == Sites.NAVER_FULL: + return 'naver' + + @staticmethod + def get_face_url(code): + if code == Sites.GOOGLE or Sites.GOOGLE_FULL: + return "&tbs=itp:face" + if code == Sites.NAVER or Sites.NAVER_FULL: + return "&face=1" class AutoCrawler: - def __init__(self, skip_already_exist=True, n_threads=4, do_google=True, do_naver=True, download_path='download'): + def __init__(self, skip_already_exist=True, n_threads=4, do_google=True, do_naver=True, download_path='download', + full_resolution=False, face=False): """ :param skip_already_exist: Skips keyword already downloaded before. This is needed when re-downloading. :param n_threads: Number of threads to download. :param do_google: Download from google.com (boolean) :param do_naver: Download from naver.com (boolean) + :param download_path: Download folder path + :param full_resolution: Download full resolution image rather than thumbnails (slow) + :param face: Face search mode """ + self.skip = skip_already_exist self.n_threads = n_threads self.do_google = do_google self.do_naver = do_naver self.download_path = download_path + self.full_resolution = full_resolution + self.face = face os.makedirs('./{}'.format(self.download_path), exist_ok=True) @@ -70,6 +90,21 @@ def all_files(path): return paths + @staticmethod + def get_extension_from_link(link, default='jpg'): + splits = str(link).split('.') + if len(splits) == 0: + return default + ext = splits[-1].lower() + if ext == 'jpg' or ext == 'jpeg': + return 'jpg' + elif ext == 'gif': + return 'gif' + elif ext == 'png': + return 'png' + else: + return default + @staticmethod def make_dir(dirname): current_path = os.getcwd() @@ -111,7 +146,8 @@ def download_images(self, keyword, links, site_name): try: print('Downloading {} from {}: {} / {}'.format(keyword, site_name, index + 1, total)) response = requests.get(link, stream=True) - self.save_image_to_file(response, '{}/{}/{}_{}.jpg'.format(self.download_path, keyword, site_name, index)) + ext = self.get_extension_from_link(link) + self.save_image_to_file(response, '{}/{}/{}_{}.{}'.format(self.download_path, keyword, site_name, index, ext)) del response except Exception as e: @@ -120,6 +156,7 @@ def download_images(self, keyword, links, site_name): def download_from_site(self, keyword, site_code): site_name = Sites.get_text(site_code) + add_url = Sites.get_face_url(site_code) if self.face else "" collect = CollectLinks() # initialize chrome driver try: @@ -132,10 +169,16 @@ def download_from_site(self, keyword, site_code): print('Collecting links... {} from {}'.format(keyword, site_name)) if site_code == Sites.GOOGLE: - links = collect.google(keyword) + links = collect.google(keyword, add_url) elif site_code == Sites.NAVER: - links = collect.naver(keyword) + links = collect.naver(keyword, add_url) + + elif site_code == Sites.GOOGLE_FULL: + links = collect.google_full(keyword, add_url) + + elif site_code == Sites.NAVER_FULL: + links = collect.naver_full(keyword, add_url) else: print('Invalid Site Code') @@ -159,10 +202,16 @@ def do_crawling(self): for keyword in keywords: if self.do_google: - tasks.append([keyword, Sites.GOOGLE]) + if self.full_resolution: + tasks.append([keyword, Sites.GOOGLE_FULL]) + else: + tasks.append([keyword, Sites.GOOGLE]) if self.do_naver: - tasks.append([keyword, Sites.NAVER]) + if self.full_resolution: + tasks.append([keyword, Sites.NAVER_FULL]) + else: + tasks.append([keyword, Sites.NAVER]) pool = Pool(self.n_threads) pool.map_async(self.download, tasks) @@ -225,14 +274,18 @@ def imbalance_check(self): parser.add_argument('--threads', type=int, default=4, help='Number of threads to download.') parser.add_argument('--google', type=str, default='true', help='Download from google.com (boolean)') parser.add_argument('--naver', type=str, default='true', help='Download from naver.com (boolean)') + parser.add_argument('--full', type=str, default='false', help='Download full resolution image rather than thumbnails (slow)') + parser.add_argument('--face', type=str, default='false', help='Face search mode') args = parser.parse_args() _skip = False if str(args.skip).lower() == 'false' else True _threads = args.threads _google = False if str(args.google).lower() == 'false' else True _naver = False if str(args.naver).lower() == 'false' else True + _full = False if str(args.full).lower() == 'false' else True + _face = False if str(args.face).lower() == 'false' else True - print('Options - skip:{}, threads:{}, google:{}, naver:{}'.format(_skip, _threads, _google, _naver)) + print('Options - skip:{}, threads:{}, google:{}, naver:{}, full_resolution:{}, face:{}'.format(_skip, _threads, _google, _naver, _full, _face)) - crawler = AutoCrawler(skip_already_exist=_skip, n_threads=_threads, do_google=_google, do_naver=_naver) + crawler = AutoCrawler(skip_already_exist=_skip, n_threads=_threads, do_google=_google, do_naver=_naver, full_resolution=_full, face=_face) crawler.do_crawling()