multiprocess bug fixed

IrvingShu · Nov 22, 2018 · 4d30b3d · 4d30b3d
1 parent 945459d
commit 4d30b3d
Show file tree

Hide file tree

Showing 3 changed files with 51 additions and 74 deletions.
diff --git a/auto_crawler.py b/auto_crawler.py
@@ -20,9 +20,8 @@
 import shutil
 from multiprocessing import Pool
 import argparse
+import collect_links
 
-import google
-import naver
 
 class AutoCrawler:
     def __init__(self, skip_already_exist=True, n_threads=4, do_google=True, do_naver=True, download_path='download'):
@@ -126,9 +125,9 @@ def download_from_site(self, keyword, site, collect_links_func):
 
     def download(self, args):
         if args[1] == 'google':
-            self.download_from_site(keyword=args[0], site=args[1], collect_links_func=google.collect_links)
+            self.download_from_site(keyword=args[0], site=args[1], collect_links_func=collect_links.google)
         elif args[1] == 'naver':
-            self.download_from_site(keyword=args[0], site=args[1], collect_links_func=naver.collect_links)
+            self.download_from_site(keyword=args[0], site=args[1], collect_links_func=collect_links.naver)
 
     def do_crawling(self):
         keywords = self.get_keywords()
@@ -152,6 +151,7 @@ def do_crawling(self):
 
     def integrity_check(self):
         print('Integrity Checking...')
+        print('Data imbalance checking...')
 
         dict_num_files = {}
 
@@ -172,7 +172,7 @@ def integrity_check(self):
 
         if len(dict_too_small) >= 1:
             for dir, n_files in dict_too_small.items():
-                print('_________________________________')
+                print('Data imbalance detected.')
                 print('Below keywords have smaller than 50% of average file count.')
                 print('I recommend you to remove these directories and re-download for that keyword.')
                 print('_________________________________')

diff --git a/naver.py → collect_links.py b/naver.py → collect_links.py
@@ -21,8 +21,53 @@
 from selenium.webdriver.common.by import By
 from selenium.common.exceptions import ElementNotVisibleException
 
+def google(keyword):
+    browser = webdriver.Chrome()
+
+    browser.get("https://www.google.com/search?q={}&source=lnms&tbm=isch".format(keyword))
+
+    time.sleep(1)
+
+    print('Scrolling down')
+
+    elem = browser.find_element_by_tag_name("body")
+
+    for i in range(60):
+        elem.send_keys(Keys.PAGE_DOWN)
+        time.sleep(0.2)
+
+    try:
+        btn_more = browser.find_element(By.XPATH, '//input[@value="결과 더보기"]')
+        btn_more.click()
+
+        for i in range(60):
+            elem.send_keys(Keys.PAGE_DOWN)
+            time.sleep(0.2)
+
+    except ElementNotVisibleException:
+        pass
+
+    photo_grid_boxes = browser.find_elements(By.XPATH, '//div[@class="rg_bx rg_di rg_el ivg-i"]')
+
+    print('Scraping links')
+
+    links = []
+
+    for box in photo_grid_boxes:
+        imgs = box.find_elements(By.TAG_NAME, 'img')
+
+        for img in imgs:
+            src = img.get_attribute("src")
+            if src[0] != 'd':
+                links.append(src)
+
+    print('Collect links done. Site: {}, Keyword: {}, Total: {}'.format('google', keyword, len(links)))
+    browser.close()
+
+    return links
+
 
-def collect_links(keyword):
+def naver(keyword):
     browser = webdriver.Chrome()
 
     browser.get("https://search.naver.com/search.naver?where=image&sm=tab_jum&query={}".format(keyword))

diff --git a/google.py b/google.py