Initial Commit

IrvingShu · Nov 21, 2018 · f4070cf · f4070cf
1 parent ee22cad
commit f4070cf
Show file tree

Hide file tree

Showing 9 changed files with 340 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -102,3 +102,6 @@ venv.bak/
 
 # mypy
 .mypy_cache/
+
+download
+.idea
diff --git a/auto_crawler.py b/auto_crawler.py
@@ -0,0 +1,193 @@
+"""
+Copyright 2018 YoongiKim
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+"""
+
+
+import os
+import requests
+import shutil
+from multiprocessing import Pool
+
+import google
+import naver
+
+class AutoCrawler:
+    def __init__(self, skip_already_exist=True, n_threads=4, do_google=True, do_naver=True, download_path='download'):
+        """
+        :param skip_already_exist: Skips keyword already downloaded before. This is needed when re-downloading.
+        :param n_threads: Number of threads to download.
+        :param do_google: Download from google.com (boolean)
+        :param do_naver: Download from naver.com (boolean)
+        """
+        self.skip = skip_already_exist
+        self.n_threads = n_threads
+        self.do_google = do_google
+        self.do_naver = do_naver
+        self.download_path = download_path
+
+    @staticmethod
+    def all_dirs(path):
+        paths = []
+        for dir in os.listdir(path):
+            if os.path.isdir(path + '/' + dir):
+                paths.append(path + '/' + dir)
+
+        return paths
+
+    @staticmethod
+    def all_files(path):
+        paths = []
+        for root, dirs, files in os.walk(path):
+            for file in files:
+                if os.path.isfile(path + '/' + file):
+                    paths.append(path + '/' + file)
+
+        return paths
+
+    @staticmethod
+    def make_dir(dirname):
+        current_path = os.getcwd()
+        path = os.path.join(current_path, dirname)
+        if not os.path.exists(path):
+            os.makedirs(path)
+
+    @staticmethod
+    def get_keywords(keywords_file='keywords.txt'):
+        # read search keywords from file
+        with open(keywords_file, 'r', encoding='utf-8-sig') as f:
+            text = f.read()
+            lines = text.split('\n')
+            if '' in lines:
+                lines.remove('')
+            keywords = sorted(set(lines))
+
+        print('{} keywords found: {}'.format(len(keywords), keywords))
+
+        # re-save sorted keywords
+        with open(keywords_file, 'w+', encoding='utf-8') as f:
+            for keyword in keywords:
+                f.write('{}\n'.format(keyword))
+
+        return keywords
+
+    def save_image_to_file(self, image, file_path):
+        try:
+            with open('{}'.format(file_path), 'wb') as file:
+                shutil.copyfileobj(image.raw, file)
+        except Exception as e:
+            print('Save failed - {}'.format(e))
+
+    def download_images(self, keyword, links, site):
+        self.make_dir('{}/{}'.format(self.download_path, keyword))
+        total = len(links)
+
+        for index, link in enumerate(links):
+            try:
+                print('Downloading {} from {}: {} / {}'.format(keyword, site, index+1, total))
+                response = requests.get(link, stream=True)
+                self.save_image_to_file(response, '{}/{}/{}_{}.jpg'.format(self.download_path, keyword, site, index))
+                del response
+
+            except Exception as e:
+                print('Download failed - ', e)
+                continue
+
+    def download_from_site(self, keyword, site, collect_links_func):
+        try:
+            dirname = '{}/{}'.format(self.download_path, keyword)
+
+            if os.path.exists(os.path.join(os.getcwd(), dirname)) and self.skip:
+                print('Skipping already existing directory {}'.format(dirname))
+                return
+
+            print('Collecting links... {} from {}'.format(keyword, site))
+            links = collect_links_func(keyword)
+
+            print('Downloading images from collected links... {} from {}'.format(keyword, site))
+            self.download_images(keyword, links, site)
+
+            print('Done {} : {}'.format(site, keyword))
+
+        except Exception as e:
+            print('Exception {} - {}'.format(keyword, e))
+
+    def download(self, args):
+        self.download_from_site(keyword=args[0], site=args[1], collect_links_func=args[2])
+
+    def do_crawling(self):
+        keywords = self.get_keywords()
+
+        tasks = []
+
+        for keyword in keywords:
+            if self.do_google:
+                tasks.append([keyword, 'google', google.collect_links])
+
+            if self.do_naver:
+                tasks.append([keyword, 'naver', naver.collect_links])
+
+        pool = Pool(self.n_threads)
+        pool.map_async(self.download, tasks)
+        pool.close()
+        pool.join()
+        print('pool join')
+
+        self.integrity_check()
+
+    def integrity_check(self):
+        print('Integrity Checking...')
+
+        dict_num_files = {}
+
+        for dir in self.all_dirs(self.download_path):
+            n_files = len(self.all_files(dir))
+            dict_num_files[dir] = n_files
+
+        avg = 0
+        for dir, n_files in dict_num_files.items():
+            avg += n_files / len(dict_num_files)
+            print('dir: {}, file_count: {}'.format(dir, n_files))
+
+        dict_too_small = {}
+
+        for dir, n_files in dict_num_files.items():
+            if n_files < avg * 0.5:
+                dict_too_small[dir] = n_files
+
+        if len(dict_too_small) >= 1:
+            for dir, n_files in dict_too_small.items():
+                print('_________________________________')
+                print('Below keywords have smaller than 50% of average file count.')
+                print('I recommend you to remove these directories and re-download for that keyword.')
+                print('_________________________________')
+                print('Too small file count directories:')
+                print('dir: {}, file_count: {}'.format(dir, n_files))
+
+            print("Remove directories above? (y/n)")
+            answer = input()
+
+            if answer == 'y':
+                # removing directories too small files
+                print("Removing too small file count directories...")
+                for dir, n_files in dict_too_small.items():
+                    shutil.rmtree(dir)
+                    print('Removed {}'.format(dir))
+
+                print('Now re-run this program to re-download removed files. (with skip_already_exist=True)')
+
+
+if __name__ == '__main__':
+    crawler = AutoCrawler(skip_already_exist=True, n_threads=4, do_google=True, do_naver=True)
+    crawler.do_crawling()
diff --git a/chromedriver_linux64_v2.44.zip b/chromedriver_linux64_v2.44.zip
diff --git a/chromedriver_mac64_v2.44.zip b/chromedriver_mac64_v2.44.zip
diff --git a/chromedriver_win32_v2.44.zip b/chromedriver_win32_v2.44.zip
diff --git a/google.py b/google.py
@@ -0,0 +1,68 @@
+"""
+Copyright 2018 YoongiKim
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+"""
+
+
+import time
+from selenium import webdriver
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.common.by import By
+from selenium.common.exceptions import ElementNotVisibleException
+
+
+def collect_links(keyword):
+    browser = webdriver.Chrome()
+
+    browser.get("https://www.google.com/search?q={}&source=lnms&tbm=isch".format(keyword))
+
+    time.sleep(1)
+
+    print('Scrolling down')
+
+    elem = browser.find_element_by_tag_name("body")
+
+    for i in range(60):
+        elem.send_keys(Keys.PAGE_DOWN)
+        time.sleep(0.2)
+
+    try:
+        btn_more = browser.find_element(By.XPATH, '//input[@value="결과 더보기"]')
+        btn_more.click()
+
+        for i in range(60):
+            elem.send_keys(Keys.PAGE_DOWN)
+            time.sleep(0.2)
+
+    except ElementNotVisibleException:
+        pass
+
+    photo_grid_boxes = browser.find_elements(By.XPATH, '//div[@class="rg_bx rg_di rg_el ivg-i"]')
+
+    print('Scraping links')
+
+    links = []
+
+    for box in photo_grid_boxes:
+        imgs = box.find_elements(By.TAG_NAME, 'img')
+
+        for img in imgs:
+            src = img.get_attribute("src")
+            if src[0] != 'd':
+                links.append(src)
+
+    print('Collect links done. Site: {}, Keyword: {}, Total: {}'.format('google', keyword, len(links)))
+    browser.close()
+
+    return links
diff --git a/keywords.txt b/keywords.txt
@@ -0,0 +1,2 @@
+cat
+dog
diff --git a/naver.py b/naver.py
@@ -0,0 +1,68 @@
+"""
+Copyright 2018 YoongiKim
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+"""
+
+
+import time
+from selenium import webdriver
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.common.by import By
+from selenium.common.exceptions import ElementNotVisibleException
+
+
+def collect_links(keyword):
+    browser = webdriver.Chrome()
+
+    browser.get("https://search.naver.com/search.naver?where=image&sm=tab_jum&query={}".format(keyword))
+
+    time.sleep(1)
+
+    print('Scrolling down')
+
+    elem = browser.find_element_by_tag_name("body")
+
+    for i in range(60):
+        elem.send_keys(Keys.PAGE_DOWN)
+        time.sleep(0.2)
+
+    try:
+        btn_more = browser.find_element(By.XPATH, '//a[@class="btn_more _more"]')
+        btn_more.click()
+
+        for i in range(60):
+            elem.send_keys(Keys.PAGE_DOWN)
+            time.sleep(0.2)
+
+    except ElementNotVisibleException:
+        pass
+
+    photo_grid_boxes = browser.find_elements(By.XPATH, '//div[@class="photo_grid _box"]')
+
+    print('Scraping links')
+
+    links = []
+
+    for box in photo_grid_boxes:
+        imgs = box.find_elements(By.CLASS_NAME, '_img')
+
+        for img in imgs:
+            src = img.get_attribute("src")
+            if src[0] != 'd':
+                links.append(src)
+
+    print('Collect links done. Site: {}, Keyword: {}, Total: {}'.format('naver', keyword, len(links)))
+    browser.close()
+
+    return links
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,6 @@
+certifi==2018.10.15
+chardet==3.0.4
+idna==2.7
+requests==2.20.1
+selenium==3.141.0
+urllib3==1.24.1
-Original file line number
+Diff line change
@@ Expand Up / @@ -102,3 +102,6 @@ venv.bak/ @@
     # mypy
     .mypy_cache/
+    download
+    .idea