Skip to content

Commit

Permalink
Merge pull request YoongiKim#51 from HyeongminMoon/master
Browse files Browse the repository at this point in the history
fixes some bugs, add transparent filter
  • Loading branch information
YoongiKim authored May 23, 2023
2 parents de31723 + c9c9f94 commit f43b032
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 22 deletions.
16 changes: 15 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
# Changes on this Fork
updated: 2023.03.02 / ChromeDriver 110.0.5481.178

- Fixed bug on `google` & `google_full`
- Add `transparents` arguments that filters transparent images only. (for google)
- Fixed a bug that `limit` does not work on `google_full`
- Fixed a bug that multiprocess threads did not exit when `Ctrl+C` is pressed

### Best usage of this Fork
```
python3 main.py --google true --transparent true --naver false --full true #[--no_gui false] [--limit 100]
```
And now whenever you can stop program by pressing `Ctrl+C`!

# AutoCrawler
Google, Naver multiprocess image crawler (High Quality & Speed & Customizable)

Expand All @@ -19,7 +33,7 @@ Google, Naver multiprocess image crawler (High Quality & Speed & Customizable)
# Arguments
usage:
```
python3 main.py [--skip true] [--threads 4] [--google true] [--naver true] [--full false] [--face false] [--no_gui auto] [--limit 0]
python3 main.py [--skip true] [--threads 4] [--google true] [--transparent false] [--naver true] [--full false] [--face false] [--no_gui auto] [--limit 0]
```

```
Expand Down
33 changes: 24 additions & 9 deletions collect_links.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,13 +132,15 @@ def google(self, keyword, add_url=""):
except ElementNotVisibleException:
pass

photo_grid_boxes = self.browser.find_elements(By.XPATH, '//div[@class="bRMDJf islir"]')
# photo_grid_boxes = self.browser.find_elements(By.XPATH, '//div[@class="bRMDJf islir"]')
photo_grid_boxes = self.browser.find_elements(By.XPATH, '//div[@class=" bRMDJf islir"]')

print('Scraping links')

links = []

for box in photo_grid_boxes:
for idx, box in enumerate(photo_grid_boxes):
# print('Scraping', idx)
try:
imgs = box.find_elements(By.TAG_NAME, 'img')

Expand Down Expand Up @@ -197,7 +199,7 @@ def naver(self, keyword, add_url=""):

return links

def google_full(self, keyword, add_url=""):
def google_full(self, keyword, add_url="", limit=100):
print('[Full Resolution Mode]')

self.browser.get("https://www.google.com/search?q={}&tbm=isch{}".format(keyword, add_url))
Expand All @@ -216,15 +218,24 @@ def google_full(self, keyword, add_url=""):
last_scroll = 0
scroll_patience = 0

while True:
NUM_MAX_RETRY = 30
NUM_MAX_SCROLL_PATIENCE = 100
# while True:
for _ in range(limit):
try:
xpath = '//div[@id="islsp"]//div[@class="v4dQwb"]'
div_box = self.browser.find_element(By.XPATH, xpath)
self.highlight(div_box)

xpath = '//img[@class="n3VNCb"]'
img = div_box.find_element(By.XPATH, xpath)
self.highlight(img)
for _ in range(NUM_MAX_RETRY):
try:
xpath = '//img[@class="n3VNCb pT0Scc KAlRDb"]'
img = div_box.find_element(By.XPATH, xpath)
self.highlight(img)
break
except:
time.sleep(0.1)
pass

xpath = '//div[@class="k7O2sd"]'
loading_bar = div_box.find_element(By.XPATH, xpath)
Expand All @@ -240,6 +251,9 @@ def google_full(self, keyword, add_url=""):
print('%d: %s' % (count, src))
count += 1

except KeyboardInterrupt:
break

except StaleElementReferenceException:
# print('[Expected Exception - StaleElementReferenceException]')
pass
Expand All @@ -253,8 +267,9 @@ def google_full(self, keyword, add_url=""):
scroll_patience = 0
last_scroll = scroll

if scroll_patience >= 30:
break
if scroll_patience >= NUM_MAX_SCROLL_PATIENCE:
elem.send_keys(Keys.RIGHT)
continue

elem.send_keys(Keys.RIGHT)

Expand Down
44 changes: 32 additions & 12 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import requests
import shutil
from multiprocessing import Pool
import signal
import argparse
from collect_links import CollectLinks
import imghdr
Expand Down Expand Up @@ -52,7 +53,7 @@ def get_face_url(code):


class AutoCrawler:
def __init__(self, skip_already_exist=True, n_threads=4, do_google=True, do_naver=True, download_path='download',
def __init__(self, skip_already_exist=True, n_threads=4, do_google=True, transparent=False, do_naver=True, download_path='download',
full_resolution=False, face=False, no_gui=False, limit=0, proxy_list=None):
"""
:param skip_already_exist: Skips keyword already downloaded before. This is needed when re-downloading.
Expand All @@ -70,6 +71,7 @@ def __init__(self, skip_already_exist=True, n_threads=4, do_google=True, do_nave
self.skip = skip_already_exist
self.n_threads = n_threads
self.do_google = do_google
self.transparent = transparent
self.do_naver = do_naver
self.download_path = download_path
self.full_resolution = full_resolution
Expand Down Expand Up @@ -187,7 +189,7 @@ def download_images(self, keyword, links, site_name, max_count=0):
ext = 'png'
is_base64 = True
else:
response = requests.get(link, stream=True)
response = requests.get(link, stream=True, timeout=10)
ext = self.get_extension_from_link(link)
is_base64 = False

Expand All @@ -210,6 +212,9 @@ def download_images(self, keyword, links, site_name, max_count=0):
os.rename(path, path2)
print('Renamed extension {} -> {}'.format(ext, ext2))

except KeyboardInterrupt:
break

except Exception as e:
print('Download failed - ', e)
continue
Expand All @@ -231,13 +236,17 @@ def download_from_site(self, keyword, site_code):
print('Collecting links... {} from {}'.format(keyword, site_name))

if site_code == Sites.GOOGLE:
if self.transparent:
add_url += '&tbs=ic:trans'
links = collect.google(keyword, add_url)

elif site_code == Sites.NAVER:
links = collect.naver(keyword, add_url)

elif site_code == Sites.GOOGLE_FULL:
links = collect.google_full(keyword, add_url)
if self.transparent:
add_url += '&tbs=ic:trans'
links = collect.google_full(keyword, add_url, self.limit)

elif site_code == Sites.NAVER_FULL:
links = collect.naver_full(keyword, add_url)
Expand All @@ -254,10 +263,14 @@ def download_from_site(self, keyword, site_code):

except Exception as e:
print('Exception {}:{} - {}'.format(site_name, keyword, e))
return

def download(self, args):
self.download_from_site(keyword=args[0], site_code=args[1])

def init_worker(self):
signal.signal(signal.SIGINT, signal.SIG_IGN)

def do_crawling(self):
keywords = self.get_keywords()

Expand All @@ -283,10 +296,15 @@ def do_crawling(self):
else:
tasks.append([keyword, Sites.NAVER])

pool = Pool(self.n_threads)
pool.map_async(self.download, tasks)
pool.close()
pool.join()
try:
pool = Pool(self.n_threads, initializer=self.init_worker)
pool.map(self.download, tasks)
except KeyboardInterrupt:
pool.terminate()
pool.join()
else:
pool.terminate()
pool.join()
print('Task ended. Pool join.')

self.imbalance_check()
Expand Down Expand Up @@ -343,6 +361,7 @@ def imbalance_check(self):
help='Skips keyword already downloaded before. This is needed when re-downloading.')
parser.add_argument('--threads', type=int, default=4, help='Number of threads to download.')
parser.add_argument('--google', type=str, default='true', help='Download from google.com (boolean)')
parser.add_argument('--transparent', type=str, default='false', help='Filter for transparent background images(for google)')
parser.add_argument('--naver', type=str, default='true', help='Download from naver.com (boolean)')
parser.add_argument('--full', type=str, default='false',
help='Download full resolution image instead of thumbnails (slow)')
Expand All @@ -351,8 +370,8 @@ def imbalance_check(self):
help='No GUI mode. Acceleration for full_resolution mode. '
'But unstable on thumbnail mode. '
'Default: "auto" - false if full=false, true if full=true')
parser.add_argument('--limit', type=int, default=0,
help='Maximum count of images to download per site. (0: infinite)')
parser.add_argument('--limit', type=int, default=100,
help='Maximum count of images to download per site.')
parser.add_argument('--proxy-list', type=str, default='',
help='The comma separated proxy list like: "socks://127.0.0.1:1080,http://127.0.0.1:1081". '
'Every thread will randomly choose one from the list.')
Expand All @@ -361,6 +380,7 @@ def imbalance_check(self):
_skip = False if str(args.skip).lower() == 'false' else True
_threads = args.threads
_google = False if str(args.google).lower() == 'false' else True
_transparent = False if str(args.transparent).lower() == 'false' else True
_naver = False if str(args.naver).lower() == 'false' else True
_full = False if str(args.full).lower() == 'false' else True
_face = False if str(args.face).lower() == 'false' else True
Expand All @@ -376,10 +396,10 @@ def imbalance_check(self):
_no_gui = False

print(
'Options - skip:{}, threads:{}, google:{}, naver:{}, full_resolution:{}, face:{}, no_gui:{}, limit:{}, _proxy_list:{}'
.format(_skip, _threads, _google, _naver, _full, _face, _no_gui, _limit, _proxy_list))
'Options - skip:{}, threads:{}, google:{}, transparent:{}, naver:{}, full_resolution:{}, face:{}, no_gui:{}, limit:{}, _proxy_list:{}'
.format(_skip, _threads, _google, _transparent, _naver, _full, _face, _no_gui, _limit, _proxy_list))

crawler = AutoCrawler(skip_already_exist=_skip, n_threads=_threads,
do_google=_google, do_naver=_naver, full_resolution=_full,
do_google=_google, transparent=_transparent, do_naver=_naver, full_resolution=_full,
face=_face, no_gui=_no_gui, limit=_limit, proxy_list=_proxy_list)
crawler.do_crawling()

0 comments on commit f43b032

Please sign in to comment.