Skip to content

Commit

Permalink
Merge pull request YoongiKim#52 from YoongiKim/merge_fork
Browse files Browse the repository at this point in the history
Merge HyeongminMoon's fork
  • Loading branch information
YoongiKim authored May 23, 2023
2 parents de31723 + cc9f51f commit 2518344
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 16 deletions.
31 changes: 22 additions & 9 deletions collect_links.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,13 +132,14 @@ def google(self, keyword, add_url=""):
except ElementNotVisibleException:
pass

photo_grid_boxes = self.browser.find_elements(By.XPATH, '//div[@class="bRMDJf islir"]')
photo_grid_boxes = self.browser.find_elements(By.XPATH, '//div[@class=" bRMDJf islir"]')

print('Scraping links')

links = []

for box in photo_grid_boxes:
for idx, box in enumerate(photo_grid_boxes):
# print('Scraping', idx)
try:
imgs = box.find_elements(By.TAG_NAME, 'img')

Expand Down Expand Up @@ -197,7 +198,7 @@ def naver(self, keyword, add_url=""):

return links

def google_full(self, keyword, add_url=""):
def google_full(self, keyword, add_url="", limit=100):
print('[Full Resolution Mode]')

self.browser.get("https://www.google.com/search?q={}&tbm=isch{}".format(keyword, add_url))
Expand All @@ -216,15 +217,23 @@ def google_full(self, keyword, add_url=""):
last_scroll = 0
scroll_patience = 0

while True:
NUM_MAX_RETRY = 30
NUM_MAX_SCROLL_PATIENCE = 100
for _ in range(limit):
try:
xpath = '//div[@id="islsp"]//div[@class="v4dQwb"]'
div_box = self.browser.find_element(By.XPATH, xpath)
self.highlight(div_box)

xpath = '//img[@class="n3VNCb"]'
img = div_box.find_element(By.XPATH, xpath)
self.highlight(img)
for _ in range(NUM_MAX_RETRY):
try:
xpath = '//img[@class="n3VNCb pT0Scc KAlRDb"]'
img = div_box.find_element(By.XPATH, xpath)
self.highlight(img)
break
except:
time.sleep(0.1)
pass

xpath = '//div[@class="k7O2sd"]'
loading_bar = div_box.find_element(By.XPATH, xpath)
Expand All @@ -240,6 +249,9 @@ def google_full(self, keyword, add_url=""):
print('%d: %s' % (count, src))
count += 1

except KeyboardInterrupt:
break

except StaleElementReferenceException:
# print('[Expected Exception - StaleElementReferenceException]')
pass
Expand All @@ -253,8 +265,9 @@ def google_full(self, keyword, add_url=""):
scroll_patience = 0
last_scroll = scroll

if scroll_patience >= 30:
break
if scroll_patience >= NUM_MAX_SCROLL_PATIENCE:
elem.send_keys(Keys.RIGHT)
continue

elem.send_keys(Keys.RIGHT)

Expand Down
27 changes: 20 additions & 7 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import requests
import shutil
from multiprocessing import Pool
import signal
import argparse
from collect_links import CollectLinks
import imghdr
Expand Down Expand Up @@ -187,7 +188,7 @@ def download_images(self, keyword, links, site_name, max_count=0):
ext = 'png'
is_base64 = True
else:
response = requests.get(link, stream=True)
response = requests.get(link, stream=True, timeout=10)
ext = self.get_extension_from_link(link)
is_base64 = False

Expand All @@ -210,6 +211,9 @@ def download_images(self, keyword, links, site_name, max_count=0):
os.rename(path, path2)
print('Renamed extension {} -> {}'.format(ext, ext2))

except KeyboardInterrupt:
break

except Exception as e:
print('Download failed - ', e)
continue
Expand Down Expand Up @@ -237,7 +241,7 @@ def download_from_site(self, keyword, site_code):
links = collect.naver(keyword, add_url)

elif site_code == Sites.GOOGLE_FULL:
links = collect.google_full(keyword, add_url)
links = collect.google_full(keyword, add_url, self.limit)

elif site_code == Sites.NAVER_FULL:
links = collect.naver_full(keyword, add_url)
Expand All @@ -254,10 +258,14 @@ def download_from_site(self, keyword, site_code):

except Exception as e:
print('Exception {}:{} - {}'.format(site_name, keyword, e))
return

def download(self, args):
self.download_from_site(keyword=args[0], site_code=args[1])

def init_worker(self):
signal.signal(signal.SIGINT, signal.SIG_IGN)

def do_crawling(self):
keywords = self.get_keywords()

Expand All @@ -283,10 +291,15 @@ def do_crawling(self):
else:
tasks.append([keyword, Sites.NAVER])

pool = Pool(self.n_threads)
pool.map_async(self.download, tasks)
pool.close()
pool.join()
try:
pool = Pool(self.n_threads, initializer=self.init_worker)
pool.map(self.download, tasks)
except KeyboardInterrupt:
pool.terminate()
pool.join()
else:
pool.terminate()
pool.join()
print('Task ended. Pool join.')

self.imbalance_check()
Expand Down Expand Up @@ -352,7 +365,7 @@ def imbalance_check(self):
'But unstable on thumbnail mode. '
'Default: "auto" - false if full=false, true if full=true')
parser.add_argument('--limit', type=int, default=0,
help='Maximum count of images to download per site. (0: infinite)')
help='Maximum count of images to download per site.')
parser.add_argument('--proxy-list', type=str, default='',
help='The comma separated proxy list like: "socks://127.0.0.1:1080,http://127.0.0.1:1081". '
'Every thread will randomly choose one from the list.')
Expand Down

0 comments on commit 2518344

Please sign in to comment.