Skip to content

Commit

Permalink
Update Google & Naver
Browse files Browse the repository at this point in the history
  • Loading branch information
YoongiKim committed Apr 15, 2024
1 parent b54e46e commit fd9b594
Showing 1 changed file with 23 additions and 21 deletions.
44 changes: 23 additions & 21 deletions collect_links.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
class CollectLinks:
def __init__(self, no_gui=False, proxy=None):
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--no-sandbox') # To maintain user cookies
chrome_options.add_argument('--disable-dev-shm-usage')
if no_gui:
chrome_options.add_argument('--headless')
Expand Down Expand Up @@ -97,26 +97,27 @@ def google(self, keyword, add_url=""):

elem = self.browser.find_element(By.TAG_NAME, "body")

for i in range(60):
last_scroll = 0
scroll_patience = 0
NUM_MAX_SCROLL_PATIENCE = 50

while True:
elem.send_keys(Keys.PAGE_DOWN)
time.sleep(0.2)

try:
# You may need to change this. Because google image changes rapidly.
# btn_more = self.browser.find_element(By.XPATH, '//input[@value="결과 더보기"]')
# self.wait_and_click('//input[@id="smb"]')
self.wait_and_click('//input[@type="button"]')

for i in range(60):
elem.send_keys(Keys.PAGE_DOWN)
time.sleep(0.2)
scroll = self.get_scroll()
if scroll == last_scroll:
scroll_patience += 1
else:
scroll_patience = 0
last_scroll = scroll

except ElementNotVisibleException:
pass
if scroll_patience >= NUM_MAX_SCROLL_PATIENCE:
break

print('Scraping links')

imgs = self.browser.find_elements(By.XPATH, '//div[@class="isv-r PNCib ViTmJb BUooTd"]//img[@class="rg_i Q4LuWd"]')
imgs = self.browser.find_elements(By.XPATH, '//div[@jsname="dTDiAc"]/div[@jsname="qQjpJ"]//img')

links = []
for idx, img in enumerate(imgs):
Expand Down Expand Up @@ -148,7 +149,7 @@ def naver(self, keyword, add_url=""):
elem.send_keys(Keys.PAGE_DOWN)
time.sleep(0.2)

imgs = self.browser.find_elements(By.XPATH, '//div[@class="photo_bx api_ani_send _photoBox"]//img[@class="_image _listImage"]')
imgs = self.browser.find_elements(By.XPATH, '//div[@class="tile_item _fe_image_tab_content_tile"]//img[@class="_fe_image_tab_content_thumbnail_image"]')

print('Scraping links')

Expand All @@ -175,11 +176,11 @@ def google_full(self, keyword, add_url="", limit=100):
self.browser.get("https://www.google.com/search?q={}&tbm=isch{}".format(keyword, add_url))
time.sleep(1)

self.wait_and_click('//div[@class="isv-r PNCib ViTmJb BUooTd"]//img[@class="rg_i Q4LuWd"]')
# Click the first image to get full resolution images
self.wait_and_click('//div[@jsname="dTDiAc"]')
time.sleep(1)

body = self.browser.find_element(By.TAG_NAME, "body")
sidebar = body.find_element(By.XPATH, '//div[@id="islsp"]')

print('Scraping links')

Expand All @@ -192,7 +193,8 @@ def google_full(self, keyword, add_url="", limit=100):

while len(links) < limit:
try:
xpath = '//div[@id="islsp"]//div[@class="tvh9oe BIB1wf hVa2Fd"]//img[@class="sFlh5c pT0Scc iPVvYb"]'
# Google renders compressed image first, and overlaps with full image later.
xpath = '//div[@jsname="figiqf"]//img[not(contains(@src,"gstatic.com"))]'

t1 = time.time()
while True:
Expand Down Expand Up @@ -252,7 +254,8 @@ def naver_full(self, keyword, add_url=""):

print('Scraping links')

self.wait_and_click('//div[@class="photo_bx api_ani_send _photoBox"]')
# Click the first image
self.wait_and_click('//div[@class="tile_item _fe_image_tab_content_tile"]//img[@class="_fe_image_tab_content_thumbnail_image"]')
time.sleep(1)

links = []
Expand All @@ -263,7 +266,7 @@ def naver_full(self, keyword, add_url=""):

while True:
try:
xpath = '//div[@class="image _imageBox"]/img[@class="_image"]'
xpath = '//img[@class="_fe_image_viewer_image_fallback_target"]'
imgs = self.browser.find_elements(By.XPATH, xpath)

for img in imgs:
Expand Down Expand Up @@ -292,7 +295,6 @@ def naver_full(self, keyword, add_url=""):
break

elem.send_keys(Keys.RIGHT)
elem.send_keys(Keys.PAGE_DOWN)

links = self.remove_duplicates(links)

Expand Down

0 comments on commit fd9b594

Please sign in to comment.