feat: Adapt to Google page updates

QianyanTech · Apr 16, 2024 · de560c8 · de560c8 · Patty-OFurniture · Apr 20, 2024
1 parent 2ff2ce0
commit de560c8
Show file tree

Hide file tree

Showing 3 changed files with 21 additions and 16 deletions.
diff --git a/crawler.py b/crawler.py
@@ -42,25 +42,25 @@ def google_gen_query_url(keywords, face_only=False, safe_mode=False, image_type=
     base_url = "https://www.google.com/search?tbm=isch&hl=en"
     keywords_str = "&q=" + quote(keywords)
     query_url = base_url + keywords_str
-    
+
     if safe_mode is True:
         query_url += "&safe=on"
     else:
         query_url += "&safe=off"
-    
+
     filter_url = "&tbs="
 
     if color is not None:
         if color == "bw":
             filter_url += "ic:gray%2C"
         else:
             filter_url += "ic:specific%2Cisc:{}%2C".format(color.lower())
-    
+
     if image_type is not None:
         if image_type.lower() == "linedrawing":
             image_type = "lineart"
         filter_url += "itp:{}".format(image_type)
-        
+
     if face_only is True:
         filter_url += "itp:face"
 
@@ -73,7 +73,10 @@ def google_image_url_from_webpage(driver, max_number, quiet=False):
     thumb_elements = []
     while True:
         try:
-            thumb_elements = driver.find_elements(By.CLASS_NAME, "rg_i")
+            # old way to get thumb_elements
+            # thumb_elements = driver.find_elements(By.CLASS_NAME, "rg_i")
+            # Adapt to the updated Google image search page
+            thumb_elements = driver.find_elements(By.CSS_SELECTOR, ".H8Rx8c > g-img > img")
             my_print("Find {} images.".format(len(thumb_elements)), quiet)
             if len(thumb_elements) >= max_number:
                 break
@@ -90,7 +93,7 @@ def google_image_url_from_webpage(driver, max_number, quiet=False):
         except Exception as e:
             print("Exception ", e)
             pass
-    
+
     if len(thumb_elements) == 0:
         return []
 
@@ -109,16 +112,17 @@ def google_image_url_from_webpage(driver, max_number, quiet=False):
             print("Error while clicking in thumbnail:", e)
             retry_click.append(elem)
 
-    if len(retry_click) > 0:    
+    if len(retry_click) > 0:
         my_print("Retry some failed clicks ...", quiet)
         for elem in retry_click:
             try:
                 if elem.is_displayed() and elem.is_enabled():
                     elem.click()
             except Exception as e:
                 print("Error while retrying click:", e)
-
-    image_elements = driver.find_elements(By.CLASS_NAME, "islib")
+
+    # image_elements = driver.find_elements(By.CLASS_NAME, "islib")
+    image_elements = driver.find_elements(By.CSS_SELECTOR, ".ob5Hkd > a")
     image_urls = list()
     url_pattern = r"imgurl=\S*&amp;imgrefurl"
 
@@ -138,10 +142,10 @@ def bing_gen_query_url(keywords, face_only=False, safe_mode=False, image_type=No
     filter_url = "&qft="
     if face_only is True:
         filter_url += "+filterui:face-face"
-    
+
     if image_type is not None:
         filter_url += "+filterui:photo-{}".format(image_type)
-    
+
     if color is not None:
         if color == "bw" or color == "color":
             filter_url += "+filterui:color2-{}".format(color.lower())
@@ -183,7 +187,7 @@ def bing_get_image_url_using_api(keywords, max_number=10000, face_only=False,
     proxies = None
     if proxy and proxy_type:
         proxies = {"http": "{}://{}".format(proxy_type, proxy),
-                   "https": "{}://{}".format(proxy_type, proxy)}                             
+                   "https": "{}://{}".format(proxy_type, proxy)}
     start = 1
     image_urls = []
     while start <= max_number:
@@ -309,7 +313,7 @@ def process_batch(batch_no, batch_size):
 
 
 def crawl_image_urls(keywords, engine="Google", max_number=10000,
-                     face_only=False, safe_mode=False, proxy=None, 
+                     face_only=False, safe_mode=False, proxy=None,
                      proxy_type="http", quiet=False, browser="chrome_headless", image_type=None, color=None):
     """
     Scrape image urls of keywords from Google Image Search

diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
 chromedriver-autoinstaller==0.4.0
 pyinstaller==5.9.0
-PyQt5==5.15.9
+PyQt5==5.15.10
 requests==2.31.0
 selenium==4.8.3
diff --git a/utils.py b/utils.py
@@ -13,7 +13,7 @@ def gen_valid_dir_name_for_keywords(keywords):
 class AppConfig(object):
     def __init__(self):
         self.engine = "Google"
-        
+
         self.driver = "chrome_headless"
 
         self.keywords = ""
@@ -33,7 +33,7 @@ def __init__(self):
 
     def to_command_paras(self):
         str_paras = ""
- 
+
         str_paras += ' -e ' + self.engine
 
         str_paras += ' -d ' + self.driver
@@ -72,6 +72,7 @@ def gen_keywords_list_from_file(filepath):
 def resolve_dependencies(driver=str):
     if "chrome" in driver:
         print("Checking Google Chrome and chromedriver ...")
+        # if you have installed chronmium/chrome and chromedriver of the same version and still get an error, you can try commenting out the following three lines.
         driver_path = chromedriver_autoinstaller.install()
         if not driver_path:
             return False