✨ Extract base64 from Google thumbnail

AtropsCooper · Feb 8, 2023 · 4f5c9f9 · 4f5c9f9
1 parent 3c7f13c
commit 4f5c9f9
Show file tree

Hide file tree

Showing 2 changed files with 42 additions and 26 deletions.
diff --git a/PicImageSearch/google.py b/PicImageSearch/google.py
@@ -1,6 +1,5 @@
 from pathlib import Path
 from typing import Any, Dict, Optional, Union
-from urllib.parse import quote
 
 from lxml.html import HTMLParser, fromstring
 from pyquery import PyQuery
@@ -29,9 +28,11 @@ def __init__(self, **request_kwargs: Any):
     def _slice(resp: str, index: int = 1) -> GoogleResponse:
         utf8_parser = HTMLParser(encoding="utf-8")
         d = PyQuery(fromstring(resp, parser=utf8_parser))
+
+        images_data = d.find("script")
         data = d.find(".g")
         pages = list(d.find("td").items())[1:-1]
-        return GoogleResponse(data, pages, index)
+        return GoogleResponse(data, pages, index, images_data)
 
     async def goto_page(self, url: str, index: int) -> GoogleResponse:
         resp_text, _, _ = await self.get(url)
@@ -53,19 +54,18 @@ async def search(
         • .raw[2] = Second index of simplified data that was found <Should start from index 2, because from there is matching image>\n
         • .raw[2].title = First index of title that was found\n
         • .raw[2].url = First index of url source that was found\n
-        • .raw[2].thumbnail = First index of url image that was found
+        • .raw[2].thumbnail = First index of base64 string image that was found
         """
         if url:
-            encoded_image_url = quote(url, safe="")
-            params = {"image_url": encoded_image_url}
-            resp_text, _, _ = await self.get(self.url, params=params)
-        elif file:
-            data: Dict[str, Any] = (
-                {"encoded_image": file}
-                if isinstance(file, bytes)
-                else {"encoded_image": open(file, "rb")}
-            )
-            resp_text, _, _ = await self.post(f"{self.url}/upload", data=data)
-        else:
+            file = await self.download(url)
+
+        if not url or not file:
             raise ValueError("url or file is required")
+
+        data: Dict[str, Any] = (
+            {"encoded_image": file}
+            if isinstance(file, bytes)
+            else {"encoded_image": open(file, "rb")}
+        )
+        resp_text, _, _ = await self.post(f"{self.url}/upload", data=data)
         return self._slice(resp_text, 1)
diff --git a/PicImageSearch/model/google.py b/PicImageSearch/model/google.py
@@ -1,31 +1,47 @@
 from typing import List
+from re import compile
+from json import dump
 
 from pyquery import PyQuery
 
 
 class GoogleItem:
-    def __init__(self, data: PyQuery):
+    def __init__(self, data: PyQuery, thumbnail: str):
         self.origin: PyQuery = data  # 原始数据
         self.title: str = data("h3").text()
         self.url: str = data("a").eq(0).attr("href")
-        self.thumbnail: str = ""
-        thumbnail = data("img")
-        if (
-            thumbnail
-            and thumbnail.attr("src")
-            != "data:image/gif;base64,R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw=="
-        ):
-            self.thumbnail = thumbnail.attr("src")
-
+        self.thumbnail: str = thumbnail
 
 class GoogleResponse:
-    def __init__(self, data: PyQuery, pages: List[PyQuery], index: int):
+    def __init__(self, data: PyQuery, pages: List[PyQuery], index: int, images_data: PyQuery):
         self.origin: PyQuery = data  # 原始数据
         # 结果返回值
-        self.raw: List[GoogleItem] = [GoogleItem(i) for i in data.items()]
+        thumbnail: dict = self.create_list_thumbnail(images_data)
+        self.raw: List[GoogleItem] = [GoogleItem(i, (thumbnail[i("img").attr("id")] if i("img").attr("id") else None)) for i in data.items()]
         self.index: int = index  # 当前页
         self.page: int = len(pages)  # 总页数
         self.pages: List[PyQuery] = pages  # 页面源
 
     def get_page_url(self, index: int) -> str:
         return f'https://www.google.com{self.pages[index - 1]("a").eq(0).attr("href")}'
+
+    @staticmethod
+    def create_list_thumbnail(data: PyQuery):
+        d: dict = {}
+        base_64_regex = compile(r"(data:image\/(?:jpeg|jpg|png|gif);base64,[^'\"]+)")
+        extract_id = compile(r"(\[(?:[\"']dimg_\d+['\"],?\s*)*[\"']dimg_\d+['\"]\])")
+
+        base64 = base_64_regex.findall(data.text())
+        id = extract_id.findall(data.text())
+
+        for index, a in enumerate(id):
+            a = a.replace("[", "").replace("]", "").replace('"', '').replace("'", "")
+
+            if "," in a:
+                a = a.split(",")
+                for x in a:
+                    d[x] = str(base64[index]).replace("\\x3d", "=")
+                continue
+
+            d[a] = str(base64[index]).replace("\\x3d", "=")
+        return d