Skip to content

Commit

Permalink
✨ Extract base64 from Google thumbnail
Browse files Browse the repository at this point in the history
  • Loading branch information
lleans committed Feb 8, 2023
1 parent 3c7f13c commit 4f5c9f9
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 26 deletions.
28 changes: 14 additions & 14 deletions PicImageSearch/google.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from pathlib import Path
from typing import Any, Dict, Optional, Union
from urllib.parse import quote

from lxml.html import HTMLParser, fromstring
from pyquery import PyQuery
Expand Down Expand Up @@ -29,9 +28,11 @@ def __init__(self, **request_kwargs: Any):
def _slice(resp: str, index: int = 1) -> GoogleResponse:
utf8_parser = HTMLParser(encoding="utf-8")
d = PyQuery(fromstring(resp, parser=utf8_parser))

images_data = d.find("script")
data = d.find(".g")
pages = list(d.find("td").items())[1:-1]
return GoogleResponse(data, pages, index)
return GoogleResponse(data, pages, index, images_data)

async def goto_page(self, url: str, index: int) -> GoogleResponse:
resp_text, _, _ = await self.get(url)
Expand All @@ -53,19 +54,18 @@ async def search(
• .raw[2] = Second index of simplified data that was found <Should start from index 2, because from there is matching image>\n
• .raw[2].title = First index of title that was found\n
• .raw[2].url = First index of url source that was found\n
• .raw[2].thumbnail = First index of url image that was found
• .raw[2].thumbnail = First index of base64 string image that was found
"""
if url:
encoded_image_url = quote(url, safe="")
params = {"image_url": encoded_image_url}
resp_text, _, _ = await self.get(self.url, params=params)
elif file:
data: Dict[str, Any] = (
{"encoded_image": file}
if isinstance(file, bytes)
else {"encoded_image": open(file, "rb")}
)
resp_text, _, _ = await self.post(f"{self.url}/upload", data=data)
else:
file = await self.download(url)

if not url or not file:
raise ValueError("url or file is required")

data: Dict[str, Any] = (
{"encoded_image": file}
if isinstance(file, bytes)
else {"encoded_image": open(file, "rb")}
)
resp_text, _, _ = await self.post(f"{self.url}/upload", data=data)
return self._slice(resp_text, 1)
40 changes: 28 additions & 12 deletions PicImageSearch/model/google.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,47 @@
from typing import List
from re import compile
from json import dump

from pyquery import PyQuery


class GoogleItem:
def __init__(self, data: PyQuery):
def __init__(self, data: PyQuery, thumbnail: str):
self.origin: PyQuery = data # 原始数据
self.title: str = data("h3").text()
self.url: str = data("a").eq(0).attr("href")
self.thumbnail: str = ""
thumbnail = data("img")
if (
thumbnail
and thumbnail.attr("src")
!= "data:image/gif;base64,R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw=="
):
self.thumbnail = thumbnail.attr("src")

self.thumbnail: str = thumbnail

class GoogleResponse:
def __init__(self, data: PyQuery, pages: List[PyQuery], index: int):
def __init__(self, data: PyQuery, pages: List[PyQuery], index: int, images_data: PyQuery):
self.origin: PyQuery = data # 原始数据
# 结果返回值
self.raw: List[GoogleItem] = [GoogleItem(i) for i in data.items()]
thumbnail: dict = self.create_list_thumbnail(images_data)
self.raw: List[GoogleItem] = [GoogleItem(i, (thumbnail[i("img").attr("id")] if i("img").attr("id") else None)) for i in data.items()]
self.index: int = index # 当前页
self.page: int = len(pages) # 总页数
self.pages: List[PyQuery] = pages # 页面源

def get_page_url(self, index: int) -> str:
return f'https://www.google.com{self.pages[index - 1]("a").eq(0).attr("href")}'

@staticmethod
def create_list_thumbnail(data: PyQuery):
d: dict = {}
base_64_regex = compile(r"(data:image\/(?:jpeg|jpg|png|gif);base64,[^'\"]+)")
extract_id = compile(r"(\[(?:[\"']dimg_\d+['\"],?\s*)*[\"']dimg_\d+['\"]\])")

base64 = base_64_regex.findall(data.text())
id = extract_id.findall(data.text())

for index, a in enumerate(id):
a = a.replace("[", "").replace("]", "").replace('"', '').replace("'", "")

if "," in a:
a = a.split(",")
for x in a:
d[x] = str(base64[index]).replace("\\x3d", "=")
continue

d[a] = str(base64[index]).replace("\\x3d", "=")
return d

0 comments on commit 4f5c9f9

Please sign in to comment.