Skip to content

Commit

Permalink
Added cloudscraper to bypass new Kobo bot protection
Browse files Browse the repository at this point in the history
  • Loading branch information
NotSimone committed Jun 18, 2024
1 parent e640871 commit b9fa4c3
Show file tree
Hide file tree
Showing 131 changed files with 50,282 additions and 253 deletions.
21 changes: 21 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2024 Simon Hua

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,11 @@ To fetch the metadata of multiple books at once, select some books, right click
- If the first match is not correct, try turning the option "Number of matches to fetch" in the plugin configuration higher and try to fetch metadata with the "Download metadata" button in the individual metadata editor. This will allow you to select from n number of possible matches.
- If you are having trouble matching a series, check what it is called on the Kobo store and try matching the titles.
- If you know what the correct match should be, try filling in the isbn in the Identifiers field with the format `isbn:xxxxxxxxxxxxx`. The plugin will then perform the metadata search with that isbn.
- If you are getting 503 errors, wait and try again the next day. This is probably cloudflare bot detection triggering and it has some kind of lockout mechanism to it.

## Used Open Source Software
This uses the following open source software:
- [cloudscraper](https://github.com/VeNoMouS/cloudscraper) ([MIT License](https://github.com/VeNoMouS/cloudscraper/blob/master/LICENSE))
- [requests](https://github.com/psf/requests) ([Apache 2.0 License](https://github.com/psf/requests/blob/main/LICENSE))
- [urllib3](https://github.com/urllib3/urllib3) ([MIT License](https://github.com/urllib3/urllib3/blob/main/LICENSE.txt))
- [idna](https://github.com/kjd/idna) ([BSD-3 License](https://github.com/kjd/idna/blob/master/LICENSE.md))
270 changes: 19 additions & 251 deletions __init__.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,10 @@
import re
import string
from queue import Queue
from typing import List, Optional, Tuple
from urllib.parse import urlencode

from calibre import browser
from calibre.ebooks.metadata import check_isbn
from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.metadata.sources.base import Option, Source, fixauthors
from calibre.utils.config_base import tweaks
from calibre.utils.date import parse_only_date
from calibre.utils.logging import Log
from lxml import html
from calibre.ebooks.metadata.sources.base import Option, Source


class KoboMetadata(Source):
name = "Kobo Metadata"
author = "NotSimone"
version = (1, 5, 0)
version = (1, 6, 0)
minimum_calibre_version = (5, 0, 0)
description = _("Downloads metadata and covers from Kobo")

Expand All @@ -37,8 +24,6 @@ class KoboMetadata(Source):
has_html_comments = True
supports_gzip_transfer_encoding = True

BASE_URL = "https://www.kobo.com/"

COUNTRIES = {
"ca": _("Canada"),
"us": _("United States"),
Expand Down Expand Up @@ -135,17 +120,23 @@ class KoboMetadata(Source):
),
)

_impl = None

def __init__(self, *args, **kwargs):
Source.__init__(self, *args, **kwargs)
with self:
from .kobo_metadata import KoboMetadataImpl

def get_book_url(self, identifiers) -> Optional[Tuple]:
self._impl = KoboMetadataImpl(self)

def get_book_url(self, identifiers):
isbn = identifiers.get("isbn", None)
if isbn:
# Example output:"https://www.kobo.com/au/en/search?query=9781761108105"
return ("isbn", isbn, self._get_search_url(isbn, 1))
return ("isbn", isbn, self._impl.get_search_url(isbn, 1))
return None

def get_cached_cover_url(self, identifiers) -> Optional[str]:
def get_cached_cover_url(self, identifiers):
isbn = identifiers.get("isbn", None)

if isbn is not None:
Expand All @@ -163,38 +154,7 @@ def identify(
identifiers={},
timeout=30,
) -> None:
log.info(f"KoboMetadata::identify: title: {title}, authors: {authors}, identifiers: {identifiers}")

isbn = check_isbn(identifiers.get("isbn", None))
urls = []

if isbn:
log.info(f"KoboMetadata::identify: Getting metadata with isbn: {isbn}")
# isbn searches will (sometimes) redirect to the product page
isbn_urls = self._perform_query(isbn, log, timeout)
if isbn_urls:
urls.append(isbn_urls[0])

query = self._generate_query(title, authors)
log.info(f"KoboMetadata::identify: Searching with query: {query}")
urls.extend(self._perform_query(query, log, timeout))

index = 0
for url in urls:
log.info(f"KoboMetadata::identify: Looking up metadata with url: {url}")
try:
metadata = self._lookup_metadata(url, log, timeout)
except Exception as e:
log.error(f"KoboMetadata::identify: Got exception looking up metadata: {e}")
return

if metadata:
metadata.source_relevance = index
result_queue.put(metadata)
else:
log.info("KoboMetadata::identify:: Could not find matching book")
index += 1
return
self._impl.identify(result_queue, title, authors, identifiers, self.prefs, timeout, log)

def download_cover(
self,
Expand All @@ -210,209 +170,17 @@ def download_cover(
cover_url = self.get_cached_cover_url(identifiers)
if not cover_url:
log.info("KoboMetadata::download_cover: No cached url found, running identify")
res_queue = Queue()
self.identify(log, res_queue, abort, title, authors, identifiers, timeout)
if res_queue.empty():
log.error("KoboMetadata::download_cover: Could not identify book")
return
self._impl.get_cover_url(title, authors, identifiers, self.prefs, timeout, log)

metadata = res_queue.get()
cover_url = self.get_cached_cover_url(metadata)
# Try again now that we set the cached url
cover_url = self.get_cached_cover_url(identifiers)
if not cover_url:
log.error("KoboMetadata::download_cover: Could not find cover")
log.error("KoboMetadata::download_cover: Could not get cover")
return

br = self._get_browser()
try:
cover = br.open_novisit(cover_url, timeout=timeout).read()
cover = self._impl.get_cover(cover_url, timeout)
result_queue.put((self, cover))
except Exception as e:
log.error(f"KoboMetadata::download_cover: Got exception while opening cover url: {e}")
return

result_queue.put((self, cover))

def _get_search_url(self, search_str: str, page_number: int) -> str:
query = {"query": search_str, "fcmedia": "Book", "pageNumber": page_number, "fclanguages": "all"}
return f"{self.BASE_URL}{self.prefs['country']}/en/search?{urlencode(query)}"

def _generate_query(self, title: str, authors: list[str]) -> str:
# Remove leading zeroes from the title if configured
# Kobo search doesn't do a great job of matching numbers
title = " ".join(
x.lstrip("0") if self.prefs["remove_leading_zeroes"] else x
for x in self.get_title_tokens(title, strip_joiners=False, strip_subtitle=False)
)

if authors:
title += " " + " ".join(self.get_author_tokens(authors))

return title

def _get_browser(self) -> browser:
br: browser = self.browser
br.set_header(
"User-Agent",
"Mozilla/5.0 (Linux; Android 8.0.0; VTR-L29; rv:63.0) Gecko/20100101 Firefox/63.0",
)
return br

# Returns [lxml html element, is search result]
def _get_webpage(self, url: str, log: Log, timeout: int) -> Tuple[Optional[html.Element], bool]:
br = self._get_browser()
try:
resp = br.open_novisit(url, timeout=timeout)
tree = html.fromstring(resp.read())
is_search = "/search?" in resp.geturl()
return (tree, is_search)
except Exception as e:
log.error(f"KoboMetadata::_get_webpage: Got exception while opening url: {e}")
return (None, False)

# Returns a list of urls that match our search
def _perform_query(self, query: str, log: Log, timeout: int) -> list[str]:
url = self._get_search_url(query, 1)
log.info(f"KoboMetadata::identify: Searching for book with url: {url}")

tree, is_search = self._get_webpage(url, log, timeout)
if tree is None:
log.info(f"KoboMetadata::_lookup_metadata: Could not get url: {url}")
return []

# Query redirected straight to product page
if not is_search:
return [url]

results = self._get_search_matches(tree, log)

page_num = 2
# a reasonable default for how many we should try before we give up
max_page_num = 4
while len(results) < self.prefs["num_matches"] and page_num < max_page_num:
url = self._get_search_url(query, page_num)
tree, is_search = self._get_webpage(url, log, timeout)
assert tree and is_search
results.extend(self._get_search_matches(tree, log))
page_num += 1

return results[: self.prefs["num_matches"]]

def _get_search_matches(self, page: html.Element, log: Log) -> List[str]:
# Kobo seems to have partially moved to a new webpage for their search pages
if len(page.xpath("//div[@data-testid='search-result-widget']")):
log.info("KoboMetadata::_get_search_matches: Detected new search page")
result_elements = page.xpath("//a[@data-testid='title']")
# Only get every second because the page includes mobile and web urls
return [x.get("href") for x in result_elements[::2]]

# Old
log.info("KoboMetadata::_get_search_matches: Detected old search page")
result_elements = page.xpath("//h2[@class='title product-field']/a")
return [x.get("href") for x in result_elements]

# Given the url for a book, parse and return the metadata
def _lookup_metadata(self, url: str, log: Log, timeout: int) -> Optional[Metadata]:
tree, is_search = self._get_webpage(url, log, timeout)
if tree is None or is_search:
log.info(f"KoboMetadata::_lookup_metadata: Could not get url: {url}")
return None

title_elements = tree.xpath("//h1[@class='title product-field']")
title = title_elements[0].text.strip()
log.info(f"KoboMetadata::_lookup_metadata: Got title: {title}")

authors_elements = tree.xpath("//span[@class='visible-contributors']/a")
authors = fixauthors([x.text for x in authors_elements])
log.info(f"KoboMetadata::_lookup_metadata: Got authors: {authors}")

metadata = Metadata(title, authors)

series_elements = tree.xpath("//span[@class='series product-field']")
if series_elements:
# Books in series but without an index get a nested series product-field class
# With index: https://www.kobo.com/au/en/ebook/fourth-wing-1
# Without index: https://www.kobo.com/au/en/ebook/les-damnees-de-la-mer-femmes-et-frontieres-en-mediterranee
series_name_element = series_elements[-1].xpath("span[@class='product-sequence-field']/a")
if series_name_element:
metadata.series = series_name_element[0].text
log.info(f"KoboMetadata::_lookup_metadata: Got series: {metadata.series}")

series_index_element = series_elements[-1].xpath("span[@class='sequenced-name-prefix']")
if series_index_element:
series_index_match = re.match("Book (.*) - ", series_index_element[0].text)
if series_index_match:
metadata.series_index = series_index_match.groups(0)[0]
log.info(f"KoboMetadata::_lookup_metadata: Got series_index: {metadata.series_index}")

book_details_elements = tree.xpath("//div[@class='bookitem-secondary-metadata']/ul/li")
if book_details_elements:
metadata.publisher = book_details_elements[0].text.strip()
log.info(f"KoboMetadata::_lookup_metadata: Got publisher: {metadata.publisher}")
for x in book_details_elements[1:]:
descriptor = x.text.strip()
if descriptor == "Release Date:":
metadata.pubdate = parse_only_date(x.xpath("span")[0].text)
log.info(f"KoboMetadata::_lookup_metadata: Got pubdate: {metadata.pubdate}")
elif descriptor == "ISBN:":
metadata.isbn = x.xpath("span")[0].text
log.info(f"KoboMetadata::_lookup_metadata: Got isbn: {metadata.isbn}")
elif descriptor == "Language:":
metadata.language = x.xpath("span")[0].text
log.info(f"KoboMetadata::_lookup_metadata: Got language: {metadata.language}")

tags_elements = tree.xpath("//ul[@class='category-rankings']/meta[@property='genre']")
if tags_elements:
# Calibre doesnt like commas in tags
metadata.tags = {x.get("content").replace(", ", " ") for x in tags_elements}
log.info(f"KoboMetadata::_lookup_metadata: Got tags: {metadata.tags}")

synopsis_elements = tree.xpath("//div[@class='synopsis-description']")
if synopsis_elements:
metadata.comments = html.tostring(synopsis_elements[0], method="html")
log.info(f"KoboMetadata::_lookup_metadata: Got comments: {metadata.comments}")

cover_elements = tree.xpath("//img[contains(@class, 'cover-image')]")
if cover_elements:
# Sample: https://cdn.kobo.com/book-images/44f0e8b9-3338-4d1c-bd6e-e88e82cb8fad/353/569/90/False/holly-23.jpg
cover_url = "https:" + cover_elements[0].get("src")
if self.prefs["resize_cover"]:
# Change the resolution from 353x569 to maximum_cover_size (default 1650x2200)
# Kobo will resize to match the width and have the correct aaspect ratio
width, height = tweaks["maximum_cover_size"]
cover_url = cover_url.replace("353/569/90", f"{width}/{height}/100")
else:
# Removing this gets the original cover art (probably)
# Sample: https://cdn.kobo.com/book-images/44f0e8b9-3338-4d1c-bd6e-e88e82cb8fad/holly-23.jpg
cover_url = cover_url.replace("353/569/90/False/", "")
self.cache_identifier_to_cover_url(metadata.isbn, cover_url)
log.info(f"KoboMetadata::_lookup_metadata: Got cover: {cover_url}")

blacklisted_title = self._check_title_blacklist(title, log)
if blacklisted_title:
log.info(f"KoboMetadata::_lookup_metadata: Hit blacklisted word(s) in the title: {blacklisted_title}")
return None

blacklisted_tags = self._check_tag_blacklist(metadata.tags, log)
if blacklisted_tags:
log.info(f"KoboMetadata::_lookup_metadata: Hit blacklisted tag(s): {blacklisted_tags}")
return None

return metadata

# Returns the set of words in the title that are also blacklisted
def _check_title_blacklist(self, title: str, log: Log) -> set[str]:
if not self.prefs["title_blacklist"]:
return None

blacklisted_words = {x.strip().lower() for x in self.prefs["title_blacklist"].split(",")}
log.info(f"KoboMetadata::_check_title_blacklist: blacklisted title words: {blacklisted_words}")
# Remove punctuation from title string
title_str = title.translate(str.maketrans("", "", string.punctuation))
return blacklisted_words.intersection(title_str.lower().split(" "))

# Returns the set of tags that are also blacklisted
def _check_tag_blacklist(self, tags: set[str], log: Log) -> set[str]:
if not self.prefs["tag_blacklist"]:
return None

blacklisted_tags = {x.strip().lower() for x in self.prefs["tag_blacklist"].split(",")}
log.info(f"KoboMetadata::_check_tag_blacklist: blacklisted tags: {blacklisted_tags}")
return blacklisted_tags.intersection({x.lower() for x in tags})
4 changes: 4 additions & 0 deletions certifi/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .core import contents, where

__all__ = ["contents", "where"]
__version__ = "2024.06.02"
12 changes: 12 additions & 0 deletions certifi/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import argparse

from certifi import contents, where

parser = argparse.ArgumentParser()
parser.add_argument("-c", "--contents", action="store_true")
args = parser.parse_args()

if args.contents:
print(contents())
else:
print(where())
Loading

0 comments on commit b9fa4c3

Please sign in to comment.