Skip to content

Commit

Permalink
🔨 Run adblock on HTTPX request event hook (#126)
Browse files Browse the repository at this point in the history
  • Loading branch information
roniemartinez authored Mar 31, 2022
1 parent 90ceb53 commit d19cc29
Show file tree
Hide file tree
Showing 7 changed files with 93 additions and 19 deletions.
3 changes: 0 additions & 3 deletions dude/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,9 +331,6 @@ def iter_urls(self) -> Iterable[str]:
if urlparse(url).netloc not in self.allowed_domains:
logger.info("URL %s is not in allowed domains.", url)
continue
if self.adblock.check_network_urls(url=url, source_url=url, request_type="document"):
logger.info("URL %s has been blocked.", url)
continue
yield url
except IndexError:
pass
Expand Down
10 changes: 6 additions & 4 deletions dude/optional/beautifulsoup_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@

from ..base import ScraperAbstract
from ..rule import Selector, SelectorType, rule_grouper, rule_sorter
from .utils import async_http_get, http_get
from .utils import HTTPXMixin, async_http_get, http_get

logger = logging.getLogger(__name__)


class BeautifulSoupScraper(ScraperAbstract):
class BeautifulSoupScraper(ScraperAbstract, HTTPXMixin):
"""
Scraper using BeautifulSoup4 parser and HTTPX for requests
"""
Expand Down Expand Up @@ -62,7 +62,7 @@ def run_sync(
save_per_page: bool,
**kwargs: Any,
) -> None:
with httpx.Client(proxies=proxy) as client:
with httpx.Client(proxies=proxy, event_hooks={"request": [self._block_httpx_request_if_needed]}) as client:
for url in self.iter_urls():
logger.info("Requesting url %s", url)
for i in range(1, pages + 1):
Expand Down Expand Up @@ -96,7 +96,9 @@ async def run_async(
save_per_page: bool,
**kwargs: Any,
) -> None:
async with httpx.AsyncClient(proxies=proxy) as client:
async with httpx.AsyncClient(
proxies=proxy, event_hooks={"request": [self._block_httpx_request_if_needed]}
) as client:
for url in self.iter_urls():
logger.info("Requesting url %s", url)
for i in range(1, pages + 1):
Expand Down
10 changes: 6 additions & 4 deletions dude/optional/lxml_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@

from ..base import ScraperAbstract
from ..rule import Selector, SelectorType, rule_grouper, rule_sorter
from .utils import async_http_get, http_get
from .utils import HTTPXMixin, async_http_get, http_get

logger = logging.getLogger(__name__)


class LxmlScraper(ScraperAbstract):
class LxmlScraper(ScraperAbstract, HTTPXMixin):
"""
Scraper using lxml parser backend and HTTPX for requests
"""
Expand Down Expand Up @@ -63,7 +63,7 @@ def run_sync(
save_per_page: bool,
**kwargs: Any,
) -> None:
with httpx.Client(proxies=proxy) as client:
with httpx.Client(proxies=proxy, event_hooks={"request": [self._block_httpx_request_if_needed]}) as client:
for url in self.iter_urls():
logger.info("Requesting url %s", url)
for i in range(1, pages + 1):
Expand Down Expand Up @@ -99,7 +99,9 @@ async def run_async(
save_per_page: bool,
**kwargs: Any,
) -> None:
async with httpx.AsyncClient(proxies=proxy) as client:
async with httpx.AsyncClient(
proxies=proxy, event_hooks={"request": [self._block_httpx_request_if_needed]}
) as client:
for url in self.iter_urls():
logger.info("Requesting url %s", url)
for i in range(1, pages + 1):
Expand Down
10 changes: 6 additions & 4 deletions dude/optional/parsel_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@

from ..base import ScraperAbstract
from ..rule import Selector, SelectorType, rule_grouper, rule_sorter
from .utils import async_http_get, http_get
from .utils import HTTPXMixin, async_http_get, http_get

logger = logging.getLogger(__name__)


class ParselScraper(ScraperAbstract):
class ParselScraper(ScraperAbstract, HTTPXMixin):
"""
Scraper using Parsel parser backend and HTTPX for requests
"""
Expand Down Expand Up @@ -62,7 +62,7 @@ def run_sync(
save_per_page: bool,
**kwargs: Any,
) -> None:
with httpx.Client(proxies=proxy) as client:
with httpx.Client(proxies=proxy, event_hooks={"request": [self._block_httpx_request_if_needed]}) as client:
for url in self.iter_urls():
logger.info("Requesting url %s", url)
for i in range(1, pages + 1):
Expand Down Expand Up @@ -97,7 +97,9 @@ async def run_async(
save_per_page: bool,
**kwargs: Any,
) -> None:
async with httpx.AsyncClient(proxies=proxy) as client:
async with httpx.AsyncClient(
proxies=proxy, event_hooks={"request": [self._block_httpx_request_if_needed]}
) as client:
for url in self.iter_urls():
logger.info("Requesting url %s", url)
for i in range(1, pages + 1):
Expand Down
20 changes: 18 additions & 2 deletions dude/optional/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import Optional, Tuple

import httpx
from httpx import Request

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -35,7 +36,7 @@ async def async_http_get(client: httpx.AsyncClient, url: str) -> Tuple[Optional[
response = await client.get(url)
response.raise_for_status()
return response.text, str(response.url)
except httpx.HTTPStatusError as e:
except (httpx.HTTPStatusError, httpx.RequestError) as e:
logger.warning(e)
return None, url

Expand All @@ -49,6 +50,21 @@ def http_get(client: httpx.Client, url: str) -> Tuple[Optional[str], str]:
response = client.get(url)
response.raise_for_status()
return response.text, str(response.url)
except httpx.HTTPStatusError as e:
except (httpx.HTTPStatusError, httpx.RequestError) as e:
logger.warning(e)
return None, url


class HTTPXMixin:
def _block_httpx_request_if_needed(self, request: Request) -> None:
url = str(request.url)
source_url = (
request.headers.get("referer") or request.headers.get("origin") or request.headers.get("host") or url
)
if self.adblock.check_network_urls( # type: ignore
url=url,
source_url=source_url,
request_type=request.headers.get("sec-fetch-dest") or "other",
):
logger.info("URL %s has been blocked.", url)
raise httpx.RequestError(message=f"URL {url} has been blocked.", request=request)
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pydude"
version = "0.14.0"
version = "0.15.0"
repository = "https://github.com/roniemartinez/dude"
description = "dude uncomplicated data extraction"
authors = ["Ronie Martinez <ronmarti18@gmail.com>"]
Expand Down
57 changes: 56 additions & 1 deletion tests/test_bs4.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,21 @@
import sys
from typing import Any, Callable, Dict, List
from typing import Any, Callable, Dict, List, Optional
from unittest import mock

import httpx
import pytest
from braveblock import Adblocker
from bs4 import BeautifulSoup

from dude import Scraper
from dude.optional.beautifulsoup_scraper import BeautifulSoupScraper


@pytest.fixture()
def scraper_application_with_bs4_parser() -> Scraper:
scraper = BeautifulSoupScraper()
scraper.adblock = Adblocker(rules=["https://dude.ron.sh/blockme.css"])
return Scraper(scraper=scraper)


@pytest.fixture()
Expand Down Expand Up @@ -88,6 +97,35 @@ def url(element: BeautifulSoup) -> Dict:
return {"url": element["href"]}


@pytest.fixture()
def bs4_select_with_parser(scraper_application_with_bs4_parser: Scraper) -> None:
@scraper_application_with_bs4_parser.group(css=".custom-group")
@scraper_application_with_bs4_parser.select(css=".title")
def title(element: BeautifulSoup) -> Dict:
return {"title": element.get_text()}

@scraper_application_with_bs4_parser.select(css=".title", group_css=".custom-group")
def empty(element: BeautifulSoup) -> Dict:
return {}

@scraper_application_with_bs4_parser.group(css=".custom-group")
@scraper_application_with_bs4_parser.select(css=".title", url="example.com")
def url_dont_match(element: BeautifulSoup) -> Dict:
return {"title": element.get_text()}

@scraper_application_with_bs4_parser.select(css=".url", group_css=".custom-group")
def url(element: BeautifulSoup) -> Dict:
return {"url": element["href"]}


@pytest.fixture()
def scraper_with_parser_save(scraper_application_with_bs4_parser: Scraper, mock_database: mock.MagicMock) -> None:
@scraper_application_with_bs4_parser.save("custom")
def save_to_database(data: Any, output: Optional[str]) -> bool:
mock_database.save(data)
return True


def test_full_flow_bs4(
scraper_application: Scraper,
bs4_select: None,
Expand Down Expand Up @@ -269,3 +307,20 @@ def test_unsupported_regex(

with pytest.raises(Exception):
scraper_application.run(urls=[test_url], pages=2, format="custom", parser="bs4")


def test_scraper_with_parser(
scraper_application_with_bs4_parser: Scraper,
bs4_select_with_parser: None,
scraper_with_parser_save: None,
mock_database: mock.MagicMock,
) -> None:
assert scraper_application_with_bs4_parser.has_async is False
assert scraper_application_with_bs4_parser.scraper is not None
assert len(scraper_application_with_bs4_parser.scraper.rules) == 4

scraper_application_with_bs4_parser.run(
urls=["https://dude.ron.sh/blockme.css"], pages=2, format="custom", parser="bs4"
)

mock_database.save.assert_not_called()

0 comments on commit d19cc29

Please sign in to comment.