Skip to content

Commit 49ff6e2

Browse files
authored
feat: blocking detection for playwright crawler (#328)
### Description - Add blocking detection for `PlaywrightCrawler`. - Improve documentation of the `PlaywrightCrawler` and related components. ### Issues - Closes: #239 ### Testing - Only manual, until we have #197 ready. ### Checklist - [x] Changes are described in the `CHANGELOG.md` - [x] CI passed
1 parent c630818 commit 49ff6e2

4 files changed

Lines changed: 103 additions & 9 deletions

File tree

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44

55
### Features
66

7-
- Integrate proxies into PlaywrightCrawler.
7+
- Support for proxy configuration in `PlaywrightCrawler`.
8+
- Blocking detection in `PlaywrightCrawler`.
89
- Expose `crawler.log` to public.
910

1011
### Bug fixes

src/crawlee/basic_crawler/types.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,14 @@ def __call__( # noqa: D102
9191

9292

9393
class EnqueueLinksFunction(Protocol):
94-
"""Type of a function for enqueueing links based on a selector."""
94+
"""Type of a function for enqueueing links based on a selector.
95+
96+
Args:
97+
selector: CSS selector used to find the elements containing the links.
98+
label: Label for the newly created `Request` objects, used for request routing.
99+
user_data: User data to be provided to the newly created `Request` objects.
100+
**kwargs: Additional arguments for the `add_requests` method.
101+
"""
95102

96103
def __call__( # noqa: D102
97104
self,

src/crawlee/playwright_crawler/playwright_crawler.py

Lines changed: 84 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@
55

66
from typing_extensions import Unpack
77

8+
from crawlee._utils.blocked import RETRY_CSS_SELECTORS
89
from crawlee.basic_crawler import BasicCrawler, BasicCrawlerOptions, ContextPipeline
10+
from crawlee.basic_crawler.errors import SessionError
911
from crawlee.browsers import BrowserPool
1012
from crawlee.enqueue_strategy import EnqueueStrategy
1113
from crawlee.models import BaseRequestData
@@ -18,7 +20,24 @@
1820

1921

2022
class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext]):
21-
"""A crawler that fetches the request URL using `Playwright`."""
23+
"""A crawler that leverages the [Playwright](https://playwright.dev/python/) browser automation library.
24+
25+
`PlaywrightCrawler` is a subclass of `BasicCrawler`, inheriting all its features, such as autoscaling of requests,
26+
request routing, and utilization of `RequestProvider`. Additionally, it offers Playwright-specific methods and
27+
properties, like the `page` property for user data extraction, and the `enqueue_links` method for crawling
28+
other pages.
29+
30+
This crawler is ideal for crawling websites that require JavaScript execution, as it uses headless browsers
31+
to download web pages and extract data. For websites that do not require JavaScript, consider using
32+
`BeautifulSoupCrawler`, which uses raw HTTP requests, and it is much faster.
33+
34+
`PlaywrightCrawler` opens a new browser page (i.e., tab) for each `Request` object and invokes the user-provided
35+
request handler function via the `Router`. Users can interact with the page and extract the data using
36+
the Playwright API.
37+
38+
Note that the pool of browser instances used by `PlaywrightCrawler`, and the pages they open, is internally
39+
managed by the `BrowserPool`.
40+
"""
2241

2342
def __init__(
2443
self,
@@ -50,19 +69,42 @@ def __init__(
5069

5170
self._browser_pool = browser_pool
5271

53-
kwargs['_context_pipeline'] = ContextPipeline().compose(self._page_goto)
72+
# Compose the context pipeline with the Playwright-specific context enhancer.
73+
kwargs['_context_pipeline'] = (
74+
ContextPipeline().compose(self._make_http_request).compose(self._handle_blocked_request)
75+
)
5476
kwargs['_additional_context_managers'] = [self._browser_pool]
55-
5677
kwargs.setdefault('_logger', logging.getLogger(__name__))
5778

5879
super().__init__(**kwargs)
5980

60-
async def _page_goto(self, context: BasicCrawlingContext) -> AsyncGenerator[PlaywrightCrawlingContext, None]:
81+
async def _make_http_request(
82+
self,
83+
context: BasicCrawlingContext,
84+
) -> AsyncGenerator[PlaywrightCrawlingContext, None]:
85+
"""Enhance the crawling context with making an HTTP request using Playwright.
86+
87+
Args:
88+
context: The basic crawling context to be enhanced.
89+
90+
Raises:
91+
ValueError: If the browser pool is not initialized.
92+
SessionError: If the URL cannot be loaded by the browser.
93+
94+
Yields:
95+
An enhanced crawling context with Playwright-specific features.
96+
"""
6197
if self._browser_pool is None:
6298
raise ValueError('Browser pool is not initialized.')
6399

100+
# Create a new browser page, navigate to the URL and get response.
64101
crawlee_page = await self._browser_pool.new_page(proxy_info=context.proxy_info)
65-
await crawlee_page.page.goto(context.request.url)
102+
response = await crawlee_page.page.goto(context.request.url)
103+
104+
if response is None:
105+
raise SessionError(f'Failed to load the URL: {context.request.url}')
106+
107+
# Set the loaded URL to the actual URL after redirection.
66108
context.request.loaded_url = crawlee_page.page.url
67109

68110
async def enqueue_links(
@@ -72,6 +114,7 @@ async def enqueue_links(
72114
user_data: dict | None = None,
73115
**kwargs: Unpack[AddRequestsKwargs],
74116
) -> None:
117+
"""The `PlaywrightCrawler` implementation of the `EnqueueLinksFunction` function."""
75118
kwargs.setdefault('strategy', EnqueueStrategy.SAME_HOSTNAME)
76119

77120
requests = list[BaseRequestData]()
@@ -102,7 +145,43 @@ async def enqueue_links(
102145
proxy_info=context.proxy_info,
103146
log=context.log,
104147
page=crawlee_page.page,
148+
response=response,
105149
enqueue_links=enqueue_links,
106150
)
107151

108152
await crawlee_page.page.close()
153+
154+
async def _handle_blocked_request(
155+
self,
156+
crawling_context: PlaywrightCrawlingContext,
157+
) -> AsyncGenerator[PlaywrightCrawlingContext, None]:
158+
"""Enhance the crawling context with handling of blocked requests.
159+
160+
Args:
161+
crawling_context: The crawling context to be checked for blocking.
162+
163+
Raises:
164+
SessionError: If the session is blocked based on the HTTP status code or the response content.
165+
166+
Yields:
167+
The original crawling context if the session is not blocked.
168+
"""
169+
if self._retry_on_blocked:
170+
status_code = crawling_context.response.status
171+
172+
# Check if the session is blocked based on the HTTP status code.
173+
if crawling_context.session and crawling_context.session.is_blocked_status_code(status_code=status_code):
174+
raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}.')
175+
176+
matched_selectors = [
177+
selector for selector in RETRY_CSS_SELECTORS if (await crawling_context.page.query_selector(selector))
178+
]
179+
180+
# Check if the session is blocked based on the response content
181+
if matched_selectors:
182+
raise SessionError(
183+
'Assuming the session is blocked - '
184+
f"HTTP response matched the following selectors: {'; '.join(matched_selectors)}"
185+
)
186+
187+
yield crawling_context

src/crawlee/playwright_crawler/types.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,19 @@
66
from crawlee.basic_crawler.types import BasicCrawlingContext, EnqueueLinksFunction
77

88
if TYPE_CHECKING:
9-
from playwright.async_api import Page
9+
from playwright.async_api import Page, Response
1010

1111

1212
@dataclass(frozen=True)
1313
class PlaywrightCrawlingContext(BasicCrawlingContext):
14-
"""Crawling context used by PlaywrightSoupCrawler."""
14+
"""Crawling context used by PlaywrightSoupCrawler.
15+
16+
Args:
17+
page: The Playwright `Page` object.
18+
response: The Playwright `Response` object.
19+
enqueue_links: The `PlaywrightCrawler` implementation of the `EnqueueLinksFunction` function.
20+
"""
1521

1622
page: Page
23+
response: Response
1724
enqueue_links: EnqueueLinksFunction

0 commit comments

Comments
 (0)