Skip to content

Commit 5dd5b60

Browse files
feat: Add pre_navigation_hooks to PlaywrightCrawler (#631)
### Description <!-- The purpose of the PR, list of the changes, ... --> Add a new decorator for processing pre navigation hooks Example Use: ```python from crawlee.playwright_crawler import PlaywrightCrawler from .routes import router async def main() -> None: """The crawler entry point.""" crawler = PlaywrightCrawler( request_handler=router, max_requests_per_crawl=50, ) @crawler.pre_navigation_hook async def hooky(context) -> None: print(f'Hook1') @crawler.pre_navigation_hook async def hooky2(context) -> None: print(f'Hook2') await crawler.run( [ 'https://crawlee.dev', ] ) ``` ### Issues <!-- If applicable, reference any related GitHub issues --> - Closes: #427 ### Checklist - [x] CI passed
1 parent f9463e7 commit 5dd5b60

7 files changed

Lines changed: 97 additions & 24 deletions

File tree

docs/examples/code/playwright_crawler.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import asyncio
22

3-
from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext
3+
from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPreNavigationContext
44

55

66
async def main() -> None:
@@ -47,6 +47,14 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
4747
# Find a link to the next page and enqueue it if it exists.
4848
await context.enqueue_links(selector='.morelink')
4949

50+
# Define a hook that will be called each time before navigating to a new URL.
51+
# The hook receives a context parameter, providing access to the request and
52+
# browser page among other things. In this example, we log the URL being
53+
# navigated to.
54+
@crawler.pre_navigation_hook
55+
async def log_navigation_url(context: PlaywrightPreNavigationContext) -> None:
56+
context.log.info(f'Navigating to {context.request.url} ...')
57+
5058
# Run the crawler with the initial list of URLs.
5159
await crawler.run(['https://news.ycombinator.com/'])
5260

docs/examples/playwright_crawler.mdx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ This example demonstrates how to use <ApiLink to="class/PlaywrightCrawler">`Play
1212

1313
The <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> manages the browser and page instances, simplifying the process of interacting with web pages. In the request handler, Playwright's API is used to extract data from each post on the page. Specifically, it retrieves the title, rank, and URL of each post. Additionally, the handler enqueues links to the next pages to ensure continuous scraping. This setup is ideal for scraping dynamic web pages where JavaScript execution is required to render the content.
1414

15+
A **pre-navigation hook** can be used to perform actions before navigating to the URL. This hook provides further flexibility in controlling environment and preparing for navigation.
16+
1517
<CodeBlock className="language-python">
1618
{PlaywrightCrawlerExample}
1719
</CodeBlock>
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
try:
22
from ._playwright_crawler import PlaywrightCrawler
33
from ._playwright_crawling_context import PlaywrightCrawlingContext
4+
from ._playwright_pre_navigation_context import PlaywrightPreNavigationContext
45
except ImportError as exc:
56
raise ImportError(
67
"To import anything from this subpackage, you need to install the 'playwright' extra."
78
"For example, if you use pip, run `pip install 'crawlee[playwright]'`.",
89
) from exc
910

10-
__all__ = ['PlaywrightCrawler', 'PlaywrightCrawlingContext']
11+
__all__ = ['PlaywrightCrawler', 'PlaywrightCrawlingContext', 'PlaywrightPreNavigationContext']

src/crawlee/playwright_crawler/_playwright_crawler.py

Lines changed: 44 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from __future__ import annotations
22

33
import logging
4-
from typing import TYPE_CHECKING
4+
from typing import TYPE_CHECKING, Awaitable, Callable
55

66
from pydantic import ValidationError
77
from typing_extensions import Unpack
@@ -14,6 +14,7 @@
1414
from crawlee.browsers import BrowserPool
1515
from crawlee.errors import SessionError
1616
from crawlee.playwright_crawler._playwright_crawling_context import PlaywrightCrawlingContext
17+
from crawlee.playwright_crawler._playwright_pre_navigation_context import PlaywrightPreNavigationContext
1718
from crawlee.playwright_crawler._utils import infinite_scroll
1819

1920
if TYPE_CHECKING:
@@ -95,16 +96,41 @@ def __init__(
9596

9697
# Compose the context pipeline with the Playwright-specific context enhancer.
9798
kwargs['_context_pipeline'] = (
98-
ContextPipeline().compose(self._make_http_request).compose(self._handle_blocked_request)
99+
ContextPipeline().compose(self._open_page).compose(self._navigate).compose(self._handle_blocked_request)
99100
)
100101
kwargs['_additional_context_managers'] = [self._browser_pool]
101102
kwargs.setdefault('_logger', logging.getLogger(__name__))
103+
self._pre_navigation_hooks: list[Callable[[PlaywrightPreNavigationContext], Awaitable[None]]] = []
102104

103105
super().__init__(**kwargs)
104106

105-
async def _make_http_request(
107+
async def _open_page(self, context: BasicCrawlingContext) -> AsyncGenerator[PlaywrightPreNavigationContext, None]:
108+
if self._browser_pool is None:
109+
raise ValueError('Browser pool is not initialized.')
110+
111+
# Create a new browser page
112+
crawlee_page = await self._browser_pool.new_page(proxy_info=context.proxy_info)
113+
114+
pre_navigation_context = PlaywrightPreNavigationContext(
115+
request=context.request,
116+
session=context.session,
117+
add_requests=context.add_requests,
118+
send_request=context.send_request,
119+
push_data=context.push_data,
120+
proxy_info=context.proxy_info,
121+
get_key_value_store=context.get_key_value_store,
122+
log=context.log,
123+
page=crawlee_page.page,
124+
)
125+
126+
for hook in self._pre_navigation_hooks:
127+
await hook(pre_navigation_context)
128+
129+
yield pre_navigation_context
130+
131+
async def _navigate(
106132
self,
107-
context: BasicCrawlingContext,
133+
context: PlaywrightPreNavigationContext,
108134
) -> AsyncGenerator[PlaywrightCrawlingContext, None]:
109135
"""Executes an HTTP request utilizing the `BrowserPool` and the `Playwright` library.
110136
@@ -119,21 +145,15 @@ async def _make_http_request(
119145
The enhanced crawling context with the Playwright-specific features (page, response, enqueue_links, and
120146
infinite_scroll).
121147
"""
122-
if self._browser_pool is None:
123-
raise ValueError('Browser pool is not initialized.')
124-
125-
# Create a new browser page
126-
crawlee_page = await self._browser_pool.new_page(proxy_info=context.proxy_info)
127-
128-
async with crawlee_page.page:
148+
async with context.page:
129149
# Navigate to the URL and get response.
130-
response = await crawlee_page.page.goto(context.request.url)
150+
response = await context.page.goto(context.request.url)
131151

132152
if response is None:
133153
raise SessionError(f'Failed to load the URL: {context.request.url}')
134154

135155
# Set the loaded URL to the actual URL after redirection.
136-
context.request.loaded_url = crawlee_page.page.url
156+
context.request.loaded_url = context.page.url
137157

138158
async def enqueue_links(
139159
*,
@@ -148,7 +168,7 @@ async def enqueue_links(
148168
requests = list[BaseRequestData]()
149169
user_data = user_data or {}
150170

151-
elements = await crawlee_page.page.query_selector_all(selector)
171+
elements = await context.page.query_selector_all(selector)
152172

153173
for element in elements:
154174
url = await element.get_attribute('href')
@@ -187,8 +207,8 @@ async def enqueue_links(
187207
proxy_info=context.proxy_info,
188208
get_key_value_store=context.get_key_value_store,
189209
log=context.log,
190-
page=crawlee_page.page,
191-
infinite_scroll=lambda: infinite_scroll(crawlee_page.page),
210+
page=context.page,
211+
infinite_scroll=lambda: infinite_scroll(context.page),
192212
response=response,
193213
enqueue_links=enqueue_links,
194214
)
@@ -227,3 +247,11 @@ async def _handle_blocked_request(
227247
)
228248

229249
yield context
250+
251+
def pre_navigation_hook(self, hook: Callable[[PlaywrightPreNavigationContext], Awaitable[None]]) -> None:
252+
"""Register a hook to be called before each navigation.
253+
254+
Args:
255+
hook: A coroutine function to be called before each navigation.
256+
"""
257+
self._pre_navigation_hooks.append(hook)

src/crawlee/playwright_crawler/_playwright_crawling_context.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,22 +3,21 @@
33
from dataclasses import dataclass
44
from typing import TYPE_CHECKING, Awaitable, Callable
55

6-
from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction
6+
from crawlee.playwright_crawler._playwright_pre_navigation_context import PlaywrightPreNavigationContext
77

88
if TYPE_CHECKING:
9-
from playwright.async_api import Page, Response
9+
from playwright.async_api import Response
10+
11+
from crawlee._types import EnqueueLinksFunction
1012

1113

1214
@dataclass(frozen=True)
13-
class PlaywrightCrawlingContext(BasicCrawlingContext):
15+
class PlaywrightCrawlingContext(PlaywrightPreNavigationContext):
1416
"""The crawling context used by the `PlaywrightCrawler`.
1517
1618
It provides access to key objects as well as utility functions for handling crawling tasks.
1719
"""
1820

19-
page: Page
20-
"""The Playwright `Page` object for the current page."""
21-
2221
response: Response
2322
"""The Playwright `Response` object containing the response details for the current URL."""
2423

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
from __future__ import annotations
2+
3+
from dataclasses import dataclass
4+
from typing import TYPE_CHECKING
5+
6+
from crawlee._types import BasicCrawlingContext
7+
8+
if TYPE_CHECKING:
9+
from playwright.async_api import Page
10+
11+
12+
@dataclass(frozen=True)
13+
class PlaywrightPreNavigationContext(BasicCrawlingContext):
14+
"""Context used by PlaywrightCrawler.
15+
16+
It Provides access to the `Page` object for the current browser page.
17+
"""
18+
19+
page: Page
20+
"""The Playwright `Page` object for the current page."""

tests/unit/playwright_crawler/test_playwright_crawler.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,3 +131,18 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
131131
assert 'headless' not in headers['User-Agent'].lower()
132132

133133
assert headers['User-Agent'] == PW_FIREFOX_HEADLESS_DEFAULT_USER_AGENT
134+
135+
136+
async def test_pre_navigation_hook() -> None:
137+
crawler = PlaywrightCrawler()
138+
mock_hook = mock.AsyncMock(return_value=None)
139+
140+
crawler.pre_navigation_hook(mock_hook)
141+
142+
@crawler.router.default_handler
143+
async def request_handler(_context: PlaywrightCrawlingContext) -> None:
144+
pass
145+
146+
await crawler.run(['https://example.com', 'https://httpbin.org'])
147+
148+
assert mock_hook.call_count == 2

0 commit comments

Comments
 (0)