feat: Add pre_navigation_hooks to PlaywrightCrawler (#631)

Prathamesh010 · web-flow · commit 5dd5b60e2a44 · 2024-10-30T13:46:12.000+01:00
### Description  Add a new decorator for processing pre navigation hooks Example Use: ```python from crawlee.playwright_crawler import PlaywrightCrawler from .routes import router async def main() -> None: """The crawler entry point.""" crawler = PlaywrightCrawler( request_handler=router, max_requests_per_crawl=50, ) @crawler.pre_navigation_hook async def hooky(context) -> None: print(f'Hook1') @crawler.pre_navigation_hook async def hooky2(context) -> None: print(f'Hook2') await crawler.run( [ 'https://crawlee.dev', ] ) ``` ### Issues  - Closes: #427 ### Checklist - [x] CI passed
diff --git a/docs/examples/code/playwright_crawler.py b/docs/examples/code/playwright_crawler.py
@@ -1,6 +1,6 @@
 import asyncio
 
-from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext
+from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPreNavigationContext
 
 
 async def main() -> None:
@@ -47,6 +47,14 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
         # Find a link to the next page and enqueue it if it exists.
         await context.enqueue_links(selector='.morelink')
 
+    # Define a hook that will be called each time before navigating to a new URL.
+    # The hook receives a context parameter, providing access to the request and
+    # browser page among other things. In this example, we log the URL being
+    # navigated to.
+    @crawler.pre_navigation_hook
+    async def log_navigation_url(context: PlaywrightPreNavigationContext) -> None:
+        context.log.info(f'Navigating to {context.request.url} ...')
+
     # Run the crawler with the initial list of URLs.
     await crawler.run(['https://news.ycombinator.com/'])
 
diff --git a/docs/examples/playwright_crawler.mdx b/docs/examples/playwright_crawler.mdx
@@ -12,6 +12,8 @@ This example demonstrates how to use <ApiLink to="class/PlaywrightCrawler">`Play
 
 The <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> manages the browser and page instances, simplifying the process of interacting with web pages. In the request handler, Playwright's API is used to extract data from each post on the page. Specifically, it retrieves the title, rank, and URL of each post. Additionally, the handler enqueues links to the next pages to ensure continuous scraping. This setup is ideal for scraping dynamic web pages where JavaScript execution is required to render the content.
 
+A **pre-navigation hook** can be used to perform actions before navigating to the URL. This hook provides further flexibility in controlling environment and preparing for navigation.
+
 <CodeBlock className="language-python">
     {PlaywrightCrawlerExample}
 </CodeBlock>
diff --git a/src/crawlee/playwright_crawler/__init__.py b/src/crawlee/playwright_crawler/__init__.py
@@ -1,10 +1,11 @@
 try:
     from ._playwright_crawler import PlaywrightCrawler
     from ._playwright_crawling_context import PlaywrightCrawlingContext
+    from ._playwright_pre_navigation_context import PlaywrightPreNavigationContext
 except ImportError as exc:
     raise ImportError(
         "To import anything from this subpackage, you need to install the 'playwright' extra."
         "For example, if you use pip, run `pip install 'crawlee[playwright]'`.",
     ) from exc
 
-__all__ = ['PlaywrightCrawler', 'PlaywrightCrawlingContext']
+__all__ = ['PlaywrightCrawler', 'PlaywrightCrawlingContext', 'PlaywrightPreNavigationContext']
diff --git a/src/crawlee/playwright_crawler/_playwright_crawler.py b/src/crawlee/playwright_crawler/_playwright_crawler.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import logging
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Awaitable, Callable
 
 from pydantic import ValidationError
 from typing_extensions import Unpack
@@ -14,6 +14,7 @@
 from crawlee.browsers import BrowserPool
 from crawlee.errors import SessionError
 from crawlee.playwright_crawler._playwright_crawling_context import PlaywrightCrawlingContext
+from crawlee.playwright_crawler._playwright_pre_navigation_context import PlaywrightPreNavigationContext
 from crawlee.playwright_crawler._utils import infinite_scroll
 
 if TYPE_CHECKING:
@@ -95,16 +96,41 @@ def __init__(
 
         # Compose the context pipeline with the Playwright-specific context enhancer.
         kwargs['_context_pipeline'] = (
-            ContextPipeline().compose(self._make_http_request).compose(self._handle_blocked_request)
+            ContextPipeline().compose(self._open_page).compose(self._navigate).compose(self._handle_blocked_request)
         )
         kwargs['_additional_context_managers'] = [self._browser_pool]
         kwargs.setdefault('_logger', logging.getLogger(__name__))
+        self._pre_navigation_hooks: list[Callable[[PlaywrightPreNavigationContext], Awaitable[None]]] = []
 
         super().__init__(**kwargs)
 
-    async def _make_http_request(
+    async def _open_page(self, context: BasicCrawlingContext) -> AsyncGenerator[PlaywrightPreNavigationContext, None]:
+        if self._browser_pool is None:
+            raise ValueError('Browser pool is not initialized.')
+
+        # Create a new browser page
+        crawlee_page = await self._browser_pool.new_page(proxy_info=context.proxy_info)
+
+        pre_navigation_context = PlaywrightPreNavigationContext(
+            request=context.request,
+            session=context.session,
+            add_requests=context.add_requests,
+            send_request=context.send_request,
+            push_data=context.push_data,
+            proxy_info=context.proxy_info,
+            get_key_value_store=context.get_key_value_store,
+            log=context.log,
+            page=crawlee_page.page,
+        )
+
+        for hook in self._pre_navigation_hooks:
+            await hook(pre_navigation_context)
+
+        yield pre_navigation_context
+
+    async def _navigate(
         self,
-        context: BasicCrawlingContext,
+        context: PlaywrightPreNavigationContext,
     ) -> AsyncGenerator[PlaywrightCrawlingContext, None]:
         """Executes an HTTP request utilizing the `BrowserPool` and the `Playwright` library.
 
@@ -119,21 +145,15 @@ async def _make_http_request(
             The enhanced crawling context with the Playwright-specific features (page, response, enqueue_links, and
                 infinite_scroll).
         """
-        if self._browser_pool is None:
-            raise ValueError('Browser pool is not initialized.')
-
-        # Create a new browser page
-        crawlee_page = await self._browser_pool.new_page(proxy_info=context.proxy_info)
-
-        async with crawlee_page.page:
+        async with context.page:
             # Navigate to the URL and get response.
-            response = await crawlee_page.page.goto(context.request.url)
+            response = await context.page.goto(context.request.url)
 
             if response is None:
                 raise SessionError(f'Failed to load the URL: {context.request.url}')
 
             # Set the loaded URL to the actual URL after redirection.
-            context.request.loaded_url = crawlee_page.page.url
+            context.request.loaded_url = context.page.url
 
             async def enqueue_links(
                 *,
@@ -148,7 +168,7 @@ async def enqueue_links(
                 requests = list[BaseRequestData]()
                 user_data = user_data or {}
 
-                elements = await crawlee_page.page.query_selector_all(selector)
+                elements = await context.page.query_selector_all(selector)
 
                 for element in elements:
                     url = await element.get_attribute('href')
@@ -187,8 +207,8 @@ async def enqueue_links(
                 proxy_info=context.proxy_info,
                 get_key_value_store=context.get_key_value_store,
                 log=context.log,
-                page=crawlee_page.page,
-                infinite_scroll=lambda: infinite_scroll(crawlee_page.page),
+                page=context.page,
+                infinite_scroll=lambda: infinite_scroll(context.page),
                 response=response,
                 enqueue_links=enqueue_links,
             )
@@ -227,3 +247,11 @@ async def _handle_blocked_request(
                 )
 
         yield context
+
+    def pre_navigation_hook(self, hook: Callable[[PlaywrightPreNavigationContext], Awaitable[None]]) -> None:
+        """Register a hook to be called before each navigation.
+
+        Args:
+            hook: A coroutine function to be called before each navigation.
+        """
+        self._pre_navigation_hooks.append(hook)
diff --git a/src/crawlee/playwright_crawler/_playwright_crawling_context.py b/src/crawlee/playwright_crawler/_playwright_crawling_context.py
@@ -3,22 +3,21 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Awaitable, Callable
 
-from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction
+from crawlee.playwright_crawler._playwright_pre_navigation_context import PlaywrightPreNavigationContext
 
 if TYPE_CHECKING:
-    from playwright.async_api import Page, Response
+    from playwright.async_api import Response
+
+    from crawlee._types import EnqueueLinksFunction
 
 
 @dataclass(frozen=True)
-class PlaywrightCrawlingContext(BasicCrawlingContext):
+class PlaywrightCrawlingContext(PlaywrightPreNavigationContext):
     """The crawling context used by the `PlaywrightCrawler`.
 
     It provides access to key objects as well as utility functions for handling crawling tasks.
     """
 
-    page: Page
-    """The Playwright `Page` object for the current page."""
-
     response: Response
     """The Playwright `Response` object containing the response details for the current URL."""
 
diff --git a/src/crawlee/playwright_crawler/_playwright_pre_navigation_context.py b/src/crawlee/playwright_crawler/_playwright_pre_navigation_context.py
@@ -0,0 +1,20 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+from crawlee._types import BasicCrawlingContext
+
+if TYPE_CHECKING:
+    from playwright.async_api import Page
+
+
+@dataclass(frozen=True)
+class PlaywrightPreNavigationContext(BasicCrawlingContext):
+    """Context used by PlaywrightCrawler.
+
+    It Provides access to the `Page` object for the current browser page.
+    """
+
+    page: Page
+    """The Playwright `Page` object for the current page."""
diff --git a/tests/unit/playwright_crawler/test_playwright_crawler.py b/tests/unit/playwright_crawler/test_playwright_crawler.py
@@ -131,3 +131,18 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
     assert 'headless' not in headers['User-Agent'].lower()
 
     assert headers['User-Agent'] == PW_FIREFOX_HEADLESS_DEFAULT_USER_AGENT
+
+
+async def test_pre_navigation_hook() -> None:
+    crawler = PlaywrightCrawler()
+    mock_hook = mock.AsyncMock(return_value=None)
+
+    crawler.pre_navigation_hook(mock_hook)
+
+    @crawler.router.default_handler
+    async def request_handler(_context: PlaywrightCrawlingContext) -> None:
+        pass
+
+    await crawler.run(['https://example.com', 'https://httpbin.org'])
+
+    assert mock_hook.call_count == 2