feat: blocking detection for playwright crawler (#328)

vdusek · web-flow · commit 49ff6e25c12a · 2024-07-19T14:08:41.000+02:00
### Description - Add blocking detection for `PlaywrightCrawler`. - Improve documentation of the `PlaywrightCrawler` and related components. ### Issues - Closes: #239 ### Testing - Only manual, until we have #197 ready. ### Checklist - [x] Changes are described in the `CHANGELOG.md` - [x] CI passed
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,7 +4,8 @@
 
 ### Features
 
-- Integrate proxies into PlaywrightCrawler.
+- Support for proxy configuration in `PlaywrightCrawler`.
+- Blocking detection in `PlaywrightCrawler`.
 - Expose `crawler.log` to public.
 
 ### Bug fixes
diff --git a/src/crawlee/basic_crawler/types.py b/src/crawlee/basic_crawler/types.py
@@ -91,7 +91,14 @@ def __call__(  # noqa: D102
 
 
 class EnqueueLinksFunction(Protocol):
-    """Type of a function for enqueueing links based on a selector."""
+    """Type of a function for enqueueing links based on a selector.
+
+    Args:
+        selector: CSS selector used to find the elements containing the links.
+        label: Label for the newly created `Request` objects, used for request routing.
+        user_data: User data to be provided to the newly created `Request` objects.
+        **kwargs: Additional arguments for the `add_requests` method.
+    """
 
     def __call__(  # noqa: D102
         self,
diff --git a/src/crawlee/playwright_crawler/playwright_crawler.py b/src/crawlee/playwright_crawler/playwright_crawler.py
@@ -5,7 +5,9 @@
 
 from typing_extensions import Unpack
 
+from crawlee._utils.blocked import RETRY_CSS_SELECTORS
 from crawlee.basic_crawler import BasicCrawler, BasicCrawlerOptions, ContextPipeline
+from crawlee.basic_crawler.errors import SessionError
 from crawlee.browsers import BrowserPool
 from crawlee.enqueue_strategy import EnqueueStrategy
 from crawlee.models import BaseRequestData
@@ -18,7 +20,24 @@
 
 
 class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext]):
-    """A crawler that fetches the request URL using `Playwright`."""
+    """A crawler that leverages the [Playwright](https://playwright.dev/python/) browser automation library.
+
+    `PlaywrightCrawler` is a subclass of `BasicCrawler`, inheriting all its features, such as autoscaling of requests,
+    request routing, and utilization of `RequestProvider`. Additionally, it offers Playwright-specific methods and
+    properties, like the `page` property for user data extraction, and the `enqueue_links` method for crawling
+    other pages.
+
+    This crawler is ideal for crawling websites that require JavaScript execution, as it uses headless browsers
+    to download web pages and extract data. For websites that do not require JavaScript, consider using
+    `BeautifulSoupCrawler`, which uses raw HTTP requests, and it is much faster.
+
+    `PlaywrightCrawler` opens a new browser page (i.e., tab) for each `Request` object and invokes the user-provided
+    request handler function via the `Router`. Users can interact with the page and extract the data using
+    the Playwright API.
+
+    Note that the pool of browser instances used by `PlaywrightCrawler`, and the pages they open, is internally
+    managed by the `BrowserPool`.
+    """
 
     def __init__(
         self,
@@ -50,19 +69,42 @@ def __init__(
 
         self._browser_pool = browser_pool
 
-        kwargs['_context_pipeline'] = ContextPipeline().compose(self._page_goto)
+        # Compose the context pipeline with the Playwright-specific context enhancer.
+        kwargs['_context_pipeline'] = (
+            ContextPipeline().compose(self._make_http_request).compose(self._handle_blocked_request)
+        )
         kwargs['_additional_context_managers'] = [self._browser_pool]
-
         kwargs.setdefault('_logger', logging.getLogger(__name__))
 
         super().__init__(**kwargs)
 
-    async def _page_goto(self, context: BasicCrawlingContext) -> AsyncGenerator[PlaywrightCrawlingContext, None]:
+    async def _make_http_request(
+        self,
+        context: BasicCrawlingContext,
+    ) -> AsyncGenerator[PlaywrightCrawlingContext, None]:
+        """Enhance the crawling context with making an HTTP request using Playwright.
+
+        Args:
+            context: The basic crawling context to be enhanced.
+
+        Raises:
+            ValueError: If the browser pool is not initialized.
+            SessionError: If the URL cannot be loaded by the browser.
+
+        Yields:
+            An enhanced crawling context with Playwright-specific features.
+        """
         if self._browser_pool is None:
             raise ValueError('Browser pool is not initialized.')
 
+        # Create a new browser page, navigate to the URL and get response.
         crawlee_page = await self._browser_pool.new_page(proxy_info=context.proxy_info)
-        await crawlee_page.page.goto(context.request.url)
+        response = await crawlee_page.page.goto(context.request.url)
+
+        if response is None:
+            raise SessionError(f'Failed to load the URL: {context.request.url}')
+
+        # Set the loaded URL to the actual URL after redirection.
         context.request.loaded_url = crawlee_page.page.url
 
         async def enqueue_links(
@@ -72,6 +114,7 @@ async def enqueue_links(
             user_data: dict | None = None,
             **kwargs: Unpack[AddRequestsKwargs],
         ) -> None:
+            """The `PlaywrightCrawler` implementation of the `EnqueueLinksFunction` function."""
             kwargs.setdefault('strategy', EnqueueStrategy.SAME_HOSTNAME)
 
             requests = list[BaseRequestData]()
@@ -102,7 +145,43 @@ async def enqueue_links(
             proxy_info=context.proxy_info,
             log=context.log,
             page=crawlee_page.page,
+            response=response,
             enqueue_links=enqueue_links,
         )
 
         await crawlee_page.page.close()
+
+    async def _handle_blocked_request(
+        self,
+        crawling_context: PlaywrightCrawlingContext,
+    ) -> AsyncGenerator[PlaywrightCrawlingContext, None]:
+        """Enhance the crawling context with handling of blocked requests.
+
+        Args:
+            crawling_context: The crawling context to be checked for blocking.
+
+        Raises:
+            SessionError: If the session is blocked based on the HTTP status code or the response content.
+
+        Yields:
+            The original crawling context if the session is not blocked.
+        """
+        if self._retry_on_blocked:
+            status_code = crawling_context.response.status
+
+            # Check if the session is blocked based on the HTTP status code.
+            if crawling_context.session and crawling_context.session.is_blocked_status_code(status_code=status_code):
+                raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}.')
+
+            matched_selectors = [
+                selector for selector in RETRY_CSS_SELECTORS if (await crawling_context.page.query_selector(selector))
+            ]
+
+            # Check if the session is blocked based on the response content
+            if matched_selectors:
+                raise SessionError(
+                    'Assuming the session is blocked - '
+                    f"HTTP response matched the following selectors: {'; '.join(matched_selectors)}"
+                )
+
+        yield crawling_context
diff --git a/src/crawlee/playwright_crawler/types.py b/src/crawlee/playwright_crawler/types.py
@@ -6,12 +6,19 @@
 from crawlee.basic_crawler.types import BasicCrawlingContext, EnqueueLinksFunction
 
 if TYPE_CHECKING:
-    from playwright.async_api import Page
+    from playwright.async_api import Page, Response
 
 
 @dataclass(frozen=True)
 class PlaywrightCrawlingContext(BasicCrawlingContext):
-    """Crawling context used by PlaywrightSoupCrawler."""
+    """Crawling context used by PlaywrightSoupCrawler.
+
+    Args:
+        page: The Playwright `Page` object.
+        response: The Playwright `Response` object.
+        enqueue_links: The `PlaywrightCrawler` implementation of the `EnqueueLinksFunction` function.
+    """
 
     page: Page
+    response: Response
     enqueue_links: EnqueueLinksFunction