55
66from typing_extensions import Unpack
77
8+ from crawlee ._utils .blocked import RETRY_CSS_SELECTORS
89from crawlee .basic_crawler import BasicCrawler , BasicCrawlerOptions , ContextPipeline
10+ from crawlee .basic_crawler .errors import SessionError
911from crawlee .browsers import BrowserPool
1012from crawlee .enqueue_strategy import EnqueueStrategy
1113from crawlee .models import BaseRequestData
1820
1921
2022class PlaywrightCrawler (BasicCrawler [PlaywrightCrawlingContext ]):
21- """A crawler that fetches the request URL using `Playwright`."""
23+ """A crawler that leverages the [Playwright](https://playwright.dev/python/) browser automation library.
24+
25+ `PlaywrightCrawler` is a subclass of `BasicCrawler`, inheriting all its features, such as autoscaling of requests,
26+ request routing, and utilization of `RequestProvider`. Additionally, it offers Playwright-specific methods and
27+ properties, like the `page` property for user data extraction, and the `enqueue_links` method for crawling
28+ other pages.
29+
30+ This crawler is ideal for crawling websites that require JavaScript execution, as it uses headless browsers
31+ to download web pages and extract data. For websites that do not require JavaScript, consider using
32+ `BeautifulSoupCrawler`, which uses raw HTTP requests, and it is much faster.
33+
34+ `PlaywrightCrawler` opens a new browser page (i.e., tab) for each `Request` object and invokes the user-provided
35+ request handler function via the `Router`. Users can interact with the page and extract the data using
36+ the Playwright API.
37+
38+ Note that the pool of browser instances used by `PlaywrightCrawler`, and the pages they open, is internally
39+ managed by the `BrowserPool`.
40+ """
2241
2342 def __init__ (
2443 self ,
@@ -50,19 +69,42 @@ def __init__(
5069
5170 self ._browser_pool = browser_pool
5271
53- kwargs ['_context_pipeline' ] = ContextPipeline ().compose (self ._page_goto )
72+ # Compose the context pipeline with the Playwright-specific context enhancer.
73+ kwargs ['_context_pipeline' ] = (
74+ ContextPipeline ().compose (self ._make_http_request ).compose (self ._handle_blocked_request )
75+ )
5476 kwargs ['_additional_context_managers' ] = [self ._browser_pool ]
55-
5677 kwargs .setdefault ('_logger' , logging .getLogger (__name__ ))
5778
5879 super ().__init__ (** kwargs )
5980
60- async def _page_goto (self , context : BasicCrawlingContext ) -> AsyncGenerator [PlaywrightCrawlingContext , None ]:
81+ async def _make_http_request (
82+ self ,
83+ context : BasicCrawlingContext ,
84+ ) -> AsyncGenerator [PlaywrightCrawlingContext , None ]:
85+ """Enhance the crawling context with making an HTTP request using Playwright.
86+
87+ Args:
88+ context: The basic crawling context to be enhanced.
89+
90+ Raises:
91+ ValueError: If the browser pool is not initialized.
92+ SessionError: If the URL cannot be loaded by the browser.
93+
94+ Yields:
95+ An enhanced crawling context with Playwright-specific features.
96+ """
6197 if self ._browser_pool is None :
6298 raise ValueError ('Browser pool is not initialized.' )
6399
100+ # Create a new browser page, navigate to the URL and get response.
64101 crawlee_page = await self ._browser_pool .new_page (proxy_info = context .proxy_info )
65- await crawlee_page .page .goto (context .request .url )
102+ response = await crawlee_page .page .goto (context .request .url )
103+
104+ if response is None :
105+ raise SessionError (f'Failed to load the URL: { context .request .url } ' )
106+
107+ # Set the loaded URL to the actual URL after redirection.
66108 context .request .loaded_url = crawlee_page .page .url
67109
68110 async def enqueue_links (
@@ -72,6 +114,7 @@ async def enqueue_links(
72114 user_data : dict | None = None ,
73115 ** kwargs : Unpack [AddRequestsKwargs ],
74116 ) -> None :
117+ """The `PlaywrightCrawler` implementation of the `EnqueueLinksFunction` function."""
75118 kwargs .setdefault ('strategy' , EnqueueStrategy .SAME_HOSTNAME )
76119
77120 requests = list [BaseRequestData ]()
@@ -102,7 +145,43 @@ async def enqueue_links(
102145 proxy_info = context .proxy_info ,
103146 log = context .log ,
104147 page = crawlee_page .page ,
148+ response = response ,
105149 enqueue_links = enqueue_links ,
106150 )
107151
108152 await crawlee_page .page .close ()
153+
154+ async def _handle_blocked_request (
155+ self ,
156+ crawling_context : PlaywrightCrawlingContext ,
157+ ) -> AsyncGenerator [PlaywrightCrawlingContext , None ]:
158+ """Enhance the crawling context with handling of blocked requests.
159+
160+ Args:
161+ crawling_context: The crawling context to be checked for blocking.
162+
163+ Raises:
164+ SessionError: If the session is blocked based on the HTTP status code or the response content.
165+
166+ Yields:
167+ The original crawling context if the session is not blocked.
168+ """
169+ if self ._retry_on_blocked :
170+ status_code = crawling_context .response .status
171+
172+ # Check if the session is blocked based on the HTTP status code.
173+ if crawling_context .session and crawling_context .session .is_blocked_status_code (status_code = status_code ):
174+ raise SessionError (f'Assuming the session is blocked based on HTTP status code { status_code } .' )
175+
176+ matched_selectors = [
177+ selector for selector in RETRY_CSS_SELECTORS if (await crawling_context .page .query_selector (selector ))
178+ ]
179+
180+ # Check if the session is blocked based on the response content
181+ if matched_selectors :
182+ raise SessionError (
183+ 'Assuming the session is blocked - '
184+ f"HTTP response matched the following selectors: { '; ' .join (matched_selectors )} "
185+ )
186+
187+ yield crawling_context
0 commit comments