apify · Pijukatel · Feb 11, 2025 · Feb 11, 2025
diff --git a/docs/examples/code/adaptive_playwright_crawler.py b/docs/examples/code/adaptive_playwright_crawler.py
@@ -15,7 +15,9 @@ async def main() -> None:
     )
 
     @crawler.router.handler(label='label')
-    async def request_handler_for_label(context: AdaptivePlaywrightCrawlingContext) -> None:
+    async def request_handler_for_label(
+        context: AdaptivePlaywrightCrawlingContext,
+    ) -> None:
         # Do some processing using `page`
         some_locator = context.page.locator('div').first
         await some_locator.wait_for()
@@ -35,8 +37,8 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
     @crawler.pre_navigation_hook
     async def hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:
         """Hook executed both in static sub crawler and playwright sub crawler."""
-        # Trying to access context.page in this hook would raise `AdaptiveContextError` for pages crawled
-        # without playwright.
+        # Trying to access context.page in this hook would raise `AdaptiveContextError`
+        # for pages crawled without playwright.
         context.log.info(f'pre navigation hook for: {context.request.url} ...')
 
     @crawler.pre_navigation_hook(playwright_only=True)
@@ -47,7 +49,9 @@ async def some_routing_function(route: Route) -> None:
             await route.continue_()
 
         await context.page.route('*/**', some_routing_function)
-        context.log.info(f'Playwright only pre navigation hook for: {context.request.url} ...')
+        context.log.info(
+            f'Playwright only pre navigation hook for: {context.request.url} ...'
+        )
 
     # Run the crawler with the initial list of URLs.
     await crawler.run(['https://warehouse-theme-metal.myshopify.com/'])

diff --git a/docs/examples/code/beautifulsoup_crawler.py b/docs/examples/code/beautifulsoup_crawler.py
@@ -1,7 +1,11 @@
 import asyncio
 from datetime import timedelta
 
-from crawlee.crawlers import BasicCrawlingContext, BeautifulSoupCrawler, BeautifulSoupCrawlingContext
+from crawlee.crawlers import (
+    BasicCrawlingContext,
+    BeautifulSoupCrawler,
+    BeautifulSoupCrawlingContext,
+)
 
 
 async def main() -> None:

diff --git a/docs/examples/code/beautifulsoup_crawler_keep_alive.py b/docs/examples/code/beautifulsoup_crawler_keep_alive.py
@@ -6,20 +6,25 @@
 
 async def main() -> None:
     crawler = BeautifulSoupCrawler(
-        # Keep the crawler alive even when there are no requests to be processed at the moment.
+        # Keep the crawler alive even when there are no requests to be processed now.
         keep_alive=True,
     )
 
     def stop_crawler_if_url_visited(context: BasicCrawlingContext) -> None:
-        """Stop crawler once specific url is visited. Just an example of guard condition to stop the crawler."""
+        """Stop crawler once specific url is visited.
+
+        Example of guard condition to stop the crawler."""
         if context.request.url == 'https://crawlee.dev/docs/examples':
-            crawler.stop('Stop crawler that was in keep_alive state after specific url was visited')
+            crawler.stop(
+                'Stop crawler that was in keep_alive state after specific url was visite'
+            )
         else:
             context.log.info('keep_alive=True, waiting for more requests to come.')
 
     async def add_request_later(url: str, after_s: int) -> None:
-        """Add requests to the queue after some time. This can be done by external code."""
-        # Just an example of request being added to the crawler later, when it is waiting due to `keep_alive=True`.
+        """Add requests to the queue after some time. Can be done by external code."""
+        # Just an example of request being added to the crawler later,
+        # when it is waiting due to `keep_alive=True`.
         await asyncio.sleep(after_s)
         await crawler.add_requests([url])
 
@@ -33,11 +38,15 @@ async def request_handler(context: BasicCrawlingContext) -> None:
 
     # Start some tasks that will add some requests later to simulate real situation,
     # where requests are added later by external code.
-    add_request_later_task1 = asyncio.create_task(add_request_later(url='https://crawlee.dev', after_s=1))
-    add_request_later_task2 = asyncio.create_task(add_request_later(url='https://crawlee.dev/docs/examples', after_s=5))
+    add_request_later_task1 = asyncio.create_task(
+        add_request_later(url='https://crawlee.dev', after_s=1)
+    )
+    add_request_later_task2 = asyncio.create_task(
+        add_request_later(url='https://crawlee.dev/docs/examples', after_s=5)
+    )
 
     # Run the crawler without the initial list of requests.
-    # It will wait for more requests to be added to the queue later due to `keep_alive=True`.
+    # Wait for more requests to be added to the queue later due to `keep_alive=True`.
     await crawler.run()
 
     await asyncio.gather(add_request_later_task1, add_request_later_task2)

diff --git a/docs/examples/code/beautifulsoup_crawler_stop.py b/docs/examples/code/beautifulsoup_crawler_stop.py
@@ -20,7 +20,9 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
 
         # Create custom condition to stop crawler once it finds what it is looking for.
         if 'crawlee' in context.request.url:
-            crawler.stop(reason='Manual stop of crawler after finding `crawlee` in the url.')
+            crawler.stop(
+                reason='Manual stop of crawler after finding `crawlee` in the url.'
+            )
 
         # Extract data from the page.
         data = {

diff --git a/docs/examples/code/crawl_website_with_relative_links_all_links.py b/docs/examples/code/crawl_website_with_relative_links_all_links.py
@@ -15,8 +15,8 @@ async def main() -> None:
     async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
         context.log.info(f'Processing {context.request.url} ...')
 
-        # Enqueue all links found on the page. Any URLs found will be matched by this strategy,
-        # even if they go off the site you are currently crawling.
+        # Enqueue all links found on the page. Any URLs found will be matched by
+        # this strategy, even if they go off the site you are currently crawling.
         await context.enqueue_links(strategy=EnqueueStrategy.ALL)
 
     # Run the crawler with the initial list of requests.

diff --git a/docs/examples/code/playwright_block_requests.py b/docs/examples/code/playwright_block_requests.py
@@ -1,6 +1,10 @@
 import asyncio
 
-from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext
+from crawlee.crawlers import (
+    PlaywrightCrawler,
+    PlaywrightCrawlingContext,
+    PlaywrightPreNavCrawlingContext,
+)
 
 
 async def main() -> None:

diff --git a/docs/examples/code/playwright_crawler.py b/docs/examples/code/playwright_crawler.py
@@ -1,6 +1,10 @@
 import asyncio
 
-from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext
+from crawlee.crawlers import (
+    PlaywrightCrawler,
+    PlaywrightCrawlingContext,
+    PlaywrightPreNavCrawlingContext,
+)
 
 
 async def main() -> None:

diff --git a/docs/examples/code/playwright_crawler_with_camoufox.py b/docs/examples/code/playwright_crawler_with_camoufox.py
@@ -4,31 +4,40 @@
 from camoufox import AsyncNewBrowser
 from typing_extensions import override
 
-from crawlee.browsers import BrowserPool, PlaywrightBrowserController, PlaywrightBrowserPlugin
+from crawlee.browsers import (
+    BrowserPool,
+    PlaywrightBrowserController,
+    PlaywrightBrowserPlugin,
+)
 from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
 
 
 class CamoufoxPlugin(PlaywrightBrowserPlugin):
-    """Example browser plugin that uses Camoufox browser, but otherwise keeps the functionality of
-    PlaywrightBrowserPlugin."""
+    """Example browser plugin that uses Camoufox browser,
+    but otherwise keeps the functionality of PlaywrightBrowserPlugin.
+    """
 
     @override
     async def new_browser(self) -> PlaywrightBrowserController:
         if not self._playwright:
             raise RuntimeError('Playwright browser plugin is not initialized.')
 
         return PlaywrightBrowserController(
-            browser=await AsyncNewBrowser(self._playwright, **self._browser_launch_options),
-            max_open_pages_per_browser=1,  # Increase, if camoufox can handle it in your use case.
-            header_generator=None,  # This turns off the crawlee header_generation. Camoufox has its own.
+            browser=await AsyncNewBrowser(
+                self._playwright, **self._browser_launch_options
+            ),
+            # Increase, if camoufox can handle it in your use case.
+            max_open_pages_per_browser=1,
+            # This turns off the crawlee header_generation. Camoufox has its own.
+            header_generator=None,
         )
 
 
 async def main() -> None:
     crawler = PlaywrightCrawler(
         # Limit the crawl to max requests. Remove or increase it for crawling all links.
         max_requests_per_crawl=10,
-        # Custom browser pool. This gives users full control over browsers used by the crawler.
+        # Custom browser pool. Gives users full control over browsers used by the crawler.
         browser_pool=BrowserPool(plugins=[CamoufoxPlugin()]),
     )
 

diff --git a/docs/examples/code/playwright_crawler_with_fingerprint_generator.py b/docs/examples/code/playwright_crawler_with_fingerprint_generator.py
@@ -1,12 +1,16 @@
 import asyncio
 
 from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
-from crawlee.fingerprint_suite import DefaultFingerprintGenerator, HeaderGeneratorOptions, ScreenOptions
+from crawlee.fingerprint_suite import (
+    DefaultFingerprintGenerator,
+    HeaderGeneratorOptions,
+    ScreenOptions,
+)
 
 
 async def main() -> None:
     # Use default fingerprint generator with desired fingerprint options.
-    # Generator will try to generate real looking browser fingerprint based on the options.
+    # Generator will generate real looking browser fingerprint based on the options.
     # Unspecified fingerprint options will be automatically selected by the generator.
     fingerprint_generator = DefaultFingerprintGenerator(
         header_options=HeaderGeneratorOptions(browsers=['chromium']),

diff --git a/docs/guides/code/playwright_crawler/browser_configuration_example.py b/docs/guides/code/playwright_crawler/browser_configuration_example.py
@@ -9,7 +9,8 @@ async def main() -> None:
         browser_type='chromium',
         # Browser launch options
         browser_launch_options={
-            # For support `msedge` channel you need to install it `playwright install msedge`
+            # For support `msedge` channel you need to install it
+            # `playwright install msedge`
             'channel': 'msedge',
             'slow_mo': 200,
         },

diff --git a/docs/guides/code/playwright_crawler/multiple_launch_example.py b/docs/guides/code/playwright_crawler/multiple_launch_example.py
@@ -6,8 +6,12 @@
 
 async def main() -> None:
     # Create a plugin for each required browser.
-    plugin_chromium = PlaywrightBrowserPlugin(browser_type='chromium', max_open_pages_per_browser=1)
-    plugin_firefox = PlaywrightBrowserPlugin(browser_type='firefox', max_open_pages_per_browser=1)
+    plugin_chromium = PlaywrightBrowserPlugin(
+        browser_type='chromium', max_open_pages_per_browser=1
+    )
+    plugin_firefox = PlaywrightBrowserPlugin(
+        browser_type='firefox', max_open_pages_per_browser=1
+    )
 
     crawler = PlaywrightCrawler(
         browser_pool=BrowserPool(plugins=[plugin_chromium, plugin_firefox]),
@@ -17,7 +21,11 @@ async def main() -> None:
 
     @crawler.router.default_handler
     async def request_handler(context: PlaywrightCrawlingContext) -> None:
-        browser_name = context.page.context.browser.browser_type.name if context.page.context.browser else 'undefined'
+        browser_name = (
+            context.page.context.browser.browser_type.name
+            if context.page.context.browser
+            else 'undefined'
+        )
         context.log.info(f'Processing {context.request.url} with {browser_name} ...')
 
         await context.enqueue_links()

diff --git a/docs/guides/code/playwright_crawler/pre_navigation_hook_example.py b/docs/guides/code/playwright_crawler/pre_navigation_hook_example.py
@@ -1,6 +1,10 @@
 import asyncio
 
-from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext
+from crawlee.crawlers import (
+    PlaywrightCrawler,
+    PlaywrightCrawlingContext,
+    PlaywrightPreNavCrawlingContext,
+)
 
 
 async def main() -> None:

diff --git a/docs/guides/code/proxy_management/tiers_bs_example.py b/docs/guides/code/proxy_management/tiers_bs_example.py
@@ -8,12 +8,19 @@ async def main() -> None:
     # Create a ProxyConfiguration object and pass it to the crawler.
     proxy_configuration = ProxyConfiguration(
         tiered_proxy_urls=[
-            # No proxy tier. (Not needed, but optional in case you do not want to use any proxy on lowest tier.)
+            # No proxy tier.
+            # Optional in case you do not want to use any proxy on lowest tier.
             [None],
             # lower tier, cheaper, preferred as long as they work
-            ['http://cheap-datacenter-proxy-1.com/', 'http://cheap-datacenter-proxy-2.com/'],
+            [
+                'http://cheap-datacenter-proxy-1.com/',
+                'http://cheap-datacenter-proxy-2.com/',
+            ],
             # higher tier, more expensive, used as a fallback
-            ['http://expensive-residential-proxy-1.com/', 'http://expensive-residential-proxy-2.com/'],
+            [
+                'http://expensive-residential-proxy-1.com/',
+                'http://expensive-residential-proxy-2.com/',
+            ],
         ]
     )
     crawler = BeautifulSoupCrawler(proxy_configuration=proxy_configuration)

diff --git a/docs/guides/code/proxy_management/tiers_pw_example.py b/docs/guides/code/proxy_management/tiers_pw_example.py
@@ -8,12 +8,19 @@ async def main() -> None:
     # Create a ProxyConfiguration object and pass it to the crawler.
     proxy_configuration = ProxyConfiguration(
         tiered_proxy_urls=[
-            # No proxy tier. (Not needed, but optional in case you do not want to use any proxy on lowest tier.)
+            # No proxy tier.
+            # Optional in case you do not want to use any proxy on lowest tier.
             [None],
             # lower tier, cheaper, preferred as long as they work
-            ['http://cheap-datacenter-proxy-1.com/', 'http://cheap-datacenter-proxy-2.com/'],
+            [
+                'http://cheap-datacenter-proxy-1.com/',
+                'http://cheap-datacenter-proxy-2.com/',
+            ],
             # higher tier, more expensive, used as a fallback
-            ['http://expensive-residential-proxy-1.com/', 'http://expensive-residential-proxy-2.com/'],
+            [
+                'http://expensive-residential-proxy-1.com/',
+                'http://expensive-residential-proxy-2.com/',
+            ],
         ]
     )
     crawler = PlaywrightCrawler(proxy_configuration=proxy_configuration)

diff --git a/docs/guides/code/storages/rq_basic_example.py b/docs/guides/code/storages/rq_basic_example.py
@@ -12,7 +12,9 @@ async def main() -> None:
     await request_queue.add_request('https://apify.com/')
 
     # Add multiple requests as a batch.
-    await request_queue.add_requests_batched(['https://crawlee.dev/', 'https://crawlee.dev/python/'])
+    await request_queue.add_requests_batched(
+        ['https://crawlee.dev/', 'https://crawlee.dev/python/']
+    )
 
     # Fetch and process requests from the queue.
     while request := await request_queue.fetch_next_request():

diff --git a/docs/guides/code/storages/rq_with_crawler_example.py b/docs/guides/code/storages/rq_with_crawler_example.py
@@ -4,8 +4,8 @@
 
 
 async def main() -> None:
-    # Create a new crawler (it can be any subclass of BasicCrawler). Request queue is a default
-    # request manager, it will be opened, and fully managed if not specified.
+    # Create a new crawler (it can be any subclass of BasicCrawler). Request queue is
+    # a default request manager, it will be opened, and fully managed if not specified.
     crawler = HttpCrawler()
 
     # Define the default request handler, which will be called for every request.

diff --git a/docs/guides/code/storages/rq_with_crawler_explicit_example.py b/docs/guides/code/storages/rq_with_crawler_explicit_example.py
@@ -10,7 +10,9 @@ async def main() -> None:
     request_queue = await RequestQueue.open(name='my-request-queue')
 
     # Interact with the request queue directly, e.g. add a batch of requests.
-    await request_queue.add_requests_batched(['https://apify.com/', 'https://crawlee.dev/'])
+    await request_queue.add_requests_batched(
+        ['https://apify.com/', 'https://crawlee.dev/']
+    )
 
     # Create a new crawler (it can be any subclass of BasicCrawler) and pass the request
     # list as request manager to it. It will be managed by the crawler.

diff --git a/docs/introduction/code/03_transform_request.py b/docs/introduction/code/03_transform_request.py
@@ -4,7 +4,9 @@
 from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
 
 
-def transform_request(request_options: RequestOptions) -> RequestOptions | RequestTransformAction:
+def transform_request(
+    request_options: RequestOptions,
+) -> RequestOptions | RequestTransformAction:
     # Skip requests to PDF files
     if request_options['url'].endswith('.pdf'):
         return 'skip'

diff --git a/docs/introduction/code/04_sanity_check.py b/docs/introduction/code/04_sanity_check.py
@@ -13,8 +13,9 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
         # the elements we want to interact with are present in the DOM.
         await context.page.wait_for_selector('.collection-block-item')
 
-        # Execute a function within the browser context to target the collection card elements
-        # and extract their text content, trimming any leading or trailing whitespace.
+        # Execute a function within the browser context to target the collection
+        # card elements and extract their text content, trimming any leading or
+        # trailing whitespace.
         category_texts = await context.page.eval_on_selector_all(
             '.collection-block-item',
             '(els) => els.map(el => el.textContent.trim())',

diff --git a/docs/introduction/code/06_scraping.py b/docs/introduction/code/06_scraping.py
@@ -23,7 +23,9 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
             title = await context.page.locator('.product-meta h1').text_content()
 
             # Extract the SKU using its selector.
-            sku = await context.page.locator('span.product-meta__sku-number').text_content()
+            sku = await context.page.locator(
+                'span.product-meta__sku-number'
+            ).text_content()
 
             # Locate the price element that contains the '$' sign and filter out
             # the visually hidden elements.

diff --git a/docs/introduction/code/07_final_code.py b/docs/introduction/code/07_final_code.py
@@ -23,7 +23,9 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
             title = await context.page.locator('.product-meta h1').text_content()
 
             # Extract the SKU using its selector.
-            sku = await context.page.locator('span.product-meta__sku-number').text_content()
+            sku = await context.page.locator(
+                'span.product-meta__sku-number'
+            ).text_content()
 
             # Locate the price element that contains the '$' sign and filter out
             # the visually hidden elements.

diff --git a/docs/pyproject.toml b/docs/pyproject.toml
@@ -0,0 +1,9 @@
+# Line lenght different from the rest of the code to make sure that the example codes visualised on the generated
+# documentation webpages are shown without vertical slider to make them more readable.
+
+[tool.ruff]
+# Inherit all from project top configuration file.
+extend = "../pyproject.toml"
+
+# Override just line length
+line-length = 90 # Maximum possible fit to the doc webpage. Longer lines need slider.