Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions docs/examples/code/adaptive_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@ async def main() -> None:
)

@crawler.router.handler(label='label')
async def request_handler_for_label(context: AdaptivePlaywrightCrawlingContext) -> None:
async def request_handler_for_label(
context: AdaptivePlaywrightCrawlingContext,
) -> None:
# Do some processing using `page`
some_locator = context.page.locator('div').first
await some_locator.wait_for()
Expand All @@ -35,8 +37,8 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
@crawler.pre_navigation_hook
async def hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:
"""Hook executed both in static sub crawler and playwright sub crawler."""
# Trying to access context.page in this hook would raise `AdaptiveContextError` for pages crawled
# without playwright.
# Trying to access context.page in this hook would raise `AdaptiveContextError`
# for pages crawled without playwright.
context.log.info(f'pre navigation hook for: {context.request.url} ...')

@crawler.pre_navigation_hook(playwright_only=True)
Expand All @@ -47,7 +49,9 @@ async def some_routing_function(route: Route) -> None:
await route.continue_()

await context.page.route('*/**', some_routing_function)
context.log.info(f'Playwright only pre navigation hook for: {context.request.url} ...')
context.log.info(
f'Playwright only pre navigation hook for: {context.request.url} ...'
)

# Run the crawler with the initial list of URLs.
await crawler.run(['https://warehouse-theme-metal.myshopify.com/'])
Expand Down
6 changes: 5 additions & 1 deletion docs/examples/code/beautifulsoup_crawler.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
import asyncio
from datetime import timedelta

from crawlee.crawlers import BasicCrawlingContext, BeautifulSoupCrawler, BeautifulSoupCrawlingContext
from crawlee.crawlers import (
BasicCrawlingContext,
BeautifulSoupCrawler,
BeautifulSoupCrawlingContext,
)


async def main() -> None:
Expand Down
25 changes: 17 additions & 8 deletions docs/examples/code/beautifulsoup_crawler_keep_alive.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,25 @@

async def main() -> None:
crawler = BeautifulSoupCrawler(
# Keep the crawler alive even when there are no requests to be processed at the moment.
# Keep the crawler alive even when there are no requests to be processed now.
keep_alive=True,
)

def stop_crawler_if_url_visited(context: BasicCrawlingContext) -> None:
"""Stop crawler once specific url is visited. Just an example of guard condition to stop the crawler."""
"""Stop crawler once specific url is visited.

Example of guard condition to stop the crawler."""
if context.request.url == 'https://crawlee.dev/docs/examples':
crawler.stop('Stop crawler that was in keep_alive state after specific url was visited')
crawler.stop(
'Stop crawler that was in keep_alive state after specific url was visite'
)
else:
context.log.info('keep_alive=True, waiting for more requests to come.')

async def add_request_later(url: str, after_s: int) -> None:
"""Add requests to the queue after some time. This can be done by external code."""
# Just an example of request being added to the crawler later, when it is waiting due to `keep_alive=True`.
"""Add requests to the queue after some time. Can be done by external code."""
# Just an example of request being added to the crawler later,
# when it is waiting due to `keep_alive=True`.
await asyncio.sleep(after_s)
await crawler.add_requests([url])

Expand All @@ -33,11 +38,15 @@ async def request_handler(context: BasicCrawlingContext) -> None:

# Start some tasks that will add some requests later to simulate real situation,
# where requests are added later by external code.
add_request_later_task1 = asyncio.create_task(add_request_later(url='https://crawlee.dev', after_s=1))
add_request_later_task2 = asyncio.create_task(add_request_later(url='https://crawlee.dev/docs/examples', after_s=5))
add_request_later_task1 = asyncio.create_task(
add_request_later(url='https://crawlee.dev', after_s=1)
)
add_request_later_task2 = asyncio.create_task(
add_request_later(url='https://crawlee.dev/docs/examples', after_s=5)
)

# Run the crawler without the initial list of requests.
# It will wait for more requests to be added to the queue later due to `keep_alive=True`.
# Wait for more requests to be added to the queue later due to `keep_alive=True`.
await crawler.run()

await asyncio.gather(add_request_later_task1, add_request_later_task2)
Expand Down
4 changes: 3 additions & 1 deletion docs/examples/code/beautifulsoup_crawler_stop.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:

# Create custom condition to stop crawler once it finds what it is looking for.
if 'crawlee' in context.request.url:
crawler.stop(reason='Manual stop of crawler after finding `crawlee` in the url.')
crawler.stop(
reason='Manual stop of crawler after finding `crawlee` in the url.'
)

# Extract data from the page.
data = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ async def main() -> None:
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

# Enqueue all links found on the page. Any URLs found will be matched by this strategy,
# even if they go off the site you are currently crawling.
# Enqueue all links found on the page. Any URLs found will be matched by
# this strategy, even if they go off the site you are currently crawling.
await context.enqueue_links(strategy=EnqueueStrategy.ALL)

# Run the crawler with the initial list of requests.
Expand Down
6 changes: 5 additions & 1 deletion docs/examples/code/playwright_block_requests.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
import asyncio

from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext
from crawlee.crawlers import (
PlaywrightCrawler,
PlaywrightCrawlingContext,
PlaywrightPreNavCrawlingContext,
)


async def main() -> None:
Expand Down
6 changes: 5 additions & 1 deletion docs/examples/code/playwright_crawler.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
import asyncio

from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext
from crawlee.crawlers import (
PlaywrightCrawler,
PlaywrightCrawlingContext,
PlaywrightPreNavCrawlingContext,
)


async def main() -> None:
Expand Down
23 changes: 16 additions & 7 deletions docs/examples/code/playwright_crawler_with_camoufox.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,31 +4,40 @@
from camoufox import AsyncNewBrowser
from typing_extensions import override

from crawlee.browsers import BrowserPool, PlaywrightBrowserController, PlaywrightBrowserPlugin
from crawlee.browsers import (
BrowserPool,
PlaywrightBrowserController,
PlaywrightBrowserPlugin,
)
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext


class CamoufoxPlugin(PlaywrightBrowserPlugin):
"""Example browser plugin that uses Camoufox browser, but otherwise keeps the functionality of
PlaywrightBrowserPlugin."""
"""Example browser plugin that uses Camoufox browser,
but otherwise keeps the functionality of PlaywrightBrowserPlugin.
"""

@override
async def new_browser(self) -> PlaywrightBrowserController:
if not self._playwright:
raise RuntimeError('Playwright browser plugin is not initialized.')

return PlaywrightBrowserController(
browser=await AsyncNewBrowser(self._playwright, **self._browser_launch_options),
max_open_pages_per_browser=1, # Increase, if camoufox can handle it in your use case.
header_generator=None, # This turns off the crawlee header_generation. Camoufox has its own.
browser=await AsyncNewBrowser(
self._playwright, **self._browser_launch_options
),
# Increase, if camoufox can handle it in your use case.
max_open_pages_per_browser=1,
# This turns off the crawlee header_generation. Camoufox has its own.
header_generator=None,
)


async def main() -> None:
crawler = PlaywrightCrawler(
# Limit the crawl to max requests. Remove or increase it for crawling all links.
max_requests_per_crawl=10,
# Custom browser pool. This gives users full control over browsers used by the crawler.
# Custom browser pool. Gives users full control over browsers used by the crawler.
browser_pool=BrowserPool(plugins=[CamoufoxPlugin()]),
)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
import asyncio

from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
from crawlee.fingerprint_suite import DefaultFingerprintGenerator, HeaderGeneratorOptions, ScreenOptions
from crawlee.fingerprint_suite import (
DefaultFingerprintGenerator,
HeaderGeneratorOptions,
ScreenOptions,
)


async def main() -> None:
# Use default fingerprint generator with desired fingerprint options.
# Generator will try to generate real looking browser fingerprint based on the options.
# Generator will generate real looking browser fingerprint based on the options.
# Unspecified fingerprint options will be automatically selected by the generator.
fingerprint_generator = DefaultFingerprintGenerator(
header_options=HeaderGeneratorOptions(browsers=['chromium']),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ async def main() -> None:
browser_type='chromium',
# Browser launch options
browser_launch_options={
# For support `msedge` channel you need to install it `playwright install msedge`
# For support `msedge` channel you need to install it
# `playwright install msedge`
'channel': 'msedge',
'slow_mo': 200,
},
Expand Down
14 changes: 11 additions & 3 deletions docs/guides/code/playwright_crawler/multiple_launch_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,12 @@

async def main() -> None:
# Create a plugin for each required browser.
plugin_chromium = PlaywrightBrowserPlugin(browser_type='chromium', max_open_pages_per_browser=1)
plugin_firefox = PlaywrightBrowserPlugin(browser_type='firefox', max_open_pages_per_browser=1)
plugin_chromium = PlaywrightBrowserPlugin(
browser_type='chromium', max_open_pages_per_browser=1
)
plugin_firefox = PlaywrightBrowserPlugin(
browser_type='firefox', max_open_pages_per_browser=1
)

crawler = PlaywrightCrawler(
browser_pool=BrowserPool(plugins=[plugin_chromium, plugin_firefox]),
Expand All @@ -17,7 +21,11 @@ async def main() -> None:

@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
browser_name = context.page.context.browser.browser_type.name if context.page.context.browser else 'undefined'
browser_name = (
context.page.context.browser.browser_type.name
if context.page.context.browser
else 'undefined'
)
context.log.info(f'Processing {context.request.url} with {browser_name} ...')

await context.enqueue_links()
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
import asyncio

from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext
from crawlee.crawlers import (
PlaywrightCrawler,
PlaywrightCrawlingContext,
PlaywrightPreNavCrawlingContext,
)


async def main() -> None:
Expand Down
13 changes: 10 additions & 3 deletions docs/guides/code/proxy_management/tiers_bs_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,19 @@ async def main() -> None:
# Create a ProxyConfiguration object and pass it to the crawler.
proxy_configuration = ProxyConfiguration(
tiered_proxy_urls=[
# No proxy tier. (Not needed, but optional in case you do not want to use any proxy on lowest tier.)
# No proxy tier.
# Optional in case you do not want to use any proxy on lowest tier.
[None],
# lower tier, cheaper, preferred as long as they work
['http://cheap-datacenter-proxy-1.com/', 'http://cheap-datacenter-proxy-2.com/'],
[
'http://cheap-datacenter-proxy-1.com/',
'http://cheap-datacenter-proxy-2.com/',
],
# higher tier, more expensive, used as a fallback
['http://expensive-residential-proxy-1.com/', 'http://expensive-residential-proxy-2.com/'],
[
'http://expensive-residential-proxy-1.com/',
'http://expensive-residential-proxy-2.com/',
],
]
)
crawler = BeautifulSoupCrawler(proxy_configuration=proxy_configuration)
Expand Down
13 changes: 10 additions & 3 deletions docs/guides/code/proxy_management/tiers_pw_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,19 @@ async def main() -> None:
# Create a ProxyConfiguration object and pass it to the crawler.
proxy_configuration = ProxyConfiguration(
tiered_proxy_urls=[
# No proxy tier. (Not needed, but optional in case you do not want to use any proxy on lowest tier.)
# No proxy tier.
# Optional in case you do not want to use any proxy on lowest tier.
[None],
# lower tier, cheaper, preferred as long as they work
['http://cheap-datacenter-proxy-1.com/', 'http://cheap-datacenter-proxy-2.com/'],
[
'http://cheap-datacenter-proxy-1.com/',
'http://cheap-datacenter-proxy-2.com/',
],
# higher tier, more expensive, used as a fallback
['http://expensive-residential-proxy-1.com/', 'http://expensive-residential-proxy-2.com/'],
[
'http://expensive-residential-proxy-1.com/',
'http://expensive-residential-proxy-2.com/',
],
]
)
crawler = PlaywrightCrawler(proxy_configuration=proxy_configuration)
Expand Down
4 changes: 3 additions & 1 deletion docs/guides/code/storages/rq_basic_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ async def main() -> None:
await request_queue.add_request('https://apify.com/')

# Add multiple requests as a batch.
await request_queue.add_requests_batched(['https://crawlee.dev/', 'https://crawlee.dev/python/'])
await request_queue.add_requests_batched(
['https://crawlee.dev/', 'https://crawlee.dev/python/']
)

# Fetch and process requests from the queue.
while request := await request_queue.fetch_next_request():
Expand Down
4 changes: 2 additions & 2 deletions docs/guides/code/storages/rq_with_crawler_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@


async def main() -> None:
# Create a new crawler (it can be any subclass of BasicCrawler). Request queue is a default
# request manager, it will be opened, and fully managed if not specified.
# Create a new crawler (it can be any subclass of BasicCrawler). Request queue is
# a default request manager, it will be opened, and fully managed if not specified.
crawler = HttpCrawler()

# Define the default request handler, which will be called for every request.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ async def main() -> None:
request_queue = await RequestQueue.open(name='my-request-queue')

# Interact with the request queue directly, e.g. add a batch of requests.
await request_queue.add_requests_batched(['https://apify.com/', 'https://crawlee.dev/'])
await request_queue.add_requests_batched(
['https://apify.com/', 'https://crawlee.dev/']
)

# Create a new crawler (it can be any subclass of BasicCrawler) and pass the request
# list as request manager to it. It will be managed by the crawler.
Expand Down
4 changes: 3 additions & 1 deletion docs/introduction/code/03_transform_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


def transform_request(request_options: RequestOptions) -> RequestOptions | RequestTransformAction:
def transform_request(
request_options: RequestOptions,
) -> RequestOptions | RequestTransformAction:
# Skip requests to PDF files
if request_options['url'].endswith('.pdf'):
return 'skip'
Expand Down
5 changes: 3 additions & 2 deletions docs/introduction/code/04_sanity_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
# the elements we want to interact with are present in the DOM.
await context.page.wait_for_selector('.collection-block-item')

# Execute a function within the browser context to target the collection card elements
# and extract their text content, trimming any leading or trailing whitespace.
# Execute a function within the browser context to target the collection
# card elements and extract their text content, trimming any leading or
# trailing whitespace.
category_texts = await context.page.eval_on_selector_all(
'.collection-block-item',
'(els) => els.map(el => el.textContent.trim())',
Expand Down
4 changes: 3 additions & 1 deletion docs/introduction/code/06_scraping.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
title = await context.page.locator('.product-meta h1').text_content()

# Extract the SKU using its selector.
sku = await context.page.locator('span.product-meta__sku-number').text_content()
sku = await context.page.locator(
'span.product-meta__sku-number'
).text_content()

# Locate the price element that contains the '$' sign and filter out
# the visually hidden elements.
Expand Down
4 changes: 3 additions & 1 deletion docs/introduction/code/07_final_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
title = await context.page.locator('.product-meta h1').text_content()

# Extract the SKU using its selector.
sku = await context.page.locator('span.product-meta__sku-number').text_content()
sku = await context.page.locator(
'span.product-meta__sku-number'
).text_content()

# Locate the price element that contains the '$' sign and filter out
# the visually hidden elements.
Expand Down
9 changes: 9 additions & 0 deletions docs/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Line lenght different from the rest of the code to make sure that the example codes visualised on the generated
# documentation webpages are shown without vertical slider to make them more readable.

[tool.ruff]
# Inherit all from project top configuration file.
extend = "../pyproject.toml"

# Override just line length
line-length = 90 # Maximum possible fit to the doc webpage. Longer lines need slider.
Loading
Loading