Skip to content

Commit 39b2d24

Browse files
authored
feat(cli): Add Adaptive and Stagehand crawler templates (#1888)
Adds three crawler types to the cookiecutter project template and the scheduled E2E templates tests: - `adaptive-beautifulsoup` — uses `AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser()`. - `adaptive-parsel` — uses `AdaptivePlaywrightCrawler.with_parsel_static_parser()`. - `stagehand` — uses `StagehandCrawler`. The template ships a placeholder API key; the E2E test only validates the build and skips the run.
1 parent 981f937 commit 39b2d24

14 files changed

Lines changed: 147 additions & 22 deletions

File tree

.github/workflows/on_schedule_tests.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ jobs:
2727
fail-fast: false
2828
max-parallel: 12
2929
matrix:
30-
crawler-type: ["playwright_camoufox", "playwright_chrome", "playwright_firefox", "playwright_webkit", "playwright", "parsel", "beautifulsoup"]
30+
crawler-type: ["playwright_camoufox", "playwright_chrome", "playwright_firefox", "playwright_webkit", "playwright", "parsel", "beautifulsoup", "adaptive_beautifulsoup", "adaptive_parsel", "stagehand"]
3131
http-client: ["httpx", "curl_impersonate", "impit"]
3232
package-manager: ["pip", "uv", "poetry"]
3333

src/crawlee/project_template/cookiecutter.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"project_name": "crawlee-python-project",
33
"__package_name": "{{ cookiecutter.project_name|lower|replace('-', '_') }}",
4-
"crawler_type": ["beautifulsoup", "parsel", "playwright", "playwright-camoufox", "playwright-chrome", "playwright-firefox", "playwright-webkit"],
4+
"crawler_type": ["beautifulsoup", "parsel", "adaptive-beautifulsoup", "adaptive-parsel", "playwright", "playwright-camoufox", "playwright-chrome", "playwright-firefox", "playwright-webkit", "stagehand"],
55
"__crawler_type": "{{ cookiecutter.crawler_type|lower|replace('-', '_') }}",
66
"http_client": ["impit", "httpx", "curl-impersonate"],
77
"package_manager": ["poetry", "pip", "uv"],

src/crawlee/project_template/hooks/post_gen_project.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# % set needs_playwright = cookiecutter.crawler_type == 'playwright' or cookiecutter.crawler_type.startswith('adaptive-') or cookiecutter.crawler_type == 'stagehand'
12
import platform
23
import subprocess
34
from pathlib import Path
@@ -12,7 +13,7 @@
1213
subprocess.check_call(['uv', 'sync'])
1314
# % endif
1415

15-
# % if cookiecutter.crawler_type == 'playwright'
16+
# % if needs_playwright
1617
manager = "{{ cookiecutter.package_manager }}"
1718
subprocess.check_call([manager, 'run', 'playwright', 'install'])
1819
# % endif
@@ -38,7 +39,7 @@
3839
subprocess.check_output([str(path / 'pip'), 'freeze']).decode()
3940
)
4041

41-
# % if cookiecutter.crawler_type == 'playwright'
42+
# % if needs_playwright
4243
subprocess.check_call([str(path / 'playwright'), 'install'])
4344
# % endif
4445
# % endif
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# % extends 'main.py'
2+
3+
# % block import
4+
from crawlee.crawlers import AdaptivePlaywrightCrawler
5+
# % endblock
6+
7+
# % block instantiation
8+
crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
9+
request_handler=router,
10+
max_requests_per_crawl=10,
11+
{{ self.http_client_instantiation() }})
12+
# % endblock
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# % extends 'main.py'
2+
3+
# % block import
4+
from crawlee.crawlers import AdaptivePlaywrightCrawler
5+
# % endblock
6+
7+
# % block instantiation
8+
crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser(
9+
request_handler=router,
10+
max_requests_per_crawl=10,
11+
{{ self.http_client_instantiation() }})
12+
# % endblock
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# % extends 'main.py'
2+
3+
# % block import
4+
import os
5+
6+
from crawlee.browsers import StagehandOptions
7+
from crawlee.crawlers import StagehandCrawler
8+
# % endblock
9+
10+
# % block instantiation
11+
model_api_key = os.environ.get('OPENAI_API_KEY')
12+
if model_api_key is None:
13+
raise ValueError('The OPENAI_API_KEY environment variable is not set.')
14+
15+
crawler = StagehandCrawler(
16+
request_handler=router,
17+
headless=True,
18+
max_requests_per_crawl=10,
19+
stagehand_options=StagehandOptions(
20+
model_api_key=model_api_key,
21+
),
22+
{{ self.http_client_instantiation() }})
23+
# % endblock
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from crawlee.crawlers import AdaptivePlaywrightCrawlingContext
2+
from crawlee.router import Router
3+
4+
router = Router[AdaptivePlaywrightCrawlingContext]()
5+
6+
7+
@router.default_handler
8+
async def default_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
9+
"""Default request handler."""
10+
context.log.info(f'Processing {context.request.url} ...')
11+
title = context.parsed_content.find('title')
12+
await context.push_data(
13+
{
14+
'url': context.request.loaded_url,
15+
'title': title.text if title else None,
16+
}
17+
)
18+
19+
await context.enqueue_links()
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from crawlee.crawlers import AdaptivePlaywrightCrawlingContext
2+
from crawlee.router import Router
3+
4+
router = Router[AdaptivePlaywrightCrawlingContext]()
5+
6+
7+
@router.default_handler
8+
async def default_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
9+
"""Default request handler."""
10+
context.log.info(f'Processing {context.request.url} ...')
11+
title = context.parsed_content.xpath('//title/text()').get()
12+
await context.push_data(
13+
{
14+
'url': context.request.loaded_url,
15+
'title': title,
16+
}
17+
)
18+
19+
await context.enqueue_links()
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
from crawlee.crawlers import StagehandCrawlingContext
2+
from crawlee.router import Router
3+
4+
router = Router[StagehandCrawlingContext]()
5+
6+
7+
@router.default_handler
8+
async def default_handler(context: StagehandCrawlingContext) -> None:
9+
"""Default request handler."""
10+
context.log.info(f'Processing {context.request.url} ...')
11+
12+
data = await context.page.extract(instruction='Get the page title and main heading.')
13+
14+
await context.push_data(
15+
{
16+
'url': context.request.loaded_url,
17+
'data': data.model_dump(),
18+
}
19+
)
20+
21+
await context.enqueue_links()

src/crawlee/project_template/{{cookiecutter.project_name}}/Dockerfile

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,20 @@
11
# First, specify the base Docker image.
22
# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
33
# You can also use any other image from Docker Hub.
4-
# % if cookiecutter.crawler_type == 'playwright'
5-
FROM apify/actor-python-playwright:3.13
4+
# % if cookiecutter.crawler_type == 'playwright' or cookiecutter.crawler_type.startswith('adaptive-') or cookiecutter.crawler_type == 'stagehand'
5+
# % set base_image = 'apify/actor-python-playwright:3.13'
66
# % elif cookiecutter.crawler_type == 'playwright-camoufox'
7-
FROM apify/actor-python-playwright-camoufox:3.13
7+
# % set base_image = 'apify/actor-python-playwright-camoufox:3.13'
88
# % elif cookiecutter.crawler_type == 'playwright-chrome'
9-
FROM apify/actor-python-playwright-chrome:3.13
9+
# % set base_image = 'apify/actor-python-playwright-chrome:3.13'
1010
# % elif cookiecutter.crawler_type == 'playwright-firefox'
11-
FROM apify/actor-python-playwright-firefox:3.13
11+
# % set base_image = 'apify/actor-python-playwright-firefox:3.13'
1212
# % elif cookiecutter.crawler_type == 'playwright-webkit'
13-
FROM apify/actor-python-playwright-webkit:3.13
13+
# % set base_image = 'apify/actor-python-playwright-webkit:3.13'
1414
# % else
15-
FROM apify/actor-python:3.13
15+
# % set base_image = 'apify/actor-python:3.13'
1616
# % endif
17+
FROM {{ base_image }}
1718

1819
RUN apt update && apt install -yq git && rm -rf /var/lib/apt/lists/*
1920

0 commit comments

Comments
 (0)