Skip to content

Commit d1445e4

Browse files
authored
feat: mask Playwright's "headless" headers (#545)
### Description - See the issue description for more information. - Playwright headless Chromium before: ```json { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "Accept-Encoding": "gzip, deflate, br, zstd", "Host": "httpbin.org", "Priority": "u=0, i", "Sec-Ch-Ua": "\"Chromium\";v=\"128\", \"Not;A=Brand\";v=\"24\", \"HeadlessChrome\";v=\"128\"", "Sec-Ch-Ua-Mobile": "?0", "Sec-Ch-Ua-Platform": "\"Linux\"", "Sec-Fetch-Dest": "document", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "none", "Sec-Fetch-User": "?1", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/128.0.6613.18 Safari/537.36", "X-Amzn-Trace-Id": "Root=1-66d04117-141b301674c02e4e2136f1f1" } ``` - Playwright headless Chromium after: ```json { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "Accept-Encoding": "gzip, deflate, br, zstd", "Accept-Language": "en-US,en;q=0.9", "Host": "httpbin.org", "Priority": "u=0, i", "Sec-Ch-Ua": "\"Not=A?Brand\";v=\"8\", \"Chromium\";v=\"124\", \"Google Chrome\";v=\"124\"", "Sec-Ch-Ua-Mobile": "?0", "Sec-Ch-Ua-Platform": "\"macOS\"", "Sec-Fetch-Dest": "document", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "none", "Sec-Fetch-User": "?1", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", "X-Amzn-Trace-Id": "Root=1-66f56373-7367a87b176db2f27025c5d1" } ``` ### Issues - Closes: #401 ### Testing - New unit tests were implemented. ### Checklist - [x] CI passed
1 parent a2f3ce6 commit d1445e4

15 files changed

Lines changed: 339 additions & 47 deletions

File tree

src/crawlee/_types.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -235,12 +235,12 @@ def __init__(self, headers: Mapping[str, str] | None = None) -> None:
235235
"""
236236
# Ensure immutability by sorting and fixing the order.
237237
headers = headers or {}
238-
headers = {k.lower(): v for k, v in headers.items()}
238+
headers = {k.capitalize(): v for k, v in headers.items()}
239239
self._headers = dict(sorted(headers.items()))
240240

241241
def __getitem__(self, key: str) -> str:
242242
"""Get the value of a header by its name, case-insensitive."""
243-
return self._headers[key.lower()]
243+
return self._headers[key.capitalize()]
244244

245245
def __iter__(self) -> Iterator[str]:
246246
"""Return an iterator over the header names."""
@@ -261,3 +261,13 @@ def __setitem__(self, key: str, value: str) -> None:
261261
def __delitem__(self, key: str) -> None:
262262
"""Prevent deleting a header, as the object is immutable."""
263263
raise TypeError(f'{self.__class__.__name__} is immutable')
264+
265+
def __or__(self, other: Mapping[str, str]) -> HttpHeaders:
266+
"""Return a new instance of `HttpHeaders` combining this one with another one."""
267+
combined_headers = {**self._headers, **other}
268+
return HttpHeaders(combined_headers)
269+
270+
def __ror__(self, other: Mapping[str, str]) -> HttpHeaders:
271+
"""Support reversed | operation (other | self)."""
272+
combined_headers = {**other, **self._headers}
273+
return HttpHeaders(combined_headers)

src/crawlee/browsers/_base_browser_controller.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
from playwright.async_api import Page
1313

14+
from crawlee.browsers._types import BrowserType
1415
from crawlee.proxy_configuration import ProxyInfo
1516

1617

@@ -50,6 +51,11 @@ def has_free_capacity(self) -> bool:
5051
def is_browser_connected(self) -> bool:
5152
"""Return if the browser is closed."""
5253

54+
@property
55+
@abstractmethod
56+
def browser_type(self) -> BrowserType:
57+
"""Return the type of the browser."""
58+
5359
@abstractmethod
5460
async def new_page(
5561
self,

src/crawlee/browsers/_base_browser_plugin.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,14 @@
33
from __future__ import annotations
44

55
from abc import ABC, abstractmethod
6-
from typing import TYPE_CHECKING, Any, Literal
6+
from typing import TYPE_CHECKING, Any
77

88
if TYPE_CHECKING:
99
from collections.abc import Mapping
1010
from types import TracebackType
1111

1212
from crawlee.browsers._base_browser_controller import BaseBrowserController
13+
from crawlee.browsers._types import BrowserType
1314

1415

1516
class BaseBrowserPlugin(ABC):
@@ -24,7 +25,7 @@ class BaseBrowserPlugin(ABC):
2425

2526
@property
2627
@abstractmethod
27-
def browser_type(self) -> Literal['chromium', 'firefox', 'webkit']:
28+
def browser_type(self) -> BrowserType:
2829
"""Return the browser type name."""
2930

3031
@property

src/crawlee/browsers/_browser_pool.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,14 @@
77
from collections import defaultdict
88
from datetime import timedelta
99
from logging import getLogger
10-
from typing import TYPE_CHECKING, Any, Literal
10+
from typing import TYPE_CHECKING, Any
1111
from weakref import WeakValueDictionary
1212

1313
from crawlee._utils.crypto import crypto_random_object_id
1414
from crawlee._utils.recurring_task import RecurringTask
1515
from crawlee.browsers._base_browser_controller import BaseBrowserController
1616
from crawlee.browsers._playwright_browser_plugin import PlaywrightBrowserPlugin
17-
from crawlee.browsers._types import CrawleePage
17+
from crawlee.browsers._types import BrowserType, CrawleePage
1818

1919
if TYPE_CHECKING:
2020
from collections.abc import Mapping, Sequence
@@ -94,7 +94,7 @@ def with_default_plugin(
9494
cls,
9595
*,
9696
headless: bool | None = None,
97-
browser_type: Literal['chromium', 'firefox', 'webkit'] | None = None,
97+
browser_type: BrowserType | None = None,
9898
**kwargs: Any,
9999
) -> BrowserPool:
100100
"""Create a new instance with a single `BaseBrowserPlugin` configured with the provided options.

src/crawlee/browsers/_playwright_browser_controller.py

Lines changed: 54 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,14 @@
33
from __future__ import annotations
44

55
from datetime import datetime, timedelta, timezone
6-
from typing import TYPE_CHECKING, Any
6+
from typing import TYPE_CHECKING, Any, cast
77

8-
from playwright.async_api import Page
8+
from playwright.async_api import BrowserContext, Page, ProxySettings
99
from typing_extensions import override
1010

1111
from crawlee.browsers._base_browser_controller import BaseBrowserController
12+
from crawlee.browsers._types import BrowserType
13+
from crawlee.fingerprint_suite import HeaderGenerator
1214

1315
if TYPE_CHECKING:
1416
from collections.abc import Mapping
@@ -26,17 +28,29 @@ class PlaywrightBrowserController(BaseBrowserController):
2628
"""
2729

2830
AUTOMATION_LIBRARY = 'playwright'
31+
_DEFAULT_HEADER_GENERATOR = HeaderGenerator()
2932

30-
def __init__(self, browser: Browser, *, max_open_pages_per_browser: int = 20) -> None:
33+
def __init__(
34+
self,
35+
browser: Browser,
36+
*,
37+
max_open_pages_per_browser: int = 20,
38+
header_generator: HeaderGenerator | None = _DEFAULT_HEADER_GENERATOR,
39+
) -> None:
3140
"""Create a new instance.
3241
3342
Args:
3443
browser: The browser instance to control.
3544
max_open_pages_per_browser: The maximum number of pages that can be open at the same time.
45+
header_generator: An optional `HeaderGenerator` instance used to generate and manage HTTP headers for
46+
requests made by the browser. By default, a predefined header generator is used. Set to `None` to
47+
disable automatic header modifications.
3648
"""
3749
self._browser = browser
3850
self._max_open_pages_per_browser = max_open_pages_per_browser
51+
self._header_generator = header_generator
3952

53+
self._browser_context: BrowserContext | None = None
4054
self._pages = list[Page]()
4155
self._last_page_opened_at = datetime.now(timezone.utc)
4256

@@ -70,26 +84,25 @@ def has_free_capacity(self) -> bool:
7084
def is_browser_connected(self) -> bool:
7185
return self._browser.is_connected()
7286

87+
@property
88+
@override
89+
def browser_type(self) -> BrowserType:
90+
return cast(BrowserType, self._browser.browser_type.name)
91+
7392
@override
7493
async def new_page(
7594
self,
7695
page_options: Mapping[str, Any] | None = None,
7796
proxy_info: ProxyInfo | None = None,
7897
) -> Page:
79-
page_options = dict(page_options) if page_options else {}
80-
81-
# If "proxy_info" is provided and no proxy is already set in "page_options", configure the proxy.
82-
if proxy_info and 'proxy' not in page_options:
83-
page_options['proxy'] = {
84-
'server': f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}',
85-
'username': proxy_info.username,
86-
'password': proxy_info.password,
87-
}
98+
if not self._browser_context:
99+
self._browser_context = await self._create_browser_context(proxy_info)
88100

89101
if not self.has_free_capacity:
90102
raise ValueError('Cannot open more pages in this browser.')
91103

92-
page = await self._browser.new_page(**page_options)
104+
page_options = dict(page_options) if page_options else {}
105+
page = await self._browser_context.new_page(**page_options)
93106

94107
# Handle page close event
95108
page.on(event='close', f=self._on_page_close)
@@ -114,3 +127,31 @@ async def close(self, *, force: bool = False) -> None:
114127
def _on_page_close(self, page: Page) -> None:
115128
"""Handle actions after a page is closed."""
116129
self._pages.remove(page)
130+
131+
async def _create_browser_context(self, proxy_info: ProxyInfo | None = None) -> BrowserContext:
132+
"""Create a new browser context with the specified proxy settings."""
133+
if self._header_generator:
134+
common_headers = self._header_generator.get_common_headers()
135+
sec_ch_ua_headers = self._header_generator.get_sec_ch_ua_headers(browser_type=self.browser_type)
136+
user_agent_header = self._header_generator.get_user_agent_header(browser_type=self.browser_type)
137+
extra_http_headers = dict(common_headers | sec_ch_ua_headers | user_agent_header)
138+
user_agent = user_agent_header.get('User-Agent')
139+
else:
140+
extra_http_headers = None
141+
user_agent = None
142+
143+
proxy = (
144+
ProxySettings(
145+
server=f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}',
146+
username=proxy_info.username,
147+
password=proxy_info.password,
148+
)
149+
if proxy_info
150+
else None
151+
)
152+
153+
return await self._browser.new_context(
154+
user_agent=user_agent,
155+
extra_http_headers=extra_http_headers,
156+
proxy=proxy,
157+
)

src/crawlee/browsers/_playwright_browser_plugin.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from __future__ import annotations
44

55
from logging import getLogger
6-
from typing import TYPE_CHECKING, Any, Literal
6+
from typing import TYPE_CHECKING, Any
77

88
from playwright.async_api import Playwright, async_playwright
99
from typing_extensions import override
@@ -15,6 +15,8 @@
1515
from collections.abc import Mapping
1616
from types import TracebackType
1717

18+
from crawlee.browsers._types import BrowserType
19+
1820
logger = getLogger(__name__)
1921

2022

@@ -29,7 +31,7 @@ class PlaywrightBrowserPlugin(BaseBrowserPlugin):
2931
def __init__(
3032
self,
3133
*,
32-
browser_type: Literal['chromium', 'firefox', 'webkit'] = 'chromium',
34+
browser_type: BrowserType = 'chromium',
3335
browser_options: Mapping[str, Any] | None = None,
3436
page_options: Mapping[str, Any] | None = None,
3537
max_open_pages_per_browser: int = 20,
@@ -53,7 +55,7 @@ def __init__(
5355

5456
@property
5557
@override
56-
def browser_type(self) -> Literal['chromium', 'firefox', 'webkit']:
58+
def browser_type(self) -> BrowserType:
5759
return self._browser_type
5860

5961
@property

src/crawlee/browsers/_types.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,13 @@
66
if TYPE_CHECKING:
77
from playwright.async_api import Page
88

9+
BrowserType = Literal['chromium', 'firefox', 'webkit']
10+
911

1012
@dataclass
1113
class CrawleePage:
1214
"""Represents a page object within a browser, with additional metadata for tracking and management."""
1315

1416
id: str
15-
browser_type: Literal['chromium', 'firefox', 'webkit']
17+
browser_type: BrowserType
1618
page: Page

src/crawlee/fingerprint_suite/_consts.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,17 @@
66

77
COMMON_ACCEPT_LANGUAGE = 'en-US,en;q=0.9'
88

9+
# Playwright default headers (user-agents and sec-ch) for headless browsers.
10+
PW_CHROMIUM_HEADLESS_DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'
11+
PW_CHROMIUM_HEADLESS_DEFAULT_SEC_CH_UA = '"Not=A?Brand";v="8", "Chromium";v="124", "Google Chrome";v="124"'
12+
PW_CHROMIUM_HEADLESS_DEFAULT_SEC_CH_UA_MOBILE = '?0'
13+
PW_CHROMIUM_HEADLESS_DEFAULT_SEC_CH_UA_PLATFORM = '"macOS"'
14+
15+
PW_FIREFOX_HEADLESS_DEFAULT_USER_AGENT = (
16+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv125.0) Gecko/20100101 Firefox/125.0'
17+
)
18+
PW_WEBKIT_HEADLESS_DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15'
19+
920
# Random 1000 user agents from Apify fingerprint dataset.
1021
USER_AGENT_POOL = [
1122
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',

src/crawlee/fingerprint_suite/_header_generator.py

Lines changed: 72 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,26 +3,86 @@
33
import random
44
from typing import TYPE_CHECKING
55

6-
from crawlee.fingerprint_suite._consts import COMMON_ACCEPT, COMMON_ACCEPT_LANGUAGE, USER_AGENT_POOL
6+
from crawlee._types import HttpHeaders
7+
from crawlee.fingerprint_suite._consts import (
8+
COMMON_ACCEPT,
9+
COMMON_ACCEPT_LANGUAGE,
10+
PW_CHROMIUM_HEADLESS_DEFAULT_SEC_CH_UA,
11+
PW_CHROMIUM_HEADLESS_DEFAULT_SEC_CH_UA_MOBILE,
12+
PW_CHROMIUM_HEADLESS_DEFAULT_SEC_CH_UA_PLATFORM,
13+
PW_CHROMIUM_HEADLESS_DEFAULT_USER_AGENT,
14+
PW_FIREFOX_HEADLESS_DEFAULT_USER_AGENT,
15+
PW_WEBKIT_HEADLESS_DEFAULT_USER_AGENT,
16+
USER_AGENT_POOL,
17+
)
718

819
if TYPE_CHECKING:
9-
from collections.abc import Mapping
20+
from crawlee.browsers._types import BrowserType
1021

1122

1223
class HeaderGenerator:
13-
"""Generates common headers for HTTP requests."""
24+
"""Generates realistic looking or browser-like HTTP headers."""
1425

15-
def get_common_headers(self) -> Mapping[str, str]:
16-
"""Get common headers for HTTP requests.
26+
def get_common_headers(self) -> HttpHeaders:
27+
"""Get common HTTP headers ("Accept", "Accept-Language").
1728
18-
We do not modify the 'Accept-Encoding', 'Connection' and other headers. They should be included and handled
19-
by the HTTP client.
20-
21-
Returns:
22-
Dictionary containing common headers.
29+
We do not modify the "Accept-Encoding", "Connection" and other headers. They should be included and handled
30+
by the HTTP client or browser.
2331
"""
24-
return {
32+
headers = {
2533
'Accept': COMMON_ACCEPT,
2634
'Accept-Language': COMMON_ACCEPT_LANGUAGE,
27-
'User-Agent': random.choice(USER_AGENT_POOL),
2835
}
36+
return HttpHeaders(headers)
37+
38+
def get_random_user_agent_header(self) -> HttpHeaders:
39+
"""Get a random User-Agent header."""
40+
headers = {'User-Agent': random.choice(USER_AGENT_POOL)}
41+
return HttpHeaders(headers)
42+
43+
def get_user_agent_header(
44+
self,
45+
*,
46+
browser_type: BrowserType = 'chromium',
47+
) -> HttpHeaders:
48+
"""Get the User-Agent header based on the browser type."""
49+
headers = dict[str, str]()
50+
51+
if browser_type == 'chromium':
52+
headers['User-Agent'] = PW_CHROMIUM_HEADLESS_DEFAULT_USER_AGENT
53+
54+
elif browser_type == 'firefox':
55+
headers['User-Agent'] = PW_FIREFOX_HEADLESS_DEFAULT_USER_AGENT
56+
57+
elif browser_type == 'webkit':
58+
headers['User-Agent'] = PW_WEBKIT_HEADLESS_DEFAULT_USER_AGENT
59+
60+
else:
61+
raise ValueError(f'Unsupported browser type: {browser_type}')
62+
63+
return HttpHeaders(headers)
64+
65+
def get_sec_ch_ua_headers(
66+
self,
67+
*,
68+
browser_type: BrowserType = 'chromium',
69+
) -> HttpHeaders:
70+
"""Get the Sec-Ch-Ua headers based on the browser type."""
71+
headers = dict[str, str]()
72+
73+
if browser_type == 'chromium':
74+
# Currently, only Chromium uses Sec-Ch-Ua headers.
75+
headers['Sec-Ch-Ua'] = PW_CHROMIUM_HEADLESS_DEFAULT_SEC_CH_UA
76+
headers['Sec-Ch-Ua-Mobile'] = PW_CHROMIUM_HEADLESS_DEFAULT_SEC_CH_UA_MOBILE
77+
headers['Sec-Ch-Ua-Platform'] = PW_CHROMIUM_HEADLESS_DEFAULT_SEC_CH_UA_PLATFORM
78+
79+
elif browser_type == 'firefox': # noqa: SIM114
80+
pass
81+
82+
elif browser_type == 'webkit':
83+
pass
84+
85+
else:
86+
raise ValueError(f'Unsupported browser type: {browser_type}')
87+
88+
return HttpHeaders(headers)

0 commit comments

Comments
 (0)