Skip to content

Commit cfc66d9

Browse files
committed
fix: Filter sitemap-derived URLs by enqueue strategy
1 parent ba6e555 commit cfc66d9

7 files changed

Lines changed: 289 additions & 41 deletions

File tree

src/crawlee/_utils/robots.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,13 @@
22

33
from logging import getLogger
44
from typing import TYPE_CHECKING
5+
from urllib.parse import urlparse
56

67
from protego import Protego
78
from yarl import URL
89

910
from crawlee._utils.sitemap import Sitemap
11+
from crawlee._utils.urls import matches_enqueue_strategy
1012
from crawlee._utils.web import is_status_code_client_error
1113

1214
if TYPE_CHECKING:
@@ -90,8 +92,23 @@ def is_allowed(self, url: str, user_agent: str = '*') -> bool:
9092
return bool(self._robots.can_fetch(str(check_url), user_agent))
9193

9294
def get_sitemaps(self) -> list[str]:
93-
"""Get the list of sitemaps urls from the robots.txt file."""
94-
return list(self._robots.sitemaps)
95+
"""Get the list of same-host sitemap URLs from the robots.txt file.
96+
97+
Sitemap entries pointing to a different host than the robots.txt file are filtered out, as required by the
98+
robots.txt specification.
99+
"""
100+
origin_url = str(self._original_url)
101+
parsed_origin = urlparse(origin_url)
102+
same_host_sitemaps: list[str] = []
103+
for sitemap_url in self._robots.sitemaps:
104+
if matches_enqueue_strategy('same-hostname', target_url=sitemap_url, origin_url=parsed_origin):
105+
same_host_sitemaps.append(sitemap_url)
106+
else:
107+
logger.warning(
108+
f'Skipping sitemap {sitemap_url!r} listed in robots.txt at {origin_url!r}: '
109+
f'cross-host sitemap entries are not allowed by the robots.txt specification.'
110+
)
111+
return same_host_sitemaps
95112

96113
def get_crawl_delay(self, user_agent: str = '*') -> int | None:
97114
"""Get the crawl delay for the given user agent.

src/crawlee/_utils/urls.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,21 @@
11
from __future__ import annotations
22

3+
import tempfile
4+
from functools import lru_cache
35
from typing import TYPE_CHECKING
6+
from urllib.parse import ParseResult, urlparse
47

58
from pydantic import AnyHttpUrl, TypeAdapter
9+
from tldextract import TLDExtract
10+
from typing_extensions import assert_never
611
from yarl import URL
712

813
if TYPE_CHECKING:
914
from collections.abc import Iterator
1015
from logging import Logger
1116

17+
from crawlee._types import EnqueueStrategy
18+
1219

1320
def is_url_absolute(url: str) -> bool:
1421
"""Check if a URL is absolute."""
@@ -51,3 +58,52 @@ def validate_http_url(value: str | None) -> str | None:
5158
_http_url_adapter.validate_python(value)
5259

5360
return value
61+
62+
63+
@lru_cache(maxsize=1)
64+
def _get_tld_extractor() -> TLDExtract:
65+
"""Return a lazily-initialized `TLDExtract` instance shared across the module."""
66+
# `mkdtemp` (vs `TemporaryDirectory`) returns a path whose lifetime is tied to the process — `TemporaryDirectory`
67+
# is collected immediately when its return value is discarded, which would race the directory out from under
68+
# tldextract.
69+
return TLDExtract(cache_dir=tempfile.mkdtemp())
70+
71+
72+
def matches_enqueue_strategy(
73+
strategy: EnqueueStrategy,
74+
*,
75+
target_url: str | ParseResult,
76+
origin_url: str | ParseResult,
77+
) -> bool:
78+
"""Check whether `target_url` matches `origin_url` under the given enqueue strategy.
79+
80+
Args:
81+
strategy: The enqueue strategy to apply.
82+
target_url: The URL to be evaluated.
83+
origin_url: The reference URL the target is compared against.
84+
85+
Returns:
86+
`True` if `target_url` is allowed under `strategy` relative to `origin_url`, `False` otherwise.
87+
"""
88+
target = urlparse(target_url) if isinstance(target_url, str) else target_url
89+
origin = urlparse(origin_url) if isinstance(origin_url, str) else origin_url
90+
91+
if strategy == 'all':
92+
return True
93+
94+
if origin.hostname is None or target.hostname is None:
95+
return False
96+
97+
if strategy == 'same-hostname':
98+
return target.hostname == origin.hostname
99+
100+
if strategy == 'same-domain':
101+
extractor = _get_tld_extractor()
102+
origin_domain = extractor.extract_str(origin.hostname).top_domain_under_public_suffix
103+
target_domain = extractor.extract_str(target.hostname).top_domain_under_public_suffix
104+
return origin_domain == target_domain
105+
106+
if strategy == 'same-origin':
107+
return target.hostname == origin.hostname and target.scheme == origin.scheme and target.port == origin.port
108+
109+
assert_never(strategy)

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 4 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
import logging
77
import signal
88
import sys
9-
import tempfile
109
import threading
1110
import traceback
1211
from asyncio import CancelledError
@@ -21,8 +20,7 @@
2120
from weakref import WeakKeyDictionary
2221

2322
from cachetools import LRUCache
24-
from tldextract import TLDExtract
25-
from typing_extensions import NotRequired, TypedDict, TypeVar, Unpack, assert_never
23+
from typing_extensions import NotRequired, TypedDict, TypeVar, Unpack
2624
from yarl import URL
2725

2826
from crawlee import EnqueueStrategy, Glob, RequestTransformAction, service_locator
@@ -47,7 +45,7 @@
4745
from crawlee._utils.file import atomic_write, export_csv_to_stream, export_json_to_stream
4846
from crawlee._utils.recurring_task import RecurringTask
4947
from crawlee._utils.robots import RobotsTxtFile
50-
from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
48+
from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute, matches_enqueue_strategy
5149
from crawlee._utils.wait import wait_for
5250
from crawlee._utils.web import is_status_code_client_error, is_status_code_server_error
5351
from crawlee.errors import (
@@ -484,7 +482,6 @@ async def persist_state_factory() -> KeyValueStore:
484482
# Internal, not explicitly configurable components
485483
self._robots_txt_file_cache: LRUCache[str, RobotsTxtFile] = LRUCache(maxsize=1000)
486484
self._robots_txt_lock = asyncio.Lock()
487-
self._tld_extractor = TLDExtract(cache_dir=tempfile.TemporaryDirectory().name)
488485
self._snapshotter = Snapshotter.from_config(config)
489486
self._autoscaled_pool = AutoscaledPool(
490487
system_status=SystemStatus(self._snapshotter),
@@ -1094,32 +1091,13 @@ def _check_enqueue_strategy(
10941091
origin_url: ParseResult,
10951092
) -> bool:
10961093
"""Check if a URL matches the enqueue_strategy."""
1097-
if strategy == 'all':
1098-
return True
1099-
1100-
if origin_url.hostname is None or target_url.hostname is None:
1094+
if strategy != 'all' and (origin_url.hostname is None or target_url.hostname is None):
11011095
self.log.debug(
11021096
f'Skipping enqueue: Missing hostname in origin_url = {origin_url.geturl()} or '
11031097
f'target_url = {target_url.geturl()}'
11041098
)
1105-
return False
1106-
1107-
if strategy == 'same-hostname':
1108-
return target_url.hostname == origin_url.hostname
1109-
1110-
if strategy == 'same-domain':
1111-
origin_domain = self._tld_extractor.extract_str(origin_url.hostname).top_domain_under_public_suffix
1112-
target_domain = self._tld_extractor.extract_str(target_url.hostname).top_domain_under_public_suffix
1113-
return origin_domain == target_domain
1114-
1115-
if strategy == 'same-origin':
1116-
return (
1117-
target_url.hostname == origin_url.hostname
1118-
and target_url.scheme == origin_url.scheme
1119-
and target_url.port == origin_url.port
1120-
)
11211099

1122-
assert_never(strategy)
1100+
return matches_enqueue_strategy(strategy, target_url=target_url, origin_url=origin_url)
11231101

11241102
def _check_url_patterns(
11251103
self,

src/crawlee/request_loaders/_sitemap_request_loader.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from contextlib import suppress
66
from logging import getLogger
77
from typing import TYPE_CHECKING, Annotated, Any
8+
from urllib.parse import urlparse
89

910
from pydantic import BaseModel, ConfigDict, Field
1011
from typing_extensions import override
@@ -14,6 +15,7 @@
1415
from crawlee._utils.globs import Glob
1516
from crawlee._utils.recoverable_state import RecoverableState
1617
from crawlee._utils.sitemap import NestedSitemap, ParseSitemapOptions, SitemapSource, SitemapUrl, parse_sitemap
18+
from crawlee._utils.urls import matches_enqueue_strategy
1719
from crawlee.request_loaders._request_loader import RequestLoader
1820

1921
if TYPE_CHECKING:
@@ -22,6 +24,7 @@
2224
from types import TracebackType
2325

2426
from crawlee import RequestTransformAction
27+
from crawlee._types import EnqueueStrategy
2528
from crawlee.http_clients import HttpClient
2629
from crawlee.proxy_configuration import ProxyInfo
2730
from crawlee.storage_clients.models import ProcessedRequest
@@ -111,6 +114,7 @@ def __init__(
111114
proxy_info: ProxyInfo | None = None,
112115
include: list[re.Pattern[Any] | Glob] | None = None,
113116
exclude: list[re.Pattern[Any] | Glob] | None = None,
117+
enqueue_strategy: EnqueueStrategy = 'same-hostname',
114118
max_buffer_size: int = 200,
115119
persist_state_key: str | None = None,
116120
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
@@ -122,6 +126,10 @@ def __init__(
122126
proxy_info: Optional proxy to use for fetching sitemaps.
123127
include: List of glob or regex patterns to include URLs.
124128
exclude: List of glob or regex patterns to exclude URLs.
129+
enqueue_strategy: Strategy used to decide which sitemap-derived URLs (both nested-sitemap entries and
130+
URL entries) are kept relative to the parent sitemap URL. Defaults to `'same-hostname'`, matching
131+
the sitemap protocol's same-host expectation and the `enqueue_links` default; pass `'all'` to
132+
disable filtering.
125133
max_buffer_size: Maximum number of URLs to buffer in memory.
126134
http_client: the instance of `HttpClient` to use for fetching sitemaps.
127135
persist_state_key: A key for persisting the loader's state in the KeyValueStore.
@@ -135,6 +143,7 @@ def __init__(
135143
self._sitemap_urls = sitemap_urls
136144
self._include = include
137145
self._exclude = exclude
146+
self._enqueue_strategy = enqueue_strategy
138147
self._proxy_info = proxy_info
139148
self._max_buffer_size = max_buffer_size
140149
self._transform_request_function = transform_request_function
@@ -235,6 +244,9 @@ async def _load_sitemaps(self) -> None:
235244
state.in_progress_sitemap_url = sitemap_url
236245

237246
parse_options = ParseSitemapOptions(max_depth=0, emit_nested_sitemaps=True, sitemap_retries=3)
247+
# Parse the parent sitemap URL once per outer iteration; `matches_enqueue_strategy` is called per
248+
# entry below, and re-parsing the same string thousands of times for large sitemaps is wasteful.
249+
parsed_sitemap_url = urlparse(sitemap_url)
238250

239251
async for item in parse_sitemap(
240252
[SitemapSource(type='url', url=sitemap_url)],
@@ -245,6 +257,14 @@ async def _load_sitemaps(self) -> None:
245257
if isinstance(item, NestedSitemap):
246258
# Add nested sitemap to queue
247259
if item.loc not in state.pending_sitemap_urls and item.loc not in state.processed_sitemap_urls:
260+
if not matches_enqueue_strategy(
261+
self._enqueue_strategy, target_url=item.loc, origin_url=parsed_sitemap_url
262+
):
263+
logger.warning(
264+
f'Skipping nested sitemap {item.loc!r}: does not match enqueue strategy '
265+
f'{self._enqueue_strategy!r} relative to {sitemap_url!r}.'
266+
)
267+
continue
248268
state.pending_sitemap_urls.append(item.loc)
249269
continue
250270

@@ -261,6 +281,15 @@ async def _load_sitemaps(self) -> None:
261281
if not self._check_url_patterns(url, self._include, self._exclude):
262282
continue
263283

284+
if not matches_enqueue_strategy(
285+
self._enqueue_strategy, target_url=url, origin_url=parsed_sitemap_url
286+
):
287+
logger.warning(
288+
f'Skipping sitemap URL {url!r}: does not match enqueue strategy '
289+
f'{self._enqueue_strategy!r} relative to {sitemap_url!r}.'
290+
)
291+
continue
292+
264293
# Check if we have capacity in the queue
265294
await self._queue_has_capacity.wait()
266295

@@ -326,7 +355,7 @@ async def fetch_next_request(self) -> Request | None:
326355
continue
327356

328357
url = state.url_queue.popleft()
329-
request_option = RequestOptions(url=url)
358+
request_option = RequestOptions(url=url, enqueue_strategy=self._enqueue_strategy)
330359

331360
if len(state.url_queue) < self._max_buffer_size:
332361
self._queue_has_capacity.set()

tests/unit/_utils/test_robots.py

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,10 @@
1111

1212

1313
async def test_generation_robots_txt_url(server_url: URL, http_client: HttpClient) -> None:
14+
"""`RobotsTxtFile.find` constructs the correct /robots.txt URL and successfully parses the response."""
1415
robots_file = await RobotsTxtFile.find(str(server_url), http_client)
15-
assert len(robots_file.get_sitemaps()) > 0
16+
# The fixture's robots.txt disallows /deny_all/ — proves the file was fetched and parsed.
17+
assert not robots_file.is_allowed(str(server_url / 'deny_all/page.html'))
1618

1719

1820
async def test_allow_disallow_robots_txt(server_url: URL, http_client: HttpClient) -> None:
@@ -24,9 +26,33 @@ async def test_allow_disallow_robots_txt(server_url: URL, http_client: HttpClien
2426

2527

2628
async def test_extract_sitemaps_urls(server_url: URL, http_client: HttpClient) -> None:
29+
"""Cross-host sitemap entries are dropped from the test fixture's robots.txt."""
2730
robots = await RobotsTxtFile.find(str(server_url), http_client)
28-
assert len(robots.get_sitemaps()) == 2
29-
assert set(robots.get_sitemaps()) == {'http://not-exists.com/sitemap_1.xml', 'http://not-exists.com/sitemap_2.xml'}
31+
# The fixture lists `http://not-exists.com/sitemap_*.xml`, which is cross-host relative to `server_url` and
32+
# therefore filtered out per the robots.txt specification.
33+
assert robots.get_sitemaps() == []
34+
35+
36+
async def test_extract_same_host_sitemaps_urls() -> None:
37+
"""Sitemap entries on the same host as the robots.txt are returned."""
38+
content = 'User-agent: *\nSitemap: http://example.com/sitemap_1.xml\nSitemap: http://example.com/sitemap_2.xml\n'
39+
robots = await RobotsTxtFile.from_content('http://example.com/robots.txt', content)
40+
assert set(robots.get_sitemaps()) == {
41+
'http://example.com/sitemap_1.xml',
42+
'http://example.com/sitemap_2.xml',
43+
}
44+
45+
46+
async def test_extract_sitemaps_urls_filters_cross_host() -> None:
47+
"""Cross-host `Sitemap:` directives in robots.txt are silently filtered."""
48+
content = (
49+
'User-agent: *\n'
50+
'Sitemap: http://example.com/legit.xml\n'
51+
'Sitemap: http://other.test/payload.xml\n'
52+
'Sitemap: gopher://internal:6379/_PING\n'
53+
)
54+
robots = await RobotsTxtFile.from_content('http://example.com/robots.txt', content)
55+
assert robots.get_sitemaps() == ['http://example.com/legit.xml']
3056

3157

3258
async def test_parse_from_content() -> None:

tests/unit/_utils/test_urls.py

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,19 @@
11
from __future__ import annotations
22

3+
from typing import TYPE_CHECKING
4+
35
import pytest
46
from pydantic import ValidationError
57

6-
from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute, validate_http_url
8+
from crawlee._utils.urls import (
9+
convert_to_absolute_url,
10+
is_url_absolute,
11+
matches_enqueue_strategy,
12+
validate_http_url,
13+
)
14+
15+
if TYPE_CHECKING:
16+
from crawlee._types import EnqueueStrategy
717

818

919
def test_is_url_absolute() -> None:
@@ -55,3 +65,30 @@ def test_validate_http_url() -> None:
5565
def test_validate_http_url_rejects_non_http_scheme(invalid_url: str) -> None:
5666
with pytest.raises(ValidationError):
5767
validate_http_url(invalid_url)
68+
69+
70+
@pytest.mark.parametrize(
71+
('strategy', 'origin', 'target', 'expected'),
72+
[
73+
# 'all' lets everything through, even with empty/cross-host targets
74+
('all', 'https://example.com/', 'https://other.test/', True),
75+
('all', 'https://example.com/', 'gopher://internal:6379/_PING', True),
76+
# 'same-hostname' is exact host equality
77+
('same-hostname', 'https://example.com/a', 'https://example.com/b', True),
78+
('same-hostname', 'https://example.com/', 'https://www.example.com/', False),
79+
('same-hostname', 'https://example.com/', 'https://other.test/', False),
80+
# 'same-domain' allows subdomains under the same registrable domain
81+
('same-domain', 'https://example.com/', 'https://www.example.com/', True),
82+
('same-domain', 'https://example.com/', 'https://api.example.com/', True),
83+
('same-domain', 'https://example.com/', 'https://other.test/', False),
84+
# 'same-origin' requires scheme + host + port match
85+
('same-origin', 'https://example.com/', 'https://example.com/path', True),
86+
('same-origin', 'https://example.com/', 'http://example.com/', False),
87+
('same-origin', 'https://example.com/', 'https://example.com:8443/', False),
88+
# missing hostname rejects everything except 'all'
89+
('same-hostname', 'https://example.com/', 'not-a-url', False),
90+
('same-domain', 'not-a-url', 'https://example.com/', False),
91+
],
92+
)
93+
def test_matches_enqueue_strategy(strategy: EnqueueStrategy, origin: str, target: str, *, expected: bool) -> None:
94+
assert matches_enqueue_strategy(strategy, target_url=target, origin_url=origin) is expected

0 commit comments

Comments
 (0)