fix: Filter sitemap-derived URLs by enqueue strategy

vdusek · vdusek · commit cfc66d92ef22 · 2026-04-30T17:17:13.000+02:00
diff --git a/src/crawlee/_utils/robots.py b/src/crawlee/_utils/robots.py
@@ -2,11 +2,13 @@
 
 from logging import getLogger
 from typing import TYPE_CHECKING
+from urllib.parse import urlparse
 
 from protego import Protego
 from yarl import URL
 
 from crawlee._utils.sitemap import Sitemap
+from crawlee._utils.urls import matches_enqueue_strategy
 from crawlee._utils.web import is_status_code_client_error
 
 if TYPE_CHECKING:
@@ -90,8 +92,23 @@ def is_allowed(self, url: str, user_agent: str = '*') -> bool:
         return bool(self._robots.can_fetch(str(check_url), user_agent))
 
     def get_sitemaps(self) -> list[str]:
-        """Get the list of sitemaps urls from the robots.txt file."""
-        return list(self._robots.sitemaps)
+        """Get the list of same-host sitemap URLs from the robots.txt file.
+
+        Sitemap entries pointing to a different host than the robots.txt file are filtered out, as required by the
+        robots.txt specification.
+        """
+        origin_url = str(self._original_url)
+        parsed_origin = urlparse(origin_url)
+        same_host_sitemaps: list[str] = []
+        for sitemap_url in self._robots.sitemaps:
+            if matches_enqueue_strategy('same-hostname', target_url=sitemap_url, origin_url=parsed_origin):
+                same_host_sitemaps.append(sitemap_url)
+            else:
+                logger.warning(
+                    f'Skipping sitemap {sitemap_url!r} listed in robots.txt at {origin_url!r}: '
+                    f'cross-host sitemap entries are not allowed by the robots.txt specification.'
+                )
+        return same_host_sitemaps
 
     def get_crawl_delay(self, user_agent: str = '*') -> int | None:
         """Get the crawl delay for the given user agent.
diff --git a/src/crawlee/_utils/urls.py b/src/crawlee/_utils/urls.py
@@ -1,14 +1,21 @@
 from __future__ import annotations
 
+import tempfile
+from functools import lru_cache
 from typing import TYPE_CHECKING
+from urllib.parse import ParseResult, urlparse
 
 from pydantic import AnyHttpUrl, TypeAdapter
+from tldextract import TLDExtract
+from typing_extensions import assert_never
 from yarl import URL
 
 if TYPE_CHECKING:
     from collections.abc import Iterator
     from logging import Logger
 
+    from crawlee._types import EnqueueStrategy
+
 
 def is_url_absolute(url: str) -> bool:
     """Check if a URL is absolute."""
@@ -51,3 +58,52 @@ def validate_http_url(value: str | None) -> str | None:
         _http_url_adapter.validate_python(value)
 
     return value
+
+
+@lru_cache(maxsize=1)
+def _get_tld_extractor() -> TLDExtract:
+    """Return a lazily-initialized `TLDExtract` instance shared across the module."""
+    # `mkdtemp` (vs `TemporaryDirectory`) returns a path whose lifetime is tied to the process — `TemporaryDirectory`
+    # is collected immediately when its return value is discarded, which would race the directory out from under
+    # tldextract.
+    return TLDExtract(cache_dir=tempfile.mkdtemp())
+
+
+def matches_enqueue_strategy(
+    strategy: EnqueueStrategy,
+    *,
+    target_url: str | ParseResult,
+    origin_url: str | ParseResult,
+) -> bool:
+    """Check whether `target_url` matches `origin_url` under the given enqueue strategy.
+
+    Args:
+        strategy: The enqueue strategy to apply.
+        target_url: The URL to be evaluated.
+        origin_url: The reference URL the target is compared against.
+
+    Returns:
+        `True` if `target_url` is allowed under `strategy` relative to `origin_url`, `False` otherwise.
+    """
+    target = urlparse(target_url) if isinstance(target_url, str) else target_url
+    origin = urlparse(origin_url) if isinstance(origin_url, str) else origin_url
+
+    if strategy == 'all':
+        return True
+
+    if origin.hostname is None or target.hostname is None:
+        return False
+
+    if strategy == 'same-hostname':
+        return target.hostname == origin.hostname
+
+    if strategy == 'same-domain':
+        extractor = _get_tld_extractor()
+        origin_domain = extractor.extract_str(origin.hostname).top_domain_under_public_suffix
+        target_domain = extractor.extract_str(target.hostname).top_domain_under_public_suffix
+        return origin_domain == target_domain
+
+    if strategy == 'same-origin':
+        return target.hostname == origin.hostname and target.scheme == origin.scheme and target.port == origin.port
+
+    assert_never(strategy)
diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -6,7 +6,6 @@
 import logging
 import signal
 import sys
-import tempfile
 import threading
 import traceback
 from asyncio import CancelledError
@@ -21,8 +20,7 @@
 from weakref import WeakKeyDictionary
 
 from cachetools import LRUCache
-from tldextract import TLDExtract
-from typing_extensions import NotRequired, TypedDict, TypeVar, Unpack, assert_never
+from typing_extensions import NotRequired, TypedDict, TypeVar, Unpack
 from yarl import URL
 
 from crawlee import EnqueueStrategy, Glob, RequestTransformAction, service_locator
@@ -47,7 +45,7 @@
 from crawlee._utils.file import atomic_write, export_csv_to_stream, export_json_to_stream
 from crawlee._utils.recurring_task import RecurringTask
 from crawlee._utils.robots import RobotsTxtFile
-from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
+from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute, matches_enqueue_strategy
 from crawlee._utils.wait import wait_for
 from crawlee._utils.web import is_status_code_client_error, is_status_code_server_error
 from crawlee.errors import (
@@ -484,7 +482,6 @@ async def persist_state_factory() -> KeyValueStore:
         # Internal, not explicitly configurable components
         self._robots_txt_file_cache: LRUCache[str, RobotsTxtFile] = LRUCache(maxsize=1000)
         self._robots_txt_lock = asyncio.Lock()
-        self._tld_extractor = TLDExtract(cache_dir=tempfile.TemporaryDirectory().name)
         self._snapshotter = Snapshotter.from_config(config)
         self._autoscaled_pool = AutoscaledPool(
             system_status=SystemStatus(self._snapshotter),
@@ -1094,32 +1091,13 @@ def _check_enqueue_strategy(
         origin_url: ParseResult,
     ) -> bool:
         """Check if a URL matches the enqueue_strategy."""
-        if strategy == 'all':
-            return True
-
-        if origin_url.hostname is None or target_url.hostname is None:
+        if strategy != 'all' and (origin_url.hostname is None or target_url.hostname is None):
             self.log.debug(
                 f'Skipping enqueue: Missing hostname in origin_url = {origin_url.geturl()} or '
                 f'target_url = {target_url.geturl()}'
             )
-            return False
-
-        if strategy == 'same-hostname':
-            return target_url.hostname == origin_url.hostname
-
-        if strategy == 'same-domain':
-            origin_domain = self._tld_extractor.extract_str(origin_url.hostname).top_domain_under_public_suffix
-            target_domain = self._tld_extractor.extract_str(target_url.hostname).top_domain_under_public_suffix
-            return origin_domain == target_domain
-
-        if strategy == 'same-origin':
-            return (
-                target_url.hostname == origin_url.hostname
-                and target_url.scheme == origin_url.scheme
-                and target_url.port == origin_url.port
-            )
 
-        assert_never(strategy)
+        return matches_enqueue_strategy(strategy, target_url=target_url, origin_url=origin_url)
 
     def _check_url_patterns(
         self,
diff --git a/src/crawlee/request_loaders/_sitemap_request_loader.py b/src/crawlee/request_loaders/_sitemap_request_loader.py
@@ -5,6 +5,7 @@
 from contextlib import suppress
 from logging import getLogger
 from typing import TYPE_CHECKING, Annotated, Any
+from urllib.parse import urlparse
 
 from pydantic import BaseModel, ConfigDict, Field
 from typing_extensions import override
@@ -14,6 +15,7 @@
 from crawlee._utils.globs import Glob
 from crawlee._utils.recoverable_state import RecoverableState
 from crawlee._utils.sitemap import NestedSitemap, ParseSitemapOptions, SitemapSource, SitemapUrl, parse_sitemap
+from crawlee._utils.urls import matches_enqueue_strategy
 from crawlee.request_loaders._request_loader import RequestLoader
 
 if TYPE_CHECKING:
@@ -22,6 +24,7 @@
     from types import TracebackType
 
     from crawlee import RequestTransformAction
+    from crawlee._types import EnqueueStrategy
     from crawlee.http_clients import HttpClient
     from crawlee.proxy_configuration import ProxyInfo
     from crawlee.storage_clients.models import ProcessedRequest
@@ -111,6 +114,7 @@ def __init__(
         proxy_info: ProxyInfo | None = None,
         include: list[re.Pattern[Any] | Glob] | None = None,
         exclude: list[re.Pattern[Any] | Glob] | None = None,
+        enqueue_strategy: EnqueueStrategy = 'same-hostname',
         max_buffer_size: int = 200,
         persist_state_key: str | None = None,
         transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
@@ -122,6 +126,10 @@ def __init__(
             proxy_info: Optional proxy to use for fetching sitemaps.
             include: List of glob or regex patterns to include URLs.
             exclude: List of glob or regex patterns to exclude URLs.
+            enqueue_strategy: Strategy used to decide which sitemap-derived URLs (both nested-sitemap entries and
+                URL entries) are kept relative to the parent sitemap URL. Defaults to `'same-hostname'`, matching
+                the sitemap protocol's same-host expectation and the `enqueue_links` default; pass `'all'` to
+                disable filtering.
             max_buffer_size: Maximum number of URLs to buffer in memory.
             http_client: the instance of `HttpClient` to use for fetching sitemaps.
             persist_state_key: A key for persisting the loader's state in the KeyValueStore.
@@ -135,6 +143,7 @@ def __init__(
         self._sitemap_urls = sitemap_urls
         self._include = include
         self._exclude = exclude
+        self._enqueue_strategy = enqueue_strategy
         self._proxy_info = proxy_info
         self._max_buffer_size = max_buffer_size
         self._transform_request_function = transform_request_function
@@ -235,6 +244,9 @@ async def _load_sitemaps(self) -> None:
                     state.in_progress_sitemap_url = sitemap_url
 
                 parse_options = ParseSitemapOptions(max_depth=0, emit_nested_sitemaps=True, sitemap_retries=3)
+                # Parse the parent sitemap URL once per outer iteration; `matches_enqueue_strategy` is called per
+                # entry below, and re-parsing the same string thousands of times for large sitemaps is wasteful.
+                parsed_sitemap_url = urlparse(sitemap_url)
 
                 async for item in parse_sitemap(
                     [SitemapSource(type='url', url=sitemap_url)],
@@ -245,6 +257,14 @@ async def _load_sitemaps(self) -> None:
                     if isinstance(item, NestedSitemap):
                         # Add nested sitemap to queue
                         if item.loc not in state.pending_sitemap_urls and item.loc not in state.processed_sitemap_urls:
+                            if not matches_enqueue_strategy(
+                                self._enqueue_strategy, target_url=item.loc, origin_url=parsed_sitemap_url
+                            ):
+                                logger.warning(
+                                    f'Skipping nested sitemap {item.loc!r}: does not match enqueue strategy '
+                                    f'{self._enqueue_strategy!r} relative to {sitemap_url!r}.'
+                                )
+                                continue
                             state.pending_sitemap_urls.append(item.loc)
                         continue
 
@@ -261,6 +281,15 @@ async def _load_sitemaps(self) -> None:
                         if not self._check_url_patterns(url, self._include, self._exclude):
                             continue
 
+                        if not matches_enqueue_strategy(
+                            self._enqueue_strategy, target_url=url, origin_url=parsed_sitemap_url
+                        ):
+                            logger.warning(
+                                f'Skipping sitemap URL {url!r}: does not match enqueue strategy '
+                                f'{self._enqueue_strategy!r} relative to {sitemap_url!r}.'
+                            )
+                            continue
+
                         # Check if we have capacity in the queue
                         await self._queue_has_capacity.wait()
 
@@ -326,7 +355,7 @@ async def fetch_next_request(self) -> Request | None:
                     continue
 
                 url = state.url_queue.popleft()
-                request_option = RequestOptions(url=url)
+                request_option = RequestOptions(url=url, enqueue_strategy=self._enqueue_strategy)
 
                 if len(state.url_queue) < self._max_buffer_size:
                     self._queue_has_capacity.set()
diff --git a/tests/unit/_utils/test_robots.py b/tests/unit/_utils/test_robots.py
@@ -11,8 +11,10 @@
 
 
 async def test_generation_robots_txt_url(server_url: URL, http_client: HttpClient) -> None:
+    """`RobotsTxtFile.find` constructs the correct /robots.txt URL and successfully parses the response."""
     robots_file = await RobotsTxtFile.find(str(server_url), http_client)
-    assert len(robots_file.get_sitemaps()) > 0
+    # The fixture's robots.txt disallows /deny_all/ — proves the file was fetched and parsed.
+    assert not robots_file.is_allowed(str(server_url / 'deny_all/page.html'))
 
 
 async def test_allow_disallow_robots_txt(server_url: URL, http_client: HttpClient) -> None:
@@ -24,9 +26,33 @@ async def test_allow_disallow_robots_txt(server_url: URL, http_client: HttpClien
 
 
 async def test_extract_sitemaps_urls(server_url: URL, http_client: HttpClient) -> None:
+    """Cross-host sitemap entries are dropped from the test fixture's robots.txt."""
     robots = await RobotsTxtFile.find(str(server_url), http_client)
-    assert len(robots.get_sitemaps()) == 2
-    assert set(robots.get_sitemaps()) == {'http://not-exists.com/sitemap_1.xml', 'http://not-exists.com/sitemap_2.xml'}
+    # The fixture lists `http://not-exists.com/sitemap_*.xml`, which is cross-host relative to `server_url` and
+    # therefore filtered out per the robots.txt specification.
+    assert robots.get_sitemaps() == []
+
+
+async def test_extract_same_host_sitemaps_urls() -> None:
+    """Sitemap entries on the same host as the robots.txt are returned."""
+    content = 'User-agent: *\nSitemap: http://example.com/sitemap_1.xml\nSitemap: http://example.com/sitemap_2.xml\n'
+    robots = await RobotsTxtFile.from_content('http://example.com/robots.txt', content)
+    assert set(robots.get_sitemaps()) == {
+        'http://example.com/sitemap_1.xml',
+        'http://example.com/sitemap_2.xml',
+    }
+
+
+async def test_extract_sitemaps_urls_filters_cross_host() -> None:
+    """Cross-host `Sitemap:` directives in robots.txt are silently filtered."""
+    content = (
+        'User-agent: *\n'
+        'Sitemap: http://example.com/legit.xml\n'
+        'Sitemap: http://other.test/payload.xml\n'
+        'Sitemap: gopher://internal:6379/_PING\n'
+    )
+    robots = await RobotsTxtFile.from_content('http://example.com/robots.txt', content)
+    assert robots.get_sitemaps() == ['http://example.com/legit.xml']
 
 
 async def test_parse_from_content() -> None:
diff --git a/tests/unit/_utils/test_urls.py b/tests/unit/_utils/test_urls.py
@@ -1,9 +1,19 @@
 from __future__ import annotations
 
+from typing import TYPE_CHECKING
+
 import pytest
 from pydantic import ValidationError
 
-from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute, validate_http_url
+from crawlee._utils.urls import (
+    convert_to_absolute_url,
+    is_url_absolute,
+    matches_enqueue_strategy,
+    validate_http_url,
+)
+
+if TYPE_CHECKING:
+    from crawlee._types import EnqueueStrategy
 
 
 def test_is_url_absolute() -> None:
@@ -55,3 +65,30 @@ def test_validate_http_url() -> None:
 def test_validate_http_url_rejects_non_http_scheme(invalid_url: str) -> None:
     with pytest.raises(ValidationError):
         validate_http_url(invalid_url)
+
+
+@pytest.mark.parametrize(
+    ('strategy', 'origin', 'target', 'expected'),
+    [
+        # 'all' lets everything through, even with empty/cross-host targets
+        ('all', 'https://example.com/', 'https://other.test/', True),
+        ('all', 'https://example.com/', 'gopher://internal:6379/_PING', True),
+        # 'same-hostname' is exact host equality
+        ('same-hostname', 'https://example.com/a', 'https://example.com/b', True),
+        ('same-hostname', 'https://example.com/', 'https://www.example.com/', False),
+        ('same-hostname', 'https://example.com/', 'https://other.test/', False),
+        # 'same-domain' allows subdomains under the same registrable domain
+        ('same-domain', 'https://example.com/', 'https://www.example.com/', True),
+        ('same-domain', 'https://example.com/', 'https://api.example.com/', True),
+        ('same-domain', 'https://example.com/', 'https://other.test/', False),
+        # 'same-origin' requires scheme + host + port match
+        ('same-origin', 'https://example.com/', 'https://example.com/path', True),
+        ('same-origin', 'https://example.com/', 'http://example.com/', False),
+        ('same-origin', 'https://example.com/', 'https://example.com:8443/', False),
+        # missing hostname rejects everything except 'all'
+        ('same-hostname', 'https://example.com/', 'not-a-url', False),
+        ('same-domain', 'not-a-url', 'https://example.com/', False),
+    ],
+)
+def test_matches_enqueue_strategy(strategy: EnqueueStrategy, origin: str, target: str, *, expected: bool) -> None:
+    assert matches_enqueue_strategy(strategy, target_url=target, origin_url=origin) is expected
diff --git a/tests/unit/request_loaders/test_sitemap_request_loader.py b/tests/unit/request_loaders/test_sitemap_request_loader.py