55from contextlib import suppress
66from logging import getLogger
77from typing import TYPE_CHECKING , Annotated , Any
8+ from urllib .parse import urlparse
89
910from pydantic import BaseModel , ConfigDict , Field
1011from typing_extensions import override
1415from crawlee ._utils .globs import Glob
1516from crawlee ._utils .recoverable_state import RecoverableState
1617from crawlee ._utils .sitemap import NestedSitemap , ParseSitemapOptions , SitemapSource , SitemapUrl , parse_sitemap
18+ from crawlee ._utils .urls import matches_enqueue_strategy
1719from crawlee .request_loaders ._request_loader import RequestLoader
1820
1921if TYPE_CHECKING :
2224 from types import TracebackType
2325
2426 from crawlee import RequestTransformAction
27+ from crawlee ._types import EnqueueStrategy
2528 from crawlee .http_clients import HttpClient
2629 from crawlee .proxy_configuration import ProxyInfo
2730 from crawlee .storage_clients .models import ProcessedRequest
@@ -111,6 +114,7 @@ def __init__(
111114 proxy_info : ProxyInfo | None = None ,
112115 include : list [re .Pattern [Any ] | Glob ] | None = None ,
113116 exclude : list [re .Pattern [Any ] | Glob ] | None = None ,
117+ enqueue_strategy : EnqueueStrategy = 'same-hostname' ,
114118 max_buffer_size : int = 200 ,
115119 persist_state_key : str | None = None ,
116120 transform_request_function : Callable [[RequestOptions ], RequestOptions | RequestTransformAction ] | None = None ,
@@ -122,6 +126,10 @@ def __init__(
122126 proxy_info: Optional proxy to use for fetching sitemaps.
123127 include: List of glob or regex patterns to include URLs.
124128 exclude: List of glob or regex patterns to exclude URLs.
129+ enqueue_strategy: Strategy used to decide which sitemap-derived URLs (both nested-sitemap entries and
130+ URL entries) are kept relative to the parent sitemap URL. Defaults to `'same-hostname'`, matching
131+ the sitemap protocol's same-host expectation and the `enqueue_links` default; pass `'all'` to
132+ disable filtering.
125133 max_buffer_size: Maximum number of URLs to buffer in memory.
126134 http_client: the instance of `HttpClient` to use for fetching sitemaps.
127135 persist_state_key: A key for persisting the loader's state in the KeyValueStore.
@@ -135,6 +143,7 @@ def __init__(
135143 self ._sitemap_urls = sitemap_urls
136144 self ._include = include
137145 self ._exclude = exclude
146+ self ._enqueue_strategy = enqueue_strategy
138147 self ._proxy_info = proxy_info
139148 self ._max_buffer_size = max_buffer_size
140149 self ._transform_request_function = transform_request_function
@@ -235,6 +244,9 @@ async def _load_sitemaps(self) -> None:
235244 state .in_progress_sitemap_url = sitemap_url
236245
237246 parse_options = ParseSitemapOptions (max_depth = 0 , emit_nested_sitemaps = True , sitemap_retries = 3 )
247+ # Parse the parent sitemap URL once per outer iteration; `matches_enqueue_strategy` is called per
248+ # entry below, and re-parsing the same string thousands of times for large sitemaps is wasteful.
249+ parsed_sitemap_url = urlparse (sitemap_url )
238250
239251 async for item in parse_sitemap (
240252 [SitemapSource (type = 'url' , url = sitemap_url )],
@@ -245,6 +257,14 @@ async def _load_sitemaps(self) -> None:
245257 if isinstance (item , NestedSitemap ):
246258 # Add nested sitemap to queue
247259 if item .loc not in state .pending_sitemap_urls and item .loc not in state .processed_sitemap_urls :
260+ if not matches_enqueue_strategy (
261+ self ._enqueue_strategy , target_url = item .loc , origin_url = parsed_sitemap_url
262+ ):
263+ logger .warning (
264+ f'Skipping nested sitemap { item .loc !r} : does not match enqueue strategy '
265+ f'{ self ._enqueue_strategy !r} relative to { sitemap_url !r} .'
266+ )
267+ continue
248268 state .pending_sitemap_urls .append (item .loc )
249269 continue
250270
@@ -261,6 +281,15 @@ async def _load_sitemaps(self) -> None:
261281 if not self ._check_url_patterns (url , self ._include , self ._exclude ):
262282 continue
263283
284+ if not matches_enqueue_strategy (
285+ self ._enqueue_strategy , target_url = url , origin_url = parsed_sitemap_url
286+ ):
287+ logger .warning (
288+ f'Skipping sitemap URL { url !r} : does not match enqueue strategy '
289+ f'{ self ._enqueue_strategy !r} relative to { sitemap_url !r} .'
290+ )
291+ continue
292+
264293 # Check if we have capacity in the queue
265294 await self ._queue_has_capacity .wait ()
266295
@@ -326,7 +355,7 @@ async def fetch_next_request(self) -> Request | None:
326355 continue
327356
328357 url = state .url_queue .popleft ()
329- request_option = RequestOptions (url = url )
358+ request_option = RequestOptions (url = url , enqueue_strategy = self . _enqueue_strategy )
330359
331360 if len (state .url_queue ) < self ._max_buffer_size :
332361 self ._queue_has_capacity .set ()
0 commit comments