55import warnings
66from functools import partial
77from typing import TYPE_CHECKING , Any , Callable , Generic , Literal , Union
8+ from urllib .parse import urlparse
89
910from pydantic import ValidationError
1011from typing_extensions import NotRequired , TypedDict , TypeVar
@@ -344,8 +345,6 @@ async def extract_links(
344345
345346 The `PlaywrightCrawler` implementation of the `ExtractLinksFunction` function.
346347 """
347- kwargs .setdefault ('strategy' , 'same-hostname' )
348-
349348 requests = list [Request ]()
350349 skipped = list [str ]()
351350 base_user_data = user_data or {}
@@ -354,7 +353,15 @@ async def extract_links(
354353
355354 robots_txt_file = await self ._get_robots_txt_file_for_url (context .request .url )
356355
356+ strategy = kwargs .get ('strategy' , 'same-hostname' )
357+ include_blobs = kwargs .get ('include' )
358+ exclude_blobs = kwargs .get ('exclude' )
359+ limit_requests = kwargs .get ('limit' )
360+
357361 for element in elements :
362+ if limit_requests and len (requests ) >= limit_requests :
363+ break
364+
358365 url = await element .get_attribute ('href' )
359366
360367 if url :
@@ -368,26 +375,31 @@ async def extract_links(
368375 skipped .append (url )
369376 continue
370377
371- request_option = RequestOptions ({'url' : url , 'user_data' : {** base_user_data }, 'label' : label })
372-
373- if transform_request_function :
374- transform_request_option = transform_request_function (request_option )
375- if transform_request_option == 'skip' :
378+ if self ._check_enqueue_strategy (
379+ strategy ,
380+ target_url = urlparse (url ),
381+ origin_url = urlparse (context .request .url ),
382+ ) and self ._check_url_patterns (url , include_blobs , exclude_blobs ):
383+ request_option = RequestOptions ({'url' : url , 'user_data' : {** base_user_data }, 'label' : label })
384+
385+ if transform_request_function :
386+ transform_request_option = transform_request_function (request_option )
387+ if transform_request_option == 'skip' :
388+ continue
389+ if transform_request_option != 'unchanged' :
390+ request_option = transform_request_option
391+
392+ try :
393+ request = Request .from_url (** request_option )
394+ except ValidationError as exc :
395+ context .log .debug (
396+ f'Skipping URL "{ url } " due to invalid format: { exc } . '
397+ 'This may be caused by a malformed URL or unsupported URL scheme. '
398+ 'Please ensure the URL is correct and retry.'
399+ )
376400 continue
377- if transform_request_option != 'unchanged' :
378- request_option = transform_request_option
379-
380- try :
381- request = Request .from_url (** request_option )
382- except ValidationError as exc :
383- context .log .debug (
384- f'Skipping URL "{ url } " due to invalid format: { exc } . '
385- 'This may be caused by a malformed URL or unsupported URL scheme. '
386- 'Please ensure the URL is correct and retry.'
387- )
388- continue
389401
390- requests .append (request )
402+ requests .append (request )
391403
392404 if skipped :
393405 skipped_tasks = [
0 commit comments