Skip to content

Commit 6db365d

Browse files
kozlicevdusek
andauthored
feat: allow non-href links extract & enqueue (#1781)
### Description Right now `href` attribute is hardcoded for `enqueue_links` and `extract_links` methods. This change would allow to: - grab image links - grab links from exotic attributes in SPA Examples: ```python await context.extract_links(selector="nav ul li", attribute="data-href") await context.enqueue_links(selector=".gallery .item img", attribute="src", label="image") ``` ### Issues ? ### Testing Covered by new tests. ### Checklist - [ ] CI passed --------- Co-authored-by: Vlada Dusek <v.dusek96@gmail.com>
1 parent b6894b8 commit 6db365d

14 files changed

Lines changed: 341 additions & 175 deletions

File tree

docs/guides/code_examples/http_crawlers/selectolax_parser.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def is_matching_selector(
4545

4646
@override
4747
def find_links(
48-
self, parsed_content: LexborHTMLParser, selector: str
48+
self, parsed_content: LexborHTMLParser, selector: str, attribute: str
4949
) -> Iterable[str]:
5050
"""Extract href attributes from elements matching the selector.
5151
@@ -54,7 +54,7 @@ def find_links(
5454
link: LexborNode
5555
urls: list[str] = []
5656
for link in parsed_content.css(selector):
57-
url = link.attributes.get('href')
57+
url = link.attributes.get(attribute)
5858
if url:
5959
urls.append(url.strip())
6060
return urls

src/crawlee/_types.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,7 @@ def __call__(
387387
self,
388388
*,
389389
selector: str | None = None,
390+
attribute: str | None = None,
390391
label: str | None = None,
391392
user_data: dict[str, Any] | None = None,
392393
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
@@ -411,6 +412,7 @@ def __call__(
411412
self,
412413
*,
413414
selector: str | None = None,
415+
attribute: str | None = None,
414416
label: str | None = None,
415417
user_data: dict[str, Any] | None = None,
416418
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
@@ -428,6 +430,7 @@ def __call__(
428430
- `PlaywrightCrawler` supports CSS and XPath selectors.
429431
- `ParselCrawler` supports CSS selectors.
430432
- `BeautifulSoupCrawler` supports CSS selectors.
433+
attribute: Which node attribute to extract the links from.
431434
label: Label for the newly created `Request` objects, used for request routing.
432435
user_data: User data to be provided to the newly created `Request` objects.
433436
transform_request_function: A function that takes `RequestOptions` and returns either:
@@ -457,6 +460,7 @@ def __call__(
457460
self,
458461
*,
459462
selector: str = 'a',
463+
attribute: str = 'href',
460464
label: str | None = None,
461465
user_data: dict[str, Any] | None = None,
462466
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
@@ -470,6 +474,7 @@ def __call__(
470474
- `PlaywrightCrawler` supports CSS and XPath selectors.
471475
- `ParselCrawler` supports CSS selectors.
472476
- `BeautifulSoupCrawler` supports CSS selectors.
477+
attribute: Which node attribute to extract the links from.
473478
label: Label for the newly created `Request` objects, used for request routing.
474479
user_data: User data to be provided to the newly created `Request` objects.
475480
transform_request_function: A function that takes `RequestOptions` and returns either:

src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,7 @@ def _create_extract_links_function(
176176
async def extract_links(
177177
*,
178178
selector: str = 'a',
179+
attribute: str = 'href',
179180
label: str | None = None,
180181
user_data: dict[str, Any] | None = None,
181182
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
@@ -191,10 +192,12 @@ async def extract_links(
191192
kwargs.setdefault('strategy', 'same-hostname')
192193
strategy = kwargs.get('strategy', 'same-hostname')
193194

194-
links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
195+
links_iterator: Iterator[str] = iter(
196+
self._parser.find_links(parsed_content, selector=selector, attribute=attribute)
197+
)
195198

196199
# Get base URL from <base> tag if present
197-
extracted_base_urls = list(self._parser.find_links(parsed_content, 'base[href]'))
200+
extracted_base_urls = list(self._parser.find_links(parsed_content, 'base[href]', 'href'))
198201
base_url: str = (
199202
str(extracted_base_urls[0])
200203
if extracted_base_urls

src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,12 +93,13 @@ def is_matching_selector(self, parsed_content: TParseResult, selector: str) -> b
9393
"""
9494

9595
@abstractmethod
96-
def find_links(self, parsed_content: TParseResult, selector: str) -> Iterable[str]:
96+
def find_links(self, parsed_content: TParseResult, selector: str, attribute: str) -> Iterable[str]:
9797
"""Find all links in result using selector.
9898
9999
Args:
100100
parsed_content: Parsed HTTP response. Result of `parse` method.
101101
selector: String used to define matching pattern for finding links.
102+
attribute: Which node attribute to extract the links from.
102103
103104
Returns:
104105
Iterable of strings that contain found links.

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -997,6 +997,7 @@ def _create_enqueue_links_function(
997997
async def enqueue_links(
998998
*,
999999
selector: str | None = None,
1000+
attribute: str | None = None,
10001001
label: str | None = None,
10011002
user_data: dict[str, Any] | None = None,
10021003
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
@@ -1010,9 +1011,9 @@ async def enqueue_links(
10101011
kwargs.setdefault('strategy', 'same-hostname')
10111012

10121013
if requests:
1013-
if any((selector, label, user_data, transform_request_function)):
1014+
if any((selector, attribute, label, user_data, transform_request_function)):
10141015
raise ValueError(
1015-
'You cannot provide `selector`, `label`, `user_data` or '
1016+
'You cannot provide `selector`, `attribute`, `label`, `user_data` or '
10161017
'`transform_request_function` arguments when `requests` is provided.'
10171018
)
10181019
# Add directly passed requests.
@@ -1024,6 +1025,7 @@ async def enqueue_links(
10241025
await context.add_requests(
10251026
await extract_links(
10261027
selector=selector or 'a',
1028+
attribute=attribute or 'href',
10271029
label=label,
10281030
user_data=user_data,
10291031
transform_request_function=transform_request_function,

src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,11 +38,11 @@ async def select(self, parsed_content: Tag, selector: str) -> Sequence[Tag]:
3838
return tuple(match for match in parsed_content.select(selector))
3939

4040
@override
41-
def find_links(self, parsed_content: Tag, selector: str) -> Iterable[str]:
41+
def find_links(self, parsed_content: Tag, selector: str, attribute: str) -> Iterable[str]:
4242
link: Tag
4343
urls: list[str] = []
4444
for link in parsed_content.select(selector):
45-
url = link.attrs.get('href')
45+
url = link.attrs.get(attribute)
4646
if url:
4747
urls.append(url.strip())
4848
return urls

src/crawlee/crawlers/_http/_http_parser.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,5 +43,7 @@ def is_matching_selector(self, parsed_content: bytes, selector: str) -> bool: #
4343
return False
4444

4545
@override
46-
def find_links(self, parsed_content: bytes, selector: str) -> Iterable[str]: # Intentional unused argument.
46+
def find_links(
47+
self, parsed_content: bytes, selector: str, attribute: str
48+
) -> Iterable[str]: # Intentional unused argument.
4749
return []

src/crawlee/crawlers/_parsel/_parsel_parser.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,11 @@ def is_matching_selector(self, parsed_content: Selector, selector: str) -> bool:
3737
return parsed_content.type in ('html', 'xml') and parsed_content.css(selector).get() is not None
3838

3939
@override
40-
def find_links(self, parsed_content: Selector, selector: str) -> Iterable[str]:
40+
def find_links(self, parsed_content: Selector, selector: str, attribute: str) -> Iterable[str]:
4141
link: Selector
4242
urls: list[str] = []
4343
for link in parsed_content.css(selector):
44-
url = link.xpath('@href').get()
44+
url = link.xpath(f'@{attribute}').get()
4545
if url:
4646
urls.append(url.strip())
4747
return urls

src/crawlee/crawlers/_playwright/_playwright_crawler.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,7 @@ def _create_extract_links_function(self, context: PlaywrightPreNavCrawlingContex
373373
async def extract_links(
374374
*,
375375
selector: str = 'a',
376+
attribute: str = 'href',
376377
label: str | None = None,
377378
user_data: dict | None = None,
378379
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
@@ -394,7 +395,7 @@ async def extract_links(
394395

395396
elements = await context.page.query_selector_all(selector)
396397
links_iterator: Iterator[str] = iter(
397-
[url for element in elements if (url := await element.get_attribute('href')) is not None]
398+
[url for element in elements if (url := await element.get_attribute(attribute)) is not None]
398399
)
399400

400401
# Get base URL from <base> tag if present

tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py

Lines changed: 93 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -50,20 +50,41 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
5050

5151
await crawler.run(requests)
5252

53-
first_visited = visit.call_args_list[0][0][0]
54-
visited = {call[0][0] for call in visit.call_args_list}
55-
56-
assert first_visited == redirect_url
57-
assert visited == {
58-
redirect_url,
59-
str(server_url / 'sub_index'),
60-
str(server_url / 'page_1'),
61-
str(server_url / 'page_2'),
62-
str(server_url / 'page_3'),
63-
str(server_url / 'page_4'),
64-
str(server_url / 'base_page'),
65-
str(server_url / 'base_subpath/page_5'),
66-
}
53+
expected_visit_calls = [
54+
mock.call(redirect_url),
55+
mock.call(str(server_url / 'sub_index')),
56+
mock.call(str(server_url / 'page_1')),
57+
mock.call(str(server_url / 'page_2')),
58+
mock.call(str(server_url / 'page_3')),
59+
mock.call(str(server_url / 'page_4')),
60+
mock.call(str(server_url / 'base_page')),
61+
mock.call(str(server_url / 'base_subpath/page_5')),
62+
]
63+
assert visit.mock_calls[0] == expected_visit_calls[0]
64+
visit.assert_has_calls(expected_visit_calls, any_order=True)
65+
66+
67+
async def test_enqueue_non_href_links(redirect_server_url: URL, server_url: URL, http_client: HttpClient) -> None:
68+
redirect_target = str(server_url / 'start_enqueue_non_href')
69+
redirect_url = str(redirect_server_url.with_path('redirect').with_query(url=redirect_target))
70+
requests = [redirect_url]
71+
72+
crawler = BeautifulSoupCrawler(http_client=http_client)
73+
visit = mock.Mock()
74+
75+
@crawler.router.default_handler
76+
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
77+
visit(context.request.url)
78+
await context.enqueue_links(selector='img', attribute='src')
79+
80+
await crawler.run(requests)
81+
82+
expected_visit_calls = [
83+
mock.call(redirect_url),
84+
mock.call(str(server_url / 'base_subpath/image_1')),
85+
mock.call(str(server_url / 'image_2')),
86+
]
87+
visit.assert_has_calls(expected_visit_calls, any_order=True)
6788

6889

6990
async def test_enqueue_links_selector(server_url: URL, http_client: HttpClient) -> None:
@@ -77,8 +98,11 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
7798

7899
await crawler.run([str(server_url / 'start_enqueue')])
79100

80-
visited = {call[0][0] for call in visit.call_args_list}
81-
assert visited == {str(server_url / 'start_enqueue'), str(server_url / 'sub_index')}
101+
expected_visit_calls = [
102+
mock.call(str(server_url / 'start_enqueue')),
103+
mock.call(str(server_url / 'sub_index')),
104+
]
105+
visit.assert_has_calls(expected_visit_calls, any_order=True)
82106

83107

84108
async def test_enqueue_links_with_max_crawl(server_url: URL, http_client: HttpClient) -> None:
@@ -128,18 +152,17 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
128152

129153
await crawler.run([str(server_url / 'start_enqueue')])
130154

131-
visited = {call[0][0] for call in visit.call_args_list}
132-
133155
# url /page_3 should not be visited
134-
assert visited == {
135-
str(server_url / 'start_enqueue'),
136-
str(server_url / 'sub_index'),
137-
str(server_url / 'page_1'),
138-
str(server_url / 'page_2'),
139-
str(server_url / 'base_page'),
140-
str(server_url / 'page_4'),
141-
str(server_url / 'base_subpath/page_5'),
142-
}
156+
expected_visit_calls = [
157+
mock.call(str(server_url / 'start_enqueue')),
158+
mock.call(str(server_url / 'sub_index')),
159+
mock.call(str(server_url / 'page_1')),
160+
mock.call(str(server_url / 'page_2')),
161+
mock.call(str(server_url / 'base_page')),
162+
mock.call(str(server_url / 'page_4')),
163+
mock.call(str(server_url / 'base_subpath/page_5')),
164+
]
165+
visit.assert_has_calls(expected_visit_calls, any_order=True)
143166

144167
# # all urls added to `enqueue_links` must have a custom header
145168
assert headers[1]['transform-header'] == 'my-header'
@@ -167,14 +190,14 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
167190
await context.enqueue_links()
168191

169192
await crawler.run([str(server_url / 'start_enqueue')])
170-
visited = {call[0][0] for call in visit.call_args_list}
171193

172-
assert visited == {
173-
str(server_url / 'start_enqueue'),
174-
str(server_url / 'sub_index'),
175-
str(server_url / 'base_page'),
176-
str(server_url / 'base_subpath/page_5'),
177-
}
194+
expected_visit_calls = [
195+
mock.call(str(server_url / 'start_enqueue')),
196+
mock.call(str(server_url / 'sub_index')),
197+
mock.call(str(server_url / 'base_page')),
198+
mock.call(str(server_url / 'base_subpath/page_5')),
199+
]
200+
visit.assert_has_calls(expected_visit_calls, any_order=True)
178201

179202

180203
async def test_respect_robots_txt_with_problematic_links(server_url: URL, http_client: HttpClient) -> None:
@@ -198,17 +221,19 @@ async def error_handler(context: BasicCrawlingContext, _error: Exception) -> Non
198221

199222
await crawler.run([str(server_url / 'problematic_links')])
200223

201-
visited = {call[0][0] for call in visit.call_args_list}
202-
failed = {call[0][0] for call in fail.call_args_list}
203-
204224
# Email must be skipped
205225
# https://avatars.githubusercontent.com/apify does not get robots.txt, but is correct for the crawler.
206-
assert visited == {str(server_url / 'problematic_links'), 'https://avatars.githubusercontent.com/apify'}
226+
expected_visit_calls = [
227+
mock.call(str(server_url / 'problematic_links')),
228+
mock.call('https://avatars.githubusercontent.com/apify'),
229+
]
230+
visit.assert_has_calls(expected_visit_calls, any_order=True)
207231

208232
# The budplaceholder.com does not exist.
209-
assert failed == {
210-
'https://budplaceholder.com/',
211-
}
233+
expected_fail_calls = [
234+
mock.call('https://budplaceholder.com/'),
235+
]
236+
fail.assert_has_calls(expected_fail_calls, any_order=True)
212237

213238

214239
async def test_on_skipped_request(server_url: URL, http_client: HttpClient) -> None:
@@ -225,14 +250,13 @@ async def skipped_hook(url: str, _reason: SkippedReason) -> None:
225250

226251
await crawler.run([str(server_url / 'start_enqueue')])
227252

228-
skipped = {call[0][0] for call in skip.call_args_list}
229-
230-
assert skipped == {
231-
str(server_url / 'page_1'),
232-
str(server_url / 'page_2'),
233-
str(server_url / 'page_3'),
234-
str(server_url / 'page_4'),
235-
}
253+
expected_skip_calls = [
254+
mock.call(str(server_url / 'page_1')),
255+
mock.call(str(server_url / 'page_2')),
256+
mock.call(str(server_url / 'page_3')),
257+
mock.call(str(server_url / 'page_4')),
258+
]
259+
skip.assert_has_calls(expected_skip_calls, any_order=True)
236260

237261

238262
async def test_extract_links(server_url: URL, http_client: HttpClient) -> None:
@@ -250,6 +274,21 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
250274
assert extracted_links[0] == str(server_url / 'page_1')
251275

252276

277+
async def test_extract_non_href_links(server_url: URL, http_client: HttpClient) -> None:
278+
crawler = BeautifulSoupCrawler(http_client=http_client)
279+
extracted_links: list[str] = []
280+
281+
@crawler.router.default_handler
282+
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
283+
links = await context.extract_links(selector='li', attribute='data-href')
284+
extracted_links.extend(request.url for request in links)
285+
286+
await crawler.run([str(server_url / 'non_href_links')])
287+
288+
assert len(extracted_links) == 1
289+
assert extracted_links[0] == str(server_url / 'page_2')
290+
291+
253292
@pytest.mark.parametrize(
254293
('queue_name', 'queue_alias', 'by_id'),
255294
[
@@ -444,12 +483,9 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
444483

445484
await crawler.run(requests)
446485

447-
first_visited = visit.call_args_list[0][0][0]
448-
visited = {call[0][0] for call in visit.call_args_list}
449-
450-
assert first_visited == start_url
451486
# Only one link should be enqueued from sub_index due to the limit
452-
assert visited == {
453-
start_url,
454-
str(server_url / 'page_3'),
455-
}
487+
expected_visit_calls = [
488+
mock.call(start_url),
489+
mock.call(str(server_url / 'page_3')),
490+
]
491+
visit.assert_has_calls(expected_visit_calls, any_order=True)

0 commit comments

Comments
 (0)