@@ -50,20 +50,41 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
5050
5151 await crawler .run (requests )
5252
53- first_visited = visit .call_args_list [0 ][0 ][0 ]
54- visited = {call [0 ][0 ] for call in visit .call_args_list }
55-
56- assert first_visited == redirect_url
57- assert visited == {
58- redirect_url ,
59- str (server_url / 'sub_index' ),
60- str (server_url / 'page_1' ),
61- str (server_url / 'page_2' ),
62- str (server_url / 'page_3' ),
63- str (server_url / 'page_4' ),
64- str (server_url / 'base_page' ),
65- str (server_url / 'base_subpath/page_5' ),
66- }
53+ expected_visit_calls = [
54+ mock .call (redirect_url ),
55+ mock .call (str (server_url / 'sub_index' )),
56+ mock .call (str (server_url / 'page_1' )),
57+ mock .call (str (server_url / 'page_2' )),
58+ mock .call (str (server_url / 'page_3' )),
59+ mock .call (str (server_url / 'page_4' )),
60+ mock .call (str (server_url / 'base_page' )),
61+ mock .call (str (server_url / 'base_subpath/page_5' )),
62+ ]
63+ assert visit .mock_calls [0 ] == expected_visit_calls [0 ]
64+ visit .assert_has_calls (expected_visit_calls , any_order = True )
65+
66+
67+ async def test_enqueue_non_href_links (redirect_server_url : URL , server_url : URL , http_client : HttpClient ) -> None :
68+ redirect_target = str (server_url / 'start_enqueue_non_href' )
69+ redirect_url = str (redirect_server_url .with_path ('redirect' ).with_query (url = redirect_target ))
70+ requests = [redirect_url ]
71+
72+ crawler = BeautifulSoupCrawler (http_client = http_client )
73+ visit = mock .Mock ()
74+
75+ @crawler .router .default_handler
76+ async def request_handler (context : BeautifulSoupCrawlingContext ) -> None :
77+ visit (context .request .url )
78+ await context .enqueue_links (selector = 'img' , attribute = 'src' )
79+
80+ await crawler .run (requests )
81+
82+ expected_visit_calls = [
83+ mock .call (redirect_url ),
84+ mock .call (str (server_url / 'base_subpath/image_1' )),
85+ mock .call (str (server_url / 'image_2' )),
86+ ]
87+ visit .assert_has_calls (expected_visit_calls , any_order = True )
6788
6889
6990async def test_enqueue_links_selector (server_url : URL , http_client : HttpClient ) -> None :
@@ -77,8 +98,11 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
7798
7899 await crawler .run ([str (server_url / 'start_enqueue' )])
79100
80- visited = {call [0 ][0 ] for call in visit .call_args_list }
81- assert visited == {str (server_url / 'start_enqueue' ), str (server_url / 'sub_index' )}
101+ expected_visit_calls = [
102+ mock .call (str (server_url / 'start_enqueue' )),
103+ mock .call (str (server_url / 'sub_index' )),
104+ ]
105+ visit .assert_has_calls (expected_visit_calls , any_order = True )
82106
83107
84108async def test_enqueue_links_with_max_crawl (server_url : URL , http_client : HttpClient ) -> None :
@@ -128,18 +152,17 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
128152
129153 await crawler .run ([str (server_url / 'start_enqueue' )])
130154
131- visited = {call [0 ][0 ] for call in visit .call_args_list }
132-
133155 # url /page_3 should not be visited
134- assert visited == {
135- str (server_url / 'start_enqueue' ),
136- str (server_url / 'sub_index' ),
137- str (server_url / 'page_1' ),
138- str (server_url / 'page_2' ),
139- str (server_url / 'base_page' ),
140- str (server_url / 'page_4' ),
141- str (server_url / 'base_subpath/page_5' ),
142- }
156+ expected_visit_calls = [
157+ mock .call (str (server_url / 'start_enqueue' )),
158+ mock .call (str (server_url / 'sub_index' )),
159+ mock .call (str (server_url / 'page_1' )),
160+ mock .call (str (server_url / 'page_2' )),
161+ mock .call (str (server_url / 'base_page' )),
162+ mock .call (str (server_url / 'page_4' )),
163+ mock .call (str (server_url / 'base_subpath/page_5' )),
164+ ]
165+ visit .assert_has_calls (expected_visit_calls , any_order = True )
143166
144167 # # all urls added to `enqueue_links` must have a custom header
145168 assert headers [1 ]['transform-header' ] == 'my-header'
@@ -167,14 +190,14 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
167190 await context .enqueue_links ()
168191
169192 await crawler .run ([str (server_url / 'start_enqueue' )])
170- visited = {call [0 ][0 ] for call in visit .call_args_list }
171193
172- assert visited == {
173- str (server_url / 'start_enqueue' ),
174- str (server_url / 'sub_index' ),
175- str (server_url / 'base_page' ),
176- str (server_url / 'base_subpath/page_5' ),
177- }
194+ expected_visit_calls = [
195+ mock .call (str (server_url / 'start_enqueue' )),
196+ mock .call (str (server_url / 'sub_index' )),
197+ mock .call (str (server_url / 'base_page' )),
198+ mock .call (str (server_url / 'base_subpath/page_5' )),
199+ ]
200+ visit .assert_has_calls (expected_visit_calls , any_order = True )
178201
179202
180203async def test_respect_robots_txt_with_problematic_links (server_url : URL , http_client : HttpClient ) -> None :
@@ -198,17 +221,19 @@ async def error_handler(context: BasicCrawlingContext, _error: Exception) -> Non
198221
199222 await crawler .run ([str (server_url / 'problematic_links' )])
200223
201- visited = {call [0 ][0 ] for call in visit .call_args_list }
202- failed = {call [0 ][0 ] for call in fail .call_args_list }
203-
204224 # Email must be skipped
205225 # https://avatars.githubusercontent.com/apify does not get robots.txt, but is correct for the crawler.
206- assert visited == {str (server_url / 'problematic_links' ), 'https://avatars.githubusercontent.com/apify' }
226+ expected_visit_calls = [
227+ mock .call (str (server_url / 'problematic_links' )),
228+ mock .call ('https://avatars.githubusercontent.com/apify' ),
229+ ]
230+ visit .assert_has_calls (expected_visit_calls , any_order = True )
207231
208232 # The budplaceholder.com does not exist.
209- assert failed == {
210- 'https://budplaceholder.com/' ,
211- }
233+ expected_fail_calls = [
234+ mock .call ('https://budplaceholder.com/' ),
235+ ]
236+ fail .assert_has_calls (expected_fail_calls , any_order = True )
212237
213238
214239async def test_on_skipped_request (server_url : URL , http_client : HttpClient ) -> None :
@@ -225,14 +250,13 @@ async def skipped_hook(url: str, _reason: SkippedReason) -> None:
225250
226251 await crawler .run ([str (server_url / 'start_enqueue' )])
227252
228- skipped = {call [0 ][0 ] for call in skip .call_args_list }
229-
230- assert skipped == {
231- str (server_url / 'page_1' ),
232- str (server_url / 'page_2' ),
233- str (server_url / 'page_3' ),
234- str (server_url / 'page_4' ),
235- }
253+ expected_skip_calls = [
254+ mock .call (str (server_url / 'page_1' )),
255+ mock .call (str (server_url / 'page_2' )),
256+ mock .call (str (server_url / 'page_3' )),
257+ mock .call (str (server_url / 'page_4' )),
258+ ]
259+ skip .assert_has_calls (expected_skip_calls , any_order = True )
236260
237261
238262async def test_extract_links (server_url : URL , http_client : HttpClient ) -> None :
@@ -250,6 +274,21 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
250274 assert extracted_links [0 ] == str (server_url / 'page_1' )
251275
252276
277+ async def test_extract_non_href_links (server_url : URL , http_client : HttpClient ) -> None :
278+ crawler = BeautifulSoupCrawler (http_client = http_client )
279+ extracted_links : list [str ] = []
280+
281+ @crawler .router .default_handler
282+ async def request_handler (context : BeautifulSoupCrawlingContext ) -> None :
283+ links = await context .extract_links (selector = 'li' , attribute = 'data-href' )
284+ extracted_links .extend (request .url for request in links )
285+
286+ await crawler .run ([str (server_url / 'non_href_links' )])
287+
288+ assert len (extracted_links ) == 1
289+ assert extracted_links [0 ] == str (server_url / 'page_2' )
290+
291+
253292@pytest .mark .parametrize (
254293 ('queue_name' , 'queue_alias' , 'by_id' ),
255294 [
@@ -444,12 +483,9 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
444483
445484 await crawler .run (requests )
446485
447- first_visited = visit .call_args_list [0 ][0 ][0 ]
448- visited = {call [0 ][0 ] for call in visit .call_args_list }
449-
450- assert first_visited == start_url
451486 # Only one link should be enqueued from sub_index due to the limit
452- assert visited == {
453- start_url ,
454- str (server_url / 'page_3' ),
455- }
487+ expected_visit_calls = [
488+ mock .call (start_url ),
489+ mock .call (str (server_url / 'page_3' )),
490+ ]
491+ visit .assert_has_calls (expected_visit_calls , any_order = True )
0 commit comments