|  | 
|  | 1 | +from __future__ import annotations | 
|  | 2 | + | 
|  | 3 | +from typing import TYPE_CHECKING | 
|  | 4 | + | 
|  | 5 | +if TYPE_CHECKING: | 
|  | 6 | +    from .conftest import MakeActorFunction, RunActorFunction | 
|  | 7 | + | 
|  | 8 | + | 
|  | 9 | +async def test_actor_on_platform_max_crawl_depth( | 
|  | 10 | +    make_actor: MakeActorFunction, | 
|  | 11 | +    run_actor: RunActorFunction, | 
|  | 12 | +) -> None: | 
|  | 13 | +    """Test that the actor respects max_crawl_depth.""" | 
|  | 14 | + | 
|  | 15 | +    async def main() -> None: | 
|  | 16 | +        """The crawler entry point.""" | 
|  | 17 | +        import re | 
|  | 18 | + | 
|  | 19 | +        from crawlee.crawlers import ParselCrawler, ParselCrawlingContext | 
|  | 20 | + | 
|  | 21 | +        from apify import Actor | 
|  | 22 | + | 
|  | 23 | +        async with Actor: | 
|  | 24 | +            crawler = ParselCrawler(max_crawl_depth=2) | 
|  | 25 | +            finished = [] | 
|  | 26 | +            enqueue_pattern = re.compile(r'http://localhost:8080/2+$') | 
|  | 27 | + | 
|  | 28 | +            @crawler.router.default_handler | 
|  | 29 | +            async def default_handler(context: ParselCrawlingContext) -> None: | 
|  | 30 | +                """Default request handler.""" | 
|  | 31 | +                context.log.info(f'Processing {context.request.url} ...') | 
|  | 32 | +                await context.enqueue_links(include=[enqueue_pattern]) | 
|  | 33 | +                finished.append(context.request.url) | 
|  | 34 | + | 
|  | 35 | +            await crawler.run(['http://localhost:8080/']) | 
|  | 36 | +            assert finished == ['http://localhost:8080/', 'http://localhost:8080/2', 'http://localhost:8080/22'] | 
|  | 37 | + | 
|  | 38 | +    actor = await make_actor(label='crawler-max-depth', main_func=main) | 
|  | 39 | +    run_result = await run_actor(actor) | 
|  | 40 | + | 
|  | 41 | +    assert run_result.status == 'SUCCEEDED' | 
|  | 42 | + | 
|  | 43 | + | 
|  | 44 | +async def test_actor_on_platform_max_requests_per_crawl( | 
|  | 45 | +    make_actor: MakeActorFunction, | 
|  | 46 | +    run_actor: RunActorFunction, | 
|  | 47 | +) -> None: | 
|  | 48 | +    """Test that the actor respects max_requests_per_crawl.""" | 
|  | 49 | + | 
|  | 50 | +    async def main() -> None: | 
|  | 51 | +        """The crawler entry point.""" | 
|  | 52 | +        from crawlee import ConcurrencySettings | 
|  | 53 | +        from crawlee.crawlers import ParselCrawler, ParselCrawlingContext | 
|  | 54 | + | 
|  | 55 | +        from apify import Actor | 
|  | 56 | + | 
|  | 57 | +        async with Actor: | 
|  | 58 | +            crawler = ParselCrawler( | 
|  | 59 | +                max_requests_per_crawl=3, concurrency_settings=ConcurrencySettings(max_concurrency=1) | 
|  | 60 | +            ) | 
|  | 61 | +            finished = [] | 
|  | 62 | + | 
|  | 63 | +            @crawler.router.default_handler | 
|  | 64 | +            async def default_handler(context: ParselCrawlingContext) -> None: | 
|  | 65 | +                """Default request handler.""" | 
|  | 66 | +                context.log.info(f'Processing {context.request.url} ...') | 
|  | 67 | +                await context.enqueue_links() | 
|  | 68 | +                finished.append(context.request.url) | 
|  | 69 | + | 
|  | 70 | +            await crawler.run(['http://localhost:8080/']) | 
|  | 71 | +            assert len(finished) == 3 | 
|  | 72 | + | 
|  | 73 | +    actor = await make_actor(label='crawler-max-requests', main_func=main) | 
|  | 74 | +    run_result = await run_actor(actor) | 
|  | 75 | + | 
|  | 76 | +    assert run_result.status == 'SUCCEEDED' | 
|  | 77 | + | 
|  | 78 | + | 
|  | 79 | +async def test_actor_on_platform_max_request_retries( | 
|  | 80 | +    make_actor: MakeActorFunction, | 
|  | 81 | +    run_actor: RunActorFunction, | 
|  | 82 | +) -> None: | 
|  | 83 | +    """Test that the actor respects max_request_retries.""" | 
|  | 84 | + | 
|  | 85 | +    async def main() -> None: | 
|  | 86 | +        """The crawler entry point.""" | 
|  | 87 | +        from crawlee.crawlers import BasicCrawlingContext, ParselCrawler, ParselCrawlingContext | 
|  | 88 | + | 
|  | 89 | +        from apify import Actor | 
|  | 90 | + | 
|  | 91 | +        async with Actor: | 
|  | 92 | +            max_retries = 3 | 
|  | 93 | +            crawler = ParselCrawler(max_request_retries=max_retries) | 
|  | 94 | +            failed_counter = 0 | 
|  | 95 | + | 
|  | 96 | +            @crawler.error_handler | 
|  | 97 | +            async def error_handler(_: BasicCrawlingContext, __: Exception) -> None: | 
|  | 98 | +                nonlocal failed_counter | 
|  | 99 | +                failed_counter += 1 | 
|  | 100 | + | 
|  | 101 | +            @crawler.router.default_handler | 
|  | 102 | +            async def default_handler(_: ParselCrawlingContext) -> None: | 
|  | 103 | +                raise RuntimeError('Some error') | 
|  | 104 | + | 
|  | 105 | +            await crawler.run(['http://localhost:8080/']) | 
|  | 106 | +            assert failed_counter == max_retries, f'{failed_counter=}' | 
|  | 107 | + | 
|  | 108 | +    actor = await make_actor(label='crawler-max-retries', main_func=main) | 
|  | 109 | +    run_result = await run_actor(actor) | 
|  | 110 | + | 
|  | 111 | +    assert run_result.status == 'SUCCEEDED' | 
0 commit comments