Skip to content

Improve default autoscaled pool settings #1391

@vdusek

Description

@vdusek

After #1384 and #1388 were merged, it appears that the main performance bottleneck in the default state is now the autoscaled pool's default configuration - the ConcurrencySettings passed to the Crawler.

Here’s an example with the default settings on my machine:

import asyncio

from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
from crawlee.http_clients import HttpxHttpClient
from crawlee.storage_clients import MemoryStorageClient


async def main() -> None:
    storage_client = MemoryStorageClient()
    http_client = HttpxHttpClient()

    crawler = ParselCrawler(
        storage_client=storage_client,
        http_client=http_client,
    )

    @crawler.router.default_handler
    async def request_handler(context: ParselCrawlingContext) -> None:
        context.log.info(f'Processing URL: {context.request.url}...')
        data = {
            'url': context.request.url,
            'title': context.selector.css('title::text').get(),
        }
        await context.push_data(data)
        await context.enqueue_links()

    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())

Final stats:

┌───────────────────────────────┬────────────┐
│ requests_finished             │ 2363       │
│ requests_failed               │ 0          │
│ retry_histogram               │ [2363]     │
│ request_avg_failed_duration   │ None       │
│ request_avg_finished_duration │ 168.4ms    │
│ requests_finished_per_minute  │ 1611       │
│ requests_failed_per_minute    │ 0          │
│ request_total_duration        │ 6min 38.0s │
│ requests_total                │ 2363       │
│ crawler_runtime               │ 1min 28.0s │
└───────────────────────────────┴────────────┘

Now, running the same example with a desired_concurrency of 20:

import asyncio

from crawlee import ConcurrencySettings
from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
from crawlee.http_clients import HttpxHttpClient
from crawlee.storage_clients import MemoryStorageClient


async def main() -> None:
    storage_client = MemoryStorageClient()
    http_client = HttpxHttpClient()

    crawler = ParselCrawler(
        storage_client=storage_client,
        http_client=http_client,
        concurrency_settings=ConcurrencySettings(
            desired_concurrency=20,
        ),
    )

    @crawler.router.default_handler
    async def request_handler(context: ParselCrawlingContext) -> None:
        context.log.info(f'Processing URL: {context.request.url}...')
        data = {
            'url': context.request.url,
            'title': context.selector.css('title::text').get(),
        }
        await context.push_data(data)
        await context.enqueue_links()

    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())

Final stats:

┌───────────────────────────────┬─────────────┐
│ requests_finished             │ 2363        │
│ requests_failed               │ 0           │
│ retry_histogram               │ [2363]      │
│ request_avg_failed_duration   │ None        │
│ request_avg_finished_duration │ 270.9ms     │
│ requests_finished_per_minute  │ 3493        │
│ requests_failed_per_minute    │ 0           │
│ request_total_duration        │ 10min 40.2s │
│ requests_total                │ 2363        │
│ crawler_runtime               │ 40.59s      │
└───────────────────────────────┴─────────────┘

Metadata

Metadata

Assignees

No one assigned

    Labels

    t-toolingIssues with this label are in the ownership of the tooling team.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions