- 
                Notifications
    You must be signed in to change notification settings 
- Fork 510
Closed as not planned
Labels
t-toolingIssues with this label are in the ownership of the tooling team.Issues with this label are in the ownership of the tooling team.
Description
After #1384 and #1388 were merged, it appears that the main performance bottleneck in the default state is now the autoscaled pool's default configuration - the ConcurrencySettings passed to the Crawler.
Here’s an example with the default settings on my machine:
import asyncio
from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
from crawlee.http_clients import HttpxHttpClient
from crawlee.storage_clients import MemoryStorageClient
async def main() -> None:
    storage_client = MemoryStorageClient()
    http_client = HttpxHttpClient()
    crawler = ParselCrawler(
        storage_client=storage_client,
        http_client=http_client,
    )
    @crawler.router.default_handler
    async def request_handler(context: ParselCrawlingContext) -> None:
        context.log.info(f'Processing URL: {context.request.url}...')
        data = {
            'url': context.request.url,
            'title': context.selector.css('title::text').get(),
        }
        await context.push_data(data)
        await context.enqueue_links()
    await crawler.run(['https://crawlee.dev'])
if __name__ == '__main__':
    asyncio.run(main())Final stats:
┌───────────────────────────────┬────────────┐
│ requests_finished             │ 2363       │
│ requests_failed               │ 0          │
│ retry_histogram               │ [2363]     │
│ request_avg_failed_duration   │ None       │
│ request_avg_finished_duration │ 168.4ms    │
│ requests_finished_per_minute  │ 1611       │
│ requests_failed_per_minute    │ 0          │
│ request_total_duration        │ 6min 38.0s │
│ requests_total                │ 2363       │
│ crawler_runtime               │ 1min 28.0s │
└───────────────────────────────┴────────────┘
Now, running the same example with a desired_concurrency of 20:
import asyncio
from crawlee import ConcurrencySettings
from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
from crawlee.http_clients import HttpxHttpClient
from crawlee.storage_clients import MemoryStorageClient
async def main() -> None:
    storage_client = MemoryStorageClient()
    http_client = HttpxHttpClient()
    crawler = ParselCrawler(
        storage_client=storage_client,
        http_client=http_client,
        concurrency_settings=ConcurrencySettings(
            desired_concurrency=20,
        ),
    )
    @crawler.router.default_handler
    async def request_handler(context: ParselCrawlingContext) -> None:
        context.log.info(f'Processing URL: {context.request.url}...')
        data = {
            'url': context.request.url,
            'title': context.selector.css('title::text').get(),
        }
        await context.push_data(data)
        await context.enqueue_links()
    await crawler.run(['https://crawlee.dev'])
if __name__ == '__main__':
    asyncio.run(main())Final stats:
┌───────────────────────────────┬─────────────┐
│ requests_finished             │ 2363        │
│ requests_failed               │ 0           │
│ retry_histogram               │ [2363]      │
│ request_avg_failed_duration   │ None        │
│ request_avg_finished_duration │ 270.9ms     │
│ requests_finished_per_minute  │ 3493        │
│ requests_failed_per_minute    │ 0           │
│ request_total_duration        │ 10min 40.2s │
│ requests_total                │ 2363        │
│ crawler_runtime               │ 40.59s      │
└───────────────────────────────┴─────────────┘
Metadata
Metadata
Assignees
Labels
t-toolingIssues with this label are in the ownership of the tooling team.Issues with this label are in the ownership of the tooling team.