Skip to content

Commit 00e65ec

Browse files
committed
Persist Crawler statistics to Crawler KVS
1 parent 37f6473 commit 00e65ec

File tree

3 files changed

+52
-16
lines changed

3 files changed

+52
-16
lines changed

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -437,15 +437,23 @@ def __init__(
437437
self._statistics_log_format = statistics_log_format
438438

439439
# Statistics
440-
self._statistics = statistics or cast(
441-
'Statistics[TStatisticsState]',
442-
Statistics.with_default_state(
443-
persistence_enabled=True,
444-
periodic_message_logger=self._logger,
445-
statistics_log_format=self._statistics_log_format,
446-
log_message='Current request statistics:',
447-
),
448-
)
440+
if statistics:
441+
self._statistics = statistics
442+
else:
443+
444+
async def persist_state_factory() -> KeyValueStore:
445+
return await self.get_key_value_store()
446+
447+
self._statistics = cast(
448+
'Statistics[TStatisticsState]',
449+
Statistics.with_default_state(
450+
persistence_enabled=True,
451+
periodic_message_logger=self._logger,
452+
statistics_log_format=self._statistics_log_format,
453+
log_message='Current request statistics:',
454+
persist_state_kvs_factory=persist_state_factory,
455+
),
456+
)
449457

450458
# Additional context managers to enter and exit
451459
self._additional_context_managers = _additional_context_managers or []

src/crawlee/statistics/_statistics.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ def with_default_state(
130130
persistence_enabled: bool = False,
131131
persist_state_kvs_name: str | None = None,
132132
persist_state_key: str | None = None,
133+
persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
133134
log_message: str = 'Statistics',
134135
periodic_message_logger: Logger | None = None,
135136
log_interval: timedelta = timedelta(minutes=1),
@@ -141,6 +142,7 @@ def with_default_state(
141142
persistence_enabled=persistence_enabled,
142143
persist_state_kvs_name=persist_state_kvs_name,
143144
persist_state_key=persist_state_key,
145+
persist_state_kvs_factory=persist_state_kvs_factory,
144146
log_message=log_message,
145147
periodic_message_logger=periodic_message_logger,
146148
log_interval=log_interval,

tests/unit/test_configuration.py

Lines changed: 33 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from crawlee import service_locator
99
from crawlee.configuration import Configuration
1010
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
11+
from crawlee.statistics import Statistics
1112
from crawlee.storage_clients import MemoryStorageClient
1213
from crawlee.storage_clients._file_system._storage_client import FileSystemStorageClient
1314

@@ -35,26 +36,51 @@ def test_global_configuration_works_reversed() -> None:
3536
)
3637

3738

38-
async def test_storage_not_persisted_when_disabled(tmp_path: Path, server_url: URL) -> None:
39+
async def test_storage_not_persisted_when_non_persistable_storage_used(tmp_path: Path, server_url: URL) -> None:
40+
"""Make the Crawler use MemoryStorageClient which can't persist state."""
41+
service_locator.set_configuration(
42+
Configuration(
43+
crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg]
44+
)
45+
)
46+
crawler = HttpCrawler(storage_client=MemoryStorageClient())
47+
48+
@crawler.router.default_handler
49+
async def default_handler(context: HttpCrawlingContext) -> None:
50+
await context.push_data({'url': context.request.url})
51+
52+
await crawler.run([str(server_url)])
53+
54+
# Verify that no files were created in the storage directory.
55+
content = list(tmp_path.iterdir())
56+
assert content == [], 'Expected the storage directory to be empty, but it is not.'
57+
58+
59+
async def test_storage_persisted_with_explicit_statistics_with_persistable_storage(
60+
tmp_path: Path, server_url: URL
61+
) -> None:
62+
"""Make the Crawler use MemoryStorageClient which can't persist state,
63+
but pass explicit statistics to it which will use global FileSystemStorageClient() that can persist state."""
64+
3965
configuration = Configuration(
4066
crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg]
4167
)
42-
storage_client = MemoryStorageClient()
43-
4468
service_locator.set_configuration(configuration)
45-
service_locator.set_storage_client(storage_client)
69+
service_locator.set_storage_client(FileSystemStorageClient())
4670

47-
crawler = HttpCrawler()
71+
crawler = HttpCrawler(
72+
storage_client=MemoryStorageClient(), statistics=Statistics.with_default_state(persistence_enabled=True)
73+
)
4874

4975
@crawler.router.default_handler
5076
async def default_handler(context: HttpCrawlingContext) -> None:
5177
await context.push_data({'url': context.request.url})
5278

5379
await crawler.run([str(server_url)])
5480

55-
# Verify that no files were created in the storage directory.
81+
# Verify that files were created in the storage directory.
5682
content = list(tmp_path.iterdir())
57-
assert content == [], 'Expected the storage directory to be empty, but it is not.'
83+
assert content != [], 'Expected the storage directory to contain files, but it does not.'
5884

5985

6086
async def test_storage_persisted_when_enabled(tmp_path: Path, server_url: URL) -> None:

0 commit comments

Comments
 (0)