|  | 
| 8 | 8 | from crawlee import service_locator | 
| 9 | 9 | from crawlee.configuration import Configuration | 
| 10 | 10 | from crawlee.crawlers import HttpCrawler, HttpCrawlingContext | 
|  | 11 | +from crawlee.statistics import Statistics | 
| 11 | 12 | from crawlee.storage_clients import MemoryStorageClient | 
| 12 | 13 | from crawlee.storage_clients._file_system._storage_client import FileSystemStorageClient | 
| 13 | 14 | 
 | 
| @@ -35,26 +36,51 @@ def test_global_configuration_works_reversed() -> None: | 
| 35 | 36 |     ) | 
| 36 | 37 | 
 | 
| 37 | 38 | 
 | 
| 38 |  | -async def test_storage_not_persisted_when_disabled(tmp_path: Path, server_url: URL) -> None: | 
|  | 39 | +async def test_storage_not_persisted_when_non_persistable_storage_used(tmp_path: Path, server_url: URL) -> None: | 
|  | 40 | +    """Make the Crawler use MemoryStorageClient which can't persist state.""" | 
|  | 41 | +    service_locator.set_configuration( | 
|  | 42 | +        Configuration( | 
|  | 43 | +            crawlee_storage_dir=str(tmp_path),  # type: ignore[call-arg] | 
|  | 44 | +        ) | 
|  | 45 | +    ) | 
|  | 46 | +    crawler = HttpCrawler(storage_client=MemoryStorageClient()) | 
|  | 47 | + | 
|  | 48 | +    @crawler.router.default_handler | 
|  | 49 | +    async def default_handler(context: HttpCrawlingContext) -> None: | 
|  | 50 | +        await context.push_data({'url': context.request.url}) | 
|  | 51 | + | 
|  | 52 | +    await crawler.run([str(server_url)]) | 
|  | 53 | + | 
|  | 54 | +    # Verify that no files were created in the storage directory. | 
|  | 55 | +    content = list(tmp_path.iterdir()) | 
|  | 56 | +    assert content == [], 'Expected the storage directory to be empty, but it is not.' | 
|  | 57 | + | 
|  | 58 | + | 
|  | 59 | +async def test_storage_persisted_with_explicit_statistics_with_persistable_storage( | 
|  | 60 | +    tmp_path: Path, server_url: URL | 
|  | 61 | +) -> None: | 
|  | 62 | +    """Make the Crawler use MemoryStorageClient which can't persist state, | 
|  | 63 | +    but pass explicit statistics to it which will use global FileSystemStorageClient() that can persist state.""" | 
|  | 64 | + | 
| 39 | 65 |     configuration = Configuration( | 
| 40 | 66 |         crawlee_storage_dir=str(tmp_path),  # type: ignore[call-arg] | 
| 41 | 67 |     ) | 
| 42 |  | -    storage_client = MemoryStorageClient() | 
| 43 |  | - | 
| 44 | 68 |     service_locator.set_configuration(configuration) | 
| 45 |  | -    service_locator.set_storage_client(storage_client) | 
|  | 69 | +    service_locator.set_storage_client(FileSystemStorageClient()) | 
| 46 | 70 | 
 | 
| 47 |  | -    crawler = HttpCrawler() | 
|  | 71 | +    crawler = HttpCrawler( | 
|  | 72 | +        storage_client=MemoryStorageClient(), statistics=Statistics.with_default_state(persistence_enabled=True) | 
|  | 73 | +    ) | 
| 48 | 74 | 
 | 
| 49 | 75 |     @crawler.router.default_handler | 
| 50 | 76 |     async def default_handler(context: HttpCrawlingContext) -> None: | 
| 51 | 77 |         await context.push_data({'url': context.request.url}) | 
| 52 | 78 | 
 | 
| 53 | 79 |     await crawler.run([str(server_url)]) | 
| 54 | 80 | 
 | 
| 55 |  | -    # Verify that no files were created in the storage directory. | 
|  | 81 | +    # Verify that files were created in the storage directory. | 
| 56 | 82 |     content = list(tmp_path.iterdir()) | 
| 57 |  | -    assert content == [], 'Expected the storage directory to be empty, but it is not.' | 
|  | 83 | +    assert content != [], 'Expected the storage directory to contain files, but it does not.' | 
| 58 | 84 | 
 | 
| 59 | 85 | 
 | 
| 60 | 86 | async def test_storage_persisted_when_enabled(tmp_path: Path, server_url: URL) -> None: | 
|  | 
0 commit comments