Skip to content

Commit bb5c53a

Browse files
authored
Merge pull request #13 from TogetherCrew/fix/8-urls-limit
Fix: changing request_queue_id everytime!
2 parents 83ad0ac + 2f0fcc7 commit bb5c53a

File tree

2 files changed

+13
-4
lines changed

2 files changed

+13
-4
lines changed

hivemind_etl/website/crawlee_client.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import asyncio
22
from typing import Any
3+
import uuid
34

45
from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext
56
from defusedxml import ElementTree as ET
@@ -21,6 +22,12 @@ def __init__(
2122
# do not persist crawled data to local storage
2223
self.crawler._configuration.persist_storage = False
2324
self.crawler._configuration.write_metadata = False
25+
self.crawler._configuration.purge_on_start = True
26+
27+
# changing the id each time so it wouldn't continue
28+
# fetching the previous links
29+
config = self.crawler._configuration.get_global_configuration()
30+
config.default_request_queue_id = uuid.uuid4().hex
2431

2532
@self.crawler.router.default_handler
2633
async def request_handler(context: PlaywrightCrawlingContext) -> None:

tests/unit/test_website_etl.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,15 +30,17 @@ async def test_extract(self):
3030
"title": "Example",
3131
}
3232
]
33-
33+
3434
# Mock the CrawleeClient class instead of the instance
35-
with patch('hivemind_etl.website.website_etl.CrawleeClient') as MockCrawleeClient:
35+
with patch(
36+
"hivemind_etl.website.website_etl.CrawleeClient"
37+
) as MockCrawleeClient:
3638
mock_client_instance = AsyncMock()
3739
mock_client_instance.crawl.return_value = mocked_data
3840
MockCrawleeClient.return_value = mock_client_instance
39-
41+
4042
extracted_data = await self.website_etl.extract(urls)
41-
43+
4244
self.assertEqual(extracted_data, mocked_data)
4345
MockCrawleeClient.assert_called_once()
4446
mock_client_instance.crawl.assert_awaited_once_with(links=urls)

0 commit comments

Comments
 (0)