Skip to content

Commit 612e73a

Browse files
committed
fix: re-instantiating the crawler per each route!
it seems it was doing some caching (or limiting the urls to 20) even in case more urls were requested to be crawled. now we're re-instantiating the crawler client which it will crawl max 20 urls per each given route.
1 parent 86a9cdd commit 612e73a

File tree

1 file changed

+4
-3
lines changed

1 file changed

+4
-3
lines changed

hivemind_etl/website/website_etl.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ def __init__(
2424
collection_name = "website"
2525

2626
# preparing the data extractor and ingestion pipelines
27-
self.crawlee_client = CrawleeClient()
27+
# self.crawlee_client = CrawleeClient()
2828
self.ingestion_pipeline = CustomIngestionPipeline(
2929
self.community_id, collection_name=collection_name
3030
)
@@ -51,9 +51,10 @@ async def extract(
5151

5252
extracted_data = []
5353
for url in urls:
54+
crawlee_client = CrawleeClient()
5455
logging.info(f"Crawling {url} and its routes!")
55-
data = await self.crawlee_client.crawl(links=[url])
56-
logging.info(f"{len(data)} data is extracted.")
56+
data = await crawlee_client.crawl(links=[url])
57+
logging.info(f"{len(data)} data is extracted for route: {url}")
5758
extracted_data.extend(data)
5859

5960
logging.info(f"Extracted {len(extracted_data)} documents!")

0 commit comments

Comments
 (0)