Skip to content

Commit 418c066

Browse files
committed
feat: enhance crawling process with improved data extraction!
Extracting one url and its routes at a time, and then merging the results.
1 parent 7d8ee36 commit 418c066

File tree

1 file changed

+8
-1
lines changed

1 file changed

+8
-1
lines changed

hivemind_etl/website/website_etl.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import logging
12
from typing import Any
23

34
from hivemind_etl.website.crawlee_client import CrawleeClient
@@ -47,7 +48,13 @@ async def extract(
4748
"""
4849
if not urls:
4950
raise ValueError("No URLs provided for crawling")
50-
extracted_data = await self.crawlee_client.crawl(urls)
51+
52+
extracted_data = []
53+
for url in urls:
54+
logging.info(f"Crawling {url} and its routes!")
55+
extracted_data.extend(await self.crawlee_client.crawl(links=[url]))
56+
57+
logging.info(f"Extracted {len(extracted_data)} documents!")
5158

5259
if not extracted_data:
5360
raise ValueError(f"No data extracted from URLs: {urls}")

0 commit comments

Comments
 (0)