Merge pull request #297 from AgentOps-AI/firecrawl-tool

bboynton97 · web-flow · commit eafff89adeb6 · 2025-02-14T15:03:48.000-05:00
Firecrawl tool
diff --git a/agentstack/_tools/agentql/__init__.py b/agentstack/_tools/agentql/__init__.py
@@ -17,32 +17,32 @@ def query_data(url: str, query: Optional[str], prompt: Optional[str]) -> dict:
 
     AgentQL query to scrape the url.
 
-Here is a guide on AgentQL query syntax:
+    Here is a guide on AgentQL query syntax:
 
-Enclose all AgentQL query terms within curly braces `{}`. The following query structure isn't valid because the term "social\_media\_links" is wrongly enclosed within parenthesis `()`.
+    Enclose all AgentQL query terms within curly braces `{}`. The following query structure isn't valid because the term "social_media_links" is wrongly enclosed within parenthesis `()`.
 
-```
-( # Should be {
-    social_media_links(The icons that lead to Facebook, Snapchat, etc.)[]
-) # Should be }
-```
+    ```
+    ( # Should be {
+        social_media_links(The icons that lead to Facebook, Snapchat, etc.)[]
+    ) # Should be }
+    ```
 
-The following query is also invalid since its missing the curly braces `{}`
+    The following query is also invalid since its missing the curly braces `{}`
 
-```
-# should include {
-social_media_links(The icons that lead to Facebook, Snapchat, etc.)[]
-# should include }
-```
+    ```
+    # should include {
+    social_media_links(The icons that lead to Facebook, Snapchat, etc.)[]
+    # should include }
+    ```
 
-You can't include new lines in your semantic context. The following query structure isn't valid because the semantic context isn't contained within one line.
+    You can't include new lines in your semantic context. The following query structure isn't valid because the semantic context isn't contained within one line.
 
-```
-{
-    social_media_links(The icons that lead
-        to Facebook, Snapchat, etc.)[]
-}
-```
+    ```
+    {
+        social_media_links(The icons that lead
+            to Facebook, Snapchat, etc.)[]
+    }
+    ```
     """
     payload = {
         "url": url,
diff --git a/agentstack/_tools/firecrawl/__init__.py b/agentstack/_tools/firecrawl/__init__.py
@@ -1,6 +1,6 @@
 import os
 from firecrawl import FirecrawlApp
-
+from typing import List, Dict, Any, Optional
 app = FirecrawlApp(api_key=os.getenv('FIRECRAWL_API_KEY'))
 
 
@@ -38,3 +38,106 @@ def retrieve_web_crawl(crawl_id: str):
     will tell you if the crawl is finished. If it is not, wait some more time then try again.
     """
     return app.check_crawl_status(crawl_id)
+
+
+def batch_scrape(urls: List[str], formats: List[str] = ['markdown', 'html']):
+    """
+    Batch scrape multiple URLs simultaneously.
+    
+    Args:
+        urls: List of URLs to scrape
+        formats: List of desired output formats (e.g., ['markdown', 'html'])
+    
+    Returns:
+        Dictionary containing the batch scrape results
+    """
+    batch_result = app.batch_scrape_urls(urls, {'formats': formats})
+    return batch_result
+
+
+def async_batch_scrape(urls: List[str], formats: List[str] = ['markdown', 'html']):
+    """
+    Asynchronously batch scrape multiple URLs.
+    
+    Args:
+        urls: List of URLs to scrape
+        formats: List of desired output formats (e.g., ['markdown', 'html'])
+    
+    Returns:
+        Dictionary containing the job ID and status URL
+    """
+    batch_job = app.async_batch_scrape_urls(urls, {'formats': formats})
+    return batch_job
+
+
+def check_batch_status(job_id: str):
+    """
+    Check the status of an asynchronous batch scrape job.
+    
+    Args:
+        job_id: The ID of the batch scrape job
+    
+    Returns:
+        Dictionary containing the current status and results if completed
+    """
+    return app.check_batch_scrape_status(job_id)
+
+
+def extract_data(urls: List[str], schema: Optional[Dict[str, Any]] = None, prompt: Optional[str] = None) -> Dict[
+    str, Any]:
+    """
+    Extract structured data from URLs using LLMs.
+
+    Args:
+        urls: List of URLs to extract data from
+        schema: Optional JSON schema defining the structure of data to extract
+        prompt: Optional natural language prompt describing the data to extract
+
+    Returns:
+        Dictionary containing the extracted structured data
+    """
+    params: Dict[str, Any] = {}
+
+    if prompt is not None:
+        params['prompt'] = prompt
+    elif schema is not None:
+        params['schema'] = schema
+
+    data = app.extract(urls, params)
+    return data
+
+
+def map_website(url: str, search: Optional[str] = None):
+    """
+    Map a website to get all URLs, with optional search functionality.
+    
+    Args:
+        url: The base URL to map
+        search: Optional search term to filter URLs
+        
+    Returns:
+        Dictionary containing the list of discovered URLs
+    """
+    params = {'search': search} if search else {}
+    map_result = app.map_url(url, params)
+    return map_result
+
+
+def batch_extract(urls: List[str], extract_params: Dict[str, Any]):
+    """
+    Batch extract structured data from multiple URLs.
+    
+    Args:
+        urls: List of URLs to extract data from
+        extract_params: Dictionary containing extraction parameters including prompt or schema
+        
+    Returns:
+        Dictionary containing the extracted data from all URLs
+    """
+    params = {
+        'formats': ['extract'],
+        'extract': extract_params
+    }
+    
+    batch_result = app.batch_scrape_urls(urls, params)
+    return batch_result
diff --git a/agentstack/_tools/firecrawl/config.json b/agentstack/_tools/firecrawl/config.json
@@ -8,6 +8,15 @@
   "dependencies": [
     "firecrawl-py>=1.6.4"
   ],
-  "tools": ["web_scrape", "web_crawl", "retrieve_web_crawl"],
+  "tools": [
+    "web_scrape",
+    "web_crawl",
+    "retrieve_web_crawl",
+    "batch_scrape",
+    "check_batch_status",
+    "extract_data",
+    "map_website",
+    "batch_extract"
+  ],
   "cta": "Create an API key at https://www.firecrawl.dev/"
 }