Better scraping pipe and session metadata (nottelabs#55)

pchaganti · Jan 14, 2025 · 7998a9e · 7998a9e
1 parent 6d8095c
commit 7998a9e
Show file tree

Hide file tree

Showing 30 changed files with 713 additions and 189 deletions.
diff --git a/.gitignore b/.gitignore
@@ -172,3 +172,5 @@ llm_usage.jsonl
 llm_parsing_error.jsonl
 
 **/__pycache__/**
+.DS_Store
+**/.DS_Store
diff --git a/examples/agent.py b/examples/agent.py
@@ -164,9 +164,7 @@ async def run(self, task: str, url: str | None = None) -> AgentOutput:
 - At every step, you will be provided with a list of actions you can take.
 - If you are asked to accept cookies to continue, please accept them. Accepting cookies is MANDATORY.
 - If you see one action about cookie management, you should stop thinking about the goal and accept cookies DIRECTLY.
-- If you are asked to signin to continue sign in if needed using the following credentials:
-email/username: hello@notte.ai
-password: notte123
+- If you are asked to signin / signup to continue browsing, abort the task and explain why you can't proceed.
 """,
                 },
                 {

diff --git a/notte/actions/space.py b/notte/actions/space.py
@@ -98,6 +98,9 @@ def actions(
             actions += SpecialAction.list()  # type: ignore
         return actions
 
+    def special_actions(self) -> list[SpecialAction]:
+        return SpecialAction.list()  # type: ignore
+
     def sample(
         self,
         status: Literal[ActionStatus, "all"] = "valid",

diff --git a/notte/browser/context.py b/notte/browser/context.py
@@ -69,7 +69,9 @@ def only_failed_actions(node: NotteNode) -> bool:
 
         filtered_graph = self.node.subtree_filter(only_failed_actions)
         if filtered_graph is None:
-            logger.error(f"No nodes left in context after filtering of exesting actions for url {self.snapshot.url}")
+            logger.error(
+                f"No nodes left in context after filtering of exesting actions for url {self.snapshot.metadata.url}"
+            )
             return None
 
         return Context(

diff --git a/notte/browser/driver.py b/notte/browser/driver.py
@@ -9,8 +9,9 @@
 from notte.actions.executor import get_executor
 from notte.browser.context import Context
 from notte.browser.node_type import A11yTree
-from notte.browser.snapshot import BrowserSnapshot
+from notte.browser.snapshot import BrowserSnapshot, SnapshotMetadata
 from notte.common.resource import AsyncResource
+from notte.utils.url import is_valid_url
 
 
 class BrowserArgs(TypedDict):
@@ -119,8 +120,10 @@ async def snapshot(self, screenshot: bool | None = None, retries: int = 5) -> Br
         take_screenshot = screenshot if screenshot is not None else self._screenshot
         snapshot_screenshot = await self.page.screenshot() if take_screenshot else None
         return BrowserSnapshot(
-            title=await self.page.title(),
-            url=self.page.url,
+            metadata=SnapshotMetadata(
+                title=await self.page.title(),
+                url=self.page.url,
+            ),
             html_content=html_content,
             a11y_tree=a11y_tree,
             screenshot=snapshot_screenshot,
@@ -143,7 +146,14 @@ async def goto(
             raise RuntimeError("Browser not started. Call `start` first.")
         if url is None or url == self.page.url:
             return await self.snapshot()
-        _ = await self.page.goto(url)
+        if not is_valid_url(url, check_reachability=False):
+            raise ValueError(
+                f"Invalid URL: {url}. Check if the URL is reachable. URLs should start with https:// or http://"
+            )
+        try:
+            _ = await self.page.goto(url)
+        except Exception as e:
+            raise ValueError(f"Failed to navigate to {url}. Check if the URL is reachable.") from e
         await self.long_wait()
         return await self.snapshot()
 
@@ -156,7 +166,7 @@ async def execute_action(
         """Execute action in async mode"""
         if not self.page:
             raise RuntimeError("Browser not started. Call `start` first.")
-        if self.page.url != context.snapshot.url:
+        if self.page.url != context.snapshot.metadata.url:
             raise ValueError(("Browser is not on the expected page. " "Use `goto` to navigate to the expected page."))
         action_executor = get_executor(action)
         is_success = await action_executor(self.page)

diff --git a/notte/browser/node_type.py b/notte/browser/node_type.py
@@ -6,18 +6,6 @@
 from loguru import logger
 
 
-def clean_url(url: str) -> str:
-    # remove anything after ? i.. ?tfs=CBwQARooEgoyMDI0LTEyLTAzagwIAh
-    # remove trailing slash
-    # remove https://, http://, www.
-    base = url.split("?")[0]
-    if base.endswith("/"):
-        base = base[:-1]
-    base = base.replace("https://", "").replace("http://", "")
-    base = base.replace("www.", "")
-    return base
-
-
 class A11yNode(TypedDict, total=False):
     # from the a11y tree
     role: Required[str]

diff --git a/notte/browser/observation.py b/notte/browser/observation.py
@@ -1,59 +1,26 @@
-import datetime as dt
-from dataclasses import dataclass, field
-from enum import Enum
-from typing import Any
-
-import requests
+from dataclasses import dataclass
 
 from notte.actions.space import ActionSpace
-from notte.browser.node_type import clean_url
-from notte.browser.snapshot import BrowserSnapshot
+from notte.browser.snapshot import BrowserSnapshot, SnapshotMetadata
+from notte.data.space import DataSpace
+from notte.utils.url import clean_url
 
 try:
     from PIL import Image
 except ImportError:
     Image = None  # type: ignore
 
 
-class ImageCategory(Enum):
-    ICON = "icon"
-    CONTENT_IMAGE = "content_image"
-    DECORATIVE = "decorative"
-    SVG_ICON = "svg_icon"
-    SVG_CONTENT = "svg_content"
-
-
-@dataclass
-class ImageData:
-    id: str
-    url: str | None = None
-    category: ImageCategory | None = None
-
-    def bytes(self) -> bytes:
-        if self.url is None:
-            raise ValueError("Image URL is not available")
-        return requests.get(self.url).content
-
-
-@dataclass
-class DataSpace:
-    markdown: str | None = None
-    images: list[ImageData] | None = None
-    structured: list[dict[str, Any]] | None = None
-
-
 @dataclass
 class Observation:
-    title: str
-    url: str
-    timestamp: dt.datetime = field(default_factory=dt.datetime.now)
+    metadata: SnapshotMetadata
     screenshot: bytes | None = None
     _space: ActionSpace | None = None
     data: DataSpace | None = None
 
     @property
     def clean_url(self) -> str:
-        return clean_url(self.url)
+        return clean_url(self.metadata.url)
 
     @property
     def space(self) -> ActionSpace:
@@ -83,9 +50,7 @@ def from_snapshot(
         snapshot: BrowserSnapshot, space: ActionSpace | None = None, data: DataSpace | None = None
     ) -> "Observation":
         return Observation(
-            title=snapshot.title,
-            url=snapshot.url,
-            timestamp=snapshot.timestamp,
+            metadata=snapshot.metadata,
             screenshot=snapshot.screenshot,
             _space=space,
             data=data,

diff --git a/notte/browser/snapshot.py b/notte/browser/snapshot.py
@@ -1,22 +1,29 @@
 import datetime as dt
 from dataclasses import dataclass, field
 
-from notte.browser.node_type import A11yTree, clean_url
+from notte.browser.node_type import A11yTree
 from notte.pipe.preprocessing.a11y.traversal import set_of_interactive_nodes
+from notte.utils.url import clean_url
 
 
 @dataclass
-class BrowserSnapshot:
+class SnapshotMetadata:
     title: str
     url: str
+    timestamp: dt.datetime = field(default_factory=dt.datetime.now)
+
+
+@dataclass
+class BrowserSnapshot:
+    metadata: SnapshotMetadata
     html_content: str
     a11y_tree: A11yTree
     screenshot: bytes | None
     timestamp: dt.datetime = field(default_factory=dt.datetime.now)
 
     @property
     def clean_url(self) -> str:
-        return clean_url(self.url)
+        return clean_url(self.metadata.url)
 
     def compare_with(self, other: "BrowserSnapshot") -> bool:
         inodes = set_of_interactive_nodes(self.a11y_tree.simple)

diff --git a/notte/common/parser.py b/notte/common/parser.py
@@ -213,10 +213,10 @@ def textify(self, obs: Observation) -> str:
                 raise ValueError("No data or actions found")
         return f"""
 Webpage information:
-- URL: {obs.url}
-- Title: {obs.title}
+- URL: {obs.metadata.url}
+- Title: {obs.metadata.title}
 - Description: {obs.space.description or "No description available"}
-- Timestamp: {obs.timestamp.strftime("%Y-%m-%d %H:%M:%S")}
+- Timestamp: {obs.metadata.timestamp.strftime("%Y-%m-%d %H:%M:%S")}
 - Page category: {obs.space.category.value if obs.space.category is not None else "No category available"}
 {text}
 {self.POST_INSTRUCTIONS}

diff --git a/notte/data/space.py b/notte/data/space.py
@@ -0,0 +1,32 @@
+from dataclasses import dataclass
+from enum import Enum
+from typing import Any
+
+import requests
+
+
+class ImageCategory(Enum):
+    ICON = "icon"
+    CONTENT_IMAGE = "content_image"
+    DECORATIVE = "decorative"
+    SVG_ICON = "svg_icon"
+    SVG_CONTENT = "svg_content"
+
+
+@dataclass
+class ImageData:
+    id: str
+    url: str | None = None
+    category: ImageCategory | None = None
+
+    def bytes(self) -> bytes:
+        if self.url is None:
+            raise ValueError("Image URL is not available")
+        return requests.get(self.url).content
+
+
+@dataclass
+class DataSpace:
+    markdown: str | None = None
+    images: list[ImageData] | None = None
+    structured: list[dict[str, Any]] | None = None
diff --git a/notte/env.py b/notte/env.py
@@ -213,12 +213,14 @@ async def step(
     async def scrape(
         self,
         url: str | None = None,
+        only_main_content: bool = True,
         scrape_images: bool = False,
     ) -> Observation:
         if url is not None:
             _ = await self.goto(url)
         self.obs.data = await self._data_scraping_pipe.forward(
             self.context,
+            only_main_content=only_main_content,
             scrape_images=scrape_images,
         )
         return self.obs

diff --git a/notte/llms/engine.py b/notte/llms/engine.py
@@ -58,8 +58,12 @@ def extract(
         if self.outer_tag:
             pattern = f"<{self.outer_tag}>(.*?)</{self.outer_tag}>"
             match = re.search(pattern, content, re.DOTALL)
-            if not match:
+            if match:
+                # perfect case, we have <outer_tag>...</outer_tag>
+                content = match.group(1).strip()
+            else:
                 splits = text.split(f"<{self.outer_tag}>")
+                # In this case, we want to fail if <outer_tag> is not found at least once
                 if fail_if_final_tag or len(splits) == 1:
                     raise ValueError(f"No content found within <{self.outer_tag}> tags in the response: {text}")
                 possible_match = splits[1]
@@ -76,11 +80,9 @@ def extract(
                         )
                     possible_match = splits[0].strip()
                 # if there is not html tag in `possible_match` then we can safely return it
-                if not re.search(r"<[^>]*>", possible_match):
-                    return possible_match
-                raise ValueError(f"No content found within <{self.outer_tag}> tags in the response: {text}")
-
-            content = match.group(1).strip()
+                if re.search(r"<[^>]*>", possible_match):
+                    raise ValueError(f"No content found within <{self.outer_tag}> tags in the response: {text}")
+                content = possible_match
 
         if self.inner_tag:
             pattern = f"```{self.inner_tag}(.*?)```"