Skip to content

Commit

Permalink
Better scraping pipe and session metadata (nottelabs#55)
Browse files Browse the repository at this point in the history
  • Loading branch information
giordano-lucas authored Jan 14, 2025
1 parent 6d8095c commit 7998a9e
Show file tree
Hide file tree
Showing 30 changed files with 713 additions and 189 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -172,3 +172,5 @@ llm_usage.jsonl
llm_parsing_error.jsonl

**/__pycache__/**
.DS_Store
**/.DS_Store
4 changes: 1 addition & 3 deletions examples/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,9 +164,7 @@ async def run(self, task: str, url: str | None = None) -> AgentOutput:
- At every step, you will be provided with a list of actions you can take.
- If you are asked to accept cookies to continue, please accept them. Accepting cookies is MANDATORY.
- If you see one action about cookie management, you should stop thinking about the goal and accept cookies DIRECTLY.
- If you are asked to signin to continue sign in if needed using the following credentials:
email/username: hello@notte.ai
password: notte123
- If you are asked to signin / signup to continue browsing, abort the task and explain why you can't proceed.
""",
},
{
Expand Down
3 changes: 3 additions & 0 deletions notte/actions/space.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,9 @@ def actions(
actions += SpecialAction.list() # type: ignore
return actions

def special_actions(self) -> list[SpecialAction]:
return SpecialAction.list() # type: ignore

def sample(
self,
status: Literal[ActionStatus, "all"] = "valid",
Expand Down
4 changes: 3 additions & 1 deletion notte/browser/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,9 @@ def only_failed_actions(node: NotteNode) -> bool:

filtered_graph = self.node.subtree_filter(only_failed_actions)
if filtered_graph is None:
logger.error(f"No nodes left in context after filtering of exesting actions for url {self.snapshot.url}")
logger.error(
f"No nodes left in context after filtering of exesting actions for url {self.snapshot.metadata.url}"
)
return None

return Context(
Expand Down
20 changes: 15 additions & 5 deletions notte/browser/driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@
from notte.actions.executor import get_executor
from notte.browser.context import Context
from notte.browser.node_type import A11yTree
from notte.browser.snapshot import BrowserSnapshot
from notte.browser.snapshot import BrowserSnapshot, SnapshotMetadata
from notte.common.resource import AsyncResource
from notte.utils.url import is_valid_url


class BrowserArgs(TypedDict):
Expand Down Expand Up @@ -119,8 +120,10 @@ async def snapshot(self, screenshot: bool | None = None, retries: int = 5) -> Br
take_screenshot = screenshot if screenshot is not None else self._screenshot
snapshot_screenshot = await self.page.screenshot() if take_screenshot else None
return BrowserSnapshot(
title=await self.page.title(),
url=self.page.url,
metadata=SnapshotMetadata(
title=await self.page.title(),
url=self.page.url,
),
html_content=html_content,
a11y_tree=a11y_tree,
screenshot=snapshot_screenshot,
Expand All @@ -143,7 +146,14 @@ async def goto(
raise RuntimeError("Browser not started. Call `start` first.")
if url is None or url == self.page.url:
return await self.snapshot()
_ = await self.page.goto(url)
if not is_valid_url(url, check_reachability=False):
raise ValueError(
f"Invalid URL: {url}. Check if the URL is reachable. URLs should start with https:// or http://"
)
try:
_ = await self.page.goto(url)
except Exception as e:
raise ValueError(f"Failed to navigate to {url}. Check if the URL is reachable.") from e
await self.long_wait()
return await self.snapshot()

Expand All @@ -156,7 +166,7 @@ async def execute_action(
"""Execute action in async mode"""
if not self.page:
raise RuntimeError("Browser not started. Call `start` first.")
if self.page.url != context.snapshot.url:
if self.page.url != context.snapshot.metadata.url:
raise ValueError(("Browser is not on the expected page. " "Use `goto` to navigate to the expected page."))
action_executor = get_executor(action)
is_success = await action_executor(self.page)
Expand Down
12 changes: 0 additions & 12 deletions notte/browser/node_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,6 @@
from loguru import logger


def clean_url(url: str) -> str:
# remove anything after ? i.. ?tfs=CBwQARooEgoyMDI0LTEyLTAzagwIAh
# remove trailing slash
# remove https://, http://, www.
base = url.split("?")[0]
if base.endswith("/"):
base = base[:-1]
base = base.replace("https://", "").replace("http://", "")
base = base.replace("www.", "")
return base


class A11yNode(TypedDict, total=False):
# from the a11y tree
role: Required[str]
Expand Down
49 changes: 7 additions & 42 deletions notte/browser/observation.py
Original file line number Diff line number Diff line change
@@ -1,59 +1,26 @@
import datetime as dt
from dataclasses import dataclass, field
from enum import Enum
from typing import Any

import requests
from dataclasses import dataclass

from notte.actions.space import ActionSpace
from notte.browser.node_type import clean_url
from notte.browser.snapshot import BrowserSnapshot
from notte.browser.snapshot import BrowserSnapshot, SnapshotMetadata
from notte.data.space import DataSpace
from notte.utils.url import clean_url

try:
from PIL import Image
except ImportError:
Image = None # type: ignore


class ImageCategory(Enum):
ICON = "icon"
CONTENT_IMAGE = "content_image"
DECORATIVE = "decorative"
SVG_ICON = "svg_icon"
SVG_CONTENT = "svg_content"


@dataclass
class ImageData:
id: str
url: str | None = None
category: ImageCategory | None = None

def bytes(self) -> bytes:
if self.url is None:
raise ValueError("Image URL is not available")
return requests.get(self.url).content


@dataclass
class DataSpace:
markdown: str | None = None
images: list[ImageData] | None = None
structured: list[dict[str, Any]] | None = None


@dataclass
class Observation:
title: str
url: str
timestamp: dt.datetime = field(default_factory=dt.datetime.now)
metadata: SnapshotMetadata
screenshot: bytes | None = None
_space: ActionSpace | None = None
data: DataSpace | None = None

@property
def clean_url(self) -> str:
return clean_url(self.url)
return clean_url(self.metadata.url)

@property
def space(self) -> ActionSpace:
Expand Down Expand Up @@ -83,9 +50,7 @@ def from_snapshot(
snapshot: BrowserSnapshot, space: ActionSpace | None = None, data: DataSpace | None = None
) -> "Observation":
return Observation(
title=snapshot.title,
url=snapshot.url,
timestamp=snapshot.timestamp,
metadata=snapshot.metadata,
screenshot=snapshot.screenshot,
_space=space,
data=data,
Expand Down
13 changes: 10 additions & 3 deletions notte/browser/snapshot.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,29 @@
import datetime as dt
from dataclasses import dataclass, field

from notte.browser.node_type import A11yTree, clean_url
from notte.browser.node_type import A11yTree
from notte.pipe.preprocessing.a11y.traversal import set_of_interactive_nodes
from notte.utils.url import clean_url


@dataclass
class BrowserSnapshot:
class SnapshotMetadata:
title: str
url: str
timestamp: dt.datetime = field(default_factory=dt.datetime.now)


@dataclass
class BrowserSnapshot:
metadata: SnapshotMetadata
html_content: str
a11y_tree: A11yTree
screenshot: bytes | None
timestamp: dt.datetime = field(default_factory=dt.datetime.now)

@property
def clean_url(self) -> str:
return clean_url(self.url)
return clean_url(self.metadata.url)

def compare_with(self, other: "BrowserSnapshot") -> bool:
inodes = set_of_interactive_nodes(self.a11y_tree.simple)
Expand Down
6 changes: 3 additions & 3 deletions notte/common/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,10 +213,10 @@ def textify(self, obs: Observation) -> str:
raise ValueError("No data or actions found")
return f"""
Webpage information:
- URL: {obs.url}
- Title: {obs.title}
- URL: {obs.metadata.url}
- Title: {obs.metadata.title}
- Description: {obs.space.description or "No description available"}
- Timestamp: {obs.timestamp.strftime("%Y-%m-%d %H:%M:%S")}
- Timestamp: {obs.metadata.timestamp.strftime("%Y-%m-%d %H:%M:%S")}
- Page category: {obs.space.category.value if obs.space.category is not None else "No category available"}
{text}
{self.POST_INSTRUCTIONS}
Expand Down
32 changes: 32 additions & 0 deletions notte/data/space.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from dataclasses import dataclass
from enum import Enum
from typing import Any

import requests


class ImageCategory(Enum):
ICON = "icon"
CONTENT_IMAGE = "content_image"
DECORATIVE = "decorative"
SVG_ICON = "svg_icon"
SVG_CONTENT = "svg_content"


@dataclass
class ImageData:
id: str
url: str | None = None
category: ImageCategory | None = None

def bytes(self) -> bytes:
if self.url is None:
raise ValueError("Image URL is not available")
return requests.get(self.url).content


@dataclass
class DataSpace:
markdown: str | None = None
images: list[ImageData] | None = None
structured: list[dict[str, Any]] | None = None
2 changes: 2 additions & 0 deletions notte/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,12 +213,14 @@ async def step(
async def scrape(
self,
url: str | None = None,
only_main_content: bool = True,
scrape_images: bool = False,
) -> Observation:
if url is not None:
_ = await self.goto(url)
self.obs.data = await self._data_scraping_pipe.forward(
self.context,
only_main_content=only_main_content,
scrape_images=scrape_images,
)
return self.obs
Expand Down
14 changes: 8 additions & 6 deletions notte/llms/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,12 @@ def extract(
if self.outer_tag:
pattern = f"<{self.outer_tag}>(.*?)</{self.outer_tag}>"
match = re.search(pattern, content, re.DOTALL)
if not match:
if match:
# perfect case, we have <outer_tag>...</outer_tag>
content = match.group(1).strip()
else:
splits = text.split(f"<{self.outer_tag}>")
# In this case, we want to fail if <outer_tag> is not found at least once
if fail_if_final_tag or len(splits) == 1:
raise ValueError(f"No content found within <{self.outer_tag}> tags in the response: {text}")
possible_match = splits[1]
Expand All @@ -76,11 +80,9 @@ def extract(
)
possible_match = splits[0].strip()
# if there is not html tag in `possible_match` then we can safely return it
if not re.search(r"<[^>]*>", possible_match):
return possible_match
raise ValueError(f"No content found within <{self.outer_tag}> tags in the response: {text}")

content = match.group(1).strip()
if re.search(r"<[^>]*>", possible_match):
raise ValueError(f"No content found within <{self.outer_tag}> tags in the response: {text}")
content = possible_match

if self.inner_tag:
pattern = f"```{self.inner_tag}(.*?)```"
Expand Down
Loading

0 comments on commit 7998a9e

Please sign in to comment.