Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 74 additions & 8 deletions openadapt_ml/baselines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@
- Track B: ReAct-style reasoning with coordinates
- Track C: Set-of-Mark element selection

Based on SOTA patterns from:
- Claude Computer Use (Anthropic)
- Microsoft UFO/UFO2
- OSWorld benchmark
- Agent-S/Agent-S2 (Simular AI)

Usage:
from openadapt_ml.baselines import UnifiedBaselineAdapter, BaselineConfig, TrackConfig

Expand All @@ -21,35 +27,95 @@
track=TrackConfig.track_c(),
)
adapter = UnifiedBaselineAdapter(config)

# OSWorld-compatible configuration
config = BaselineConfig(
provider="openai",
model="gpt-5.2",
track=TrackConfig.osworld_compatible(),
)

# Parse responses directly
from openadapt_ml.baselines import UnifiedResponseParser, ElementRegistry

parser = UnifiedResponseParser()
action = parser.parse('{"action": "CLICK", "x": 0.5, "y": 0.3}')

# With element ID to coordinate conversion
registry = ElementRegistry.from_a11y_tree(tree)
parser = UnifiedResponseParser(element_registry=registry)
action = parser.parse_and_resolve('{"action": "CLICK", "element_id": 17}')
"""

from openadapt_ml.baselines.adapter import UnifiedBaselineAdapter
from openadapt_ml.baselines.config import (
# Enums
ActionOutputFormat,
CoordinateSystem,
TrackType,
# Config dataclasses
BaselineConfig,
ModelSpec,
ReActConfig,
ScreenConfig,
SoMConfig,
TrackConfig,
TrackType,
# Registry
MODELS,
get_model_spec,
# Helper functions
get_default_model,
get_model_spec,
)
from openadapt_ml.baselines.parser import (
ElementRegistry,
ParsedAction,
UIElement,
UnifiedResponseParser,
)
from openadapt_ml.baselines.prompts import (
# System prompts
FORMAT_PROMPTS,
SYSTEM_PROMPT_OSWORLD,
SYSTEM_PROMPT_TRACK_A,
SYSTEM_PROMPT_TRACK_B,
SYSTEM_PROMPT_TRACK_C,
SYSTEM_PROMPT_UFO,
SYSTEM_PROMPTS,
# Builder class
PromptBuilder,
)
from openadapt_ml.baselines.parser import ParsedAction, UnifiedResponseParser
from openadapt_ml.baselines.prompts import PromptBuilder

__all__ = [
# Main adapter
"UnifiedBaselineAdapter",
# Configuration
"BaselineConfig",
"TrackConfig",
# Configuration - Enums
"ActionOutputFormat",
"CoordinateSystem",
"TrackType",
# Configuration - Dataclasses
"BaselineConfig",
"ModelSpec",
"ReActConfig",
"ScreenConfig",
"SoMConfig",
"TrackConfig",
# Configuration - Registry
"MODELS",
"get_model_spec",
# Configuration - Functions
"get_default_model",
"get_model_spec",
# Parsing
"ElementRegistry",
"ParsedAction",
"UIElement",
"UnifiedResponseParser",
# Prompts
"FORMAT_PROMPTS",
"PromptBuilder",
"SYSTEM_PROMPT_OSWORLD",
"SYSTEM_PROMPT_TRACK_A",
"SYSTEM_PROMPT_TRACK_B",
"SYSTEM_PROMPT_TRACK_C",
"SYSTEM_PROMPT_UFO",
"SYSTEM_PROMPTS",
]
197 changes: 190 additions & 7 deletions openadapt_ml/baselines/config.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
"""Configuration for baseline adapters.

Defines track types, model registry, and configuration dataclasses.
Based on SOTA patterns from:
- Claude Computer Use API
- Microsoft UFO/UFO2
- OSWorld benchmark
- Agent-S/Agent-S2
"""

from __future__ import annotations
Expand All @@ -23,61 +28,239 @@ class TrackType(str, Enum):
TRACK_C = "set_of_mark"


class CoordinateSystem(str, Enum):
"""Coordinate system for action output.

NORMALIZED: Coordinates in 0.0-1.0 range (relative to screen)
PIXEL: Absolute pixel coordinates
PERCENTAGE: Coordinates as percentages (0-100)
"""

NORMALIZED = "normalized"
PIXEL = "pixel"
PERCENTAGE = "percentage"


class ActionOutputFormat(str, Enum):
"""Output format style for model responses.

JSON: Structured JSON object
FUNCTION_CALL: Function-style like CLICK(x, y)
PYAUTOGUI: PyAutoGUI-style Python code (OSWorld compatible)
"""

JSON = "json"
FUNCTION_CALL = "function_call"
PYAUTOGUI = "pyautogui"


@dataclass
class SoMConfig:
"""Configuration for Set-of-Mark (SoM) overlay.

Controls how UI elements are labeled and displayed.
Based on patterns from SoM paper and OMNI-parser.

Attributes:
overlay_enabled: Whether to draw element overlays on screenshot.
label_format: Format for element labels ("[{id}]", "{id}", "e{id}").
font_size: Font size for labels in pixels.
label_background_color: RGBA tuple for label background.
label_text_color: RGB tuple for label text.
max_elements: Maximum elements to include (0=unlimited).
include_roles: Element roles to include (None=all).
exclude_roles: Element roles to exclude.
min_element_area: Minimum element area in pixels to include.
include_invisible: Whether to include non-visible elements.
"""

overlay_enabled: bool = True
label_format: str = "[{id}]" # "[1]", "1", "e1"
font_size: int = 12
label_background_color: tuple[int, int, int, int] = (0, 120, 255, 200) # Blue
label_text_color: tuple[int, int, int] = (255, 255, 255) # White
max_elements: int = 100
include_roles: list[str] | None = None # None = include all
exclude_roles: list[str] = field(
default_factory=lambda: ["group", "generic", "static_text", "separator"]
)
min_element_area: int = 100 # Minimum bbox area in pixels
include_invisible: bool = False


@dataclass
class ReActConfig:
"""Configuration for ReAct-style reasoning.

Controls the observation-thought-action cycle used in Track B.
Based on ReAct paper and UFO's Observation->Thought->Action pattern.

Attributes:
require_observation: Whether to require explicit observation.
require_thought: Whether to require reasoning explanation.
require_plan: Whether to require multi-step plan.
max_plan_steps: Maximum steps in plan output.
thinking_budget: Token budget for thinking (Claude extended thinking).
"""

require_observation: bool = True
require_thought: bool = True
require_plan: bool = False
max_plan_steps: int = 5
thinking_budget: int | None = None # For Claude extended thinking


@dataclass
class ScreenConfig:
"""Screen/display configuration for coordinate handling.

Attributes:
width: Display width in pixels.
height: Display height in pixels.
coordinate_system: How coordinates are represented.
scale_factor: DPI scale factor (1.0 = standard, 2.0 = retina).
"""

width: int = 1920
height: int = 1080
coordinate_system: CoordinateSystem = CoordinateSystem.NORMALIZED
scale_factor: float = 1.0

def normalize_coords(self, x: float, y: float) -> tuple[float, float]:
"""Convert pixel coordinates to normalized (0-1)."""
return (x / self.width, y / self.height)

def denormalize_coords(self, x: float, y: float) -> tuple[int, int]:
"""Convert normalized coordinates to pixels."""
return (int(x * self.width), int(y * self.height))


@dataclass
class TrackConfig:
"""Configuration for a specific evaluation track.

Attributes:
track_type: The track type (A, B, or C).
output_format: Expected output format string.
action_format: Style of action output (JSON, function, pyautogui).
use_som: Whether to use Set-of-Mark overlay.
som_config: Configuration for SoM (Track C).
use_a11y_tree: Whether to include accessibility tree.
max_a11y_elements: Max elements in a11y tree (truncation).
include_reasoning: Whether to request reasoning steps.
react_config: Configuration for ReAct (Track B).
include_history: Whether to include action history.
max_history_steps: Max history steps to include.
screen_config: Screen/coordinate configuration.
verify_after_action: Request screenshot verification after actions.
"""

track_type: TrackType
output_format: str
action_format: ActionOutputFormat = ActionOutputFormat.JSON
use_som: bool = False
som_config: SoMConfig | None = None
use_a11y_tree: bool = True
max_a11y_elements: int = 50
include_reasoning: bool = False
react_config: ReActConfig | None = None
include_history: bool = True
max_history_steps: int = 5
screen_config: ScreenConfig = field(default_factory=ScreenConfig)
verify_after_action: bool = False # Claude computer use best practice

@classmethod
def track_a(cls) -> "TrackConfig":
"""Create Track A (Direct Coordinates) config."""
def track_a(cls, **kwargs: Any) -> "TrackConfig":
"""Create Track A (Direct Coordinates) config.

Simplest track: screenshot + goal -> coordinates.
No reasoning or element IDs.
"""
return cls(
track_type=TrackType.TRACK_A,
output_format='{"action": "CLICK", "x": float, "y": float}',
action_format=ActionOutputFormat.JSON,
use_som=False,
use_a11y_tree=True,
include_reasoning=False,
**kwargs,
)

@classmethod
def track_b(cls) -> "TrackConfig":
"""Create Track B (ReAct with Coordinates) config."""
def track_b(cls, **kwargs: Any) -> "TrackConfig":
"""Create Track B (ReAct with Coordinates) config.

Includes observation->thought->action cycle.
Based on ReAct, UFO, and Claude thinking patterns.
"""
react_config = kwargs.pop("react_config", None) or ReActConfig()
return cls(
track_type=TrackType.TRACK_B,
output_format='{"thought": str, "action": "CLICK", "x": float, "y": float}',
output_format='{"observation": str, "thought": str, "action": "CLICK", "x": float, "y": float}',
action_format=ActionOutputFormat.JSON,
use_som=False,
use_a11y_tree=True,
include_reasoning=True,
react_config=react_config,
**kwargs,
)

@classmethod
def track_c(cls) -> "TrackConfig":
"""Create Track C (Set-of-Mark) config."""
def track_c(cls, **kwargs: Any) -> "TrackConfig":
"""Create Track C (Set-of-Mark) config.

Uses numbered element labels instead of coordinates.
Based on SoM paper and OMNI-parser patterns.
"""
som_config = kwargs.pop("som_config", None) or SoMConfig()
return cls(
track_type=TrackType.TRACK_C,
output_format='{"action": "CLICK", "element_id": int}',
action_format=ActionOutputFormat.JSON,
use_som=True,
som_config=som_config,
use_a11y_tree=True,
include_reasoning=False,
**kwargs,
)

@classmethod
def osworld_compatible(cls, **kwargs: Any) -> "TrackConfig":
"""Create OSWorld-compatible config.

Uses PyAutoGUI-style action format for OSWorld benchmark.
"""
return cls(
track_type=TrackType.TRACK_A,
output_format="pyautogui.click(x, y)",
action_format=ActionOutputFormat.PYAUTOGUI,
use_som=False,
use_a11y_tree=True,
include_reasoning=False,
**kwargs,
)

@classmethod
def ufo_compatible(cls, **kwargs: Any) -> "TrackConfig":
"""Create UFO-compatible config.

Uses UFO's AppAgent output format with observation/thought/plan.
"""
react_config = kwargs.pop("react_config", None) or ReActConfig(
require_observation=True,
require_thought=True,
require_plan=True,
)
return cls(
track_type=TrackType.TRACK_B,
output_format='{"Observation": str, "Thought": str, "ControlLabel": int, "Function": str, "Args": list}',
action_format=ActionOutputFormat.JSON,
use_som=True,
som_config=SoMConfig(),
use_a11y_tree=True,
include_reasoning=True,
react_config=react_config,
**kwargs,
)


Expand Down
Loading