OpenAdaptAI · abrichr · Jan 17, 2026 · Jan 17, 2026
diff --git a/openadapt_ml/baselines/__init__.py b/openadapt_ml/baselines/__init__.py
@@ -7,6 +7,12 @@
 - Track B: ReAct-style reasoning with coordinates
 - Track C: Set-of-Mark element selection
 
+Based on SOTA patterns from:
+- Claude Computer Use (Anthropic)
+- Microsoft UFO/UFO2
+- OSWorld benchmark
+- Agent-S/Agent-S2 (Simular AI)
+
 Usage:
     from openadapt_ml.baselines import UnifiedBaselineAdapter, BaselineConfig, TrackConfig
 
@@ -21,35 +27,95 @@
         track=TrackConfig.track_c(),
     )
     adapter = UnifiedBaselineAdapter(config)
+
+    # OSWorld-compatible configuration
+    config = BaselineConfig(
+        provider="openai",
+        model="gpt-5.2",
+        track=TrackConfig.osworld_compatible(),
+    )
+
+    # Parse responses directly
+    from openadapt_ml.baselines import UnifiedResponseParser, ElementRegistry
+
+    parser = UnifiedResponseParser()
+    action = parser.parse('{"action": "CLICK", "x": 0.5, "y": 0.3}')
+
+    # With element ID to coordinate conversion
+    registry = ElementRegistry.from_a11y_tree(tree)
+    parser = UnifiedResponseParser(element_registry=registry)
+    action = parser.parse_and_resolve('{"action": "CLICK", "element_id": 17}')
 """
 
 from openadapt_ml.baselines.adapter import UnifiedBaselineAdapter
 from openadapt_ml.baselines.config import (
+    # Enums
+    ActionOutputFormat,
+    CoordinateSystem,
+    TrackType,
+    # Config dataclasses
     BaselineConfig,
     ModelSpec,
+    ReActConfig,
+    ScreenConfig,
+    SoMConfig,
     TrackConfig,
-    TrackType,
+    # Registry
     MODELS,
-    get_model_spec,
+    # Helper functions
     get_default_model,
+    get_model_spec,
+)
+from openadapt_ml.baselines.parser import (
+    ElementRegistry,
+    ParsedAction,
+    UIElement,
+    UnifiedResponseParser,
+)
+from openadapt_ml.baselines.prompts import (
+    # System prompts
+    FORMAT_PROMPTS,
+    SYSTEM_PROMPT_OSWORLD,
+    SYSTEM_PROMPT_TRACK_A,
+    SYSTEM_PROMPT_TRACK_B,
+    SYSTEM_PROMPT_TRACK_C,
+    SYSTEM_PROMPT_UFO,
+    SYSTEM_PROMPTS,
+    # Builder class
+    PromptBuilder,
 )
-from openadapt_ml.baselines.parser import ParsedAction, UnifiedResponseParser
-from openadapt_ml.baselines.prompts import PromptBuilder
 
 __all__ = [
     # Main adapter
     "UnifiedBaselineAdapter",
-    # Configuration
-    "BaselineConfig",
-    "TrackConfig",
+    # Configuration - Enums
+    "ActionOutputFormat",
+    "CoordinateSystem",
     "TrackType",
+    # Configuration - Dataclasses
+    "BaselineConfig",
     "ModelSpec",
+    "ReActConfig",
+    "ScreenConfig",
+    "SoMConfig",
+    "TrackConfig",
+    # Configuration - Registry
     "MODELS",
-    "get_model_spec",
+    # Configuration - Functions
     "get_default_model",
+    "get_model_spec",
     # Parsing
+    "ElementRegistry",
     "ParsedAction",
+    "UIElement",
     "UnifiedResponseParser",
     # Prompts
+    "FORMAT_PROMPTS",
     "PromptBuilder",
+    "SYSTEM_PROMPT_OSWORLD",
+    "SYSTEM_PROMPT_TRACK_A",
+    "SYSTEM_PROMPT_TRACK_B",
+    "SYSTEM_PROMPT_TRACK_C",
+    "SYSTEM_PROMPT_UFO",
+    "SYSTEM_PROMPTS",
 ]
diff --git a/openadapt_ml/baselines/config.py b/openadapt_ml/baselines/config.py
@@ -1,6 +1,11 @@
 """Configuration for baseline adapters.
 
 Defines track types, model registry, and configuration dataclasses.
+Based on SOTA patterns from:
+- Claude Computer Use API
+- Microsoft UFO/UFO2
+- OSWorld benchmark
+- Agent-S/Agent-S2
 """
 
 from __future__ import annotations
@@ -23,61 +28,239 @@ class TrackType(str, Enum):
     TRACK_C = "set_of_mark"
 
 
+class CoordinateSystem(str, Enum):
+    """Coordinate system for action output.
+
+    NORMALIZED: Coordinates in 0.0-1.0 range (relative to screen)
+    PIXEL: Absolute pixel coordinates
+    PERCENTAGE: Coordinates as percentages (0-100)
+    """
+
+    NORMALIZED = "normalized"
+    PIXEL = "pixel"
+    PERCENTAGE = "percentage"
+
+
+class ActionOutputFormat(str, Enum):
+    """Output format style for model responses.
+
+    JSON: Structured JSON object
+    FUNCTION_CALL: Function-style like CLICK(x, y)
+    PYAUTOGUI: PyAutoGUI-style Python code (OSWorld compatible)
+    """
+
+    JSON = "json"
+    FUNCTION_CALL = "function_call"
+    PYAUTOGUI = "pyautogui"
+
+
+@dataclass
+class SoMConfig:
+    """Configuration for Set-of-Mark (SoM) overlay.
+
+    Controls how UI elements are labeled and displayed.
+    Based on patterns from SoM paper and OMNI-parser.
+
+    Attributes:
+        overlay_enabled: Whether to draw element overlays on screenshot.
+        label_format: Format for element labels ("[{id}]", "{id}", "e{id}").
+        font_size: Font size for labels in pixels.
+        label_background_color: RGBA tuple for label background.
+        label_text_color: RGB tuple for label text.
+        max_elements: Maximum elements to include (0=unlimited).
+        include_roles: Element roles to include (None=all).
+        exclude_roles: Element roles to exclude.
+        min_element_area: Minimum element area in pixels to include.
+        include_invisible: Whether to include non-visible elements.
+    """
+
+    overlay_enabled: bool = True
+    label_format: str = "[{id}]"  # "[1]", "1", "e1"
+    font_size: int = 12
+    label_background_color: tuple[int, int, int, int] = (0, 120, 255, 200)  # Blue
+    label_text_color: tuple[int, int, int] = (255, 255, 255)  # White
+    max_elements: int = 100
+    include_roles: list[str] | None = None  # None = include all
+    exclude_roles: list[str] = field(
+        default_factory=lambda: ["group", "generic", "static_text", "separator"]
+    )
+    min_element_area: int = 100  # Minimum bbox area in pixels
+    include_invisible: bool = False
+
+
+@dataclass
+class ReActConfig:
+    """Configuration for ReAct-style reasoning.
+
+    Controls the observation-thought-action cycle used in Track B.
+    Based on ReAct paper and UFO's Observation->Thought->Action pattern.
+
+    Attributes:
+        require_observation: Whether to require explicit observation.
+        require_thought: Whether to require reasoning explanation.
+        require_plan: Whether to require multi-step plan.
+        max_plan_steps: Maximum steps in plan output.
+        thinking_budget: Token budget for thinking (Claude extended thinking).
+    """
+
+    require_observation: bool = True
+    require_thought: bool = True
+    require_plan: bool = False
+    max_plan_steps: int = 5
+    thinking_budget: int | None = None  # For Claude extended thinking
+
+
+@dataclass
+class ScreenConfig:
+    """Screen/display configuration for coordinate handling.
+
+    Attributes:
+        width: Display width in pixels.
+        height: Display height in pixels.
+        coordinate_system: How coordinates are represented.
+        scale_factor: DPI scale factor (1.0 = standard, 2.0 = retina).
+    """
+
+    width: int = 1920
+    height: int = 1080
+    coordinate_system: CoordinateSystem = CoordinateSystem.NORMALIZED
+    scale_factor: float = 1.0
+
+    def normalize_coords(self, x: float, y: float) -> tuple[float, float]:
+        """Convert pixel coordinates to normalized (0-1)."""
+        return (x / self.width, y / self.height)
+
+    def denormalize_coords(self, x: float, y: float) -> tuple[int, int]:
+        """Convert normalized coordinates to pixels."""
+        return (int(x * self.width), int(y * self.height))
+
+
 @dataclass
 class TrackConfig:
     """Configuration for a specific evaluation track.
 
     Attributes:
         track_type: The track type (A, B, or C).
         output_format: Expected output format string.
+        action_format: Style of action output (JSON, function, pyautogui).
         use_som: Whether to use Set-of-Mark overlay.
+        som_config: Configuration for SoM (Track C).
         use_a11y_tree: Whether to include accessibility tree.
         max_a11y_elements: Max elements in a11y tree (truncation).
         include_reasoning: Whether to request reasoning steps.
+        react_config: Configuration for ReAct (Track B).
         include_history: Whether to include action history.
         max_history_steps: Max history steps to include.
+        screen_config: Screen/coordinate configuration.
+        verify_after_action: Request screenshot verification after actions.
     """
 
     track_type: TrackType
     output_format: str
+    action_format: ActionOutputFormat = ActionOutputFormat.JSON
     use_som: bool = False
+    som_config: SoMConfig | None = None
     use_a11y_tree: bool = True
     max_a11y_elements: int = 50
     include_reasoning: bool = False
+    react_config: ReActConfig | None = None
     include_history: bool = True
     max_history_steps: int = 5
+    screen_config: ScreenConfig = field(default_factory=ScreenConfig)
+    verify_after_action: bool = False  # Claude computer use best practice
 
     @classmethod
-    def track_a(cls) -> "TrackConfig":
-        """Create Track A (Direct Coordinates) config."""
+    def track_a(cls, **kwargs: Any) -> "TrackConfig":
+        """Create Track A (Direct Coordinates) config.
+
+        Simplest track: screenshot + goal -> coordinates.
+        No reasoning or element IDs.
+        """
         return cls(
             track_type=TrackType.TRACK_A,
             output_format='{"action": "CLICK", "x": float, "y": float}',
+            action_format=ActionOutputFormat.JSON,
             use_som=False,
             use_a11y_tree=True,
             include_reasoning=False,
+            **kwargs,
         )
 
     @classmethod
-    def track_b(cls) -> "TrackConfig":
-        """Create Track B (ReAct with Coordinates) config."""
+    def track_b(cls, **kwargs: Any) -> "TrackConfig":
+        """Create Track B (ReAct with Coordinates) config.
+
+        Includes observation->thought->action cycle.
+        Based on ReAct, UFO, and Claude thinking patterns.
+        """
+        react_config = kwargs.pop("react_config", None) or ReActConfig()
         return cls(
             track_type=TrackType.TRACK_B,
-            output_format='{"thought": str, "action": "CLICK", "x": float, "y": float}',
+            output_format='{"observation": str, "thought": str, "action": "CLICK", "x": float, "y": float}',
+            action_format=ActionOutputFormat.JSON,
             use_som=False,
             use_a11y_tree=True,
             include_reasoning=True,
+            react_config=react_config,
+            **kwargs,
         )
 
     @classmethod
-    def track_c(cls) -> "TrackConfig":
-        """Create Track C (Set-of-Mark) config."""
+    def track_c(cls, **kwargs: Any) -> "TrackConfig":
+        """Create Track C (Set-of-Mark) config.
+
+        Uses numbered element labels instead of coordinates.
+        Based on SoM paper and OMNI-parser patterns.
+        """
+        som_config = kwargs.pop("som_config", None) or SoMConfig()
         return cls(
             track_type=TrackType.TRACK_C,
             output_format='{"action": "CLICK", "element_id": int}',
+            action_format=ActionOutputFormat.JSON,
             use_som=True,
+            som_config=som_config,
             use_a11y_tree=True,
             include_reasoning=False,
+            **kwargs,
+        )
+
+    @classmethod
+    def osworld_compatible(cls, **kwargs: Any) -> "TrackConfig":
+        """Create OSWorld-compatible config.
+
+        Uses PyAutoGUI-style action format for OSWorld benchmark.
+        """
+        return cls(
+            track_type=TrackType.TRACK_A,
+            output_format="pyautogui.click(x, y)",
+            action_format=ActionOutputFormat.PYAUTOGUI,
+            use_som=False,
+            use_a11y_tree=True,
+            include_reasoning=False,
+            **kwargs,
+        )
+
+    @classmethod
+    def ufo_compatible(cls, **kwargs: Any) -> "TrackConfig":
+        """Create UFO-compatible config.
+
+        Uses UFO's AppAgent output format with observation/thought/plan.
+        """
+        react_config = kwargs.pop("react_config", None) or ReActConfig(
+            require_observation=True,
+            require_thought=True,
+            require_plan=True,
+        )
+        return cls(
+            track_type=TrackType.TRACK_B,
+            output_format='{"Observation": str, "Thought": str, "ControlLabel": int, "Function": str, "Args": list}',
+            action_format=ActionOutputFormat.JSON,
+            use_som=True,
+            som_config=SoMConfig(),
+            use_a11y_tree=True,
+            include_reasoning=True,
+            react_config=react_config,
+            **kwargs,
         )