Skip to content

Commit 28c9287

Browse files
committed
fix: add support for claude 3.7
1 parent fdfadfb commit 28c9287

File tree

5 files changed

+309
-31
lines changed

5 files changed

+309
-31
lines changed

demo.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,12 @@
1212

1313

1414
load_dotenv()
15+
MODEL = "claude-3-7-sonnet-20250219"
16+
17+
model_to_beta = {
18+
"claude-3-7-sonnet-20250219": "20250124",
19+
"claude-3-5-sonnet-20241022": "20241022",
20+
}
1521

1622
anthropic_client = Anthropic()
1723
invariant_client = InvariantClient() if "INVARIANT_API_KEY" in os.environ else None
@@ -24,9 +30,11 @@ async def run(playwright: Playwright, prompt: str):
2430
page = await context.new_page()
2531
await page.set_viewport_size({"width": 1024, "height": 768}) # Computer-use default
2632
await page.goto("https://www.google.com")
27-
playwright_tools = PlaywrightToolbox(page, use_cursor=True)
33+
playwright_tools = PlaywrightToolbox(
34+
page, use_cursor=True, beta_version=model_to_beta[MODEL]
35+
)
2836
messages = await sampling_loop(
29-
model="claude-3-5-sonnet-20241022",
37+
model=MODEL,
3038
anthropic_client=anthropic_client,
3139
messages=[{"role": "user", "content": prompt}],
3240
tools=playwright_tools,

loop.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,10 @@
2929

3030
from playwright_computer_use.async_api import PlaywrightToolbox, ToolResult
3131

32-
COMPUTER_USE_BETA_FLAG = "computer-use-2024-10-22"
32+
COMPUTER_USE_BETA_FLAG = {
33+
"20241022": "computer-use-2024-10-22",
34+
"20250124": "computer-use-2025-01-24",
35+
}
3336
PROMPT_CACHING_BETA_FLAG = "prompt-caching-2024-07-31"
3437

3538

@@ -82,7 +85,7 @@ async def sampling_loop(
8285
)
8386
while True:
8487
enable_prompt_caching = False
85-
betas = [COMPUTER_USE_BETA_FLAG]
88+
betas = [COMPUTER_USE_BETA_FLAG[tools.beta_version]]
8689
image_truncation_threshold = only_n_most_recent_images or 0
8790

8891
if enable_prompt_caching:

pyproject.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@ dependencies = [
2626
"playwright",
2727
"Pillow",
2828
"invariant-sdk",
29-
"python-dotenv",
3029
]
3130
[project.urls]
3231
Homepage = "https://github.com/invariantlabs-ai/playwright-computer-use"

src/playwright_computer_use/async_api.py

Lines changed: 152 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,23 +2,25 @@
22

33
import importlib.resources
44
import base64
5-
from enum import StrEnum
6-
from typing import Literal, TypedDict
5+
from typing import Literal, TypedDict, get_args, Type, cast
76
from playwright.async_api import Page
7+
from asyncio import sleep
88
from PIL import Image
99
import io
1010
from anthropic.types.beta import (
1111
BetaToolComputerUse20241022Param,
12+
BetaToolComputerUse20250124Param,
1213
BetaToolParam,
1314
BetaToolResultBlockParam,
1415
BetaTextBlockParam,
1516
BetaImageBlockParam,
1617
)
1718
from dataclasses import dataclass
1819

20+
TYPING_DELAY_MS = 12
1921
TYPING_GROUP_SIZE = 50
2022

21-
Action = Literal[
23+
Action_20241022 = Literal[
2224
"key",
2325
"type",
2426
"mouse_move",
@@ -31,6 +33,20 @@
3133
"cursor_position",
3234
]
3335

36+
Action_20250124 = (
37+
Action_20241022
38+
| Literal[
39+
"left_mouse_down",
40+
"left_mouse_up",
41+
"scroll",
42+
"hold_key",
43+
"wait",
44+
"triple_click",
45+
]
46+
)
47+
48+
ScrollDirection = Literal["up", "down", "left", "right"]
49+
3450

3551
class ComputerToolOptions(TypedDict):
3652
"""Options for the computer tool."""
@@ -71,20 +87,27 @@ def __init__(
7187
use_cursor: bool = True,
7288
screenshot_wait_until: Literal["load", "domcontentloaded", "networkidle"]
7389
| None = None,
90+
beta_version: Literal["20241022", "20250124"] = "20250124",
7491
):
7592
"""Create a new PlaywrightToolbox.
7693
7794
Args:
7895
page: The Async Playwright page to interact with.
7996
use_cursor: Whether to display the cursor in the screenshots or not.
8097
screenshot_wait_until: Optional, wait until the page is in a specific state before taking a screenshot. Default does not wait
81-
98+
beta_version: The version of the beta to use. Default is the latest version (Claude3.7)
8299
"""
83100
self.page = page
101+
self.beta_version = beta_version
102+
computer_tool_map: dict[str, Type[BasePlaywrightComputerTool]] = {
103+
"20241022": PlaywrightComputerTool20241022,
104+
"20250124": PlaywrightComputerTool20250124,
105+
}
106+
ComputerTool = computer_tool_map[beta_version]
84107
self.tools: list[
85-
PlaywrightComputerTool | PlaywrightSetURLTool | PlaywrightBackTool
108+
BasePlaywrightComputerTool | PlaywrightSetURLTool | PlaywrightBackTool
86109
] = [
87-
PlaywrightComputerTool(
110+
ComputerTool(
88111
page, use_cursor=use_cursor, screenshot_wait_until=screenshot_wait_until
89112
),
90113
PlaywrightSetURLTool(page),
@@ -181,11 +204,10 @@ async def __call__(self):
181204
return ToolResult(error=str(e))
182205

183206

184-
class PlaywrightComputerTool:
207+
class BasePlaywrightComputerTool:
185208
"""A tool that allows the agent to interact with Async Playwright Page."""
186209

187210
name: Literal["computer"] = "computer"
188-
api_type: Literal["computer_20241022"] = "computer_20241022"
189211

190212
@property
191213
def width(self) -> int:
@@ -206,9 +228,9 @@ def options(self) -> ComputerToolOptions:
206228
"display_number": 1, # hardcoded
207229
}
208230

209-
def to_params(self) -> BetaToolComputerUse20241022Param:
231+
def to_params(self):
210232
"""Params describing the tool. Used by Claude to understand this is a computer use tool."""
211-
return {"name": self.name, "type": self.api_type, **self.options}
233+
raise NotImplementedError("to_params must be implemented in the subclass")
212234

213235
def __init__(
214236
self,
@@ -233,7 +255,7 @@ def __init__(
233255
async def __call__(
234256
self,
235257
*,
236-
action: Action,
258+
action: Action_20241022,
237259
text: str | None = None,
238260
coordinate: tuple[int, int] | None = None,
239261
**kwargs,
@@ -252,7 +274,7 @@ async def __call__(
252274
x, y = coordinate
253275

254276
if action == "mouse_move":
255-
action = await self.page.mouse.move(x, y)
277+
await self.page.mouse.move(x, y)
256278
self.mouse_position = (x, y)
257279
return ToolResult(output=None, error=None, base64_image=None)
258280
elif action == "left_click_drag":
@@ -337,6 +359,124 @@ async def press_key(self, key: str):
337359
await self.page.keyboard.up(shift)
338360

339361

362+
class PlaywrightComputerTool20241022(BasePlaywrightComputerTool):
363+
"""Tool to interact with the computer using Playwright (Beta 22/10/2024)."""
364+
365+
api_type: Literal["computer_20241022"] = "computer_20241022"
366+
367+
def to_params(self) -> BetaToolComputerUse20241022Param:
368+
"""Params describing the tool. Used by Claude to understand this is a computer use tool."""
369+
return {"name": self.name, "type": self.api_type, **self.options}
370+
371+
372+
class PlaywrightComputerTool20250124(BasePlaywrightComputerTool):
373+
"""Tool to interact with the computer using Playwright (Beta 24/01/2025)."""
374+
375+
api_type: Literal["computer_20250124"] = "computer_20250124"
376+
377+
def to_params(self) -> BetaToolComputerUse20250124Param:
378+
"""Params describing the tool. Used by Claude to understand this is a computer use tool."""
379+
return {"name": self.name, "type": self.api_type, **self.options}
380+
381+
async def __call__(
382+
self,
383+
*,
384+
action: Action_20250124,
385+
text: str | None = None,
386+
coordinate: tuple[int, int] | None = None,
387+
scroll_direction: ScrollDirection | None = None,
388+
scroll_amount: int | None = None,
389+
duration: int | float | None = None,
390+
key: str | None = None,
391+
**kwargs,
392+
):
393+
"""Run an action. text, coordinate, scroll_directions, scroll_amount, duration, key are potential additional parameters."""
394+
if action in ("left_mouse_down", "left_mouse_up"):
395+
if coordinate is not None:
396+
raise ToolError(f"coordinate is not accepted for {action=}.")
397+
await (
398+
self.page.mouse.down()
399+
) if action == "left_mouse_down" else await self.page.mouse.up()
400+
return ToolResult()
401+
if action == "scroll":
402+
if scroll_direction is None or scroll_direction not in get_args(
403+
ScrollDirection
404+
):
405+
raise ToolError(
406+
f"{scroll_direction=} must be 'up', 'down', 'left', or 'right'"
407+
)
408+
if not isinstance(scroll_amount, int) or scroll_amount < 0:
409+
raise ToolError(f"{scroll_amount=} must be a non-negative int")
410+
if coordinate is not None:
411+
x, y = coordinate
412+
await self.page.mouse.move(x, y)
413+
self.mouse_position = (x, y)
414+
scroll_amount *= 100
415+
scroll_params = {
416+
"up": {"delta_y": -scroll_amount, "delta_x": 0},
417+
"down": {"delta_y": scroll_amount, "delta_x": 0},
418+
"left": {"delta_y": 0, "delta_x": scroll_amount},
419+
"right": {"delta_y": 0, "delta_x": -scroll_amount},
420+
}[scroll_direction]
421+
422+
await self.page.mouse.wheel(**scroll_params)
423+
return ToolResult()
424+
425+
if action in ("hold_key", "wait"):
426+
if duration is None or not isinstance(duration, (int, float)):
427+
raise ToolError(f"{duration=} must be a number")
428+
if duration < 0:
429+
raise ToolError(f"{duration=} must be non-negative")
430+
if duration > 100:
431+
raise ToolError(f"{duration=} is too long.")
432+
433+
if action == "hold_key":
434+
if text is None:
435+
raise ToolError(f"text is required for {action}")
436+
await self.page.keyboard.press(to_playwright_key(text), delay=duration)
437+
return ToolResult()
438+
439+
if action == "wait":
440+
await sleep(duration)
441+
return await self.screenshot()
442+
443+
if action in (
444+
"left_click",
445+
"right_click",
446+
"double_click",
447+
"triple_click",
448+
"middle_click",
449+
):
450+
if text is not None:
451+
raise ToolError(f"text is not accepted for {action}")
452+
mouse_move_part = ""
453+
if coordinate is not None:
454+
x, y = coordinate
455+
await self.page.mouse.move(x, y)
456+
self.mouse_position = (x, y)
457+
458+
click_arg = {
459+
"left_click": {"button": "left", "click_count": 1},
460+
"right_click": {"button": "right", "click_count": 1},
461+
"middle_click": {"button": "middle", "click_count": 1},
462+
"double_click": {"button": "left", "click_count": 2, "delay": 10},
463+
"double_click": {"button": "left", "click_count": 3, "delay": 10},
464+
}[action]
465+
if key:
466+
self.page.keyboard.down(to_playwright_key(key))
467+
await self.page.mouse.click(
468+
self.mouse_position[0], self.mouse_position[1], **click_arg
469+
)
470+
if key:
471+
self.page.keyboard.up(to_playwright_key(key))
472+
473+
return ToolResult()
474+
action = cast(Action_20241022, action)
475+
return await super().__call__(
476+
action=action, text=text, coordinate=coordinate, key=key, **kwargs
477+
)
478+
479+
340480
def to_playwright_key(key: str) -> str:
341481
"""Convert a key to the Playwright key format."""
342482
valid_keys = (

0 commit comments

Comments
 (0)