2
2
3
3
import importlib .resources
4
4
import base64
5
- from enum import StrEnum
6
- from typing import Literal , TypedDict
5
+ from typing import Literal , TypedDict , get_args , Type , cast
7
6
from playwright .async_api import Page
7
+ from asyncio import sleep
8
8
from PIL import Image
9
9
import io
10
10
from anthropic .types .beta import (
11
11
BetaToolComputerUse20241022Param ,
12
+ BetaToolComputerUse20250124Param ,
12
13
BetaToolParam ,
13
14
BetaToolResultBlockParam ,
14
15
BetaTextBlockParam ,
15
16
BetaImageBlockParam ,
16
17
)
17
18
from dataclasses import dataclass
18
19
20
+ TYPING_DELAY_MS = 12
19
21
TYPING_GROUP_SIZE = 50
20
22
21
- Action = Literal [
23
+ Action_20241022 = Literal [
22
24
"key" ,
23
25
"type" ,
24
26
"mouse_move" ,
31
33
"cursor_position" ,
32
34
]
33
35
36
+ Action_20250124 = (
37
+ Action_20241022
38
+ | Literal [
39
+ "left_mouse_down" ,
40
+ "left_mouse_up" ,
41
+ "scroll" ,
42
+ "hold_key" ,
43
+ "wait" ,
44
+ "triple_click" ,
45
+ ]
46
+ )
47
+
48
+ ScrollDirection = Literal ["up" , "down" , "left" , "right" ]
49
+
34
50
35
51
class ComputerToolOptions (TypedDict ):
36
52
"""Options for the computer tool."""
@@ -71,20 +87,27 @@ def __init__(
71
87
use_cursor : bool = True ,
72
88
screenshot_wait_until : Literal ["load" , "domcontentloaded" , "networkidle" ]
73
89
| None = None ,
90
+ beta_version : Literal ["20241022" , "20250124" ] = "20250124" ,
74
91
):
75
92
"""Create a new PlaywrightToolbox.
76
93
77
94
Args:
78
95
page: The Async Playwright page to interact with.
79
96
use_cursor: Whether to display the cursor in the screenshots or not.
80
97
screenshot_wait_until: Optional, wait until the page is in a specific state before taking a screenshot. Default does not wait
81
-
98
+ beta_version: The version of the beta to use. Default is the latest version (Claude3.7)
82
99
"""
83
100
self .page = page
101
+ self .beta_version = beta_version
102
+ computer_tool_map : dict [str , Type [BasePlaywrightComputerTool ]] = {
103
+ "20241022" : PlaywrightComputerTool20241022 ,
104
+ "20250124" : PlaywrightComputerTool20250124 ,
105
+ }
106
+ ComputerTool = computer_tool_map [beta_version ]
84
107
self .tools : list [
85
- PlaywrightComputerTool | PlaywrightSetURLTool | PlaywrightBackTool
108
+ BasePlaywrightComputerTool | PlaywrightSetURLTool | PlaywrightBackTool
86
109
] = [
87
- PlaywrightComputerTool (
110
+ ComputerTool (
88
111
page , use_cursor = use_cursor , screenshot_wait_until = screenshot_wait_until
89
112
),
90
113
PlaywrightSetURLTool (page ),
@@ -181,11 +204,10 @@ async def __call__(self):
181
204
return ToolResult (error = str (e ))
182
205
183
206
184
- class PlaywrightComputerTool :
207
+ class BasePlaywrightComputerTool :
185
208
"""A tool that allows the agent to interact with Async Playwright Page."""
186
209
187
210
name : Literal ["computer" ] = "computer"
188
- api_type : Literal ["computer_20241022" ] = "computer_20241022"
189
211
190
212
@property
191
213
def width (self ) -> int :
@@ -206,9 +228,9 @@ def options(self) -> ComputerToolOptions:
206
228
"display_number" : 1 , # hardcoded
207
229
}
208
230
209
- def to_params (self ) -> BetaToolComputerUse20241022Param :
231
+ def to_params (self ):
210
232
"""Params describing the tool. Used by Claude to understand this is a computer use tool."""
211
- return { "name" : self . name , "type" : self . api_type , ** self . options }
233
+ raise NotImplementedError ( "to_params must be implemented in the subclass" )
212
234
213
235
def __init__ (
214
236
self ,
@@ -233,7 +255,7 @@ def __init__(
233
255
async def __call__ (
234
256
self ,
235
257
* ,
236
- action : Action ,
258
+ action : Action_20241022 ,
237
259
text : str | None = None ,
238
260
coordinate : tuple [int , int ] | None = None ,
239
261
** kwargs ,
@@ -252,7 +274,7 @@ async def __call__(
252
274
x , y = coordinate
253
275
254
276
if action == "mouse_move" :
255
- action = await self .page .mouse .move (x , y )
277
+ await self .page .mouse .move (x , y )
256
278
self .mouse_position = (x , y )
257
279
return ToolResult (output = None , error = None , base64_image = None )
258
280
elif action == "left_click_drag" :
@@ -337,6 +359,124 @@ async def press_key(self, key: str):
337
359
await self .page .keyboard .up (shift )
338
360
339
361
362
+ class PlaywrightComputerTool20241022 (BasePlaywrightComputerTool ):
363
+ """Tool to interact with the computer using Playwright (Beta 22/10/2024)."""
364
+
365
+ api_type : Literal ["computer_20241022" ] = "computer_20241022"
366
+
367
+ def to_params (self ) -> BetaToolComputerUse20241022Param :
368
+ """Params describing the tool. Used by Claude to understand this is a computer use tool."""
369
+ return {"name" : self .name , "type" : self .api_type , ** self .options }
370
+
371
+
372
+ class PlaywrightComputerTool20250124 (BasePlaywrightComputerTool ):
373
+ """Tool to interact with the computer using Playwright (Beta 24/01/2025)."""
374
+
375
+ api_type : Literal ["computer_20250124" ] = "computer_20250124"
376
+
377
+ def to_params (self ) -> BetaToolComputerUse20250124Param :
378
+ """Params describing the tool. Used by Claude to understand this is a computer use tool."""
379
+ return {"name" : self .name , "type" : self .api_type , ** self .options }
380
+
381
+ async def __call__ (
382
+ self ,
383
+ * ,
384
+ action : Action_20250124 ,
385
+ text : str | None = None ,
386
+ coordinate : tuple [int , int ] | None = None ,
387
+ scroll_direction : ScrollDirection | None = None ,
388
+ scroll_amount : int | None = None ,
389
+ duration : int | float | None = None ,
390
+ key : str | None = None ,
391
+ ** kwargs ,
392
+ ):
393
+ """Run an action. text, coordinate, scroll_directions, scroll_amount, duration, key are potential additional parameters."""
394
+ if action in ("left_mouse_down" , "left_mouse_up" ):
395
+ if coordinate is not None :
396
+ raise ToolError (f"coordinate is not accepted for { action = } ." )
397
+ await (
398
+ self .page .mouse .down ()
399
+ ) if action == "left_mouse_down" else await self .page .mouse .up ()
400
+ return ToolResult ()
401
+ if action == "scroll" :
402
+ if scroll_direction is None or scroll_direction not in get_args (
403
+ ScrollDirection
404
+ ):
405
+ raise ToolError (
406
+ f"{ scroll_direction = } must be 'up', 'down', 'left', or 'right'"
407
+ )
408
+ if not isinstance (scroll_amount , int ) or scroll_amount < 0 :
409
+ raise ToolError (f"{ scroll_amount = } must be a non-negative int" )
410
+ if coordinate is not None :
411
+ x , y = coordinate
412
+ await self .page .mouse .move (x , y )
413
+ self .mouse_position = (x , y )
414
+ scroll_amount *= 100
415
+ scroll_params = {
416
+ "up" : {"delta_y" : - scroll_amount , "delta_x" : 0 },
417
+ "down" : {"delta_y" : scroll_amount , "delta_x" : 0 },
418
+ "left" : {"delta_y" : 0 , "delta_x" : scroll_amount },
419
+ "right" : {"delta_y" : 0 , "delta_x" : - scroll_amount },
420
+ }[scroll_direction ]
421
+
422
+ await self .page .mouse .wheel (** scroll_params )
423
+ return ToolResult ()
424
+
425
+ if action in ("hold_key" , "wait" ):
426
+ if duration is None or not isinstance (duration , (int , float )):
427
+ raise ToolError (f"{ duration = } must be a number" )
428
+ if duration < 0 :
429
+ raise ToolError (f"{ duration = } must be non-negative" )
430
+ if duration > 100 :
431
+ raise ToolError (f"{ duration = } is too long." )
432
+
433
+ if action == "hold_key" :
434
+ if text is None :
435
+ raise ToolError (f"text is required for { action } " )
436
+ await self .page .keyboard .press (to_playwright_key (text ), delay = duration )
437
+ return ToolResult ()
438
+
439
+ if action == "wait" :
440
+ await sleep (duration )
441
+ return await self .screenshot ()
442
+
443
+ if action in (
444
+ "left_click" ,
445
+ "right_click" ,
446
+ "double_click" ,
447
+ "triple_click" ,
448
+ "middle_click" ,
449
+ ):
450
+ if text is not None :
451
+ raise ToolError (f"text is not accepted for { action } " )
452
+ mouse_move_part = ""
453
+ if coordinate is not None :
454
+ x , y = coordinate
455
+ await self .page .mouse .move (x , y )
456
+ self .mouse_position = (x , y )
457
+
458
+ click_arg = {
459
+ "left_click" : {"button" : "left" , "click_count" : 1 },
460
+ "right_click" : {"button" : "right" , "click_count" : 1 },
461
+ "middle_click" : {"button" : "middle" , "click_count" : 1 },
462
+ "double_click" : {"button" : "left" , "click_count" : 2 , "delay" : 10 },
463
+ "double_click" : {"button" : "left" , "click_count" : 3 , "delay" : 10 },
464
+ }[action ]
465
+ if key :
466
+ self .page .keyboard .down (to_playwright_key (key ))
467
+ await self .page .mouse .click (
468
+ self .mouse_position [0 ], self .mouse_position [1 ], ** click_arg
469
+ )
470
+ if key :
471
+ self .page .keyboard .up (to_playwright_key (key ))
472
+
473
+ return ToolResult ()
474
+ action = cast (Action_20241022 , action )
475
+ return await super ().__call__ (
476
+ action = action , text = text , coordinate = coordinate , key = key , ** kwargs
477
+ )
478
+
479
+
340
480
def to_playwright_key (key : str ) -> str :
341
481
"""Convert a key to the Playwright key format."""
342
482
valid_keys = (
0 commit comments