Skip to content

Tweaks #6

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 8 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 25 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,30 +1,40 @@
# Playwright-computer-use
# Playwright Computer Use

This Repo contains a Claude computer use tool that interacts with Playwright.
Easily use the Claude `computer` tool to let an agent interact with a web browser on your machine (playwright).

This repo contains the required code to connect a Playwright browser to Claude's computer use capabilities. This enables you to use a browser as a tool for your agent, to interact with web pages, and achieve tasks that require a browser.

## Demo
The Demo consists of the computer use agent by Claude, with access to a Playwright instance.
To run the demo:
* Clone the Repo:
## Quickstart

Clone the Repo
```
git clone https://github.com/invariantlabs-ai/playwright-computer-use.git
```
* setup a virtual environment and install requirements

Install the dependencies:
```
python -m venv venv
. venv/bin/activate
pip install .
cd playwright-computer-use
pip install -e .
```
* create a `.env` basing on `.env-example`
* run `python demo.py "How long does it take to travel from Zurich to Milan?"`

## Install
Create a `.env` basing on `.env-example` ([Anthropic Key](https://console.anthropic.com) and an optional [Invariant Key](https://explorer.invariantlabs.ai) for tracing). Then run:

```
python demo.py "How long does it take to travel from Zurich to Milan?"
```

This will spawn an agent on your machine that attempts to achieve whatever task you have in mind in the browser.

## Install As Package

```
pip install git://git@github.com/invariantlabs-ai/playwright-computer-use.git
```
## Use
You can now include `PlaywrightToolbox` as a tool for `Claude`. It would work as any other tool.

## Using the PlaywrightToolbox as a Library

You can also include the `PlaywrightToolbox` as a tool for `Claude`, to enable the use of a playwright browser in an existing agent.

```python
tools = tools = PlaywrightToolbox(page=page, use_cursor=True)

Expand Down
13 changes: 9 additions & 4 deletions demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,14 @@

async def run(playwright: Playwright, prompt: str):
"""Setup tools and run loop."""
browser = await playwright.firefox.launch(headless=False)
context = await browser.new_context()
browser = await playwright.chromium.launch(headless=False)
if os.path.exists("storage_state.json"):
context = await browser.new_context(storage_state="storage_state.json")
else:
context = await browser.new_context()
page = await context.new_page()
await page.set_viewport_size({"width": 1024, "height": 768}) # Computer-use default
await page.goto("https://www.google.com")
await page.goto("https://aitinkerers.org")
playwright_tools = PlaywrightToolbox(page, use_cursor=True)
messages = await sampling_loop(
model="claude-3-5-sonnet-20241022",
Expand All @@ -38,14 +41,16 @@ async def run(playwright: Playwright, prompt: str):
if invariant_client is not None:
response = invariant_client.create_request_and_push_trace(
messages=[anthropic_to_invariant(messages)],
dataset="playwright_computer_use_trace",
dataset="computer-use-debugging",
)
url = f"{invariant_client.api_url}/trace/{response.id[0]}"
print(f"View the trace at {url}")
else:
print(
"No INVARIANT_API_KEY found. Add it to your .env file to push the trace to Invariant explorer https://explorer.invariantlabs.ai."
)
await context.storage_state(path="storage_state.json")

await browser.close()


Expand Down
5 changes: 5 additions & 0 deletions loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@
* When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request. At the end always ask for a screenshot, to make sure the state of the page is as you expect.
* The current date is {datetime.today().strftime("%A, %B %-d, %Y")}.
</SYSTEM_CAPABILITY>

Try to solve CAPTCHAs, your user can't do it themselves as they are vision impaired.

You can scroll every single view that you are seeing. If content seems truncated or unavailable, try scrolling to see more.
"""


Expand Down Expand Up @@ -117,6 +121,7 @@ async def sampling_loop(
system=[system],
tools=tools.to_params(),
betas=betas,
temperature=0.3,
)
if verbose:
sys.stdout.write(
Expand Down
52 changes: 50 additions & 2 deletions src/playwright_computer_use/async_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ def __init__(self, page: Page, use_cursor: bool = True):
PlaywrightComputerTool(page, use_cursor=use_cursor),
PlaywrightSetURLTool(page),
PlaywrightBackTool(page),
LogTool(),
]

def to_params(self) -> list[BetaToolParam]:
Expand Down Expand Up @@ -171,6 +172,40 @@ async def __call__(self):
return ToolResult(error=str(e))


# tool like the above, but it only prints a [LOG] message, and does not interact with the playwright page.
# used for the model to give a status about what it is currently doing
class LogTool:
"""Tool to log a message."""

name: Literal["log"] = "log"

def __init__(self):
"""Create a new LogTool."""
super().__init__()

def to_params(self) -> BetaToolParam:
"""Params describing the tool. Description used by Claude to understand how to this use tool."""
return BetaToolParam(
name=self.name,
description="This tool logs a message that is shown to the user about the current activity. Always use this tool before any action sequence. Before pressing any button or making a change beyond navigation, e.g. write a message like 'Clicking the Buy button'.",
input_schema={
"type": "object",
"properties": {
"message": {
"type": "string",
"description": "The message to log.",
}
},
"required": ["message"],
},
)

async def __call__(self, *, message: str):
"""Print the message."""
print(f"[LOG] {message}")
return ToolResult()


class PlaywrightComputerTool:
"""A tool that allows the agent to interact with Async Playwright Page."""

Expand Down Expand Up @@ -301,7 +336,7 @@ async def __call__(
async def screenshot(self) -> ToolResult:
"""Take a screenshot of the current screen and return the base64 encoded image."""
if self.screenshot_wait_until is not None:
await self.page.wait_for_timeout(self.screenshot_wait_until)
await self.page.wait_for_load_state(self.screenshot_wait_until)
await self.page.wait_for_load_state()
screenshot = await self.page.screenshot()
image = Image.open(io.BytesIO(screenshot))
Expand All @@ -322,7 +357,20 @@ async def press_key(self, key: str):
shifts += key.split("+")[:-1]
for shift in shifts:
await self.page.keyboard.down(shift)
await self.page.keyboard.press(to_playwright_key(key))

prkey = to_playwright_key(key)
# for PageDown and PageUp scroll in the page
if prkey == "PageDown":
await self.page.mouse.wheel(
delta_y=0.5 * self.page.viewport_size["height"], delta_x=0
)
elif prkey == "PageUp":
await self.page.mouse.wheel(
delta_y=-0.5 * self.page.viewport_size["height"], delta_x=0
)
else:
await self.page.keyboard.press(prkey)

for shift in shifts:
await self.page.keyboard.up(shift)

Expand Down