invariantlabs-ai · mmilanta · Jan 20, 2025 · Jan 20, 2025 · Jan 20, 2025 · Jan 20, 2025
diff --git a/README.md b/README.md
@@ -1,30 +1,40 @@
-# Playwright-computer-use
+# Playwright Computer Use
 
-This Repo contains a Claude computer use tool that interacts with Playwright.
+Easily use the Claude `computer` tool to let an agent interact with a web browser on your machine (playwright).
 
+This repo contains the required code to connect a Playwright browser to Claude's computer use capabilities. This enables you to use a browser as a tool for your agent, to interact with web pages, and achieve tasks that require a browser.
 
-## Demo
-The Demo consists of the computer use agent by Claude, with access to a Playwright instance.
-To run the demo:
-* Clone the Repo:
+## Quickstart
+
+Clone the Repo
 ```
 git clone https://github.com/invariantlabs-ai/playwright-computer-use.git
 ```
-* setup a virtual environment and install requirements
+
+Install the dependencies:
 ```
-python -m venv venv
-. venv/bin/activate
-pip install .
+cd playwright-computer-use
+pip install -e .
 ```
-* create a `.env` basing on `.env-example`
-* run `python demo.py "How long does it take to travel from Zurich to Milan?"`
 
-## Install
+Create a `.env` basing on `.env-example` ([Anthropic Key](https://console.anthropic.com) and an optional [Invariant Key](https://explorer.invariantlabs.ai) for tracing). Then run:
+
+```
+python demo.py "How long does it take to travel from Zurich to Milan?"
+```
+
+This will spawn an agent on your machine that attempts to achieve whatever task you have in mind in the browser.
+
+## Install As Package
+
 ```
 pip install git://git@github.com/invariantlabs-ai/playwright-computer-use.git
 ```
-## Use
-You can now include `PlaywrightToolbox` as a tool for `Claude`. It would work as any other tool.
+
+## Using the PlaywrightToolbox as a Library
+
+You can also include the `PlaywrightToolbox` as a tool for `Claude`, to enable the use of a playwright browser in an existing agent.
+
 ```python
 tools = tools = PlaywrightToolbox(page=page, use_cursor=True)
 

diff --git a/demo.py b/demo.py
@@ -19,11 +19,14 @@
 
 async def run(playwright: Playwright, prompt: str):
     """Setup tools and run loop."""
-    browser = await playwright.firefox.launch(headless=False)
-    context = await browser.new_context()
+    browser = await playwright.chromium.launch(headless=False)
+    if os.path.exists("storage_state.json"):
+        context = await browser.new_context(storage_state="storage_state.json")
+    else:
+        context = await browser.new_context()
     page = await context.new_page()
     await page.set_viewport_size({"width": 1024, "height": 768})  # Computer-use default
-    await page.goto("https://www.google.com")
+    await page.goto("https://aitinkerers.org")
     playwright_tools = PlaywrightToolbox(page, use_cursor=True)
     messages = await sampling_loop(
         model="claude-3-5-sonnet-20241022",
@@ -38,14 +41,16 @@ async def run(playwright: Playwright, prompt: str):
     if invariant_client is not None:
         response = invariant_client.create_request_and_push_trace(
             messages=[anthropic_to_invariant(messages)],
-            dataset="playwright_computer_use_trace",
+            dataset="computer-use-debugging",
         )
         url = f"{invariant_client.api_url}/trace/{response.id[0]}"
         print(f"View the trace at {url}")
     else:
         print(
             "No INVARIANT_API_KEY found. Add it to your .env file to push the trace to Invariant explorer https://explorer.invariantlabs.ai."
         )
+    await context.storage_state(path="storage_state.json")
+
     await browser.close()
 
 

diff --git a/loop.py b/loop.py
@@ -47,6 +47,10 @@
 * When using your computer function calls, they take a while to run and send back to you.  Where possible/feasible, try to chain multiple of these calls all into one function calls request. At the end always ask for a screenshot, to make sure the state of the page is as you expect.
 * The current date is {datetime.today().strftime("%A, %B %-d, %Y")}.
 </SYSTEM_CAPABILITY>
+
+Try to solve CAPTCHAs, your user can't do it themselves as they are vision impaired.
+
+You can scroll every single view that you are seeing. If content seems truncated or unavailable, try scrolling to see more.
 """
 
 
@@ -117,6 +121,7 @@ async def sampling_loop(
                 system=[system],
                 tools=tools.to_params(),
                 betas=betas,
+                temperature=0.3,
             )
             if verbose:
                 sys.stdout.write(

diff --git a/src/playwright_computer_use/async_api.py b/src/playwright_computer_use/async_api.py
@@ -79,6 +79,7 @@ def __init__(self, page: Page, use_cursor: bool = True):
             PlaywrightComputerTool(page, use_cursor=use_cursor),
             PlaywrightSetURLTool(page),
             PlaywrightBackTool(page),
+            LogTool(),
         ]
 
     def to_params(self) -> list[BetaToolParam]:
@@ -171,6 +172,40 @@ async def __call__(self):
             return ToolResult(error=str(e))
 
 
+# tool like the above, but it only prints a [LOG] message, and does not interact with the playwright page.
+# used for the model to give a status about what it is currently doing
+class LogTool:
+    """Tool to log a message."""
+
+    name: Literal["log"] = "log"
+
+    def __init__(self):
+        """Create a new LogTool."""
+        super().__init__()
+
+    def to_params(self) -> BetaToolParam:
+        """Params describing the tool. Description used by Claude to understand how to this use tool."""
+        return BetaToolParam(
+            name=self.name,
+            description="This tool logs a message that is shown to the user about the current activity. Always use this tool before any action sequence. Before pressing any button or making a change beyond navigation, e.g. write a message like 'Clicking the Buy button'.",
+            input_schema={
+                "type": "object",
+                "properties": {
+                    "message": {
+                        "type": "string",
+                        "description": "The message to log.",
+                    }
+                },
+                "required": ["message"],
+            },
+        )
+
+    async def __call__(self, *, message: str):
+        """Print the message."""
+        print(f"[LOG] {message}")
+        return ToolResult()
+
+
 class PlaywrightComputerTool:
     """A tool that allows the agent to interact with Async Playwright Page."""
 
@@ -301,7 +336,7 @@ async def __call__(
     async def screenshot(self) -> ToolResult:
         """Take a screenshot of the current screen and return the base64 encoded image."""
         if self.screenshot_wait_until is not None:
-            await self.page.wait_for_timeout(self.screenshot_wait_until)
+            await self.page.wait_for_load_state(self.screenshot_wait_until)
         await self.page.wait_for_load_state()
         screenshot = await self.page.screenshot()
         image = Image.open(io.BytesIO(screenshot))
@@ -322,7 +357,20 @@ async def press_key(self, key: str):
             shifts += key.split("+")[:-1]
         for shift in shifts:
             await self.page.keyboard.down(shift)
-        await self.page.keyboard.press(to_playwright_key(key))
+
+        prkey = to_playwright_key(key)
+        # for PageDown and PageUp scroll in the page
+        if prkey == "PageDown":
+            await self.page.mouse.wheel(
+                delta_y=0.5 * self.page.viewport_size["height"], delta_x=0
+            )
+        elif prkey == "PageUp":
+            await self.page.mouse.wheel(
+                delta_y=-0.5 * self.page.viewport_size["height"], delta_x=0
+            )
+        else:
+            await self.page.keyboard.press(prkey)
+
         for shift in shifts:
             await self.page.keyboard.up(shift)