browserbase · tkattkat · Sep 9, 2025 · Aug 8, 2025 · Aug 8, 2025 · Aug 8, 2025
diff --git a/.changeset/pink-snakes-sneeze.md b/.changeset/pink-snakes-sneeze.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": patch
+---
+
+Replace operator handler with base of new agent
diff --git a/.changeset/tired-cats-repeat.md b/.changeset/tired-cats-repeat.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": patch
+---
+
+replace operator agent with scaffold for new stagehand agent
diff --git a/docs/basics/agent.mdx b/docs/basics/agent.mdx
@@ -26,7 +26,11 @@ agent.execute("apply for a job at browserbase")
 
 ## Using `agent()`
 
-Here is how you can use `agent()` to create an agent.
+There are two ways to create agents in Stagehand:
+
+### Computer Use Agents
+
+Use computer use agents with specialized models from OpenAI or Anthropic: 
 
 <CodeGroup>
 ```typescript TypeScript
@@ -54,6 +58,18 @@ await agent.execute("apply for a job at Browserbase")
 ```
 </CodeGroup>
 
+### Use Stagehand Agent with Any LLM
+
+Use the agent without specifying a provider to utilize any model or LLM provider:
+
+<Note>Non CUA agents are currently only supported in TypeScript</Note>
+
+```typescript TypeScript
+const agent = stagehand.agent();
+await agent.execute("apply for a job at Browserbase")
+```
+
+
 ## MCP Integrations
 
 Agents can be enhanced with external tools and services through MCP (Model Context Protocol) integrations. This allows your agent to access external APIs and data sources beyond just browser interactions.

diff --git a/evals/index.eval.ts b/evals/index.eval.ts
@@ -33,7 +33,7 @@ import { CustomOpenAIClient } from "@/examples/external_clients/customOpenAI";
 import OpenAI from "openai";
 import { initStagehand } from "./initStagehand";
 import { AgentProvider } from "@/lib/agent/AgentProvider";
-import { AISdkClient } from "@/examples/external_clients/aisdk";
+import { AISdkClient } from "@/lib/llm/aisdk";
 import { getAISDKLanguageModel } from "@/lib/llm/LLMProvider";
 import { loadApiKeyFromEnv } from "@/lib/utils";
 import { LogLine } from "@/types/log";

diff --git a/evals/initStagehand.ts b/evals/initStagehand.ts
@@ -114,6 +114,11 @@ export const initStagehand = async ({
       model: modelName,
       provider: modelName.startsWith("claude") ? "anthropic" : "openai",
     } as AgentConfig;
+  } else {
+    agentConfig = {
+      model: modelName,
+      executionModel: "google/gemini-2.5-flash",
+    } as AgentConfig;
   }
 
   const agent = stagehand.agent(agentConfig);

diff --git a/evals/taskConfig.ts b/evals/taskConfig.ts
@@ -106,7 +106,11 @@ const DEFAULT_EVAL_MODELS = process.env.EVAL_MODELS
 
 const DEFAULT_AGENT_MODELS = process.env.EVAL_AGENT_MODELS
   ? process.env.EVAL_AGENT_MODELS.split(",")
-  : ["computer-use-preview-2025-03-11", "claude-sonnet-4-20250514"];
+  : [
+      "computer-use-preview-2025-03-11",
+      "claude-sonnet-4-20250514",
+      "anthropic/claude-sonnet-4-20250514",
+    ];
 
 /**
  * getModelList:

diff --git a/evals/tasks/agent/sf_library_card.ts b/evals/tasks/agent/sf_library_card.ts
@@ -10,19 +10,15 @@ export const sf_library_card: EvalFunction = async ({
 }) => {
   try {
     await stagehand.page.goto("https://sflib1.sfpl.org/selfreg");
-
     const agentResult = await agent.execute({
-      instruction:
-        "Fill in the 'Residential Address' field with '166 Geary St'",
+      instruction: "Fill in the 'street Address' field with '166 Geary St'",
       maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 3,
     });
     logger.log(agentResult);
-
-    await stagehand.page.mouse.wheel(0, -1000);
     const evaluator = new Evaluator(stagehand);
     const result = await evaluator.ask({
       question:
-        "Does the page show the 'Residential Address' field filled with '166 Geary St'?",
+        "Does the page show the 'street Address' field filled with '166 Geary St'?",
     });
 
     if (result.evaluation !== "YES" && result.evaluation !== "NO") {

diff --git a/lib/agent/tools/act.ts b/lib/agent/tools/act.ts
@@ -0,0 +1,55 @@
+import { tool } from "ai";
+import { z } from "zod/v3";
+import { StagehandPage } from "../../StagehandPage";
+
+export const createActTool = (
+  stagehandPage: StagehandPage,
+  executionModel?: string,
+) =>
+  tool({
+    description: "Perform an action on the page (click, type)",
+    parameters: z.object({
+      action: z.string()
+        .describe(`Describe what to click, or type within in a short, specific phrase that mentions the element type. 
+          Examples:
+          - "click the Login button"
+          - "click the language dropdown"
+          - type "John" into the first name input
+          - type "Doe" into the last name input`),
+    }),
+    execute: async ({ action }) => {
+      try {
+        let result;
+        if (executionModel) {
+          result = await stagehandPage.page.act({
+            action,
+            modelName: executionModel,
+          });
+        } else {
+          result = await stagehandPage.page.act(action);
+        }
+        const isIframeAction = result.action === "an iframe";
+
+        if (isIframeAction) {
+          const fallback = await stagehandPage.page.act(
+            executionModel
+              ? { action, modelName: executionModel, iframes: true }
+              : { action, iframes: true },
+          );
+          return {
+            success: fallback.success,
+            action: fallback.action,
+            isIframe: true,
+          };
+        }
+
+        return {
+          success: result.success,
+          action: result.action,
+          isIframe: false,
+        };
+      } catch (error) {
+        return { success: false, error: error.message };
+      }
+    },
+  });
diff --git a/lib/agent/tools/ariaTree.ts b/lib/agent/tools/ariaTree.ts
@@ -0,0 +1,35 @@
+import { tool } from "ai";
+import { z } from "zod/v3";
+import { StagehandPage } from "../../StagehandPage";
+
+export const createAriaTreeTool = (stagehandPage: StagehandPage) =>
+  tool({
+    description:
+      "gets the accessibility (ARIA) tree from the current page. this is useful for understanding the page structure and accessibility features. it should provide full context of what is on the page",
+    parameters: z.object({}),
+    execute: async () => {
+      const { page_text } = await stagehandPage.page.extract();
+      const pageUrl = stagehandPage.page.url();
+
+      let content = page_text;
+      const MAX_CHARACTERS = 70000;
+
+      const estimatedTokens = Math.ceil(content.length / 4);
+
+      if (estimatedTokens > MAX_CHARACTERS) {
+        const maxCharacters = MAX_CHARACTERS * 4;
+        content =
+          content.substring(0, maxCharacters) +
+          "\n\n[CONTENT TRUNCATED: Exceeded 70,000 token limit]";
+      }
+
+      return {
+        content,
+        pageUrl,
+      };
+    },
+    experimental_toToolResultContent: (result) => {
+      const content = typeof result === "string" ? result : result.content;
+      return [{ type: "text", text: `Accessibility Tree:\n${content}` }];
+    },
+  });
diff --git a/lib/agent/tools/close.ts b/lib/agent/tools/close.ts
@@ -0,0 +1,16 @@
+import { tool } from "ai";
+import { z } from "zod/v3";
+
+export const createCloseTool = () =>
+  tool({
+    description: "Complete the task and close",
+    parameters: z.object({
+      reasoning: z.string().describe("Summary of what was accomplished"),
+      taskComplete: z
+        .boolean()
+        .describe("Whether the task was completed successfully"),
+    }),
+    execute: async ({ reasoning, taskComplete }) => {
+      return { success: true, reasoning, taskComplete };
+    },
+  });
diff --git a/lib/agent/tools/extract.ts b/lib/agent/tools/extract.ts
@@ -0,0 +1,104 @@
+import { tool } from "ai";
+import { z } from "zod/v3";
+import { StagehandPage } from "../../StagehandPage";
+import { LogLine } from "@/types/log";
+
+/**
+ * Evaluates a Zod schema string and returns the actual Zod schema
+ * Uses Function constructor to evaluate the schema string in a controlled way
+ */
+function evaluateZodSchema(
+  schemaStr: string,
+  logger?: (message: LogLine) => void,
+): z.ZodTypeAny {
+  try {
+    // Create a function that returns the evaluated schema
+    // We pass z as a parameter to make it available in the evaluated context
+    const schemaFunction = new Function("z", `return ${schemaStr}`);
+    return schemaFunction(z);
+  } catch (error) {
+    logger?.({
+      category: "extract",
+      message: `Failed to evaluate schema string, using z.any(): ${error}`,
+      level: 1,
+      auxiliary: {
+        error: {
+          value: error,
+          type: "string",
+        },
+      },
+    });
+    return z.any();
+  }
+}
+
+export const createExtractTool = (
+  stagehandPage: StagehandPage,
+  executionModel?: string,
+  logger?: (message: LogLine) => void,
+) =>
+  tool({
+    description: `Extract structured data from the current page based on a provided schema.
+
+    USAGE GUIDELINES:
+    - Keep schemas MINIMAL - only include fields essential for the task
+    - IMPORANT: only use this if explicitly asked for structured output. In most scenarios, you should use the aria tree tool over this. 
+    - If you need to extract a link, make sure the type defintion follows the format of z.string().url()
+    EXAMPLES:
+    1. Extract a single value:
+       instruction: "extract the product price"
+       schema: "z.object({ price: z.number()})"
+
+    2. Extract multiple fields:
+       instruction: "extract product name and price"
+       schema: "z.object({ name: z.string(), price: z.number() })"
+
+    3. Extract arrays:
+       instruction: "extract all product names and prices"
+       schema: "z.object({ products: z.array(z.object({ name: z.string(), price: z.number() })) })"`,
+    parameters: z.object({
+      instruction: z
+        .string()
+        .describe(
+          "Clear instruction describing what data to extract from the page",
+        ),
+      schema: z
+        .string()
+        .describe(
+          'Zod schema as a string (e.g., "z.object({ price: z.number() })")',
+        ),
+    }),
+    execute: async ({ instruction, schema }) => {
+      try {
+        // Evaluate the schema string to get the actual Zod schema
+        const zodSchema = evaluateZodSchema(schema, logger);
+
+        // Ensure we have a ZodObject
+        const schemaObject =
+          zodSchema instanceof z.ZodObject
+            ? zodSchema
+            : z.object({ result: zodSchema });
+
+        // Extract with the schema - only pass modelName if executionModel is explicitly provided
+        const result = await stagehandPage.page.extract({
+          instruction,
+          schema: schemaObject,
+          ...(executionModel && { modelName: executionModel }),
+        });
+
+        return {
+          success: true,
+          data: result,
+          timestamp: Date.now(),
+        };
+      } catch (error) {
+        const errorMessage =
+          error instanceof Error ? error.message : String(error);
+        return {
+          success: false,
+          error: `Failed to extract data: ${errorMessage}`,
+          timestamp: Date.now(),
+        };
+      }
+    },
+  });
diff --git a/lib/agent/tools/fillform.ts b/lib/agent/tools/fillform.ts
@@ -0,0 +1,71 @@
+import { tool } from "ai";
+import { z } from "zod/v3";
+import { StagehandPage } from "../../StagehandPage";
+
+export const createFillFormTool = (
+  stagehandPage: StagehandPage,
+  executionModel?: string,
+) =>
+  tool({
+    description: `📝 FORM FILL - SPECIALIZED MULTI-FIELD INPUT TOOL
+
+     CRITICAL: Use this for ANY form with 2+ input fields (text inputs, textareas, etc.)
+
+    WHY THIS TOOL EXISTS:
+    • Forms are the #1 use case for multi-field input
+    • Optimized specifically for input/textarea elements
+    • 4-6x faster than individual typing actions
+
+    Use fillForm: Pure form filling (inputs, textareas only)
+
+
+    MANDATORY USE CASES (always use fillForm for these):
+    Registration forms: name, email, password fields
+    Contact forms: name, email, message fields  
+    Checkout forms: address, payment info fields
+    Profile updates: multiple user data fields
+    Search filters: multiple criteria inputs
+
+
+
+    PARAMETER DETAILS:
+    • fields: Array of { action, value } objects.
+      – action: short description of where to type (e.g. "type 'john@example.com' into the email input").
+      – value: the exact text to enter.
+ `,
+    parameters: z.object({
+      fields: z
+        .array(
+          z.object({
+            action: z
+              .string()
+              .describe(
+                'Description of the typing action, e.g. "type foo into the bar field"',
+              ),
+            value: z.string().describe("Text to type into the target field"),
+          }),
+        )
+        .min(1, "Provide at least one field to fill"),
+    }),
+
+    execute: async ({ fields }) => {
+      const instruction = `Return observation results for the following actions: ${fields
+        .map((field) => field.action)
+        .join(", ")}`;
+
+      const observeResults = executionModel
+        ? await stagehandPage.page.observe({
+            instruction,
+            modelName: executionModel,
+          })
+        : await stagehandPage.page.observe(instruction);
+
+      const completedActions = [];
+      for (const result of observeResults) {
+        const action = await stagehandPage.page.act(result);
+        completedActions.push(action);
+      }
+
+      return { success: true, actions: completedActions };
+    },
+  });