Skip to content

support scrolling #563

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Mar 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/plenty-ties-float.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@browserbasehq/stagehand": patch
---

support scrolling in `act`
8 changes: 8 additions & 0 deletions evals/evals.config.json
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,14 @@
{
"name": "extract_geniusee_2",
"categories": ["targeted_extract"]
},
{
"name": "scroll_50",
"categories": ["act"]
},
{
"name": "scroll_75",
"categories": ["act"]
}
]
}
48 changes: 48 additions & 0 deletions evals/tasks/scroll_50.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import { initStagehand } from "@/evals/initStagehand";
import { EvalFunction } from "@/types/evals";

export const scroll_50: EvalFunction = async ({ modelName, logger }) => {
const { stagehand, initResponse } = await initStagehand({
modelName,
logger,
domSettleTimeoutMs: 3000,
});

const { debugUrl, sessionUrl } = initResponse;
await stagehand.page.goto("https://aigrant.com/");
await stagehand.page.act({
action: "Scroll 50% down the page",
slowDomBasedAct: false,
});

await new Promise((resolve) => setTimeout(resolve, 5000));

// Get the current scroll position and total scroll height
const scrollInfo = await stagehand.page.evaluate(() => {
return {
scrollTop: window.scrollY + window.innerHeight / 2,
scrollHeight: document.documentElement.scrollHeight,
};
});

await stagehand.close();

const halfwayScroll = scrollInfo.scrollHeight / 2;
const halfwayReached = Math.abs(scrollInfo.scrollTop - halfwayScroll) <= 200;
const evaluationResult = halfwayReached
? {
_success: true,
logs: logger.getLogs(),
debugUrl,
sessionUrl,
}
: {
_success: false,
logs: logger.getLogs(),
debugUrl,
sessionUrl,
message: `Scroll position (${scrollInfo.scrollTop}px) is not halfway down the page (${halfwayScroll}px).`,
};

return evaluationResult;
};
49 changes: 49 additions & 0 deletions evals/tasks/scroll_75.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import { initStagehand } from "@/evals/initStagehand";
import { EvalFunction } from "@/types/evals";

export const scroll_75: EvalFunction = async ({ modelName, logger }) => {
const { stagehand, initResponse } = await initStagehand({
modelName,
logger,
domSettleTimeoutMs: 3000,
});

const { debugUrl, sessionUrl } = initResponse;
await stagehand.page.goto("https://aigrant.com/");
await stagehand.page.act({
action: "Scroll 75% down the page",
slowDomBasedAct: false,
});

await new Promise((resolve) => setTimeout(resolve, 5000));

// Get the current scroll position and total scroll height
const scrollInfo = await stagehand.page.evaluate(() => {
return {
scrollTop: window.scrollY + window.innerHeight * 0.75,
scrollHeight: document.documentElement.scrollHeight,
};
});

await stagehand.close();

const threeQuartersScroll = scrollInfo.scrollHeight * 0.75;
const threeQuartersReached =
Math.abs(scrollInfo.scrollTop - threeQuartersScroll) <= 200;
const evaluationResult = threeQuartersReached
? {
_success: true,
logs: logger.getLogs(),
debugUrl,
sessionUrl,
}
: {
_success: false,
logs: logger.getLogs(),
debugUrl,
sessionUrl,
message: `Scroll position (${scrollInfo.scrollTop}px) is not three quarters down the page (${threeQuartersScroll}px).`,
};

return evaluationResult;
};
2 changes: 2 additions & 0 deletions lib/dom/process.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import {
calculateViewportHeight,
canElementScroll,
getNodeFromXpath,
waitForDomSettle,
} from "./utils";
import { createStagehandContainer } from "./containerFactory";
import { StagehandContainer } from "./StagehandContainer";
Expand Down Expand Up @@ -518,6 +519,7 @@ export function getElementBoundingBoxes(xpath: string): Array<{
return boundingBoxes;
}

window.waitForDomSettle = waitForDomSettle;
window.processDom = processDom;
window.processAllOfDom = processAllOfDom;
window.storeDOM = storeDOM;
Expand Down
2 changes: 0 additions & 2 deletions lib/dom/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@ export async function waitForDomSettle() {
});
}

window.waitForDomSettle = waitForDomSettle;

export function calculateViewportHeight() {
return Math.ceil(window.innerHeight * 0.75);
}
Expand Down
77 changes: 77 additions & 0 deletions lib/handlers/actHandler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import {
} from "@/types/stagehand";
import { SupportedPlaywrightAction } from "@/types/act";
import { buildActObservePrompt } from "../prompt";
import { getNodeFromXpath } from "@/lib/dom/utils";
/**
* NOTE: Vision support has been removed from this version of Stagehand.
* If useVision or verifierUseVision is set to true, a warning is logged and
Expand Down Expand Up @@ -448,6 +449,82 @@ export class StagehandActHandler {

throw new PlaywrightCommandException(e.message);
}
} else if (
method === "scrollTo" ||
method === "scroll" ||
method === "mouse.wheel"
) {
this.logger({
category: "action",
message: "scrolling element vertically to specified percentage",
level: 2,
auxiliary: {
xpath: { value: xpath, type: "string" },
coordinate: { value: JSON.stringify(args), type: "string" },
},
});

try {
const [yArg = "0%"] = args as string[];

await this.stagehandPage.page.evaluate(
({ xpath, yArg }) => {
function parsePercent(val: string): number {
const cleaned = val.trim().replace("%", "");
const num = parseFloat(cleaned);
return Number.isNaN(num) ? 0 : Math.max(0, Math.min(num, 100));
}

const elementNode = getNodeFromXpath(xpath);
if (!elementNode || elementNode.nodeType !== Node.ELEMENT_NODE) {
console.warn(`Could not locate element to scroll on.`);
return;
}

const element = elementNode as HTMLElement;
const yPct = parsePercent(yArg);

// Determine if <html> is actually the scrolled container
if (element.tagName.toLowerCase() === "html") {
// Scroll the entire page (window)
const scrollHeight = document.body.scrollHeight;
const viewportHeight = window.innerHeight;
const scrollTop = (scrollHeight - viewportHeight) * (yPct / 100);

window.scrollTo({
top: scrollTop,
left: window.scrollX,
behavior: "smooth",
});
} else {
// Otherwise, scroll the element itself
const scrollHeight = element.scrollHeight;
const clientHeight = element.clientHeight;
const scrollTop = (scrollHeight - clientHeight) * (yPct / 100);

element.scrollTo({
top: scrollTop,
left: element.scrollLeft,
behavior: "smooth",
});
}
},
{ xpath, yArg },
);
} catch (e) {
this.logger({
category: "action",
message: "error scrolling element vertically to percentage",
level: 1,
auxiliary: {
error: { value: (e as Error).message, type: "string" },
trace: { value: (e as Error).stack, type: "string" },
xpath: { value: xpath, type: "string" },
args: { value: JSON.stringify(args), type: "object" },
},
});
throw new PlaywrightCommandException((e as Error).message);
}
} else if (method === "fill" || method === "type") {
try {
await locator.fill("");
Expand Down
2 changes: 1 addition & 1 deletion lib/prompt.ts
Original file line number Diff line number Diff line change
Expand Up @@ -401,7 +401,7 @@ export function buildActObservePrompt(
let instruction = `Find the most relevant element to perform an action on given the following action: ${action}.
Provide an action for this element such as ${supportedActions.join(", ")}, or any other playwright locator method. Remember that to users, buttons and links look the same in most cases.
If the action is completely unrelated to a potential action to be taken on the page, return an empty array.
ONLY return one action. If multiple actions are relevant, return the most relevant one.`;
ONLY return one action. If multiple actions are relevant, return the most relevant one. If the user is asking to scroll to a position on the page, e.g., 'halfway' or 0.75, etc, you must return the argument formatted as the correct percentage, e.g., '50%' or '75%', etc.`;

// Add variable names (not values) to the instruction if any
if (variables && Object.keys(variables).length > 0) {
Expand Down
1 change: 1 addition & 0 deletions types/act.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,5 @@ export enum SupportedPlaywrightAction {
CLICK = "click",
FILL = "fill",
TYPE = "type",
SCROLL = "scrollTo",
}