From 10a3f7f787353af12bdb087f61853ad9d7e58ff5 Mon Sep 17 00:00:00 2001 From: veasion Date: Sat, 21 Dec 2024 15:06:32 +0800 Subject: [PATCH 1/3] open_url: first new window --- src/extension/tools/export_file.ts | 26 ++-------- src/extension/tools/open_url.ts | 48 ++++++++---------- src/extension/tools/tab_management.ts | 29 ++++++++++- src/extension/tools/web_search.ts | 4 +- src/extension/utils.ts | 70 +++++++++++++++++++++++++-- 5 files changed, 118 insertions(+), 59 deletions(-) diff --git a/src/extension/tools/export_file.ts b/src/extension/tools/export_file.ts index 84848b1..ae78618 100644 --- a/src/extension/tools/export_file.ts +++ b/src/extension/tools/export_file.ts @@ -1,5 +1,5 @@ import { Tool, InputSchema, ExecutionContext } from '../../types/action.types'; -import { getTabId, sleep, waitForTabComplete } from '../utils'; +import { getTabId, open_new_tab, sleep } from '../utils'; /** * Export file @@ -71,7 +71,7 @@ export class ExportFile implements Tool { if (!filename) { filename = new Date().getTime() + '.' + fileType; } else if (!(filename + '').endsWith(fileType)) { - filename += ('.' + fileType); + filename += '.' + fileType; } let tabId = await getTabId(context); try { @@ -81,7 +81,8 @@ export class ExportFile implements Tool { args: [filename, type, content], }); } catch (e) { - tabId = await newTabId(); + let tab = await open_new_tab('https://www.google.com', true); + tabId = tab.id as number; await chrome.scripting.executeScript({ target: { tabId: tabId as number }, func: exportFile, @@ -94,25 +95,6 @@ export class ExportFile implements Tool { } } -async function newTabId(): Promise { - let url = 'https://google.com'; - let window = await chrome.windows.create({ - type: 'normal', - state: 'maximized', - url: url, - } as any as chrome.windows.CreateData); - let windowId = window.id as number; - let tabs = window.tabs || [ - await chrome.tabs.create({ - url: url, - windowId: windowId, - }), - ]; - let tabId = tabs[0].id as number; - await waitForTabComplete(tabId); - return tabId; -} - function exportFile(filename: string, type: string, content: string) { const blob = new Blob([content], { type: type }); const link = document.createElement('a'); diff --git a/src/extension/tools/open_url.ts b/src/extension/tools/open_url.ts index e9b24de..c26a00e 100644 --- a/src/extension/tools/open_url.ts +++ b/src/extension/tools/open_url.ts @@ -1,5 +1,5 @@ import { Tool, InputSchema, ExecutionContext } from '../../types/action.types'; -import { getWindowId, waitForTabComplete } from '../utils'; +import { getWindowId, open_new_tab } from '../utils'; /** * Open Url @@ -31,7 +31,7 @@ export class OpenUrl implements Tool { /** * Open Url * - * @param {*} params { url: 'https://google.com', newWindow: true } + * @param {*} params { url: 'https://www.google.com', newWindow: true } * @returns > { tabId, windowId, title, success: true } */ async execute(context: ExecutionContext, params: unknown): Promise { @@ -39,38 +39,28 @@ export class OpenUrl implements Tool { throw new Error('Invalid parameters. Expected an object with a "url" property.'); } let { url, newWindow } = params as any; - let windowId: number; - let tabId: number; + if (!newWindow && !context.variables.get('windowId') && !context.variables.get('tabId')) { + // First mandatory opening of a new window + newWindow = true; + } + let tab: chrome.tabs.Tab; if (newWindow) { - let window = await chrome.windows.create({ - type: 'normal', - state: 'maximized', - url: url, - } as any as chrome.windows.CreateData); - windowId = window.id as number; - let tabs = window.tabs || [ - await chrome.tabs.create({ - url: url, - windowId: windowId, - }), - ]; - tabId = tabs[0].id as number; + tab = await open_new_tab(url, true); } else { - windowId = await getWindowId(context); - let tab = await chrome.tabs.create({ - url: url, - windowId: windowId, - }); - tabId = tab.id as number; + let windowId = await getWindowId(context); + tab = await open_new_tab(url, false, windowId); } - let tab = await waitForTabComplete(tabId); + let windowId = tab.windowId as number; + let tabId = tab.id as number; context.variables.set('windowId', windowId); context.variables.set('tabId', tabId); - let windowIds = context.variables.get('windowIds') as Array; - if (windowIds) { - windowIds.push(windowId); - } else { - context.variables.set('windowIds', [windowId] as Array); + if (newWindow) { + let windowIds = context.variables.get('windowIds') as Array; + if (windowIds) { + windowIds.push(windowId); + } else { + context.variables.set('windowIds', [windowId] as Array); + } } return { tabId, diff --git a/src/extension/tools/tab_management.ts b/src/extension/tools/tab_management.ts index 2e99662..9ee6e02 100644 --- a/src/extension/tools/tab_management.ts +++ b/src/extension/tools/tab_management.ts @@ -1,5 +1,5 @@ import { Tool, InputSchema, ExecutionContext } from '../../types/action.types'; -import { getTabId, getWindowId, sleep } from '../utils'; +import { getTabId, getWindowId, open_new_tab, sleep } from '../utils'; /** * Browser tab management @@ -21,7 +21,8 @@ export class TabManagement implements Tool { * \`tab_all\`: View all tabs and return the tabId and title. * \`current_tab\`: Get current tab information (tabId, url, title). * \`close_tab\`: Close the current tab. -* \`switch_tab [tabId]\`: Switch to the specified tab using tabId, eg: switch_tab 1000`, +* \`switch_tab [tabId]\`: Switch to the specified tab using tabId, eg: switch_tab 1000. +* \`new_tab [url]\`: Open a new tab window and open the URL, eg: new_tab https://www.google.com`, }, }, required: ['action'], @@ -84,6 +85,30 @@ export class TabManagement implements Tool { context.variables.set('tabId', tab.id); context.variables.set('windowId', tab.windowId); result = { tabId, windowId: tab.windowId, title: tab.title, url: tab.url }; + } else if (action.startsWith('new_tab')) { + let url = action.replace('new_tab', '').replace('[', '').replace(']', '').replace(/"/g, ''); + // First mandatory opening of a new window + let newWindow = !context.variables.get('windowId') && !context.variables.get('tabId'); + let tab: chrome.tabs.Tab; + if (newWindow) { + tab = await open_new_tab(url, true); + } else { + let windowId = await getWindowId(context); + tab = await open_new_tab(url, false, windowId); + } + let windowId = tab.windowId as number; + let tabId = tab.id as number; + context.variables.set('windowId', windowId); + context.variables.set('tabId', tabId); + if (newWindow) { + let windowIds = context.variables.get('windowIds') as Array; + if (windowIds) { + windowIds.push(windowId); + } else { + context.variables.set('windowIds', [windowId] as Array); + } + } + result = { tabId: tab.id, windowId: tab.windowId, title: tab.title, url: tab.url }; } return { result, diff --git a/src/extension/tools/web_search.ts b/src/extension/tools/web_search.ts index 8bf6255..3e72a79 100644 --- a/src/extension/tools/web_search.ts +++ b/src/extension/tools/web_search.ts @@ -31,7 +31,7 @@ export class WebSearch implements Tool { /** * search * - * @param {*} params { url: 'https://google.com', query: 'ai agent', maxResults: 5 } + * @param {*} params { url: 'https://www.google.com', query: 'ai agent', maxResults: 5 } * @returns > [{ title, url, content }] */ async execute(context: ExecutionContext, params: unknown): Promise { @@ -40,7 +40,7 @@ export class WebSearch implements Tool { } let { url, query, maxResults } = params as any; if (!url) { - url = 'https://google.com'; + url = 'https://www.google.com'; } let taskId = new Date().getTime() + ''; let searchs = [{ url: url as string, keyword: query as string }]; diff --git a/src/extension/utils.ts b/src/extension/utils.ts index af99c66..5c9ff7d 100644 --- a/src/extension/utils.ts +++ b/src/extension/utils.ts @@ -2,6 +2,23 @@ import { ExecutionContext } from '../types/action.types'; export async function getWindowId(context: ExecutionContext): Promise { let windowId = context.variables.get('windowId') as any; + if (windowId) { + try { + await chrome.windows.get(windowId); + } catch (e) { + windowId = null; + context.variables.delete('windowId'); + let tabId = context.variables.get('tabId') as any; + if (tabId) { + try { + let tab = await chrome.tabs.get(tabId); + windowId = tab.windowId; + } catch (e) { + context.variables.delete('tabId'); + } + } + } + } if (!windowId) { const window = await chrome.windows.getCurrent(); windowId = window.id; @@ -11,6 +28,14 @@ export async function getWindowId(context: ExecutionContext): Promise { export async function getTabId(context: ExecutionContext): Promise { let tabId = context.variables.get('tabId') as any; + if (tabId) { + try { + await chrome.tabs.get(tabId); + } catch (e) { + tabId = null; + context.variables.delete('tabId'); + } + } if (!tabId) { tabId = await getCurrentTabId(); } @@ -38,16 +63,53 @@ export function getCurrentTabId(): Promise { }); } +export async function open_new_tab( + url: string, + newWindow: boolean, + windowId?: number +): Promise { + let tabId; + if (newWindow) { + let window = await chrome.windows.create({ + type: 'normal', + state: 'maximized', + url: url, + } as any as chrome.windows.CreateData); + windowId = window.id as number; + let tabs = window.tabs || [ + await chrome.tabs.create({ + url: url, + windowId: windowId, + }), + ]; + tabId = tabs[0].id as number; + } else { + if (!windowId) { + const window = await chrome.windows.getCurrent(); + windowId = window.id; + } + let tab = await chrome.tabs.create({ + url: url, + windowId: windowId, + }); + tabId = tab.id as number; + } + return await waitForTabComplete(tabId); +} + export async function executeScript(tabId: number, func: any, args: any): Promise { let frameResults = await chrome.scripting.executeScript({ target: { tabId: tabId as number }, func: func, args: args, }); - return frameResults[0].result + return frameResults[0].result; } -export async function waitForTabComplete(tabId: number, timeout: number = 30_000): Promise { +export async function waitForTabComplete( + tabId: number, + timeout: number = 30_000 +): Promise { return new Promise(async (resolve, reject) => { let tab = await chrome.tabs.get(tabId); if (tab.status === 'complete') { @@ -56,9 +118,9 @@ export async function waitForTabComplete(tabId: number, timeout: number = 30_000 } const time = setTimeout(() => { chrome.tabs.onUpdated.removeListener(listener); - reject() + reject(); }, timeout); - const listener = async (updatedTabId: any, changeInfo: any, tab: any) => { + const listener = async (updatedTabId: number, changeInfo: any, tab: chrome.tabs.Tab) => { if (updatedTabId === tabId && changeInfo.status === 'complete') { chrome.tabs.onUpdated.removeListener(listener); clearTimeout(time); From cf271b13c254a148cc8a32ee5f66edfced9418f0 Mon Sep 17 00:00:00 2001 From: veasion Date: Sat, 21 Dec 2024 17:31:15 +0800 Subject: [PATCH 2/3] modify tool types --- src/core/eko.ts | 2 +- src/core/tool-registry.ts | 8 +- src/extension/tools/computer.ts | 27 ++- src/extension/tools/computer_web.ts | 30 +-- src/extension/tools/element_click.ts | 4 +- src/extension/tools/export_file.ts | 34 ++-- src/extension/tools/extract_content.ts | 10 +- src/extension/tools/open_url.ts | 11 +- src/extension/tools/screenshot.ts | 5 +- src/extension/tools/tab_management.ts | 48 +++-- src/extension/tools/web_search.ts | 13 +- src/services/tools/computer_use.ts | 191 ++++++++------------ src/types/action.types.ts | 8 +- src/types/index.ts | 1 + src/types/tools.types.ts | 80 ++++++++ test/integration/workflow.generator.test.ts | 4 +- test/unit/tool-registry.test.ts | 4 +- 17 files changed, 269 insertions(+), 211 deletions(-) create mode 100644 src/types/tools.types.ts diff --git a/src/core/eko.ts b/src/core/eko.ts index c521ba7..4bc5eb2 100644 --- a/src/core/eko.ts +++ b/src/core/eko.ts @@ -12,7 +12,7 @@ export default class Eko { throw Error('Not implemented'); } - public registerTool(tool: Tool): void { + public registerTool(tool: Tool): void { throw Error('Not implemented'); } diff --git a/src/core/tool-registry.ts b/src/core/tool-registry.ts index f696c40..b714d1d 100644 --- a/src/core/tool-registry.ts +++ b/src/core/tool-registry.ts @@ -5,9 +5,9 @@ import { ToolDefinition } from '../types/llm.types'; import { workflowSchema } from '../schemas/workflow.schema'; export class ToolRegistry { - private tools: Map = new Map(); + private tools: Map> = new Map(); - registerTool(tool: Tool): void { + registerTool(tool: Tool): void { if (this.tools.has(tool.name)) { throw new Error(`Tool with name ${tool.name} already registered`); } @@ -21,7 +21,7 @@ export class ToolRegistry { this.tools.delete(toolName); } - getTool(toolName: string): Tool { + getTool(toolName: string): Tool { const tool = this.tools.get(toolName); if (!tool) { throw new Error(`Tool with name ${toolName} not found`); @@ -33,7 +33,7 @@ export class ToolRegistry { return toolNames.every(name => this.tools.has(name)); } - getAllTools(): Tool[] { + getAllTools(): Tool[] { return Array.from(this.tools.values()); } diff --git a/src/extension/tools/computer.ts b/src/extension/tools/computer.ts index 3315ba3..5e66fd4 100644 --- a/src/extension/tools/computer.ts +++ b/src/extension/tools/computer.ts @@ -1,6 +1,7 @@ +import { ScreenshotResult } from '../../types/tools.types'; import { getPageSize, sleep } from '../utils'; -export async function key(tabId: number, key: string, coordinate?: [number, number]) { +export async function key(tabId: number, key: string, coordinate?: [number, number]): Promise { if (!coordinate) { coordinate = (await cursor_position(tabId)).coordinate; } @@ -50,7 +51,7 @@ export async function key(tabId: number, key: string, coordinate?: [number, numb return result; } -export async function type(tabId: number, text: string, coordinate?: [number, number]) { +export async function type(tabId: number, text: string, coordinate?: [number, number]): Promise { if (!coordinate) { coordinate = (await cursor_position(tabId)).coordinate; } @@ -62,7 +63,7 @@ export async function type(tabId: number, text: string, coordinate?: [number, nu }); } -export async function clear_input(tabId: number, coordinate?: [number, number]) { +export async function clear_input(tabId: number, coordinate?: [number, number]): Promise { if (!coordinate) { coordinate = (await cursor_position(tabId)).coordinate; } @@ -74,14 +75,14 @@ export async function clear_input(tabId: number, coordinate?: [number, number]) }); } -export async function mouse_move(tabId: number, coordinate: [number, number]) { +export async function mouse_move(tabId: number, coordinate: [number, number]): Promise { return await chrome.tabs.sendMessage(tabId, { type: 'computer:mouse_move', coordinate, }); } -export async function left_click(tabId: number, coordinate?: [number, number]) { +export async function left_click(tabId: number, coordinate?: [number, number]): Promise { if (!coordinate) { coordinate = (await cursor_position(tabId)).coordinate; } @@ -91,7 +92,7 @@ export async function left_click(tabId: number, coordinate?: [number, number]) { }); } -export async function left_click_drag(tabId: number, coordinate: [number, number]) { +export async function left_click_drag(tabId: number, coordinate: [number, number]): Promise { let from_coordinate = (await cursor_position(tabId)).coordinate; return await chrome.tabs.sendMessage(tabId, { type: 'computer:left_click_drag', @@ -100,7 +101,7 @@ export async function left_click_drag(tabId: number, coordinate: [number, number }); } -export async function right_click(tabId: number, coordinate?: [number, number]) { +export async function right_click(tabId: number, coordinate?: [number, number]): Promise { if (!coordinate) { coordinate = (await cursor_position(tabId)).coordinate; } @@ -110,7 +111,7 @@ export async function right_click(tabId: number, coordinate?: [number, number]) }); } -export async function double_click(tabId: number, coordinate?: [number, number]) { +export async function double_click(tabId: number, coordinate?: [number, number]): Promise { if (!coordinate) { coordinate = (await cursor_position(tabId)).coordinate; } @@ -120,13 +121,7 @@ export async function double_click(tabId: number, coordinate?: [number, number]) }); } -export async function screenshot(windowId: number): Promise<{ - image: { - type: 'base64'; - media_type: 'image/png' | 'image/jpeg'; - data: string; - }; -}> { +export async function screenshot(windowId: number): Promise { let dataUrl = await chrome.tabs.captureVisibleTab(windowId as number, { format: 'jpeg', // jpeg / png quality: 80, // 0-100 @@ -141,7 +136,7 @@ export async function screenshot(windowId: number): Promise<{ }; } -export async function scroll_to(tabId: number, coordinate: [number, number]) { +export async function scroll_to(tabId: number, coordinate: [number, number]): Promise { let from_coordinate = (await cursor_position(tabId)).coordinate; return await chrome.tabs.sendMessage(tabId, { type: 'computer:scroll_to', diff --git a/src/extension/tools/computer_web.ts b/src/extension/tools/computer_web.ts index 39637a1..f90b809 100644 --- a/src/extension/tools/computer_web.ts +++ b/src/extension/tools/computer_web.ts @@ -1,3 +1,4 @@ +import { ComputerUseParam, ComputerUseResult } from '../../types/tools.types'; import { Tool, InputSchema, ExecutionContext } from '../../types/action.types'; import { getWindowId, getTabId, sleep } from '../utils'; import * as computer from './computer'; @@ -5,7 +6,7 @@ import * as computer from './computer'; /** * Computer Web for general */ -export class ComputerWeb implements Tool { +export class ComputerWeb implements Tool { name: string; description: string; input_schema: InputSchema; @@ -67,44 +68,43 @@ export class ComputerWeb implements Tool { * @param {*} params { action: 'mouse_move', coordinate: [100, 200] } * @returns > { success: true, coordinate?: [], image?: { type: 'base64', media_type: 'image/jpeg', data: '/9j...' } } */ - async execute(context: ExecutionContext, params: unknown): Promise { - if (typeof params !== 'object' || params === null || !('action' in params)) { + async execute(context: ExecutionContext, params: ComputerUseParam): Promise { + if (typeof params !== 'object' || params === null || !params.action) { throw new Error('Invalid parameters. Expected an object with a "action" property.'); } - let { action, coordinate, text } = params as any; let tabId = await getTabId(context); let windowId = await getWindowId(context); let result; - switch (action as string) { + switch (params.action) { case 'key': - result = await computer.key(tabId, text, coordinate); + result = await computer.key(tabId, params.text as string, params.coordinate); await sleep(500); break; case 'type': - result = await computer.type(tabId, text, coordinate); + result = await computer.type(tabId, params.text as string, params.coordinate); await sleep(500); break; case 'clear_input': - result = await computer.clear_input(tabId, coordinate); + result = await computer.clear_input(tabId, params.coordinate); await sleep(100); break; case 'mouse_move': - result = await computer.mouse_move(tabId, coordinate); + result = await computer.mouse_move(tabId, params.coordinate as [number, number]); break; case 'left_click': - result = await computer.left_click(tabId, coordinate); + result = await computer.left_click(tabId, params.coordinate); await sleep(100); break; case 'left_click_drag': - result = await computer.left_click_drag(tabId, coordinate); + result = await computer.left_click_drag(tabId, params.coordinate as [number, number]); await sleep(100); break; case 'right_click': - result = await computer.right_click(tabId, coordinate); + result = await computer.right_click(tabId, params.coordinate); await sleep(100); break; case 'double_click': - result = await computer.double_click(tabId, coordinate); + result = await computer.double_click(tabId, params.coordinate); await sleep(100); break; case 'screenshot': @@ -115,12 +115,12 @@ export class ComputerWeb implements Tool { result = await computer.cursor_position(tabId); break; case 'scroll_to': - result = await computer.scroll_to(tabId, coordinate); + result = await computer.scroll_to(tabId, params.coordinate as [number, number]); await sleep(1000); break; default: throw Error( - `Invalid parameters. The "${action}" value is not included in the "action" enumeration.` + `Invalid parameters. The "${params.action}" value is not included in the "action" enumeration.` ); } return { success: true, ...result }; diff --git a/src/extension/tools/element_click.ts b/src/extension/tools/element_click.ts index 2b0ca05..effe5d5 100644 --- a/src/extension/tools/element_click.ts +++ b/src/extension/tools/element_click.ts @@ -3,7 +3,7 @@ import { Tool, InputSchema, ExecutionContext } from "../../types/action.types"; /** * Element click */ -export class ElementClick implements Tool { +export class ElementClick implements Tool { name: string; description: string; input_schema: InputSchema; @@ -23,7 +23,7 @@ export class ElementClick implements Tool { }; } - async execute(context: ExecutionContext, params: unknown): Promise { + async execute(context: ExecutionContext, params: any): Promise { if ( typeof params !== "object" || params === null || diff --git a/src/extension/tools/export_file.ts b/src/extension/tools/export_file.ts index ae78618..579aaa5 100644 --- a/src/extension/tools/export_file.ts +++ b/src/extension/tools/export_file.ts @@ -1,10 +1,11 @@ +import { ExportFileParam } from '../../types/tools.types'; import { Tool, InputSchema, ExecutionContext } from '../../types/action.types'; import { getTabId, open_new_tab, sleep } from '../utils'; /** * Export file */ -export class ExportFile implements Tool { +export class ExportFile implements Tool { name: string; description: string; input_schema: InputSchema; @@ -15,15 +16,15 @@ export class ExportFile implements Tool { this.input_schema = { type: 'object', properties: { - content: { - type: 'string', - description: 'Export file content', - }, fileType: { type: 'string', description: 'File format type', enum: ['txt', 'csv', 'md', 'html', 'js', 'xml', 'json', 'yml', 'sql'], }, + content: { + type: 'string', + description: 'Export file content', + }, filename: { type: 'string', description: 'File name', @@ -39,16 +40,12 @@ export class ExportFile implements Tool { * @param {*} params { fileType: 'csv', content: 'field1,field2\ndata1,data2' } * @returns > { success: true } */ - async execute(context: ExecutionContext, params: unknown): Promise { + async execute(context: ExecutionContext, params: ExportFileParam): Promise { if (typeof params !== 'object' || params === null || !('content' in params)) { throw new Error('Invalid parameters. Expected an object with a "content" property.'); } - let { fileType, filename, content } = params as any; - if (!fileType) { - fileType = 'txt'; - } let type = 'text/plain'; - switch (fileType) { + switch (params.fileType) { case 'csv': type = 'text/csv'; break; @@ -68,17 +65,20 @@ export class ExportFile implements Tool { type = 'application/json'; break; } - if (!filename) { - filename = new Date().getTime() + '.' + fileType; - } else if (!(filename + '').endsWith(fileType)) { - filename += '.' + fileType; + let filename: string; + if (!params.filename) { + filename = new Date().getTime() + '.' + params.fileType; + } else if (!(params.filename + '').endsWith(params.fileType)) { + filename = params.filename + '.' + params.fileType; + } else { + filename = params.filename; } let tabId = await getTabId(context); try { await chrome.scripting.executeScript({ target: { tabId: tabId as number }, func: exportFile, - args: [filename, type, content], + args: [filename, type, params.content], }); } catch (e) { let tab = await open_new_tab('https://www.google.com', true); @@ -86,7 +86,7 @@ export class ExportFile implements Tool { await chrome.scripting.executeScript({ target: { tabId: tabId as number }, func: exportFile, - args: [filename, type, content], + args: [filename, type, params.content], }); await sleep(1000); await chrome.tabs.remove(tabId); diff --git a/src/extension/tools/extract_content.ts b/src/extension/tools/extract_content.ts index 85ca110..39827e3 100644 --- a/src/extension/tools/extract_content.ts +++ b/src/extension/tools/extract_content.ts @@ -1,10 +1,11 @@ +import { ExtractContentResult } from '../../types/tools.types'; import { Tool, InputSchema, ExecutionContext } from '../../types/action.types'; import { getTabId, executeScript, injectScript, sleep } from '../utils'; /** * Extract Page Content */ -export class ExtractContent implements Tool { +export class ExtractContent implements Tool { name: string; description: string; input_schema: InputSchema; @@ -24,7 +25,7 @@ export class ExtractContent implements Tool { * @param {*} params {} * @returns > { tabId, result: { title, url, content }, success: true } */ - async execute(context: ExecutionContext, params: unknown): Promise { + async execute(context: ExecutionContext, params: any): Promise { let tabId = await getTabId(context); let tab = await chrome.tabs.get(tabId); await injectScript(tabId); @@ -36,9 +37,8 @@ export class ExtractContent implements Tool { title: tab.title, url: tab.url, content: content, - }, - success: true, - }; + } + } as ExtractContentResult; } } diff --git a/src/extension/tools/open_url.ts b/src/extension/tools/open_url.ts index c26a00e..c887548 100644 --- a/src/extension/tools/open_url.ts +++ b/src/extension/tools/open_url.ts @@ -1,10 +1,11 @@ +import { OpenUrlParam, OpenUrlResult } from '../../types/tools.types'; import { Tool, InputSchema, ExecutionContext } from '../../types/action.types'; import { getWindowId, open_new_tab } from '../utils'; /** * Open Url */ -export class OpenUrl implements Tool { +export class OpenUrl implements Tool { name: string; description: string; input_schema: InputSchema; @@ -34,11 +35,12 @@ export class OpenUrl implements Tool { * @param {*} params { url: 'https://www.google.com', newWindow: true } * @returns > { tabId, windowId, title, success: true } */ - async execute(context: ExecutionContext, params: unknown): Promise { - if (typeof params !== 'object' || params === null || !('url' in params)) { + async execute(context: ExecutionContext, params: OpenUrlParam): Promise { + if (typeof params !== 'object' || params === null || !params.url) { throw new Error('Invalid parameters. Expected an object with a "url" property.'); } - let { url, newWindow } = params as any; + let url = params.url; + let newWindow = params.newWindow; if (!newWindow && !context.variables.get('windowId') && !context.variables.get('tabId')) { // First mandatory opening of a new window newWindow = true; @@ -66,7 +68,6 @@ export class OpenUrl implements Tool { tabId, windowId, title: tab.title, - success: true, }; } } diff --git a/src/extension/tools/screenshot.ts b/src/extension/tools/screenshot.ts index 08316be..9b0c1a0 100644 --- a/src/extension/tools/screenshot.ts +++ b/src/extension/tools/screenshot.ts @@ -1,11 +1,12 @@ import { Tool, InputSchema, ExecutionContext } from '../../types/action.types'; import { screenshot } from './computer'; import { getWindowId } from '../utils'; +import { ScreenshotResult } from '../../types/tools.types'; /** * Current Page Screenshot */ -export class Screenshot implements Tool { +export class Screenshot implements Tool { name: string; description: string; input_schema: InputSchema; @@ -25,7 +26,7 @@ export class Screenshot implements Tool { * @param {*} params {} * @returns > { image: { type: 'base64', media_type: 'image/png', data } } */ - async execute(context: ExecutionContext, params: unknown): Promise { + async execute(context: ExecutionContext, params: unknown): Promise { let windowId = await getWindowId(context); return await screenshot(windowId); } diff --git a/src/extension/tools/tab_management.ts b/src/extension/tools/tab_management.ts index 9ee6e02..d98cd05 100644 --- a/src/extension/tools/tab_management.ts +++ b/src/extension/tools/tab_management.ts @@ -1,10 +1,16 @@ +import { + CloseTabInfo, + TabInfo, + TabManagementParam, + TabManagementResult, +} from '../../types/tools.types'; import { Tool, InputSchema, ExecutionContext } from '../../types/action.types'; import { getTabId, getWindowId, open_new_tab, sleep } from '../utils'; /** * Browser tab management */ -export class TabManagement implements Tool { +export class TabManagement implements Tool { name: string; description: string; input_schema: InputSchema; @@ -32,22 +38,25 @@ export class TabManagement implements Tool { /** * Tab management * - * @param {*} params { action: 'tab_all' | 'current_tab' | 'close_tab' | 'switch_tab [tabId]' } + * @param {*} params { action: 'tab_all' | 'current_tab' | 'close_tab' | 'switch_tab [tabId]' | `new_tab [url]` } * @returns > { result, success: true } */ - async execute(context: ExecutionContext, params: unknown): Promise { + async execute( + context: ExecutionContext, + params: TabManagementParam + ): Promise { if (typeof params !== 'object' || params === null || !('action' in params)) { throw new Error('Invalid parameters. Expected an object with a "action" property.'); } - let action = (params as any).action as string; + let action = params.action; let windowId = await getWindowId(context); - let result: any = null; + let result: TabManagementResult; if (action == 'tab_all') { result = []; let tabs = await chrome.tabs.query({ windowId: windowId }); for (let i = 0; i < tabs.length; i++) { let tab = tabs[i]; - let tabInfo: any = { + let tabInfo: TabInfo = { tabId: tab.id, windowId: tab.windowId, title: tab.title, @@ -61,12 +70,12 @@ export class TabManagement implements Tool { } else if (action == 'current_tab') { let tabId = await getTabId(context); let tab = await chrome.tabs.get(tabId); - result = { tabId, windowId: tab.windowId, title: tab.title, url: tab.url }; + let tabInfo: TabInfo = { tabId, windowId: tab.windowId, title: tab.title, url: tab.url }; + result = tabInfo; } else if (action == 'close_tab') { let closedTabId = await getTabId(context); await chrome.tabs.remove(closedTabId); await sleep(100); - let currentTabId = null; let tabs = await chrome.tabs.query({ active: true, currentWindow: true }); if (tabs.length == 0) { tabs = await chrome.tabs.query({ status: 'complete', currentWindow: true }); @@ -75,16 +84,18 @@ export class TabManagement implements Tool { if (!tab.active) { await chrome.tabs.update(tab.id as number, { active: true }); } - currentTabId = tab.id; + let newTabId = tab.id; context.variables.set('tabId', tab.id); context.variables.set('windowId', tab.windowId); - result = { closedTabId, currentTabId, currentTabTitle: tab.title }; + let closeTabInfo: CloseTabInfo = { closedTabId, newTabId, newTabTitle: tab.title }; + result = closeTabInfo; } else if (action.startsWith('switch_tab')) { let tabId = parseInt(action.replace('switch_tab', '').replace('[', '').replace(']', '')); let tab = await chrome.tabs.update(tabId, { active: true }); context.variables.set('tabId', tab.id); context.variables.set('windowId', tab.windowId); - result = { tabId, windowId: tab.windowId, title: tab.title, url: tab.url }; + let tabInfo: TabInfo = { tabId, windowId: tab.windowId, title: tab.title, url: tab.url }; + result = tabInfo; } else if (action.startsWith('new_tab')) { let url = action.replace('new_tab', '').replace('[', '').replace(']', '').replace(/"/g, ''); // First mandatory opening of a new window @@ -108,11 +119,16 @@ export class TabManagement implements Tool { context.variables.set('windowIds', [windowId] as Array); } } - result = { tabId: tab.id, windowId: tab.windowId, title: tab.title, url: tab.url }; + let tabInfo: TabInfo = { + tabId: tab.id, + windowId: tab.windowId, + title: tab.title, + url: tab.url, + }; + result = tabInfo; + } else { + throw Error('Unknown action: ' + action); } - return { - result, - success: true, - }; + return result; } } diff --git a/src/extension/tools/web_search.ts b/src/extension/tools/web_search.ts index 3e72a79..7fae487 100644 --- a/src/extension/tools/web_search.ts +++ b/src/extension/tools/web_search.ts @@ -1,10 +1,11 @@ +import { WebSearchParam, WebSearchResult } from '../../types/tools.types'; import { Tool, InputSchema, ExecutionContext } from '../../types/action.types'; import { MsgEvent, CountDownLatch, sleep, injectScript } from '../utils'; /** * Web Search */ -export class WebSearch implements Tool { +export class WebSearch implements Tool { name: string; description: string; input_schema: InputSchema; @@ -34,11 +35,13 @@ export class WebSearch implements Tool { * @param {*} params { url: 'https://www.google.com', query: 'ai agent', maxResults: 5 } * @returns > [{ title, url, content }] */ - async execute(context: ExecutionContext, params: unknown): Promise { - if (typeof params !== 'object' || params === null || !('query' in params)) { + async execute(context: ExecutionContext, params: WebSearchParam): Promise { + if (typeof params !== 'object' || params === null || !params.query) { throw new Error('Invalid parameters. Expected an object with a "query" property.'); } - let { url, query, maxResults } = params as any; + let url = params.url; + let query = params.query; + let maxResults = params.maxResults; if (!url) { url = 'https://www.google.com'; } @@ -46,7 +49,7 @@ export class WebSearch implements Tool { let searchs = [{ url: url as string, keyword: query as string }]; let searchInfo = await deepSearch(taskId, searchs, maxResults || 5); let links = searchInfo.result[0]?.links || []; - return links.filter((s: any) => s.content); + return links.filter((s: any) => s.content) as WebSearchResult[]; } } diff --git a/src/services/tools/computer_use.ts b/src/services/tools/computer_use.ts index cd2206d..b22d89a 100644 --- a/src/services/tools/computer_use.ts +++ b/src/services/tools/computer_use.ts @@ -1,16 +1,17 @@ -import { Tool, InputSchema, ExecutionContext } from "../../types/action.types"; +import { ComputerUseParam, ComputerUseResult } from '../../types/tools.types'; +import { Tool, InputSchema, ExecutionContext } from '../../types/action.types'; /** * Computer Use for fellou */ -export class ComputerUse implements Tool { +export class ComputerUse implements Tool { name: string; description: string; input_schema: InputSchema; constructor(computer_screen_size: [number, number]) { // TODO The screenshot is of the screen, but the plugin returns the relative position of the browser, not the screen, there is a problem! - this.name = "computer_use"; + this.name = 'computer_use'; this.description = `Use a mouse and keyboard to interact with a computer, and take screenshots. * This is a browser GUI interface where you do not have access to the address bar or bookmarks. You must operate the browser using inputs like screenshots, mouse, keyboard, etc. * Some operations may take time to process, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you clicked submit button, but it didn't work, try taking another screenshot. @@ -19,10 +20,10 @@ export class ComputerUse implements Tool { * If you tried clicking on a button or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click. * Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element.`; this.input_schema = { - type: "object", + type: 'object', properties: { action: { - type: "string", + type: 'string', description: `The action to perform. The available actions are: * \`key\`: Press a key or key-combination on the keyboard. - This supports pyautogui hotkey syntax. @@ -38,29 +39,29 @@ export class ComputerUse implements Tool { * \`screenshot\`: Take a screenshot of the screen. * \`scroll\`: Performs a scroll of the mouse scroll wheel, The coordinate parameter is ineffective, each time a scroll operation is performed.`, enum: [ - "key", - "type", - "mouse_move", - "left_click", - "left_click_drag", - "right_click", - "double_click", - "screenshot", - "cursor_position", - "scroll", + 'key', + 'type', + 'mouse_move', + 'left_click', + 'left_click_drag', + 'right_click', + 'double_click', + 'screenshot', + 'cursor_position', + 'scroll', ], }, coordinate: { - type: "array", + type: 'array', description: - "(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to.", + '(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to.', }, text: { - type: "string", - description: "Required only by `action=type` and `action=key`", + type: 'string', + description: 'Required only by `action=type` and `action=key`', }, }, - required: ["action"], + required: ['action'], }; } @@ -70,168 +71,128 @@ export class ComputerUse implements Tool { * @param {*} params { action: 'mouse_move', coordinate: [100, 200] } * @returns { success: true, coordinate?: [], image?: { type: 'base64', media_type: 'image/jpeg', data: '/9j...' } } */ - async execute(context: ExecutionContext, params: unknown): Promise { - if ( - typeof params !== "object" || - params === null || - !("action" in params) - ) { - throw new Error( - 'Invalid parameters. Expected an object with a "action" property.' - ); + async execute(context: ExecutionContext, params: ComputerUseParam): Promise { + if (typeof params !== 'object' || params === null || !params.action) { + throw new Error('Invalid parameters. Expected an object with a "action" property.'); } - let { action, coordinate, text } = params as any; let result; - switch (action as string) { - case "key": - result = await key(text, coordinate); + switch (params.action) { + case 'key': + result = await key(params.text as string, params.coordinate); break; - case "type": - result = await type(text, coordinate); + case 'type': + result = await type(params.text as string, params.coordinate); break; - case "mouse_move": - result = await mouse_move(coordinate); + case 'mouse_move': + result = await mouse_move(params.coordinate as [number, number]); break; - case "left_click": - result = await left_click(coordinate); + case 'left_click': + result = await left_click(params.coordinate); break; - case "left_click_drag": - result = await left_click_drag(coordinate); + case 'left_click_drag': + result = await left_click_drag(params.coordinate as [number, number]); break; - case "right_click": - result = await right_click(coordinate); + case 'right_click': + result = await right_click(params.coordinate); break; - case "double_click": - result = await double_click(coordinate); + case 'double_click': + result = await double_click(params.coordinate); break; - case "screenshot": + case 'screenshot': result = await screenshot(); break; - case "cursor_position": + case 'cursor_position': result = await cursor_position(); break; - case "scroll": - result = await scroll(coordinate); + case 'scroll': + result = await scroll(params.coordinate); break; default: throw Error( - `Invalid parameters. The "${action}" value is not included in the "action" enumeration.` + `Invalid parameters. The "${params.action}" value is not included in the "action" enumeration.` ); } return { success: true, ...result }; } - } -export async function key( - key: string, - coordinate?: [number, number] -) { +export async function key(key: string, coordinate?: [number, number]) { if (!coordinate) { coordinate = (await cursor_position()).coordinate; } await mouse_move(coordinate); let mapping: { [key: string]: string } = { - space: " ", - escape: "esc", - return: "enter", - page_up: "pageup", - page_down: "pagedown", - back_space: "backspace", + space: ' ', + escape: 'esc', + return: 'enter', + page_up: 'pageup', + page_down: 'pagedown', + back_space: 'backspace', }; - let keys = key.replace(/\s+/g, " ").split(" "); + let keys = key.replace(/\s+/g, ' ').split(' '); for (let i = 0; i < keys.length; i++) { let _key = keys[i]; - if (_key.indexOf("+") > -1) { - let mapped_keys = _key.split("+").map((k) => mapping[k] || k); - await runComputeruseCommand("hotkey", mapped_keys); + if (_key.indexOf('+') > -1) { + let mapped_keys = _key.split('+').map((k) => mapping[k] || k); + await runComputeruseCommand('hotkey', mapped_keys); } else { let mapped_key = mapping[_key] || _key; - await runComputeruseCommand("press", [mapped_key]); + await runComputeruseCommand('press', [mapped_key]); } await new Promise((resolve: any) => setTimeout(() => resolve(), 100)); } } -export async function type( - text: string, - coordinate?: [number, number] -) { +export async function type(text: string, coordinate?: [number, number]) { if (coordinate) { await mouse_move(coordinate); } - await runComputeruseCommand("write", [text]); + await runComputeruseCommand('write', [text]); } export async function mouse_move(coordinate: [number, number]) { - await runComputeruseCommand("moveTo", coordinate); + await runComputeruseCommand('moveTo', coordinate); } export async function left_click(coordinate?: [number, number]) { if (!coordinate) { coordinate = (await cursor_position()).coordinate; } - await runComputeruseCommand("click", [ - coordinate[0], - coordinate[1], - 1, - 0, - "left", - ]); + await runComputeruseCommand('click', [coordinate[0], coordinate[1], 1, 0, 'left']); } -export async function left_click_drag( - coordinate: [number, number] -) { - await runComputeruseCommand("dragTo", [coordinate[0], coordinate[1], 0]); +export async function left_click_drag(coordinate: [number, number]) { + await runComputeruseCommand('dragTo', [coordinate[0], coordinate[1], 0]); } -export async function right_click( - coordinate?: [number, number] -) { +export async function right_click(coordinate?: [number, number]) { if (!coordinate) { coordinate = (await cursor_position()).coordinate; } - await runComputeruseCommand("click", [ - coordinate[0], - coordinate[1], - 1, - 0, - "right", - ]); + await runComputeruseCommand('click', [coordinate[0], coordinate[1], 1, 0, 'right']); } -export async function double_click( - coordinate?: [number, number] -) { +export async function double_click(coordinate?: [number, number]) { if (!coordinate) { coordinate = (await cursor_position()).coordinate; } - await runComputeruseCommand("click", [ - coordinate[0], - coordinate[1], - 2, - 0, - "left", - ]); + await runComputeruseCommand('click', [coordinate[0], coordinate[1], 2, 0, 'left']); } export async function screenshot(windowId?: number): Promise<{ image: { - type: "base64"; - media_type: "image/png" | "image/jpeg"; + type: 'base64'; + media_type: 'image/png' | 'image/jpeg'; data: string; }; }> { - let screenshot = (await runComputeruseCommand("screenshot")).result; - let dataUrl = screenshot.startsWith("data:") - ? screenshot - : "data:image/png;base64," + screenshot; - let data = dataUrl.substring(dataUrl.indexOf("base64,") + 7); + let screenshot = (await runComputeruseCommand('screenshot')).result; + let dataUrl = screenshot.startsWith('data:') ? screenshot : 'data:image/png;base64,' + screenshot; + let data = dataUrl.substring(dataUrl.indexOf('base64,') + 7); return { image: { - type: "base64", - media_type: dataUrl.indexOf("png") > -1 ? "image/png" : "image/jpeg", + type: 'base64', + media_type: dataUrl.indexOf('png') > -1 ? 'image/png' : 'image/jpeg', data: data, }, }; @@ -240,17 +201,17 @@ export async function screenshot(windowId?: number): Promise<{ export async function cursor_position(): Promise<{ coordinate: [number, number]; }> { - let response = await runComputeruseCommand("position"); + let response = await runComputeruseCommand('position'); return response.result; } export async function size(): Promise<[number, number]> { - let response = await runComputeruseCommand("size"); + let response = await runComputeruseCommand('size'); return response.result; } export async function scroll(coordinate?: [number, number]) { - await runComputeruseCommand("scroll", [2]); + await runComputeruseCommand('scroll', [2]); } export async function runComputeruseCommand( diff --git a/src/types/action.types.ts b/src/types/action.types.ts index 9ec81ad..f40d178 100644 --- a/src/types/action.types.ts +++ b/src/types/action.types.ts @@ -1,8 +1,8 @@ -export interface Tool { +export interface Tool { name: string; description: string; input_schema: InputSchema; - execute: (context: ExecutionContext, params: unknown) => Promise; + execute: (context: ExecutionContext, params: T) => Promise; } export interface InputSchema { @@ -25,12 +25,12 @@ export interface Propertie { export interface ExecutionContext { variables: Map; - tools: Map; + tools: Map>; } export interface Action { type: 'prompt' | 'script' | 'hybrid'; name: string; execute: (input: unknown, context: ExecutionContext) => Promise; - tools: Tool[]; + tools: Tool[]; } diff --git a/src/types/index.ts b/src/types/index.ts index 6c69093..ad2d55d 100644 --- a/src/types/index.ts +++ b/src/types/index.ts @@ -2,4 +2,5 @@ export * from './action.types'; export * from './workflow.types'; export * from './eko.types'; export * from './llm.types'; +export * from './tools.types'; export * from './framework.types'; diff --git a/src/types/tools.types.ts b/src/types/tools.types.ts new file mode 100644 index 0000000..8eb98f1 --- /dev/null +++ b/src/types/tools.types.ts @@ -0,0 +1,80 @@ +export interface ComputerUseParam { + action: string; + coordinate?: [number, number]; + text?: string; +} + +export interface ComputerUseResult { + success: boolean; + image?: ScreenshotImage; + [key: string]: any; +} + +export interface ExportFileParam { + content: string; + fileType: 'txt' | 'csv' | 'md' | 'html' | 'js' | 'xml' | 'json' | 'yml' | 'sql'; + filename?: string; +} + +export interface ExtractContentResult { + tabId: number; + result: { + title?: string; + url?: string; + content: string; + }; +} + +export interface OpenUrlParam { + url: string; + newWindow?: boolean; +} + +export interface OpenUrlResult { + tabId: number; + windowId: number; + title?: string; +} + +export interface ScreenshotResult { + image: ScreenshotImage; +} + +export interface ScreenshotImage { + type: 'base64'; + media_type: 'image/png' | 'image/jpeg'; + data: string; +} + +export interface TabManagementParam { + action: string; +} + +export type TabManagementResult = TabInfo | CloseTabInfo | TabInfo[]; + +export interface TabInfo { + tabId?: number; + windowId?: number; + title?: string; + url?: string; + active?: boolean; +} + +export interface CloseTabInfo { + closedTabId: number; + newTabId?: number; + newTabTitle?: string; +} + +export interface WebSearchParam { + url?: string; + query: string; + maxResults?: number; +} + +export interface WebSearchResult { + title: string; + url: string; + content: string; +} + \ No newline at end of file diff --git a/test/integration/workflow.generator.test.ts b/test/integration/workflow.generator.test.ts index a9a2dec..70990ff 100644 --- a/test/integration/workflow.generator.test.ts +++ b/test/integration/workflow.generator.test.ts @@ -13,7 +13,7 @@ import dotenv from 'dotenv'; dotenv.config(); // Mock browser tool base class to avoid duplicate code -class BrowserTool implements Tool { +class BrowserTool implements Tool { constructor( public name: string, public description: string, @@ -26,7 +26,7 @@ class BrowserTool implements Tool { } // Create mock browser tools -function createBrowserTools(): Tool[] { +function createBrowserTools(): Tool[] { return [ new BrowserTool( 'open_url', diff --git a/test/unit/tool-registry.test.ts b/test/unit/tool-registry.test.ts index 90b70a5..76ab39a 100644 --- a/test/unit/tool-registry.test.ts +++ b/test/unit/tool-registry.test.ts @@ -1,7 +1,7 @@ import { ToolRegistry } from '../../src/core/tool-registry'; import { Tool, InputSchema } from '../../src/types/action.types'; -class MockTool implements Tool { +class MockTool implements Tool { constructor( public name: string, public description: string = 'Mock tool description', @@ -20,7 +20,7 @@ class MockTool implements Tool { describe('ToolRegistry', () => { let registry: ToolRegistry; - let mockTool: Tool; + let mockTool: Tool; beforeEach(() => { registry = new ToolRegistry(); From 1e9c4bd43a7798c4ec76d5c1300153a348054d1d Mon Sep 17 00:00:00 2001 From: veasion Date: Sun, 22 Dec 2024 11:00:20 +0800 Subject: [PATCH 3/3] fix: claude proxy --- .../tools/{computer.ts => browser.ts} | 0 .../tools/{computer_web.ts => browser_use.ts} | 8 +-- src/extension/tools/element_click.ts | 57 +++++++++++++------ src/extension/tools/index.ts | 8 +-- src/extension/tools/screenshot.ts | 2 +- src/services/llm/claude-provider.ts | 3 +- test/integration/claude-provider.test.ts | 3 +- 7 files changed, 53 insertions(+), 28 deletions(-) rename src/extension/tools/{computer.ts => browser.ts} (100%) rename src/extension/tools/{computer_web.ts => browser_use.ts} (96%) diff --git a/src/extension/tools/computer.ts b/src/extension/tools/browser.ts similarity index 100% rename from src/extension/tools/computer.ts rename to src/extension/tools/browser.ts diff --git a/src/extension/tools/computer_web.ts b/src/extension/tools/browser_use.ts similarity index 96% rename from src/extension/tools/computer_web.ts rename to src/extension/tools/browser_use.ts index f90b809..8c2305b 100644 --- a/src/extension/tools/computer_web.ts +++ b/src/extension/tools/browser_use.ts @@ -1,18 +1,18 @@ import { ComputerUseParam, ComputerUseResult } from '../../types/tools.types'; import { Tool, InputSchema, ExecutionContext } from '../../types/action.types'; import { getWindowId, getTabId, sleep } from '../utils'; -import * as computer from './computer'; +import * as computer from './browser'; /** - * Computer Web for general + * Browser Use for general */ -export class ComputerWeb implements Tool { +export class BrowserUse implements Tool { name: string; description: string; input_schema: InputSchema; constructor() { - this.name = 'computer_web'; + this.name = 'browser_use'; this.description = `Use a mouse and keyboard to interact with a computer, and take screenshots. * This is a browser GUI interface where you do not have access to the address bar or bookmarks. You must operate the browser using inputs like screenshots, mouse, keyboard, etc. * Some operations may take time to process, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you clicked submit button, but it didn't work, try taking another screenshot. diff --git a/src/extension/tools/element_click.ts b/src/extension/tools/element_click.ts index effe5d5..de3e62a 100644 --- a/src/extension/tools/element_click.ts +++ b/src/extension/tools/element_click.ts @@ -1,4 +1,4 @@ -import { Tool, InputSchema, ExecutionContext } from "../../types/action.types"; +import { Tool, InputSchema, ExecutionContext } from '../../types/action.types'; /** * Element click @@ -9,31 +9,54 @@ export class ElementClick implements Tool { input_schema: InputSchema; constructor() { - this.name = "element_click"; - this.description = "click element"; + this.name = 'element_click'; + this.description = 'click element'; this.input_schema = { - type: "object", + type: 'object', properties: { element: { - type: "string", - description: "Element title", + type: 'string', + description: 'Element title', }, }, - required: ["element"], + required: ['element'], }; } async execute(context: ExecutionContext, params: any): Promise { - if ( - typeof params !== "object" || - params === null || - !("content" in params) - ) { - throw new Error( - 'Invalid parameters. Expected an object with a "content" property.' + if (typeof params !== 'object' || params === null || !('content' in params)) { + throw new Error('Invalid parameters. Expected an object with a "content" property.'); + } + // button, span, lable, a, img, input, textarea, strlen < 30 + // TODO .... + throw new Error('Not implemented'); + } +} + +function xpath(element: any) { + return (function (element) { + if (element.id !== '') { + return '//*[@id=\"' + element.id + '\"]'; + } + if (element == document.body) { + return '/html/' + element.tagName.toLowerCase(); + } + var ix = 1, + siblings = element.parentNode.childNodes; + for (var i = 0, l = siblings.length; i < l; i++) { + var sibling = siblings[i]; + if (sibling == element) { + return ( + arguments.callee(element.parentNode) + + '/' + + element.tagName.toLowerCase() + + '[' + + ix + + ']' ); + } else if (sibling.nodeType == 1 && sibling.tagName == element.tagName) { + ix++; } - // TODO .... - throw new Error('Not implemented') } -} \ No newline at end of file + })(arguments[0]); +} diff --git a/src/extension/tools/index.ts b/src/extension/tools/index.ts index 659cc67..b9403b9 100644 --- a/src/extension/tools/index.ts +++ b/src/extension/tools/index.ts @@ -1,5 +1,5 @@ -import * as computer from './computer'; -import { ComputerWeb } from './computer_web'; +import * as browser from './browser'; +import { BrowserUse } from './browser_use'; import { ElementClick } from './element_click'; import { ExportFile } from './export_file'; import { ExtractContent } from './extract_content'; @@ -9,8 +9,8 @@ import { TabManagement } from './tab_management'; import { WebSearch } from './web_search'; export { - computer, - ComputerWeb, + browser, + BrowserUse, ElementClick, ExportFile, ExtractContent, diff --git a/src/extension/tools/screenshot.ts b/src/extension/tools/screenshot.ts index 9b0c1a0..c52e79d 100644 --- a/src/extension/tools/screenshot.ts +++ b/src/extension/tools/screenshot.ts @@ -1,5 +1,5 @@ import { Tool, InputSchema, ExecutionContext } from '../../types/action.types'; -import { screenshot } from './computer'; +import { screenshot } from './browser'; import { getWindowId } from '../utils'; import { ScreenshotResult } from '../../types/tools.types'; diff --git a/src/services/llm/claude-provider.ts b/src/services/llm/claude-provider.ts index 3ee440c..63a3367 100644 --- a/src/services/llm/claude-provider.ts +++ b/src/services/llm/claude-provider.ts @@ -11,8 +11,9 @@ export class ClaudeProvider implements LLMProvider { private client: Anthropic; private defaultModel = 'claude-3-5-sonnet-20241022'; - constructor(apiKey: string) { + constructor(apiKey: string, baseURL?: string) { this.client = new Anthropic({ + baseURL, apiKey: apiKey, dangerouslyAllowBrowser: true }); diff --git a/test/integration/claude-provider.test.ts b/test/integration/claude-provider.test.ts index ec19dfb..e7e836c 100644 --- a/test/integration/claude-provider.test.ts +++ b/test/integration/claude-provider.test.ts @@ -5,6 +5,7 @@ import dotenv from 'dotenv'; dotenv.config(); const ANTHROPIC_API_KEY = process.env.ANTHROPIC_API_KEY; +const ANTHROPIC_BASE_URL = process.env.ANTHROPIC_BASE_URL; if (!ANTHROPIC_API_KEY) { throw new Error('ANTHROPIC_API_KEY environment variable is required for integration tests'); } @@ -20,7 +21,7 @@ describeIntegration('ClaudeProvider Integration', () => { let provider: ClaudeProvider; beforeAll(() => { - provider = new ClaudeProvider(ANTHROPIC_API_KEY); + provider = new ClaudeProvider(ANTHROPIC_API_KEY, ANTHROPIC_BASE_URL); }); describe('generateText', () => {