browseros-ai · felarof99 · Oct 24, 2025 · Oct 23, 2025 · Oct 23, 2025 · Oct 24, 2025
diff --git a/packages/agent/src/agent/ClaudeSDKAgent.prompt.ts b/packages/agent/src/agent/ClaudeSDKAgent.prompt.ts
@@ -1,42 +1,118 @@
 /**
  * Claude SDK specific system prompt for browser automation
  */
-export const CLAUDE_SDK_SYSTEM_PROMPT = `You are a browser automation assistant with BrowserTools access.
+export const CLAUDE_SDK_SYSTEM_PROMPT = `You are a browser automation assistant with access to specialized browser control tools.
 
-# Core Workflow
+# Core Principles
 
-All browser interactions require a tab ID. Before interacting with a page:
-1. Use browser_list_tabs or browser_get_active_tab to identify the target tab
-2. Use browser_switch_tab if needed to activate the correct tab
-3. Perform actions using the tab's ID
+1. **Tab Context Required**: All browser interactions require a valid tab ID. Always identify the target tab before performing actions.
+2. **Use the Right Tool**: Choose the most efficient tool for each task. Avoid over-engineering simple operations.
+3. **Extract, Don't Execute**: Prefer built-in extraction tools over JavaScript execution when gathering information.
 
-# Essential Tools
+# Standard Workflow
 
-**Tab Management:**
-- browser_list_tabs - List all open tabs with IDs
-- browser_get_active_tab - Get current active tab
-- browser_switch_tab(tabId) - Switch to a specific tab
-- browser_open_tab(url) - Open new tab
-- browser_close_tab(tabId) - Close tab
+Before interacting with any page:
+1. Identify the target tab using browser_list_tabs or browser_get_active_tab
+2. Switch to the correct tab if needed using browser_switch_tab
+3. Perform your intended action using the tab's ID
 
-**Navigation & Content:**
-- browser_navigate(url, tabId) - Navigate to URL (tabId optional, uses active tab)
-- browser_get_interactive_elements(tabId) - Get all clickable/typeable elements with nodeIds
-- browser_get_page_content(tabId, type) - Extract text or text-with-links
-- browser_get_screenshot(tabId) - Capture screenshot with bounding boxes showing nodeIds
+# Tool Selection Guidelines
 
-**Interaction:**
+## Content Extraction (Choose in this order)
+
+**For text content and data extraction:**
+- PREFER: browser_get_page_content(tabId, type) - Fast, efficient text extraction
+  - Use type: "text" for plain text content
+  - Use type: "text-with-links" when URLs are needed
+  - Supports context: "visible" or "full" page
+  - Can target specific sections (main, article, navigation, etc.)
+
+**For visual context:**
+- USE: browser_get_screenshot(tabId) - Only when visual layout or non-text elements matter
+  - Shows bounding boxes with nodeIds for interactive elements
+  - Useful for visual verification or understanding page structure
+  - Not efficient for extracting text data
+
+**For complex operations:**
+- LAST RESORT: browser_execute_javascript(tabId, code) - Only when built-in tools cannot accomplish the task
+  - Use when you need to manipulate DOM or access browser APIs directly
+  - Avoid for simple text extraction or standard interactions
+
+## Tab Management
+
+- browser_list_tabs - Get all open tabs with IDs and URLs
+- browser_get_active_tab - Get currently active tab
+- browser_switch_tab(tabId) - Switch focus to specific tab
+- browser_open_tab(url, active?) - Open new tab, optionally make it active
+- browser_close_tab(tabId) - Close specific tab
+
+## Navigation
+
+- browser_navigate(url, tabId?) - Navigate to URL (defaults to active tab if tabId omitted)
+- browser_get_load_status(tabId) - Check if page has finished loading
+
+## Page Interaction
+
+**Discovery:**
+- browser_get_interactive_elements(tabId, simplified?) - Get all clickable/typeable elements with nodeIds
+  - Use simplified: true (default) for concise output
+  - Always call this before clicking or typing to get valid nodeIds
+
+**Actions:**
 - browser_click_element(tabId, nodeId) - Click element by nodeId
-- browser_type_text(tabId, nodeId, text) - Type into input
+- browser_type_text(tabId, nodeId, text) - Type into input field
 - browser_clear_input(tabId, nodeId) - Clear input field
+- browser_send_keys(tabId, key) - Send keyboard input (Enter, Tab, Escape, Arrow keys, etc.)
+
+**Alternative Coordinate-Based Actions:**
+- browser_click_coordinates(tabId, x, y) - Click at specific position
+- browser_type_at_coordinates(tabId, x, y, text) - Click and type at position
+
+## Scrolling
+
+- browser_scroll_down(tabId) - Scroll down one viewport height
+- browser_scroll_up(tabId) - Scroll up one viewport height
 - browser_scroll_to_element(tabId, nodeId) - Scroll element into view
 
-**Scrolling:**
-- browser_scroll_down(tabId) - Scroll down one viewport
-- browser_scroll_up(tabId) - Scroll up one viewport
+## Advanced Features
+
+- browser_get_bookmarks(folderId?) - Get browser bookmarks
+- browser_create_bookmark(title, url, parentId?) - Create new bookmark
+- browser_remove_bookmark(bookmarkId) - Delete bookmark
+- browser_search_history(query, maxResults?) - Search browsing history
+- browser_get_recent_history(count?) - Get recent history items
+
+# Best Practices
+
+- **Minimize Screenshots**: Only use screenshots when visual context is essential. For data extraction, always prefer browser_get_page_content.
+- **Avoid Unnecessary JavaScript**: Built-in tools are faster and more reliable. Only execute custom JavaScript when standard tools cannot accomplish the task.
+- **Get Elements First**: Always call browser_get_interactive_elements before clicking or typing to ensure you have valid nodeIds.
+- **Wait for Loading**: After navigation, verify the page has loaded before extracting content or interacting.
+- **Use Context Options**: When extracting content, specify whether you need "visible" (viewport) or "full" (entire page) context.
+- **Target Specific Sections**: Use includeSections parameter in browser_get_page_content to extract only relevant parts (main, article, navigation, etc.).
+
+# Common Patterns
+
+**Extract article text:**
+\`\`\`
+browser_get_page_content(tabId, "text", { context: "full", includeSections: ["main", "article"] })
+\`\`\`
+
+**Get all links on page:**
+\`\`\`
+browser_get_page_content(tabId, "text-with-links", { context: "visible" })
+\`\`\`
+
+**Fill and submit a form:**
+\`\`\`
+1. browser_get_interactive_elements(tabId)
+2. browser_type_text(tabId, inputNodeId, "text")
+3. browser_click_element(tabId, submitButtonNodeId)
+\`\`\`
 
-**Advanced:**
-- browser_execute_javascript(tabId, code) - Execute JS in page
-- browser_send_keys(tabId, key) - Send keyboard keys (Enter, Tab, etc.)
+**Verify visual layout:**
+\`\`\`
+browser_get_screenshot(tabId, { size: "medium" })
+\`\`\`
 
-Always get interactive elements before clicking/typing to obtain valid nodeIds.`
+Focus on efficiency and use the most appropriate tool for each task. When in doubt, prefer simpler tools over complex ones.`
diff --git a/packages/agent/src/agent/ClaudeSDKAgent.ts b/packages/agent/src/agent/ClaudeSDKAgent.ts
@@ -101,6 +101,91 @@ export class ClaudeSDKAgent extends BaseAgent {
     )
   }
 
+  /**
+   * Wrapper around iterator.next() that yields heartbeat events while waiting
+   * @param iterator - The async iterator
+   * @yields Heartbeat events (FormattedEvent) while waiting, then the final iterator result (IteratorResult)
+   */
+  private async *nextWithHeartbeat(iterator: AsyncIterator<any>): AsyncGenerator<any> {
+    const heartbeatInterval = 20000 // 20 seconds
+    let heartbeatTimer: NodeJS.Timeout | null = null
+    let abortHandler: (() => void) | null = null
+
+    // Call iterator.next() once - this generator wraps a single next() call
+    const iteratorPromise = iterator.next()
+
+    // Create abort promise
+    const abortPromise = new Promise<never>((_, reject) => {
+      if (this.abortController) {
+        abortHandler = () => {
+          reject(new Error('Agent execution aborted by client'))
+        }
+        this.abortController.signal.addEventListener('abort', abortHandler, { once: true })
+      }
+    })
+
+    try {
+      // Loop until the iterator promise resolves, yielding heartbeats while waiting
+      while (true) {
+        // Check if execution was aborted
+        if (this.abortController?.signal.aborted) {
+          logger.info('⚠️  Agent execution aborted during heartbeat wait')
+          return
+        }
+
+        // Create timeout promise for this iteration
+        const timeoutPromise = new Promise(resolve => {
+          heartbeatTimer = setTimeout(() => resolve({ type: 'heartbeat' }), heartbeatInterval)
+        })
+
+        type RaceResult = { type: 'result'; result: any } | { type: 'heartbeat' }
+        let race: RaceResult
+
+        try {
+          race = await Promise.race([
+            iteratorPromise.then(result => ({ type: 'result' as const, result })),
+            timeoutPromise.then(() => ({ type: 'heartbeat' as const })),
+            abortPromise
+          ])
+        } catch (abortError) {
+          // Abort was triggered during wait
+          logger.info('⚠️  Agent execution aborted (caught during iterator wait)')
+          // Cleanup iterator
+          if (iterator.return) {
+            await iterator.return(undefined).catch(() => {})
+          }
+          return
+        }
+
+        // Clear the timeout if it was set
+        if (heartbeatTimer) {
+          clearTimeout(heartbeatTimer)
+          heartbeatTimer = null
+        }
+
+        if (race.type === 'heartbeat') {
+          // Heartbeat timeout occurred - yield processing event and continue waiting
+          yield EventFormatter.createProcessingEvent()
+          // Loop continues - will race the same iteratorPromise (still pending) vs new timeout
+        } else {
+          // Iterator result arrived - yield it and exit this generator
+          yield race.result
+          return
+        }
+      }
+    } finally {
+      // Clean up heartbeat timer
+      if (heartbeatTimer) {
+        clearTimeout(heartbeatTimer)
+      }
+
+      // Clean up abort listener if it wasn't triggered
+      if (abortHandler && this.abortController && !this.abortController.signal.aborted) {
+        this.abortController.signal.removeEventListener('abort', abortHandler)
+      }
+    }
+  }
+
   /**
    * Execute a task using Claude SDK and stream formatted events
    *
@@ -137,10 +222,28 @@ export class ClaudeSDKAgent extends BaseAgent {
       // Call Claude SDK
       const iterator = query({ prompt: message, options })[Symbol.asyncIterator]()
 
-      // Stream events
+      // Stream events with heartbeat
       while (true) {
-        const result = await iterator.next()
-        if (result.done) break
+        // Check if execution was aborted
+        if (this.abortController?.signal.aborted) {
+          logger.info('⚠️  Agent execution aborted by client')
+          break
+        }
+
+        let result: IteratorResult<any> | null = null
+
+        // Iterate through heartbeat generator to get the actual result
+        for await (const item of this.nextWithHeartbeat(iterator)) {
+          if (item && item.done !== undefined) {
+            // This is the final result
+            result = item
+          } else {
+            // This is a heartbeat/processing event
+            yield item
+          }
+        }
+
+        if (!result || result.done) break
 
         const event = result.value
 

diff --git a/packages/agent/src/utils/EventFormatter.ts b/packages/agent/src/utils/EventFormatter.ts
@@ -7,7 +7,7 @@
  * Formatted event structure for WebSocket clients
  */
 export class FormattedEvent {
-  type: 'init' | 'thinking' | 'tool_use' | 'tool_result' | 'response' | 'completion' | 'error'
+  type: 'init' | 'thinking' | 'tool_use' | 'tool_result' | 'response' | 'completion' | 'error' | 'processing'
   content: string
   metadata?: {
     turnCount?: number
@@ -36,6 +36,13 @@ export class FormattedEvent {
  */
 export class EventFormatter {
 
+  /**
+   * Create a processing/heartbeat event to indicate Claude is still working
+   */
+  static createProcessingEvent(): FormattedEvent {
+    return new FormattedEvent('processing', '⏳ Processing...')
+  }
+
   /**
    * Format any Claude SDK event into a FormattedEvent
    */

diff --git a/packages/agent/src/websocket/server.ts b/packages/agent/src/websocket/server.ts
@@ -393,14 +393,31 @@ async function processMessage(
       eventCount++
       lastEventType = formattedEvent.type
 
-      // Send to client (SAME AS BEFORE)
-      ws.send(JSON.stringify(formattedEvent.toJSON()))
+      // Send to client - catch errors if client disconnected
+      try {
+        ws.send(JSON.stringify(formattedEvent.toJSON()))
 
-      logger.debug('📤 Event sent', {
-        sessionId,
-        type: formattedEvent.type,
-        eventCount
-      })
+        logger.debug('📤 Event sent', {
+          sessionId,
+          type: formattedEvent.type,
+          eventCount
+        })
+      } catch (sendError) {
+        // Client disconnected during streaming
+        logger.info('⚠️  Client disconnected during event streaming, stopping iterator', {
+          sessionId,
+          eventCount
+        })
+
+        // Cleanup iterator
+        if (iterator.return) {
+          await iterator.return(undefined).catch(() => {})
+        }
+
+        // Exit cleanly - don't throw, just return
+        // (throwing would trigger outer error handler which tries to sendError again)
+        return
+      }
     }
 
     logger.info('✅ Message processed successfully', {