browser-use · eDeveloperOZ · Feb 25, 2025 · Feb 24, 2025 · Feb 24, 2025 · Feb 25, 2025
diff --git a/examples/try.py b/examples/try.py
@@ -40,11 +40,11 @@ def set_llm(llm_provider:str = None):
 		except Exception as e:
 			print(f"Error while getting API key: {e}")
 			api_key = None
-		return ChatAnthropic(model='claude-3-5-sonnet-20240620',  api_key=SecretStr(api_key))
+		return ChatAnthropic(model='claude-3-7-sonnet-20250219',  api_key=SecretStr(api_key))
 
-llm = set_llm('anthropic')
 llm = set_llm('google')
 llm = set_llm('OAI')
+llm = set_llm('anthropic')
 
 controller = Controller()
 
@@ -61,14 +61,14 @@ async def main():
 	)
 
 	await agent_greeting.run(max_steps=25)
-	task = input()
+	task = input("Enter the task: ")
 
 	agent_task = Agent(
 		task=task,
 		llm=llm,
 		controller=controller,
 		use_vision=False,
-		max_actions_per_step=1,
+		max_actions_per_step=4,
 		max_failures=5
 	)
 

diff --git a/gradio_app/src/models/llm_models.py b/gradio_app/src/models/llm_models.py
@@ -7,7 +7,7 @@
 # LLM model mappings
 LLM_MODELS = {
     "OpenAI": ["gpt-4o", "o3-mini"],
-    "Anthropic": ["claude-3-5-sonnet-20240620"],
+    "Anthropic": ["claude-3-5-sonnet-20240620", "claude-3-7-sonnet-20250219"],
     "Google": ["gemini-1.5-flash-002"],
     "alibaba": ["qwen-2.5-72b-instruct"]
 }

diff --git a/gradio_app/src/ui/interface.py b/gradio_app/src/ui/interface.py
@@ -35,7 +35,7 @@ def create_agent_tab(app_instance) -> List[gr.components.Component]:
                 max_actions = gr.Slider(
                     minimum=1,
                     maximum=20,
-                    value=3,
+                    value=5,
                     step=1,
                     label="Max Actions per Step"
                 )

diff --git a/mlx_use/agent/prompts.py b/mlx_use/agent/prompts.py
@@ -19,39 +19,34 @@ def __init__(self, action_description: str, current_date: datetime, max_actions_
 
     def important_rules(self) -> str:
         """Returns a string containing important rules for the system."""
-        return f"""
+        text = """
 1. RESPONSE FORMAT:
    You must ALWAYS respond with a valid JSON object that has EXACTLY two keys:
-     - "current_state": an object with three required fields:
-         - "evaluation_previous_goal": string evaluating if previous actions succeeded, failed, or unknown
-         - "memory": string describing task progress and important context to remember
-         - "next_goal": string describing the next immediate goal
-     - "action": an array of action objects. Each action object must be of the form:
-           {{"action_name": {{"parameter1": "<value>", ... }}}}
-   Do not include any additional keys, markdown formatting, or commentary.
-
-   For example:
-   {{
-     "current_state": {{
-       "evaluation_previous_goal": "Initialize Task", 
-       "memory": "Starting new task to open calculator app",
-       "next_goal": "Open the Calculator application"
-     }},
+     {
+     "current_state": {
+       "evaluation_previous_goal": "Success|Failed|Unknown - Use UI context elements to verify outcomes (e.g., results in context). Use action results to confirm execution when UI changes are delayed or unclear.",
+       "memory": "What you’ve done and need to remember",
+       "next_goal": "Next step to achieve"
+     },
      "action": [
-       {{"open_app": {{"app_name": "Calculator"}}}},
-       {{"click_element": {{"element_index": "0"}}}},
-       {{"input_text": {{"element_index": "0", "text": "5", "submit": true}}}}
+       {
+         "one_action_name": {
+           // action-specific parameter
+         }
+       },
+       // ... more actions in sequence
      ]
-   }}
+   }'
+
+2. ACTIONS: You can specify multiple actions in the list to be executed in sequence. But always specify only one action name per item.
+    - Always start with open_app to ensure the correct app is active.
+    - For stable UIs (e.g., Calculator), batch actions up to max_actions_per_step.
+    - For dynamic UIs (e.g., Mail), perform one action at a time due to potential refreshes.
 
-2. ACTION SEQUENCING:
-   - First ALWAYS open the required app using open_app.
-   - Then perform UI interactions.
-   - Use a maximum of {self.max_actions_per_step} actions per sequence.
-   - Actions are executed in the order they appear in the list.
 
 3. APP HANDLING:
    - App names are case-sensitive (e.g. 'Microsoft Excel', 'Calendar').
+   - Always use the correct app for the task. (e.g. calculator for calculations, mail for sending emails, browser for browsing, etc.)
    - Never assume apps are already open.
    - When opening a browser, always work a new tab.
    - Common app mappings:
@@ -60,73 +55,72 @@ def important_rules(self) -> str:
        * Messages may appear as 'Messages' or 'com.apple.MobileSMS'.
 
 4. ELEMENT INTERACTION:
-   - Only use indexes that exist in the provided element list.
-   - Each element has a unique index number (e.g. "0: Button: Submit").
-   - Elements refresh after each action.
-   - Use input_text with submit=True for text fields needing Enter submission.
-
-5. ERROR RECOVERY:
-   - If text input fails, ensure the element is a text field.
-   - If submit fails, try click_element on the submit button instead.
+   - Interactive elements: "[index][:]<type> [interactive]" (e.g., "1[:]<AXButton>").
+   - Context elements: "_[:]<type> [context]" (e.g., "_[:]<AXStaticText value='20'>").
+   - Use context elements to verify outcomes (e.g., check results after actions).
+   - Use attributes (description, title, value) to identify elements accurately.
 
-6. TASK COMPLETION:
+5. TASK COMPLETION:
    - Use the "done" action when the task is complete.
    - Don't hallucinate actions.
+   - After performing actions, verify the outcome using context elements in the UI tree.
+   - For tasks like calculations, always verify the result using context elements before marking as complete.
+   - For tasks like playing media, check the current track or playback status via AppleScript.
+   - If verification fails, attempt retries or alternative approaches before using "done".
    - Include all task results in the "done" action text.
    - If stuck after 3 attempts, use "done" with error details.
+   - Stable UIs (e.g., Calculator): Element indices remain consistent across actions, Batch up to max_actions_per_step actions (e.g., click "5", "+", "3", "=").
+   - Dynamic UIs (e.g., Mail): Elements may refresh or reorder after actions, perform one action at a time.
+
+6. NAVIGATION & ERROR HANDLING:
+   - If an element isn't found, search for alternatives using descriptions or attributes.
+   - If stuck, try alternative approaches.
+   - If text input fails, ensure the element is a text field.
+   - If submit fails, try click_element on the submit button instead.
+   - If the UI tree fails with "Window not found" or error `-25212`, use open_app to open the app again.
+   - Before interacting, verify the element is enabled (check `enabled="True"` in attributes). If not, find an alternative or use AppleScript.
 
 7. APPLESCRIPT SUPPORT:
-   - You can execute AppleScript commands using the run_apple_script action.
-   - Use this for complex operations not possible through UI interactions.
-   - AppleScript format: {{"run_apple_script": {{"script": "your AppleScript code here"}}}}
-   - Common AppleScript examples:
-       * Text to speech: {{"run_apple_script": {{"script": "say \\"Hello World\\""}}}}
-       * Create folder: {{"run_apple_script": {{"script": "tell application \\"Notes\\" to make new folder with properties {{name:\\"My Folder\\"}}"}}}}
-       * Get app info: {{"run_apple_script": {{"script": "tell application \\"System Events\\" to get name of every process"}}}}
-       * Preform some excel formula: {{"run_apple_script": {{"script": "tell application \\"Microsoft Excel\\" to calculate \\"=SUMIFS(C:C, B:B, \\"NY\\", A:A, \\"=A2\\")\\""}}}} 
-    - These are only examples, you can use any AppleScript command to accomplish the step.
-    - YOU MUST USE PROPER APPLESCRIPT SYNTAX AND COMMANDS TO ACHIEVE THE TASK.
-   - ONLY USE APPLESCRIPT WHEN STANDARD UI INTERACTIONS ARE INSUFFICIENT.
-   - Ensure AppleScript commands are properly escaped with double quotes.
-   - For text-to-speech tasks, always use the "say" command through AppleScript.
+   - Use AppleScript for precise control (e.g., creating a note directly) or when UI interactions fail after retries.   - Use this for complex operations not possible through UI interactions.
+   - Always use AppleScript with the correct command syntax.
+   - Examples: 
+        - Tell application to make new note: {"run_apple_script": {"script": "tell application \"Notes\" to make new note"}}
+        - Text-to-speech: {"run_apple_script": {"script": "say \"Task complete\""}}
+        - Rename a file in Finder: {"run_apple_script": {"script": "tell application \"Finder\" to set name of item 1 of desktop to \"NewName\""}}
 """
+        text += f'   - max_actions_per_step: {self.max_actions_per_step}'
+        return text
 
     def input_format(self) -> str:
         """Returns a string describing the expected input format."""
         return """
 INPUT STRUCTURE:
 1. Current App: Active macOS application (or "None" if none open)
 2. UI Elements: List in the format:
-   [index] ElementType: Description
-   Example:
-   [0] Button: Close
-   [1] TextField: Search (submit)
-3. Previous Results: Outcomes of the last executed actions
-NOTE: The UI tree now includes detailed accessibility attributes (e.g., AXARIAAtomic, AXARIALive, etc.) to improve element identification.
+   - Interactive: '[index][:]<type> [interactive]' (e.g., '1[:]<AXButton>').
+   - Context: '_[:]<type> [context]' (e.g., '_[:]<AXStaticText value="20">').
+3. Action Results: Feedback from the previous step's actions (e.g., "Clicked element 2 successfully").
+
+NOTE: The UI tree includes detailed accessibility attributes use them to choose the correct element.
 """
 
     def get_system_message(self) -> SystemMessage:
         """Creates and returns a SystemMessage with formatted content."""
         time_str = self.current_date.strftime('%Y-%m-%d %H:%M')
 
-        AGENT_PROMPT = f"""You are a strict macOS automation agent that MUST ONLY interact with macOS apps through structured commands. Your role is to:
-1. ALWAYS open the required app using the open_app action first - never skip this step.
-2. NEVER use your own knowledge to calculate or process information - always use the appropriate macOS app.
-3. Analyze the provided ui tree elements indices and structure and use the appropriate actions to accomplish the task.
-4. Plan a sequence of actions to accomplish the given task through UI interactions only.
-5. Always use the actions as if you were a human interacting with the app.
-6. Only rely on the ui tree elements data to provide the best possible response.
-7. For calculations, ALWAYS use the Calculator app and perform operations through UI clicks.
-8. Never return direct answers without using UI interactions.
-9. Ensure the final state of the application matches the expected outcome before declaring the task complete.
+        AGENT_PROMPT = f"""
+        You are a macOS automation agent that interacts with applications via their UI elements using the Accessibility API. Your role is to:
+1. Analyze the provided UI tree of the current application.
+2. Plan a sequence of actions to accomplish the given task.
+3. Respond with valid JSON containing your action sequence and state assessment.
 
-Current time: {time_str}
+Current date and time: {time_str}
 
 {self.input_format()}
 
 {self.important_rules()}
 
-AVAILABLE ACTIONS:
+Functions:
 {self.default_action_description}
 
 Remember: Your responses must be valid JSON matching the specified format. Each action in the sequence must be valid.

diff --git a/mlx_use/agent/service.py b/mlx_use/agent/service.py
@@ -191,6 +191,7 @@ def get_last_pid(self) -> Optional[int]:
 
 	@time_execution_async("--step")
 	async def step(self, step_info: Optional[AgentStepInfo] = None) -> None:
+		await asyncio.sleep(1)
 		"""Execute one step of the task"""
 		logger.info(f"\n📍 Step {self.n_steps}")
 		state = None
@@ -205,7 +206,7 @@ async def step(self, step_info: Optional[AgentStepInfo] = None) -> None:
 			if root:
 				state = root.get_clickable_elements_string()
 				# print the ui tree 
-				logger.debug(f"State: {state}")
+				print(f"State: {state}")
 
 				# consider adding the full ui tree details, much more tokens!
 				# state = (
@@ -293,6 +294,9 @@ def _make_history_item(
 		result: list[ActionResult],
 	) -> None:
 		"""Create and store history item"""
+		logger.debug("Adding history item: state=%s, model_output=%s, result=%s",
+					 state, model_output.json() if model_output else None, [r.model_dump() for r in result])
+
 		interacted_element = None
 		len_result = len(result)
 

diff --git a/mlx_use/controller/service.py b/mlx_use/controller/service.py
@@ -90,7 +90,11 @@ async def click_element(index: int, mac_tree_builder: MacUITreeBuilder):
 
 					click_successful = click(element_to_click)
 					if click_successful:
-						return ActionResult(extracted_content=f'Successfully clicked element with index {index}')
+						logger.debug(f'Successfully clicked element with index {index}')
+						return ActionResult(
+							extracted_content=f'Successfully clicked element with index {index}',
+							include_in_memory=True
+						)
 					else:
 						msg = f'❌ Click failed for element with index {index}'
 						logging.error(msg)
@@ -204,11 +208,41 @@ async def open_app(app_name: str):
 		)
 		async def run_apple_script(script: str):
 			logger.info(f'Running AppleScript: {script}')
+
+			# Wrap the original script in error handling and return value logic
+			wrapped_script = f'''
+				try
+					{script}
+					return "OK"
+				on error errMsg
+					return "ERROR: " & errMsg
+				end try
+			'''
+
 			try:
-				subprocess.run(['osascript', '-e', script])
-				return ActionResult(extracted_content=f'Successfully ran AppleScript: {script}')
+				result = subprocess.run(
+					['osascript', '-e', wrapped_script],
+					capture_output=True,
+					text=True
+				)
+
+				if result.returncode == 0:
+					output = result.stdout.strip()
+					if output == "OK":
+						return ActionResult(extracted_content="Success")
+					elif output.startswith("ERROR:"):
+						error_msg = output
+						logger.error(error_msg)
+						return ActionResult(extracted_content=error_msg, error=error_msg)
+					else:
+						return ActionResult(extracted_content=output)
+				else:
+					error_msg = f"AppleScript failed with return code {result.returncode}: {result.stderr.strip()}"
+					logger.error(error_msg)
+					return ActionResult(extracted_content=error_msg, error=error_msg)
+
 			except Exception as e:
-				error_msg = f'Failed to run AppleScript: {str(e)}'
+				error_msg = f"Failed to run AppleScript: {str(e)}"
 				logger.error(error_msg)
 				return ActionResult(extracted_content=error_msg, error=error_msg)
 

diff --git a/mlx_use/mac/element.py b/mlx_use/mac/element.py
@@ -75,24 +75,34 @@ def __repr__(self) -> str:
         return role_str
 
     def get_clickable_elements_string(self) -> str:
-        """Convert the UI tree to a string representation focusing on interactive elements"""
+        """Convert the UI tree to a string representation focusing on interactive and context elements"""
         formatted_text = []
 
         def process_node(node: 'MacElementNode', depth: int) -> None:
-            if node.highlight_index is not None:
-                # Include more information for interactive elements
-                attrs_str = ''
-                important_attrs = ['title', 'value', 'description', 'enabled']
-                for key in important_attrs:
-                    if key in node.attributes:
-                        attrs_str += f' {key}="{node.attributes[key]}"'
-
-                # Add actions if available
-                if node.actions:
-                    attrs_str += f' actions="{", ".join(node.actions)}"'
+            # Build attributes string
+            attrs_str = ''
+            important_attrs = ['title', 'value', 'description', 'enabled']
+            for key in important_attrs:
+                if key in node.attributes:
+                    attrs_str += f' {key}="{node.attributes[key]}"'
+
+            # Add actions if available
+            if node.actions:
+                attrs_str += f' actions="{", ".join(node.actions)}"'
 
+            # Include both interactive and context elements
+            if node.highlight_index is not None:
+                # Interactive element with numeric index
+                formatted_text.append(
+                    f'{node.highlight_index}[:]<{node.role}{attrs_str}> [interactive]'
+                )
+            # Check if this is a context element (non-interactive AXStaticText or read-only AXTextField)
+            elif (node.role in ['AXStaticText', 'AXTextField'] and 
+                  not node.is_interactive and 
+                  (node.parent is None or node.parent.role == 'AXWindow' or node.parent.is_interactive)):
+                # Context element with "_" index
                 formatted_text.append(
-                    f'{node.highlight_index}[:]<{node.role}{attrs_str}>'
+                    f'_[:]<{node.role}{attrs_str}> [context]'
                 )
 
             for child in node.children: