Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/upgraded version #22

Merged
merged 3 commits into from
Feb 25, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions examples/try.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,11 @@ def set_llm(llm_provider:str = None):
except Exception as e:
print(f"Error while getting API key: {e}")
api_key = None
return ChatAnthropic(model='claude-3-5-sonnet-20240620', api_key=SecretStr(api_key))
return ChatAnthropic(model='claude-3-7-sonnet-20250219', api_key=SecretStr(api_key))

llm = set_llm('anthropic')
llm = set_llm('google')
llm = set_llm('OAI')
llm = set_llm('anthropic')

controller = Controller()

Expand All @@ -61,14 +61,14 @@ async def main():
)

await agent_greeting.run(max_steps=25)
task = input()
task = input("Enter the task: ")

agent_task = Agent(
task=task,
llm=llm,
controller=controller,
use_vision=False,
max_actions_per_step=1,
max_actions_per_step=4,
max_failures=5
)

Expand Down
2 changes: 1 addition & 1 deletion gradio_app/src/models/llm_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
# LLM model mappings
LLM_MODELS = {
"OpenAI": ["gpt-4o", "o3-mini"],
"Anthropic": ["claude-3-5-sonnet-20240620"],
"Anthropic": ["claude-3-5-sonnet-20240620", "claude-3-7-sonnet-20250219"],
"Google": ["gemini-1.5-flash-002"],
"alibaba": ["qwen-2.5-72b-instruct"]
}
Expand Down
2 changes: 1 addition & 1 deletion gradio_app/src/ui/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def create_agent_tab(app_instance) -> List[gr.components.Component]:
max_actions = gr.Slider(
minimum=1,
maximum=20,
value=3,
value=5,
step=1,
label="Max Actions per Step"
)
Expand Down
124 changes: 59 additions & 65 deletions mlx_use/agent/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,39 +19,34 @@ def __init__(self, action_description: str, current_date: datetime, max_actions_

def important_rules(self) -> str:
"""Returns a string containing important rules for the system."""
return f"""
text = """
1. RESPONSE FORMAT:
You must ALWAYS respond with a valid JSON object that has EXACTLY two keys:
- "current_state": an object with three required fields:
- "evaluation_previous_goal": string evaluating if previous actions succeeded, failed, or unknown
- "memory": string describing task progress and important context to remember
- "next_goal": string describing the next immediate goal
- "action": an array of action objects. Each action object must be of the form:
{{"action_name": {{"parameter1": "<value>", ... }}}}
Do not include any additional keys, markdown formatting, or commentary.

For example:
{{
"current_state": {{
"evaluation_previous_goal": "Initialize Task",
"memory": "Starting new task to open calculator app",
"next_goal": "Open the Calculator application"
}},
{
"current_state": {
"evaluation_previous_goal": "Success|Failed|Unknown - Use UI context elements to verify outcomes (e.g., results in context). Use action results to confirm execution when UI changes are delayed or unclear.",
"memory": "What you’ve done and need to remember",
"next_goal": "Next step to achieve"
},
"action": [
{{"open_app": {{"app_name": "Calculator"}}}},
{{"click_element": {{"element_index": "0"}}}},
{{"input_text": {{"element_index": "0", "text": "5", "submit": true}}}}
{
"one_action_name": {
// action-specific parameter
}
},
// ... more actions in sequence
]
}}
}'

2. ACTIONS: You can specify multiple actions in the list to be executed in sequence. But always specify only one action name per item.
- Always start with open_app to ensure the correct app is active.
- For stable UIs (e.g., Calculator), batch actions up to max_actions_per_step.
- For dynamic UIs (e.g., Mail), perform one action at a time due to potential refreshes.

2. ACTION SEQUENCING:
- First ALWAYS open the required app using open_app.
- Then perform UI interactions.
- Use a maximum of {self.max_actions_per_step} actions per sequence.
- Actions are executed in the order they appear in the list.

3. APP HANDLING:
- App names are case-sensitive (e.g. 'Microsoft Excel', 'Calendar').
- Always use the correct app for the task. (e.g. calculator for calculations, mail for sending emails, browser for browsing, etc.)
- Never assume apps are already open.
- When opening a browser, always work a new tab.
- Common app mappings:
Expand All @@ -60,73 +55,72 @@ def important_rules(self) -> str:
* Messages may appear as 'Messages' or 'com.apple.MobileSMS'.

4. ELEMENT INTERACTION:
- Only use indexes that exist in the provided element list.
- Each element has a unique index number (e.g. "0: Button: Submit").
- Elements refresh after each action.
- Use input_text with submit=True for text fields needing Enter submission.

5. ERROR RECOVERY:
- If text input fails, ensure the element is a text field.
- If submit fails, try click_element on the submit button instead.
- Interactive elements: "[index][:]<type> [interactive]" (e.g., "1[:]<AXButton>").
- Context elements: "_[:]<type> [context]" (e.g., "_[:]<AXStaticText value='20'>").
- Use context elements to verify outcomes (e.g., check results after actions).
- Use attributes (description, title, value) to identify elements accurately.

6. TASK COMPLETION:
5. TASK COMPLETION:
- Use the "done" action when the task is complete.
- Don't hallucinate actions.
- After performing actions, verify the outcome using context elements in the UI tree.
- For tasks like calculations, always verify the result using context elements before marking as complete.
- For tasks like playing media, check the current track or playback status via AppleScript.
- If verification fails, attempt retries or alternative approaches before using "done".
- Include all task results in the "done" action text.
- If stuck after 3 attempts, use "done" with error details.
- Stable UIs (e.g., Calculator): Element indices remain consistent across actions, Batch up to max_actions_per_step actions (e.g., click "5", "+", "3", "=").
- Dynamic UIs (e.g., Mail): Elements may refresh or reorder after actions, perform one action at a time.

6. NAVIGATION & ERROR HANDLING:
- If an element isn't found, search for alternatives using descriptions or attributes.
- If stuck, try alternative approaches.
- If text input fails, ensure the element is a text field.
- If submit fails, try click_element on the submit button instead.
- If the UI tree fails with "Window not found" or error `-25212`, use open_app to open the app again.
- Before interacting, verify the element is enabled (check `enabled="True"` in attributes). If not, find an alternative or use AppleScript.

7. APPLESCRIPT SUPPORT:
- You can execute AppleScript commands using the run_apple_script action.
- Use this for complex operations not possible through UI interactions.
- AppleScript format: {{"run_apple_script": {{"script": "your AppleScript code here"}}}}
- Common AppleScript examples:
* Text to speech: {{"run_apple_script": {{"script": "say \\"Hello World\\""}}}}
* Create folder: {{"run_apple_script": {{"script": "tell application \\"Notes\\" to make new folder with properties {{name:\\"My Folder\\"}}"}}}}
* Get app info: {{"run_apple_script": {{"script": "tell application \\"System Events\\" to get name of every process"}}}}
* Preform some excel formula: {{"run_apple_script": {{"script": "tell application \\"Microsoft Excel\\" to calculate \\"=SUMIFS(C:C, B:B, \\"NY\\", A:A, \\"=A2\\")\\""}}}}
- These are only examples, you can use any AppleScript command to accomplish the step.
- YOU MUST USE PROPER APPLESCRIPT SYNTAX AND COMMANDS TO ACHIEVE THE TASK.
- ONLY USE APPLESCRIPT WHEN STANDARD UI INTERACTIONS ARE INSUFFICIENT.
- Ensure AppleScript commands are properly escaped with double quotes.
- For text-to-speech tasks, always use the "say" command through AppleScript.
- Use AppleScript for precise control (e.g., creating a note directly) or when UI interactions fail after retries. - Use this for complex operations not possible through UI interactions.
- Always use AppleScript with the correct command syntax.
- Examples:
- Tell application to make new note: {"run_apple_script": {"script": "tell application \"Notes\" to make new note"}}
- Text-to-speech: {"run_apple_script": {"script": "say \"Task complete\""}}
- Rename a file in Finder: {"run_apple_script": {"script": "tell application \"Finder\" to set name of item 1 of desktop to \"NewName\""}}
"""
text += f' - max_actions_per_step: {self.max_actions_per_step}'
return text

def input_format(self) -> str:
"""Returns a string describing the expected input format."""
return """
INPUT STRUCTURE:
1. Current App: Active macOS application (or "None" if none open)
2. UI Elements: List in the format:
[index] ElementType: Description
Example:
[0] Button: Close
[1] TextField: Search (submit)
3. Previous Results: Outcomes of the last executed actions
NOTE: The UI tree now includes detailed accessibility attributes (e.g., AXARIAAtomic, AXARIALive, etc.) to improve element identification.
- Interactive: '[index][:]<type> [interactive]' (e.g., '1[:]<AXButton>').
- Context: '_[:]<type> [context]' (e.g., '_[:]<AXStaticText value="20">').
3. Action Results: Feedback from the previous step's actions (e.g., "Clicked element 2 successfully").

NOTE: The UI tree includes detailed accessibility attributes use them to choose the correct element.
"""

def get_system_message(self) -> SystemMessage:
"""Creates and returns a SystemMessage with formatted content."""
time_str = self.current_date.strftime('%Y-%m-%d %H:%M')

AGENT_PROMPT = f"""You are a strict macOS automation agent that MUST ONLY interact with macOS apps through structured commands. Your role is to:
1. ALWAYS open the required app using the open_app action first - never skip this step.
2. NEVER use your own knowledge to calculate or process information - always use the appropriate macOS app.
3. Analyze the provided ui tree elements indices and structure and use the appropriate actions to accomplish the task.
4. Plan a sequence of actions to accomplish the given task through UI interactions only.
5. Always use the actions as if you were a human interacting with the app.
6. Only rely on the ui tree elements data to provide the best possible response.
7. For calculations, ALWAYS use the Calculator app and perform operations through UI clicks.
8. Never return direct answers without using UI interactions.
9. Ensure the final state of the application matches the expected outcome before declaring the task complete.
AGENT_PROMPT = f"""
You are a macOS automation agent that interacts with applications via their UI elements using the Accessibility API. Your role is to:
1. Analyze the provided UI tree of the current application.
2. Plan a sequence of actions to accomplish the given task.
3. Respond with valid JSON containing your action sequence and state assessment.

Current time: {time_str}
Current date and time: {time_str}

{self.input_format()}

{self.important_rules()}

AVAILABLE ACTIONS:
Functions:
{self.default_action_description}

Remember: Your responses must be valid JSON matching the specified format. Each action in the sequence must be valid.
Expand Down
6 changes: 5 additions & 1 deletion mlx_use/agent/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ def get_last_pid(self) -> Optional[int]:

@time_execution_async("--step")
async def step(self, step_info: Optional[AgentStepInfo] = None) -> None:
await asyncio.sleep(1)
"""Execute one step of the task"""
logger.info(f"\n📍 Step {self.n_steps}")
state = None
Expand All @@ -205,7 +206,7 @@ async def step(self, step_info: Optional[AgentStepInfo] = None) -> None:
if root:
state = root.get_clickable_elements_string()
# print the ui tree
logger.debug(f"State: {state}")
print(f"State: {state}")

# consider adding the full ui tree details, much more tokens!
# state = (
Expand Down Expand Up @@ -293,6 +294,9 @@ def _make_history_item(
result: list[ActionResult],
) -> None:
"""Create and store history item"""
logger.debug("Adding history item: state=%s, model_output=%s, result=%s",
state, model_output.json() if model_output else None, [r.model_dump() for r in result])

interacted_element = None
len_result = len(result)

Expand Down
42 changes: 38 additions & 4 deletions mlx_use/controller/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,11 @@ async def click_element(index: int, mac_tree_builder: MacUITreeBuilder):

click_successful = click(element_to_click)
if click_successful:
return ActionResult(extracted_content=f'Successfully clicked element with index {index}')
logger.debug(f'Successfully clicked element with index {index}')
return ActionResult(
extracted_content=f'Successfully clicked element with index {index}',
include_in_memory=True
)
else:
msg = f'❌ Click failed for element with index {index}'
logging.error(msg)
Expand Down Expand Up @@ -204,11 +208,41 @@ async def open_app(app_name: str):
)
async def run_apple_script(script: str):
logger.info(f'Running AppleScript: {script}')

# Wrap the original script in error handling and return value logic
wrapped_script = f'''
try
{script}
return "OK"
on error errMsg
return "ERROR: " & errMsg
end try
'''

try:
subprocess.run(['osascript', '-e', script])
return ActionResult(extracted_content=f'Successfully ran AppleScript: {script}')
result = subprocess.run(
['osascript', '-e', wrapped_script],
capture_output=True,
text=True
)

if result.returncode == 0:
output = result.stdout.strip()
if output == "OK":
return ActionResult(extracted_content="Success")
elif output.startswith("ERROR:"):
error_msg = output
logger.error(error_msg)
return ActionResult(extracted_content=error_msg, error=error_msg)
else:
return ActionResult(extracted_content=output)
else:
error_msg = f"AppleScript failed with return code {result.returncode}: {result.stderr.strip()}"
logger.error(error_msg)
return ActionResult(extracted_content=error_msg, error=error_msg)

except Exception as e:
error_msg = f'Failed to run AppleScript: {str(e)}'
error_msg = f"Failed to run AppleScript: {str(e)}"
logger.error(error_msg)
return ActionResult(extracted_content=error_msg, error=error_msg)

Expand Down
36 changes: 23 additions & 13 deletions mlx_use/mac/element.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,24 +75,34 @@ def __repr__(self) -> str:
return role_str

def get_clickable_elements_string(self) -> str:
"""Convert the UI tree to a string representation focusing on interactive elements"""
"""Convert the UI tree to a string representation focusing on interactive and context elements"""
formatted_text = []

def process_node(node: 'MacElementNode', depth: int) -> None:
if node.highlight_index is not None:
# Include more information for interactive elements
attrs_str = ''
important_attrs = ['title', 'value', 'description', 'enabled']
for key in important_attrs:
if key in node.attributes:
attrs_str += f' {key}="{node.attributes[key]}"'

# Add actions if available
if node.actions:
attrs_str += f' actions="{", ".join(node.actions)}"'
# Build attributes string
attrs_str = ''
important_attrs = ['title', 'value', 'description', 'enabled']
for key in important_attrs:
if key in node.attributes:
attrs_str += f' {key}="{node.attributes[key]}"'

# Add actions if available
if node.actions:
attrs_str += f' actions="{", ".join(node.actions)}"'

# Include both interactive and context elements
if node.highlight_index is not None:
# Interactive element with numeric index
formatted_text.append(
f'{node.highlight_index}[:]<{node.role}{attrs_str}> [interactive]'
)
# Check if this is a context element (non-interactive AXStaticText or read-only AXTextField)
elif (node.role in ['AXStaticText', 'AXTextField'] and
not node.is_interactive and
(node.parent is None or node.parent.role == 'AXWindow' or node.parent.is_interactive)):
# Context element with "_" index
formatted_text.append(
f'{node.highlight_index}[:]<{node.role}{attrs_str}>'
f'_[:]<{node.role}{attrs_str}> [context]'
)

for child in node.children:
Expand Down
Loading