Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions local_model.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ssh -L 18001:localhost:18001 ec2-user@ec2-35-88-109-159.us-west-2.compute.amazonaws.com
27 changes: 27 additions & 0 deletions oldjeff.pem
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
-----BEGIN RSA PRIVATE KEY-----
MIIEogIBAAKCAQEAxjnHtkAbblUvAl25H/wcOARtDFIUcI5ZyT2FpDvNijqEgfhy
bqLfSehfKY+N+qnOOXmxy1Wi1irbLEl/AhdTsJGZ9gNMEZdq66sq5hwIlpbweMsQ
swkclAScQHIMl9NaArLkOthcygnuswqTmvwPirNdmfm+Bt1IBrNCOrCA0elf4rUv
mpyQW6ANNW7RmVi8d8wXBhHrnij738ZHl7hC/noD9MEsWNfaGMQNng2rBvOgLgVY
OgxHIlp/74lnuflpPqmYnURKF3QkRJ82unY/TdFI71lKqypt93rn4pAw8HX9QqaC
X57mVWdlmQafCenQgHHWgJ9yYnQTam0s/+fNmwIDAQABAoIBAHLF3ca+k6Nsmw1p
qtjEJqqglWs+0yrgoUgN4SVYowfYHgULD2bT0yl97CuqPPDYBNnuhm1PJjuPENwx
qeJSE1j21QhGnHLLE1NlBi+6J5bZyl6GZSLksbFaggYmgvgdnc5WOiOARymMWrM7
+n8QVwdeF2Ih4k8jLKMEg+JrdAsVaE9AfqrpGMKYJb18fVjj1RGf0q14mh2mTSeJ
dj8H5h8N8Fb6mSR+t7XLLn5uNufe/9R204H2CUYuRPN+AOW9xSLJbokNLlY5To2R
G+uiM0xJVrWvbFPxAtaTiI/nO5Oac6OUKmKI/5hBdQQ5qTkFeHsDNn8IzcCd2gzS
rgn0CYECgYEA/1SvoxKX5l/34nlcCcMAmyG4zKCUvdQPfoAIqhn2FBrHumBoBO4n
z1bLKaeZ9zdpVfI/P5TKjTawsn4P0QwExgxpJ9oZaLNokeRzW5GAZ87Cp6wzXbX8
/1GSPyoCIyMY4YeHxYtss0O7CTtatLn+NnzsgHPO3H7RbgGf+E8ek2ECgYEAxr7H
iUwLRxiQ47XyMJKyVIuYcxh9dwpc/TTXvSRk9Abz7yvQENKEIodW+WfM+cHRj69S
fVnVsbA+azCbvIPk1NikTkIECeMK2fZ/XeoOSNi2gFinpVeICBCNylzHIdjRprh9
gv5n6eb8/8DJFy9RIA/h4w/vtU3yv6zomp/1vnsCgYAShYAgIi8mpPuEUC1e/+hB
WJbhMRzZEFL3aC44uJ1jI/YtDOU+xk/Y+IDQSroedsSLWYFBCXgP+lGjAQYAshB+
lVPjciy5rZn+S0Ya9FkOLq9sHk+zkooBs1cagd+Z0OfzJDOzHsQJ1PXyW33e8kcA
iNtXDg+JayGiCzgheQvMwQKBgG/ulpZ+24MpFMEKgeJVXFY9YJjB3DelAIYisrZ1
vt2o5M140XAIAB8qNhO1ID4xqILR7RVn+PBgIGdiMvPTHJe7g54HlBq1YjEroMQV
xAHG+9IBHDoEuDpCiHjGE+i+IiVRlm6mNYQIccjgnOCP55K1HzUwjoJ/6g2FpmMf
X9ntAoGAWXT0yWMm4sCGbo4vd/jyPQPFnqWHODdqCmkL5vYMDV+CJXLioByDtFlh
lDQErdAFqjQdaoYBg2wxUwdBKzvz4b9HUstxg6MAVRKm3w+OgeendzO1t8h0c7ru
P/216mpn4Au0kPe5AMhy1339Qdod86QEztDtRXIaWk/mczpxtkc=
-----END RSA PRIVATE KEY-----
4 changes: 3 additions & 1 deletion scripts/run-local.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ set -e

source ./shared.sh

export OPENAI_API_KEY=empty

# Default parameters
mode="azure" # Default to azure if no argument is provided
prepare_image=false
Expand All @@ -22,7 +24,7 @@ browser_port=8006
rdp_port=3390
start_client=true
agent="navi"
model="gpt-4-vision-preview"
model="Qwen/Qwen2.5-VL-72B-Instruct"
som_origin="oss"
a11y_backend="uia"
gpu_enabled=false
Expand Down
Empty file modified scripts/run.sh
100644 → 100755
Empty file.
20 changes: 11 additions & 9 deletions src/win-arena-container/client/desktop_env/controllers/python.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,15 +174,17 @@ def execute_python_windows_command(self, command: str) -> None:
}


try:
response = requests.post(self.http_server + "/execute_windows", headers=headers, json=payload, timeout=90)
if response.status_code == 200:
logger.info("Command executed successfully: %s", response.text)
else:
logger.error("Failed to execute command. Status code: %d", response.status_code)
return response.json()
except requests.exceptions.RequestException as e:
logger.error("An error occurred while trying to execute the command: %s", e)
# try:
logger.info("execute_python_windows_command self.http_server: " + str(self.http_server))
response = requests.post(self.http_server + "/execute_windows", headers=headers, json=payload, timeout=90)
logger.info("execute_python_windows_command response: " + str(response))
if response.status_code == 200:
logger.info("Command executed successfully: %s", response.text)
else:
logger.error("Failed to execute command. Status code: %d", response.status_code)
return response.json()
# except requests.exceptions.RequestException as e:
# logger.error("An error occurred while trying to execute the command: %s", e)

def execute_python_command(self, command: str) -> None:
"""
Expand Down
3 changes: 3 additions & 0 deletions src/win-arena-container/client/hello_world.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@

while True:
print("Hello World")
15 changes: 14 additions & 1 deletion src/win-arena-container/client/lib_run_single.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import time
import traceback
from trajectory_recorder import TrajectoryRecorder
import pyfile_inject_execute

logger = logging.getLogger("desktopenv.experiment")

Expand All @@ -16,11 +17,17 @@
time_limit = data["time_limit"]

def run_single_example(agent, env, example, max_steps, instruction, args, example_result_dir, scores):
logger.info("run_single_example start...")
agent.reset()
obs = env.reset(task_config=example)
done = False
step_idx = 0

# inject and execute
logger.info("pyfile_inject_execute.inject_and_execute starts")
pyfile_inject_execute.inject_and_execute(envV = env, controllerV=env.controller)
logger.info("pyfile_inject_execute.inject_and_execute ends")

#env.controller.start_recording()
start_time = datetime.datetime.now()

Expand All @@ -39,10 +46,12 @@ def run_single_example(agent, env, example, max_steps, instruction, args, exampl
continue

logger.info("Agent: Thinking...")
pyfile_inject_execute.notify(envV = env, controllerV=env.controller, data = "before agent.predict")
response, actions, logs, computer_update_args = agent.predict(
instruction,
obs
)
pyfile_inject_execute.notify(envV = env, controllerV=env.controller, data = "after agent.predict")

# update the computer object, used by navi's action space
if computer_update_args:
Expand All @@ -55,7 +64,9 @@ def run_single_example(agent, env, example, max_steps, instruction, args, exampl
elapsed_timestamp = f"{datetime.datetime.now() - start_time}"
logger.info("Step %d: %s", step_idx + 1, action)

pyfile_inject_execute.notify(envV = env, controllerV=env.controller, data = "before env.step")
obs, reward, done, info = env.step(action, args.sleep_after_execution)
pyfile_inject_execute.notify(envV = env, controllerV=env.controller, data = "after env.step")

logger.info("Reward: %.2f", reward)
logger.info("Done: %s", done)
Expand Down Expand Up @@ -89,4 +100,6 @@ def run_single_example(agent, env, example, max_steps, instruction, args, exampl

# Record final results
recorder.record_end(result, start_time)
# env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))
# env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))

logger.info("run_single_example end...")
15 changes: 12 additions & 3 deletions src/win-arena-container/client/mm_agents/navi/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import re
from typing import Dict, List
# from mm_agents.planner.computer import Computer, WindowManager
from mm_agents.navi.gpt.openAI_planner import OpenAI_Planner
from mm_agents.navi.gpt.gpt4v_planner import GPT4V_Planner
from mm_agents.navi.gpt import planner_messages
import copy
Expand Down Expand Up @@ -113,9 +114,12 @@ def __init__(

if model == 'phi3-v':
from mm_agents.navi.gpt.phi3_planner import Phi3_Planner
self.gpt4v_planner = Phi3_Planner(server='azure',model='phi3-v',temperature=temperature)
# self.gpt4v_planner = Phi3_Planner(server='azure',model='phi3-v',temperature=temperature)
self.gpt4v_planner = OpenAI_Planner(temperature=temperature)
else:
self.gpt4v_planner = GPT4V_Planner(server=self.server, model=self.model, temperature=temperature)
# self.gpt4v_planner = GPT4V_Planner(server=self.server, model=self.model, temperature=temperature)
self.gpt4v_planner = OpenAI_Planner(temperature=temperature)

if use_last_screen:
self.gpt4v_planner.system_prompt = planner_messages.planning_system_message_shortened_previmg

Expand Down Expand Up @@ -398,11 +402,16 @@ def predict(self, instruction: str, obs: Dict) -> List:
image_prompts = [last_image, image_prompt_resized]

# send to gpt
logger.info("Thinking...")
logger.info("Thinking... model:"+self.gpt4v_planner.model)
logger.info("OpenAI info 1: "+self.gpt4v_planner.gpt4v.model)
logger.info("OpenAI info 2: "+str(self.gpt4v_planner.gpt4v.client.base_url))
logger.info("OpenAI info 3: "+str(self.gpt4v_planner.gpt4v.client.api_key))
plan_result = self.gpt4v_planner.plan(image_prompts, user_question)

logs['plan_result'] = plan_result

logger.info("plan_result: "+str(plan_result))

# extract the textual memory block
memory_block = re.search(r'```memory\n(.*?)```', plan_result, re.DOTALL)
if memory_block:
Expand Down
25 changes: 16 additions & 9 deletions src/win-arena-container/client/mm_agents/navi/gpt/gpt4v_oai.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,15 @@

class GPT4VisionOAI:

def __init__(self, model="gpt-4o"):
# def __init__(self, model="gpt-4o"):
def __init__(self, model="ByteDance-Seed/UI-TARS-1.5-7B"):
self.model = model
#oad key from environment variable
self.api_key = os.getenv("OPENAI_API_KEY")
if self.api_key is None:
print("API key not found in environment variable.")
self.client = openai.OpenAI(api_key=self.api_key)
print("API key not found in environment variable. Setting to 'empty'.")
self.api_key = "empty"
self.client = openai.OpenAI(api_key=self.api_key,base_url="http://ec2-35-88-109-159.us-west-2.compute.amazonaws.com:18001/v1", max_retries=0)

def encode_image(self, image: Union[str, Image.Image]) -> str:
if isinstance(image, str):
Expand Down Expand Up @@ -55,7 +57,7 @@ def get_base64_payload(self, base64_image: str, detail="auto") -> dict:
}
}

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(30))
# @retry(wait=wait_random_exponential(min=1, max=1), stop=stop_after_attempt(30))
def process_images(self, system_prompt: str, question: str, images: Union[str, Image.Image, List[Union[str, Image.Image]]], detail="auto", max_tokens=300, temperature=1.0, only_text=True, format="JPEG") -> str:

if system_prompt==None:
Expand All @@ -73,14 +75,19 @@ def process_images(self, system_prompt: str, question: str, images: Union[str, I
base64_image = self.encode_image(image)
content.append(self.get_base64_payload(base64_image, detail=detail))

print("gpt4voai model: "+self.model)
print("gpt4voai client info 1: "+str(self.client.api_key))
print("gpt4voai client info 2: "+str(self.client.base_url))
print("gpt4voai client info 3: "+str(self.client.max_retries))

response = self.client.chat.completions.create(
# model="gpt-4-vision-preview",
model=self.model,
messages=[
{
"role": "system",
"content": system_prompt
},
# {
# "role": "system",
# "content": system_prompt
# },
{
"role": "user",
"content": content
Expand All @@ -102,7 +109,7 @@ def main():
system_prompt = "You are a helpful assistant."

# SINGLE RESOURCE
gpt4v_wrapper = GPT4VisionOAI(model="gpt-4-1106-vision-preview")
gpt4v_wrapper = GPT4VisionOAI(model="ByteDance-Seed/UI-TARS-1.5-7B")

# process a single image
start_time = time.time()
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import os
import inspect
import re
from mm_agents.navi.gpt import gpt4v_azure, gpt4v_oai, system_messages, planner_messages
import tiktoken
import time
import json
import re

class OpenAI_Planner():
def __init__(self, server="azure", model="ByteDance-Seed/UI-TARS-1.5-7B", temperature=1.0):
self.server = server
self.model = model
self.temperature = temperature

# if self.server=="azure":
# self.gpt4v = gpt4v_azure.GPT4VisionAzure()
# elif self.server=="oai":
# self.gpt4v = gpt4v_oai.GPT4VisionOAI(self.model)
# else:
# raise ValueError(f"Server {server} not supported")
self.gpt4v = gpt4v_oai.GPT4VisionOAI(model=self.model)

# set the initial system message
self.system_prompt = planner_messages.planning_system_message

def plan(self, images, user_query):
response = self.gpt4v.process_images(self.system_prompt, user_query, images, max_tokens=4096, temperature=self.temperature, only_text=True)
return response

def describe_elements(self, screenshot, crops, descriptions=None) -> str:
n = len(crops)
system_prompt = f"you will be presented with crops of interactive element in the screen and a screenshot marked with red bounding-boxes. Your task is to describe each element and infer its function. A single crop may contain multiple elements, if so, describe them all in a single paragraph. You must provide one description for each of the {n} elements provided."

user_query = f"Given the element and screenshot. what could be the purpose of these {n} elements? The last image is the screenshot with the elements marked with red bounding boxes."

print(system_prompt)
print(user_query)

r = self.gpt4v.process_images(system_prompt, user_query, crops+[screenshot], max_tokens=4096, temperature=0.0, only_text=True)

# display(Image.open(screenshot_tagged))
print(r)


structuring_prompt = "Given descriptions of the elements/images, format into a json with the element index as key and values as summarized descriptions. if a single description references to multiple elements, break it down into the appropriate items. Make sure to remove from the descriptons any references to the element index. e.g input \n'Here's a description of the elements\n 1. The first icon looks liek a bell indicating... \n2. The second and third elements represent a magnifying glass...\n3. it appears to be a ball' output: ```json\n{'1':'A bell indicating...', '2':'A magnifying glass...','3':'A magnifying glass...', '4':'ball' ...}```."
user_query= f"Structure the following text into a json with descriprions of the {n} elements/images. \n'" + r + "\n'"
formatted = self.gpt4v.process_images(structuring_prompt, user_query,[], max_tokens=4096, temperature=0.0, only_text=True)
print(formatted)
try:
# extract code block
formatted = re.search(r"```json\n(.*)```", formatted, re.DOTALL).group(1)
result = json.loads(formatted)
except Exception as e:
print(f"{formatted}\n\n\nFailed to extract json from response: {e}")
result = {}
print(result)
return result
86 changes: 86 additions & 0 deletions src/win-arena-container/client/pyfile_inject_execute.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@

import os
import os.path
import logging
import requests
import json
from requests_toolbelt.multipart.encoder import MultipartEncoder

from desktop_env.controllers.python import PythonController
from desktop_env.envs.desktop_env import DesktopEnv


logger = logging.getLogger("inject_and_execute")

def inject_and_execute(envV: DesktopEnv, controllerV: PythonController):
print("1")
# prepare basic infos
pyFile = "hello_world.py"
local_path: str = "./"+pyFile
path: str = "C:/Users/Docker/Desktop/" + pyFile
# action = "python "+path
# has terminal window overlap
# action = "import subprocess; subprocess.Popen('python "+path+"',creationflags=subprocess.CREATE_NEW_CONSOLE);"
# no terminal window overlap
action = "import subprocess; subprocess.Popen('pythonw "+path+"',creationflags=subprocess.CREATE_NEW_CONSOLE);"

# inject execute py file through upload post request
if not os.path.exists(local_path):
print(f"Setup Upload - Invalid local path ({local_path}).")
return

print("2")
form = MultipartEncoder({
"file_path": path,
"file_data": (os.path.basename(path), open(local_path, "rb"))
})
headers = {"Content-Type": form.content_type}
print(form.content_type)

## send request to server to upload file
http_server = f"http://{controllerV.vm_ip}:5000"

logger.info("upload py file 4 injection: "+str(form))

print("3")
try:
print("REQUEST ADDRESS: %s", http_server + "/setup" + "/upload")
print("REQUEST FORM: "+str(form))
response = requests.post(http_server + "/setup" + "/upload", headers=headers, data=form)
if response.status_code == 200:
print("Command executed successfully: " + response.text)
else:
print("Failed to upload file. Status code: " + response.text)
except requests.exceptions.RequestException as e:
print("An error occurred while trying to send the request: " + e)

# execute py file through execute_python_windows_command post request
print("4")
print("controllerV.execute_python_windows_command: "+str(action))
controllerV.execute_python_command(action)
# controllerV.execute_command_new_terminal(action)
print("5")


def notify(envV: DesktopEnv, controllerV: PythonController, data: str):
# prepare basic infos
pyFile = "notify.txt"
local_path: str = "./"+pyFile
path: str = "C:/Users/Docker/Desktop/" + pyFile
http_server = f"http://{controllerV.vm_ip}:5000"


payload = json.dumps({"file_path": path, "file_data": data})
headers = {'Content-Type': 'application/json'}

try:
print("REQUEST ADDRESS: %s", http_server + "/notify")
print("REQUEST FORM: "+str(payload))
response = requests.post(http_server + "/notify", headers=headers, data=payload, timeout=90)
if response.status_code == 200:
print("Command executed successfully: " + response.text)
else:
print("Failed to notify. Status code: " + response.text)
except requests.exceptions.RequestException as e:
print("An error occurred while trying to send the request: " + e)

Loading