Skip to content

Commit a9af593

Browse files
committed
fix: temp
1 parent 044912f commit a9af593

File tree

103 files changed

+1954
-106
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

103 files changed

+1954
-106
lines changed

benchmark.py

Lines changed: 22 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@
99
from selenium.webdriver.chrome.service import Service
1010
from webdriver_manager.chrome import ChromeDriverManager
1111
from selenium.common.exceptions import WebDriverException
12+
import logging
13+
14+
# Setup logging
15+
logging.basicConfig(filename='webwand_test_log.txt', level=logging.INFO, format='%(asctime)s:%(levelname)s:%(message)s')
1216

1317
load_dotenv()
1418
api_key = os.getenv('OPENAI_API_KEY')
@@ -21,6 +25,7 @@ def setup_driver():
2125
chrome_options = Options()
2226
# Load the unpacked webwand chrome extension
2327
chrome_options.add_argument("--load-extension=./dist")
28+
chrome_options.add_argument("--window-size=1600,900")
2429
service = Service(ChromeDriverManager().install())
2530
driver = webdriver.Chrome(service=service, options=chrome_options)
2631
# Set script timeout to 240 seconds
@@ -35,7 +40,7 @@ def dispatch_event(driver, event_name, event):
3540
driver.execute_script(script)
3641

3742
def add_task_listener(driver, task_id, max_retries=3):
38-
print('add_task_listener', task_id)
43+
logging.info(f'Adding task listener for task {task_id}')
3944
"""
4045
Add event listeners for task history and screenshot events. Both events include task status.
4146
Then process those events as they are captured.
@@ -47,17 +52,17 @@ def add_task_listener(driver, task_id, max_retries=3):
4752
if (e.detail.type == 'history') {{
4853
console.log("event listener received history event");
4954
if (e.detail.status === 'success' || e.detail.status === 'error') {{
50-
callback({{status: e.detail.status, type: 'history', data: e.detail.data}});
5155
document.removeEventListener('TaskUpdate', eventListener);
5256
console.log("event listener removed");
57+
callback({{status: e.detail.status, type: 'history', data: e.detail.data}});
5358
}}
5459
// Does not do anything when the status is 'running' or 'idle'.
5560
// The status 'interrupted' will never be triggered automatically.
5661
}} else if (e.detail.type == 'screenshot') {{
5762
console.log("event listener received screenshot event");
58-
callback({{status: e.detail.status, type: 'screenshot', data: e.detail.data}});
5963
document.removeEventListener('TaskUpdate', eventListener);
6064
console.log("event listener removed");
65+
callback({{status: e.detail.status, type: 'screenshot', data: e.detail.data}});
6166
}} else {{
6267
throw new Error("Invalid event type received: " + e.detail.type);
6368
}}
@@ -67,7 +72,6 @@ def add_task_listener(driver, task_id, max_retries=3):
6772
console.log("added event listener");
6873
"""
6974

70-
completed = {'status': None}
7175
attempts = 0
7276

7377
def handle_event(event_data):
@@ -77,41 +81,37 @@ def handle_event(event_data):
7781
return
7882
if event_data['type'] == 'history':
7983
# Record history when task stops
80-
completed['status'] = event_data['status']
8184
write_history(task_id, event_data['data'])
85+
attempts = 0
8286
return
8387
if event_data['type'] == 'screenshot':
8488
write_screenshots(task_id, event_data['data'])
89+
attempts = 0
8590
# Task is still running. Continue to listen for events
8691
handle_event(driver.execute_async_script(script))
8792
else:
8893
raise ValueError(f"Unhandled event data type: {event_data['type']}")
89-
attempts = 0
90-
print("reset attempts to zero")
9194

9295
while attempts < max_retries:
9396
try:
9497
handle_event(driver.execute_async_script(script))
9598
break
9699
except WebDriverException as e:
97100
if "javascript error: document unloaded while waiting for result" in str(e):
98-
print(f"Document unloaded error: {e}")
99101
attempts += 1
102+
logging.warning(f'Document unloaded error during task {task_id} attempt {attempts}: {str(e)}')
100103
print(f"Attempt {attempts}: Document unloaded error. Retrying...")
104+
logging.info("Retrying...")
101105
if attempts == max_retries:
102-
print("Maximum retry attempts reached. Cannot recover from document unloaded error.")
106+
logging.error(f'Maximum retry attempts reached for task {task_id}.')
103107
else:
104-
print(f"Other WebDriver error: {e}")
108+
logging.error(f'WebDriver exception for task {task_id}: {str(e)}')
105109
break
106110
except Exception as e:
107-
print(f"Error while listening for updates: {e}")
111+
logging.error(f'Unhandled error for task {task_id}: {str(e)}')
108112
break
109-
110-
print("completed['status']", completed['status'])
111-
return completed['status']
112113

113114
def write_history(task_id, task_history):
114-
print('write_history', task_id)
115115
task_dir = os.path.join(results_dir, f"test{task_id}")
116116
os.makedirs(task_dir, exist_ok=True)
117117
file_path = os.path.join(task_dir, 'interact_messages.json')
@@ -120,22 +120,24 @@ def write_history(task_id, task_history):
120120
json.dump(task_history, file, indent=4)
121121

122122
def write_screenshots(task_id, image_data):
123-
print('write_screenshots', task_id)
124123
image_bytes = base64.b64decode(image_data)
125124
task_dir = os.path.join(results_dir, f"test{task_id}")
126125
os.makedirs(task_dir, exist_ok=True)
127126
timestamp = int(time.time())
128127
file_path = os.path.join(task_dir, f'screenshot_{timestamp}.png')
129128
with open(file_path, 'wb') as file:
130129
file.write(image_bytes)
130+
logging.info(f'Screenshot saved for task {task_id}')
131131

132132
def run_webwand_task(driver, task_id, task_description):
133-
print('run_webwand_task', task_id, task_description)
133+
logging.info(f'Start running task {task_id} {task_description}')
134+
start = time.time()
134135
dispatch_event(driver, 'SetAPIKey', {"value": api_key})
135136
dispatch_event(driver, 'SetTask', {"value": task_description})
136137
dispatch_event(driver, 'RunTask', {})
137-
task_status = add_task_listener(driver, task_id)
138-
return task_status
138+
add_task_listener(driver, task_id)
139+
end = time.time()
140+
logging.info(f'The task {task_id} took {end - start} seconds to complete.')
139141

140142
def click_extensions_icon(driver):
141143
# Simulate click to open side panel
@@ -162,10 +164,7 @@ def main():
162164
click_extensions_icon(driver)
163165
initial_load = False
164166

165-
task_status = run_webwand_task(driver, task_id, task['ques'])
166-
while task_status not in ['success', 'error']:
167-
print("wait task_status", task_status)
168-
time.sleep(3) # Wait for 3 seconds till the current task completes
167+
run_webwand_task(driver, task_id, task['ques'])
169168
driver.quit()
170169

171170
if __name__ == "__main__":

evaluator.py

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
# Code from webVoyager https://github.com/MinorJerry/WebVoyager/blob/main/evaluation/auto_eval.py
2+
import argparse
3+
import os
4+
import json
5+
import time
6+
import re
7+
import base64
8+
9+
from openai import OpenAI
10+
11+
SYSTEM_PROMPT = """As an evaluator, you will be presented with three primary components to assist you in your role:
12+
13+
1. Web Task Instruction: This is a clear and specific directive provided in natural language, detailing the online activity to be carried out. These requirements may include conducting searches, verifying information, comparing prices, checking availability, or any other action relevant to the specified web service (such as Amazon, Apple, ArXiv, BBC News, Booking etc).
14+
15+
2. Result Screenshots: This is a visual representation of the screen showing the result or intermediate state of performing a web task. It serves as visual proof of the actions taken in response to the instruction.
16+
17+
3. Result Response: This is a textual response obtained after the execution of the web task. It serves as textual result in response to the instruction.
18+
19+
-- You DO NOT NEED to interact with web pages or perform actions such as booking flights or conducting searches on websites.
20+
-- You SHOULD NOT make assumptions based on information not presented in the screenshot when comparing it to the instructions.
21+
-- Your primary responsibility is to conduct a thorough assessment of the web task instruction against the outcome depicted in the screenshot and in the response, evaluating whether the actions taken align with the given instructions.
22+
-- NOTE that the instruction may involve more than one task, for example, locating the garage and summarizing the review. Failing to complete either task, such as not providing a summary, should be considered unsuccessful.
23+
-- NOTE that the screenshot is authentic, but the response provided by LLM is generated at the end of web browsing, and there may be discrepancies between the text and the screenshots.
24+
-- Note the difference: 1) Result response may contradict the screenshot, then the content of the screenshot prevails, 2) The content in the Result response is not mentioned on the screenshot, choose to believe the content.
25+
26+
You should elaborate on how you arrived at your final evaluation and then provide a definitive verdict on whether the task has been successfully accomplished, either as 'SUCCESS' or 'NOT SUCCESS'."""
27+
USER_PROMPT = """TASK: <task>
28+
Result Response: <answer>
29+
<num> screenshots at the end: """
30+
31+
32+
def encode_image(image_path):
33+
with open(image_path, "rb") as image_file:
34+
return base64.b64encode(image_file.read()).decode('utf-8')
35+
36+
37+
def auto_eval_by_gpt4v(process_dir, openai_client, api_model, img_num):
38+
print(f'--------------------- {process_dir} ---------------------')
39+
res_files = sorted(os.listdir(process_dir))
40+
with open(os.path.join(process_dir, 'interact_messages.json')) as fr:
41+
it_messages = json.load(fr)
42+
43+
if len(it_messages) == 1:
44+
print('Not find answer for ' + process_dir + ' only system messages')
45+
print()
46+
return 0
47+
48+
task_info = it_messages[1]["content"]
49+
if type(task_info) == list:
50+
task_info = task_info[0]["text"]
51+
assert 'Now given a task' in task_info
52+
pattern = r"Now given a task:(.+?)Please interact with"
53+
matches = re.search(pattern, task_info)
54+
task_content = matches.group(1).strip()
55+
56+
ans_info = it_messages[-1]["content"]
57+
if 'Action: ANSWER' not in ans_info:
58+
print('Not find answer for ' + process_dir)
59+
print()
60+
return 0
61+
pattern_ans = r"ANSWER[; ]+\[?(.[^\]]*)\]?"
62+
matches_ans = re.search(pattern_ans, ans_info)
63+
answer_content = matches_ans.group(1).strip()
64+
65+
# max_screenshot_id = max([int(f[10:].split('.png')[0]) for f in os.listdir(process_dir) if '.png' in f])
66+
# final_screenshot = f'screenshot{max_screenshot_id}.png'
67+
# b64_img = encode_image(os.path.join(process_dir, final_screenshot))
68+
whole_content_img = []
69+
pattern_png = r'screenshot(\d+)\.png'
70+
matches = [(filename, int(re.search(pattern_png, filename).group(1))) for filename in res_files if re.search(pattern_png, filename)]
71+
matches.sort(key=lambda x: x[1])
72+
end_files = matches[-img_num:]
73+
for png_file in end_files:
74+
b64_img = encode_image(os.path.join(process_dir, png_file[0]))
75+
whole_content_img.append(
76+
{
77+
'type': 'image_url',
78+
'image_url': {"url": f"data:image/png;base64,{b64_img}"}
79+
}
80+
)
81+
82+
user_prompt_tmp = USER_PROMPT.replace('<task>', task_content)
83+
user_prompt_tmp = user_prompt_tmp.replace('<answer>', answer_content)
84+
user_prompt_tmp = user_prompt_tmp.replace('<num>', str(img_num))
85+
messages = [
86+
{'role': 'system', 'content': SYSTEM_PROMPT},
87+
{
88+
'role': 'user',
89+
'content': [
90+
{'type': 'text', 'text': user_prompt_tmp}
91+
]
92+
+ whole_content_img
93+
+ [{'type': 'text', 'text': "Your verdict:\n"}]
94+
}
95+
]
96+
while True:
97+
try:
98+
print('Calling gpt4v API to get the auto evaluation......')
99+
openai_response = openai_client.chat.completions.create(
100+
model=api_model, messages=messages, max_tokens=1000, seed=42, temperature=0
101+
)
102+
print('Prompt Tokens:', openai_response.usage.prompt_tokens, ';',
103+
'Completion Tokens:', openai_response.usage.completion_tokens)
104+
print('Cost:', openai_response.usage.prompt_tokens/1000 * 0.01
105+
+ openai_response.usage.completion_tokens / 1000 * 0.03)
106+
107+
print('API call complete...')
108+
break
109+
except Exception as e:
110+
print(e)
111+
if type(e).__name__ == 'RateLimitError':
112+
time.sleep(10)
113+
elif type(e).__name__ == 'APIError':
114+
time.sleep(15)
115+
elif type(e).__name__ == 'InvalidRequestError':
116+
exit(0)
117+
else:
118+
time.sleep(10)
119+
gpt_4v_res = openai_response.choices[0].message.content
120+
print_message = messages[1]
121+
for idx in range(len(print_message['content'])):
122+
if print_message['content'][idx]['type'] == 'image_url':
123+
print_message['content'][idx]['image_url'] = {"url": "data:image/png;base64, b64_img"}
124+
125+
# print_message[1]['content'][1]['image_url'] = {"url": "data:image/png;base64, b64_img"}
126+
print(print_message)
127+
print(gpt_4v_res)
128+
129+
auto_eval_res = 0 if 'NOT SUCCESS' in gpt_4v_res else 1
130+
if 'SUCCESS' not in gpt_4v_res:
131+
auto_eval_res = None
132+
print('Auto_eval_res:', auto_eval_res)
133+
print()
134+
return auto_eval_res
135+
136+
137+
def main():
138+
parser = argparse.ArgumentParser()
139+
parser.add_argument('--process_dir', type=str, default='results')
140+
parser.add_argument('--lesson_dir', type=str, default='results')
141+
parser.add_argument("--api_key", default="key", type=str, help="YOUR_OPENAI_API_KEY")
142+
parser.add_argument("--api_model", default="gpt-4-vision-preview", type=str, help="api model name")
143+
parser.add_argument("--max_attached_imgs", type=int, default=1)
144+
args = parser.parse_args()
145+
146+
client = OpenAI(api_key=args.api_key)
147+
webs = ['Allrecipes', 'Amazon', 'Apple', 'ArXiv', 'BBC News', 'Booking', 'Cambridge Dictionary',
148+
'Coursera', 'ESPN', 'GitHub', 'Google Flights', 'Google Map', 'Google Search', 'Huggingface', 'Wolfram Alpha']
149+
150+
for web in webs:
151+
web_task_res = []
152+
for idx in range(0, 46):
153+
file_dir = os.path.join(args.process_dir, 'task'+web+'--'+str(idx))
154+
if os.path.exists(file_dir):
155+
response = auto_eval_by_gpt4v(file_dir, client, args.api_model, args.max_attached_imgs)
156+
web_task_res.append(response)
157+
else:
158+
pass
159+
if web_task_res:
160+
print(web_task_res)
161+
if __name__ == '__main__':
162+
main()

0 commit comments

Comments
 (0)