normal-computing
diff --git a/‎benchmark.py‎
Lines changed: 22 additions & 23 deletions b/‎benchmark.py‎
Lines changed: 22 additions & 23 deletions
diff --git a/‎evaluator.py‎
Lines changed: 162 additions & 0 deletions b/‎evaluator.py‎
Lines changed: 162 additions & 0 deletions
@@ -9,6 +9,10 @@
 from selenium.webdriver.chrome.service import Service
 from webdriver_manager.chrome import ChromeDriverManager
 from selenium.common.exceptions import WebDriverException
+import logging
+
+# Setup logging
+logging.basicConfig(filename='webwand_test_log.txt', level=logging.INFO, format='%(asctime)s:%(levelname)s:%(message)s')
 
 load_dotenv()
 api_key = os.getenv('OPENAI_API_KEY')
@@ -21,6 +25,7 @@ def setup_driver():
     chrome_options = Options()
     # Load the unpacked webwand chrome extension
     chrome_options.add_argument("--load-extension=./dist")
+    chrome_options.add_argument("--window-size=1600,900")
     service = Service(ChromeDriverManager().install())
     driver = webdriver.Chrome(service=service, options=chrome_options)
     # Set script timeout to 240 seconds
@@ -35,7 +40,7 @@ def dispatch_event(driver, event_name, event):
     driver.execute_script(script)
 
 def add_task_listener(driver, task_id, max_retries=3):
-    print('add_task_listener', task_id)
+    logging.info(f'Adding task listener for task {task_id}')
     """
     Add event listeners for task history and screenshot events. Both events include task status.
     Then process those events as they are captured.
@@ -47,17 +52,17 @@ def add_task_listener(driver, task_id, max_retries=3):
         if (e.detail.type == 'history') {{
             console.log("event listener received history event");
             if (e.detail.status === 'success' || e.detail.status === 'error') {{
-                callback({{status: e.detail.status, type: 'history', data: e.detail.data}});
                 document.removeEventListener('TaskUpdate', eventListener);
                 console.log("event listener removed");
+                callback({{status: e.detail.status, type: 'history', data: e.detail.data}});
             }}
             // Does not do anything when the status is 'running' or 'idle'. 
             // The status 'interrupted' will never be triggered automatically.
         }} else if (e.detail.type == 'screenshot') {{
             console.log("event listener received screenshot event");
-            callback({{status: e.detail.status, type: 'screenshot', data: e.detail.data}});
             document.removeEventListener('TaskUpdate', eventListener);
             console.log("event listener removed");
+            callback({{status: e.detail.status, type: 'screenshot', data: e.detail.data}});
         }} else {{
             throw new Error("Invalid event type received: " + e.detail.type);
         }}
@@ -67,7 +72,6 @@ def add_task_listener(driver, task_id, max_retries=3):
     console.log("added event listener");
     """
 
-    completed = {'status': None}
     attempts = 0
 
     def handle_event(event_data):
@@ -77,41 +81,37 @@ def handle_event(event_data):
             return
         if event_data['type'] == 'history':
             # Record history when task stops
-            completed['status'] = event_data['status']
             write_history(task_id, event_data['data'])
+            attempts = 0
             return
         if event_data['type'] == 'screenshot':
             write_screenshots(task_id, event_data['data'])
+            attempts = 0
             # Task is still running. Continue to listen for events
             handle_event(driver.execute_async_script(script))
         else:
             raise ValueError(f"Unhandled event data type: {event_data['type']}")
-        attempts = 0
-        print("reset attempts to zero")
 
     while attempts < max_retries:
         try:
             handle_event(driver.execute_async_script(script))
             break
         except WebDriverException as e:
             if "javascript error: document unloaded while waiting for result" in str(e):
-                print(f"Document unloaded error: {e}")
                 attempts += 1
+                logging.warning(f'Document unloaded error during task {task_id} attempt {attempts}: {str(e)}')
                 print(f"Attempt {attempts}: Document unloaded error. Retrying...")
+                logging.info("Retrying...")
                 if attempts == max_retries:
-                    print("Maximum retry attempts reached. Cannot recover from document unloaded error.")
+                    logging.error(f'Maximum retry attempts reached for task {task_id}.')
             else:
-                print(f"Other WebDriver error: {e}")
+                logging.error(f'WebDriver exception for task {task_id}: {str(e)}')
                 break
         except Exception as e:
-            print(f"Error while listening for updates: {e}")
+            logging.error(f'Unhandled error for task {task_id}: {str(e)}')
             break
-        
-    print("completed['status']", completed['status'])
-    return completed['status']
 
 def write_history(task_id, task_history):
-    print('write_history', task_id)
     task_dir = os.path.join(results_dir, f"test{task_id}")
     os.makedirs(task_dir, exist_ok=True)
     file_path = os.path.join(task_dir, 'interact_messages.json')
@@ -120,22 +120,24 @@ def write_history(task_id, task_history):
         json.dump(task_history, file, indent=4)
 
 def write_screenshots(task_id, image_data):
-    print('write_screenshots', task_id)
     image_bytes = base64.b64decode(image_data)
     task_dir = os.path.join(results_dir, f"test{task_id}")
     os.makedirs(task_dir, exist_ok=True)
     timestamp = int(time.time())
     file_path = os.path.join(task_dir, f'screenshot_{timestamp}.png')
     with open(file_path, 'wb') as file:
         file.write(image_bytes)
+    logging.info(f'Screenshot saved for task {task_id}')
 
 def run_webwand_task(driver, task_id, task_description):
-    print('run_webwand_task', task_id, task_description)
+    logging.info(f'Start running task {task_id} {task_description}')
+    start = time.time()
     dispatch_event(driver, 'SetAPIKey', {"value": api_key})
     dispatch_event(driver, 'SetTask', {"value": task_description})
     dispatch_event(driver, 'RunTask', {})
-    task_status = add_task_listener(driver, task_id)
-    return task_status
+    add_task_listener(driver, task_id)
+    end = time.time()
+    logging.info(f'The task {task_id} took {end - start} seconds to complete.')
 
 def click_extensions_icon(driver):
     # Simulate click to open side panel
@@ -162,10 +164,7 @@ def main():
                 click_extensions_icon(driver)
                 initial_load = False
 
-            task_status = run_webwand_task(driver, task_id, task['ques'])
-            while task_status not in ['success', 'error']:
-                print("wait task_status", task_status)
-                time.sleep(3)  # Wait for 3 seconds till the current task completes
+            run_webwand_task(driver, task_id, task['ques'])
     driver.quit()
 
 if __name__ == "__main__":
 
@@ -0,0 +1,162 @@
+# Code from webVoyager https://github.com/MinorJerry/WebVoyager/blob/main/evaluation/auto_eval.py
+import argparse
+import os
+import json
+import time
+import re
+import base64
+
+from openai import OpenAI
+
+SYSTEM_PROMPT = """As an evaluator, you will be presented with three primary components to assist you in your role:
+
+1. Web Task Instruction: This is a clear and specific directive provided in natural language, detailing the online activity to be carried out. These requirements may include conducting searches, verifying information, comparing prices, checking availability, or any other action relevant to the specified web service (such as Amazon, Apple, ArXiv, BBC News, Booking etc).
+
+2. Result Screenshots: This is a visual representation of the screen showing the result or intermediate state of performing a web task. It serves as visual proof of the actions taken in response to the instruction.
+
+3. Result Response: This is a textual response obtained after the execution of the web task. It serves as textual result in response to the instruction.
+
+-- You DO NOT NEED to interact with web pages or perform actions such as booking flights or conducting searches on websites.
+-- You SHOULD NOT make assumptions based on information not presented in the screenshot when comparing it to the instructions.
+-- Your primary responsibility is to conduct a thorough assessment of the web task instruction against the outcome depicted in the screenshot and in the response, evaluating whether the actions taken align with the given instructions.
+-- NOTE that the instruction may involve more than one task, for example, locating the garage and summarizing the review. Failing to complete either task, such as not providing a summary, should be considered unsuccessful.
+-- NOTE that the screenshot is authentic, but the response provided by LLM is generated at the end of web browsing, and there may be discrepancies between the text and the screenshots.
+-- Note the difference: 1) Result response may contradict the screenshot, then the content of the screenshot prevails, 2) The content in the Result response is not mentioned on the screenshot, choose to believe the content.
+
+You should elaborate on how you arrived at your final evaluation and then provide a definitive verdict on whether the task has been successfully accomplished, either as 'SUCCESS' or 'NOT SUCCESS'."""
+USER_PROMPT = """TASK: <task>
+Result Response: <answer>
+<num> screenshots at the end: """
+
+
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
+
+
+def auto_eval_by_gpt4v(process_dir, openai_client, api_model, img_num):
+    print(f'--------------------- {process_dir} ---------------------')
+    res_files = sorted(os.listdir(process_dir))
+    with open(os.path.join(process_dir, 'interact_messages.json')) as fr:
+        it_messages = json.load(fr)
+
+    if len(it_messages) == 1:
+        print('Not find answer for ' + process_dir + ' only system messages')
+        print()
+        return 0
+
+    task_info = it_messages[1]["content"]
+    if type(task_info) == list:
+        task_info = task_info[0]["text"]
+    assert 'Now given a task' in task_info
+    pattern = r"Now given a task:(.+?)Please interact with"
+    matches = re.search(pattern, task_info)
+    task_content = matches.group(1).strip()
+
+    ans_info = it_messages[-1]["content"]
+    if 'Action: ANSWER' not in ans_info:
+        print('Not find answer for ' + process_dir)
+        print()
+        return 0
+    pattern_ans = r"ANSWER[; ]+\[?(.[^\]]*)\]?"
+    matches_ans = re.search(pattern_ans, ans_info)
+    answer_content = matches_ans.group(1).strip()
+
+    # max_screenshot_id = max([int(f[10:].split('.png')[0]) for f in os.listdir(process_dir) if '.png' in f])
+    # final_screenshot = f'screenshot{max_screenshot_id}.png'
+    # b64_img = encode_image(os.path.join(process_dir, final_screenshot))
+    whole_content_img = []
+    pattern_png = r'screenshot(\d+)\.png'
+    matches = [(filename, int(re.search(pattern_png, filename).group(1))) for filename in res_files if re.search(pattern_png, filename)]
+    matches.sort(key=lambda x: x[1])
+    end_files = matches[-img_num:]
+    for png_file in end_files:
+        b64_img = encode_image(os.path.join(process_dir, png_file[0]))
+        whole_content_img.append(
+            {
+                'type': 'image_url',
+                'image_url': {"url": f"data:image/png;base64,{b64_img}"}
+            }
+        )
+
+    user_prompt_tmp = USER_PROMPT.replace('<task>', task_content)
+    user_prompt_tmp = user_prompt_tmp.replace('<answer>', answer_content)
+    user_prompt_tmp = user_prompt_tmp.replace('<num>', str(img_num))
+    messages = [
+        {'role': 'system', 'content': SYSTEM_PROMPT},
+        {
+            'role': 'user',
+            'content': [
+                {'type': 'text', 'text': user_prompt_tmp}
+            ]
+            + whole_content_img
+            + [{'type': 'text', 'text': "Your verdict:\n"}]
+        }
+    ]
+    while True:
+        try:
+            print('Calling gpt4v API to get the auto evaluation......')
+            openai_response = openai_client.chat.completions.create(
+                model=api_model, messages=messages, max_tokens=1000, seed=42, temperature=0
+            )
+            print('Prompt Tokens:', openai_response.usage.prompt_tokens, ';',
+                  'Completion Tokens:', openai_response.usage.completion_tokens)
+            print('Cost:', openai_response.usage.prompt_tokens/1000 * 0.01
+                  + openai_response.usage.completion_tokens / 1000 * 0.03)
+
+            print('API call complete...')
+            break
+        except Exception as e:
+            print(e)
+            if type(e).__name__ == 'RateLimitError':
+                time.sleep(10)
+            elif type(e).__name__ == 'APIError':
+                time.sleep(15)
+            elif type(e).__name__ == 'InvalidRequestError':
+                exit(0)
+            else:
+                time.sleep(10)
+    gpt_4v_res = openai_response.choices[0].message.content
+    print_message = messages[1]
+    for idx in range(len(print_message['content'])):
+        if print_message['content'][idx]['type'] == 'image_url':
+            print_message['content'][idx]['image_url'] = {"url": "data:image/png;base64, b64_img"}
+
+    # print_message[1]['content'][1]['image_url'] = {"url": "data:image/png;base64, b64_img"}
+    print(print_message)
+    print(gpt_4v_res)
+
+    auto_eval_res = 0 if 'NOT SUCCESS' in gpt_4v_res else 1
+    if 'SUCCESS' not in gpt_4v_res:
+        auto_eval_res = None
+    print('Auto_eval_res:', auto_eval_res)
+    print()
+    return auto_eval_res
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--process_dir', type=str, default='results')
+    parser.add_argument('--lesson_dir', type=str, default='results')
+    parser.add_argument("--api_key", default="key", type=str, help="YOUR_OPENAI_API_KEY")
+    parser.add_argument("--api_model", default="gpt-4-vision-preview", type=str, help="api model name")
+    parser.add_argument("--max_attached_imgs", type=int, default=1)
+    args = parser.parse_args()
+
+    client = OpenAI(api_key=args.api_key)
+    webs = ['Allrecipes', 'Amazon', 'Apple', 'ArXiv', 'BBC News', 'Booking', 'Cambridge Dictionary',
+            'Coursera', 'ESPN', 'GitHub', 'Google Flights', 'Google Map', 'Google Search', 'Huggingface', 'Wolfram Alpha']
+
+    for web in webs:
+        web_task_res = []
+        for idx in range(0, 46):
+            file_dir = os.path.join(args.process_dir, 'task'+web+'--'+str(idx))
+            if os.path.exists(file_dir):
+                response = auto_eval_by_gpt4v(file_dir, client, args.api_model, args.max_attached_imgs)
+                web_task_res.append(response)
+            else:
+                pass
+        if web_task_res:
+            print(web_task_res)
+if __name__ == '__main__':
+    main()