update

yunx-z · Dec 29, 2024 · 9b61e15 · 9b61e15
1 parent 5adf026
commit 9b61e15
Show file tree

Hide file tree

Showing 22 changed files with 194 additions and 1,011 deletions.
diff --git a/MLAgentBench/LLM.py b/MLAgentBench/LLM.py
@@ -59,20 +59,15 @@
     # print(e)
     # print("Could not load anthropic API key claude_api_key.txt.")
 
-try:
-    import openai
-    # setup OpenAI API key
-    openai_api_key = os.getenv('MY_OPENAI_API_KEY')
-    openai_api_base = os.getenv('MY_AZURE_OPENAI_ENDPOINT')
-    openai_client = openai.AzureOpenAI(
-            azure_endpoint=openai_api_base,
-            api_key=openai_api_key,
-            api_version="2024-10-01-preview",
-            )
-except Exception as e:
-    pass
-    # print(e)
-    # print("Could not load OpenAI API key openai_api_key.txt.")
+import openai
+# setup OpenAI API key
+openai_api_key = os.getenv('MY_OPENAI_API_KEY')
+openai_api_base = os.getenv('MY_AZURE_OPENAI_ENDPOINT')
+openai_client = openai.AzureOpenAI(
+        azure_endpoint=openai_api_base,
+        api_key=openai_api_key,
+        api_version="2024-10-01-preview",
+        )
 
 try:
     import vertexai
@@ -300,7 +295,7 @@ def complete_text_openai(prompt, stop_sequences=[], model="gpt-4o-mini", max_tok
         raw_request = {
               "model": model,
               "temperature": 1,
-              "max_completion_tokens": 32000,
+              "max_completion_tokens": 64000 if model.lower() == "o1-mini" else 32000,
               **kwargs
         }
     else:

diff --git a/MLAgentBench/benchmarks/base_competition/env/main.py b/MLAgentBench/benchmarks/base_competition/env/main.py
@@ -14,6 +14,8 @@
     parser.add_argument("-d", "--dataset_filepath", type=str, default="data/dev_data.jsonl")
     args = parser.parse_args()
 
+    os.makedirs("output", exist_ok=True) # `save_evals` assume that `output/` folder exists
+
     loaded_methods = all_method_handlers()
     curr_method = loaded_methods[args.method](args.method)
     start_time = time.time()

diff --git a/MLAgentBench/benchmarks/base_competition/scripts/research_problem.txt b/MLAgentBench/benchmarks/base_competition/scripts/research_problem.txt
@@ -14,7 +14,7 @@ You have been provided with a starter kit that includes an end-to-end submission
 
 ## Test Method
 
-Simply run `python main.py -m {method_name}`. For example, to test the baseline method, execute `python main.py -m my_method`. [Describe what will happen for the evaluation pipeline]
+Simply run `python main.py -m {method_name}`. For example, to test the baseline method, execute `python main.py -m my_method`. [Describe what will happen for the evaluation pipeline in the **development** phase, not **test** phase.]
 
 ## Competition Rules
 

diff --git a/MLAgentBench/benchmarks/llm-merging/env/llm_merging/constants.py b/MLAgentBench/benchmarks/llm-merging/env/llm_merging/constants.py
diff --git a/MLAgentBench/benchmarks/llm-merging/env/llm_merging/data.py b/MLAgentBench/benchmarks/llm-merging/env/llm_merging/data.py
diff --git a/MLAgentBench/benchmarks/llm-merging/env/llm_merging/evaluation.py b/MLAgentBench/benchmarks/llm-merging/env/llm_merging/evaluation.py
diff --git a/MLAgentBench/benchmarks/llm-merging/env/llm_merging/main.py b/MLAgentBench/benchmarks/llm-merging/env/llm_merging/main.py