Merge branch 'miked/paper-repro'

allenai · Nov 14, 2024 · 745a795 · 745a795
2 parents 1405ea3 + 4f1b206
commit 745a795
Show file tree

Hide file tree

Showing 36 changed files with 144,705 additions and 9 deletions.
diff --git a/README.md b/README.md
@@ -23,6 +23,18 @@ When you submit a paper on the main page, reviews will be generated using SARG-B
 Note that the result page URL may start with `/survey/` (from the email) or `/result/` (from the list-results page).  The "result" page is best for local use; the survey page hides method names and randomizes review order.
 
 
+## Reproducing paper experiments
+
+The code for alignment/metrics is in the `review_worker/paper_align_eval_repro.py` file, and configs for the experiments from Table 2 of the paper are in `review_worker/data/paper_align_eval/`, along with outputs (generated reviews, alignments, and logs).  A cache file for the gpt requests is provided in `review_worker/data/gpt3_cache.sqlite.xz`.  To run the full experiments:
+
+1. Decompress gpt cache: `unxz review_worker/data/gpt3_cache.sqlite.xz`
+2. Download the [aries dataset](https://github.com/allenai/aries), which has the paper texts and human reviewer data: `aws s3 sync --no-sign-request s3://ai2-s2-research-public/aries/ review_worker/data/aries/ && tar -C review_worker/data/aries/ -xf review_worker/data/aries/s2orc.tar.gz`
+3. Enter the `review_worker` container: `docker build -t marg review_worker && docker run -it --rm --entrypoint /bin/bash -v $(realpath review_worker/data/):/reviewgen/data marg`
+4. Run experiments: `for d in data/paper_align_eval/*; do python paper_align_eval_repro.py $d/align_config.json; done`
+5. Results are in the corresponding `output/` dir, e.g. `data/paper_align_eval/marg_s/output/`
+
+By default, following those steps will use the gpt cache from the paper experiments to ensure reproducibility; it can be disabled by deleting the `review_worker/data/gpt3_cache.sqlite` file or modifying `align_config.json` to not point to it.
+
 ## License
 
 Copyright 2023 The Allen Institute for Artificial Intelligence

diff --git a/review_worker/.dockerignore b/review_worker/.dockerignore
@@ -0,0 +1 @@
+data/
diff --git a/review_worker/aries/util/gpt3.py b/review_worker/aries/util/gpt3.py
@@ -18,11 +18,6 @@ class Gpt3CacheClient:
     def __init__(self, cache_db_path):
         self.cache_db = self._init_cache_db(cache_db_path)
 
-        if openai.api_key is None:
-            if "OPENAI_API_KEY" not in os.environ:
-                logger.error("Need OpenAI key in OPENAI_API_KEY")
-            openai.api_key = os.environ["OPENAI_API_KEY"]
-
         self.tokenizer = None
         self.tokenizers_by_model = dict()
 
@@ -154,6 +149,12 @@ def get_gpt3_result(self, *args, **kwargs):
         """Deprecated. Use prompt_completion() instead."""
         return self.prompt_completion(*args, **kwargs)
 
+    def _ensure_openai_keys(self):
+        if openai.api_key is None:
+            if "OPENAI_API_KEY" not in os.environ:
+                logger.error("Need OpenAI key in OPENAI_API_KEY")
+            openai.api_key = os.environ["OPENAI_API_KEY"]
+
     def prompt_completion(
         self,
         model,
@@ -209,6 +210,7 @@ def prompt_completion(
                 cache_json = dbrecs[0][0]
         if cache_json is None:
             logger.debug("UNCACHED prompt completion")
+            self._ensure_openai_keys()
             resp = openai.Completion.create(**db_keyvals)
             insert_keyvals = db_keyvals.copy()
             cache_json = json.dumps(resp)
@@ -295,6 +297,7 @@ def chat_completion(
                 cache_json = dbrecs[0][0]
         if cache_json is None:
             logger.debug("UNCACHED chat completion")
+            self._ensure_openai_keys()
 
             model_keyvals = db_keyvals.copy()
             del model_keyvals["messages_json"]

diff --git a/review_worker/data/gpt3_cache.sqlite.xz b/review_worker/data/gpt3_cache.sqlite.xz
diff --git a/review_worker/data/paper_align_eval/lizca/align_config.json b/review_worker/data/paper_align_eval/lizca/align_config.json
@@ -0,0 +1,13 @@
+{
+    "aries_base_path": "data/aries/",
+    "gpt3_cache_db_path": "data/gpt3_cache.sqlite",
+    "gpt_model": "gpt-4-0613",
+    "gpt_default_max_length": 2048,
+    "generated_comments_file": null,
+    "output_dir": "data/paper_align_eval/lizca/output/",
+    "model_type": "gpt_liang_etal",
+    "paper_chunk_size": 6500,
+    "prompts": {},
+    "seed": 1,
+    "_generated_comments_file": "data/paper_align_eval/lizca/output/all_results.jsonl"
+}
diff --git a/review_worker/data/paper_align_eval/lizca/output/all_aligns.json b/review_worker/data/paper_align_eval/lizca/output/all_aligns.json
diff --git a/review_worker/data/paper_align_eval/lizca/output/all_results.json b/review_worker/data/paper_align_eval/lizca/output/all_results.json
diff --git a/review_worker/data/paper_align_eval/lizca/output/logging_output.log b/review_worker/data/paper_align_eval/lizca/output/logging_output.log
diff --git a/review_worker/data/paper_align_eval/lizca/review_data.json b/review_worker/data/paper_align_eval/lizca/review_data.json
diff --git a/review_worker/data/paper_align_eval/marg_s/align_config.json b/review_worker/data/paper_align_eval/marg_s/align_config.json
diff --git a/review_worker/data/paper_align_eval/marg_s/output/all_aligns.json b/review_worker/data/paper_align_eval/marg_s/output/all_aligns.json
diff --git a/review_worker/data/paper_align_eval/marg_s/output/all_results.json b/review_worker/data/paper_align_eval/marg_s/output/all_results.json
diff --git a/review_worker/data/paper_align_eval/marg_s/output/logging_output.log b/review_worker/data/paper_align_eval/marg_s/output/logging_output.log
diff --git a/review_worker/data/paper_align_eval/marg_s/review_data.json b/review_worker/data/paper_align_eval/marg_s/review_data.json
diff --git a/review_worker/data/paper_align_eval/marg_s_nr/align_config.json b/review_worker/data/paper_align_eval/marg_s_nr/align_config.json
diff --git a/review_worker/data/paper_align_eval/marg_s_nr/output/all_aligns.json b/review_worker/data/paper_align_eval/marg_s_nr/output/all_aligns.json
diff --git a/review_worker/data/paper_align_eval/marg_s_nr/output/all_results.json b/review_worker/data/paper_align_eval/marg_s_nr/output/all_results.json
diff --git a/review_worker/data/paper_align_eval/marg_s_nr/output/logging_output.log b/review_worker/data/paper_align_eval/marg_s_nr/output/logging_output.log
diff --git a/review_worker/data/paper_align_eval/marg_s_nr/review_data.json b/review_worker/data/paper_align_eval/marg_s_nr/review_data.json
diff --git a/review_worker/data/paper_align_eval/marg_tp/align_config.json b/review_worker/data/paper_align_eval/marg_tp/align_config.json
@@ -0,0 +1,24 @@
+{
+    "aries_base_path": "data/aries/",
+    "gpt3_cache_db_path": "data/gpt3_cache.sqlite",
+    "gpt_model": "gpt-4-0613",
+    "gpt_default_max_length": 2048,
+    "generated_comments_file": null,
+    "output_dir": "data/paper_align_eval/marg_tp/output/",
+    "model_type": "gpt_generic_multi_agent",
+    "paper_chunk_size": 4096,
+    "master_chunk_type": "none",
+    "prompts": {
+        "master_system_prompt": "You are part of a group that needs to perform tasks that involve a scientific paper.  However, the paper is very long, so each agent has only been given part of it.  You are the leader in charge of interacting with the user and coordinating the group to accomplish tasks.  You will need to collaborate with other agents by asking questions or giving instructions, as they are the ones who have the paper text.\n\nCommunication protocol:\nTo broadcast a message other agents, write \"SEND MESSAGE: \" and then your message; alternatively, if you forget to include it until the end of your message, you can write \"SEND FULL MESSAGE\" and everything you just wrote will be sent.  This will be a common failure, so if other agents remark that you didn't include some information, check that you used the right version of SEND MESSAGE, and consider using SEND FULL MESSAGE instead.\n\nAdditional instructions:\nWhen you are given a task, your first step should be to draft a high-level plan with a list of steps, concisely describing how you will approach the task and your strategy for communicating with other agents.  Then, execute the plan.  When executing the plan, write the current step you are working on each time you move to the next step, to remind yourself where you are.  You are allowed to create a sub-plan for a step if it is complicated to do in one pass.\n\nYou should continue to pay attention to details in the original task instructions even after you draft your plan.  Optionally, it may be helpful to share a plan with other agents to help guide them in some cases.\n\nOther agents do not know anything about the task being performed, so it is your responsibility to convey any information about the task that is necessary for them to provide helpful responses.  You should make this part of your high-level plan.  Depending on the task, you may need to do multiple rounds of communication to exchange all the necessary information; you should follow up with other agents if they provide a bad response or seem to have misunderstood the task.  In addition, because other agents can only communicate with you but not each other, you may need to help relay information between agents.\n\nBecause each agent has a different piece of the paper, communication is key for performing tasks that require understanding the full paper.  In addition, depending on the responses you receive, you may need to ask follow-up questions, clarify your requests, or engage in additional discussion to fully reason about the task.\n\nTo reduce communication errors, after you send a message you should write a short description of what you expect the response to look like.  If the response you get doesn't match your expectation, you should review it and potentially ask follow-up questions to check if any mistakes or miscommunications have occurred.  It could be the case that an agent (including yourself) has misread something or made a logic error.",
+        "master_chunk_prompt": "Information about agents: There are {num_agents} agents in the group, including yourself.  You are {agent_name}.  The other agent(s) are: {other_agent_names}.\n\nWrite \"Ready\" if you have understood the assignment and the protocol to communicate with other agents.  You will then be given tasks.",
+        "worker_system_prompt": "You are part of a group that needs to perform tasks that involve a scientific paper.  However, the paper is very long, so each agent has only been given part of it.  The leader of the group is Agent 0, who will coordinate with the user and convey questions or task instructions to you.\n\nSometimes you will need more information in order to understand a question or task or to interpret your portion of the paper; in these cases, you should send a message to request this information from other agents.  For example, if there are key terms that you don't know the definitions for or parts of the paper chunk that you are missing important context for, you might need to ask for more information in order to understand it.  In addition, if a message or request you receive is unclear or does not seem relevant to you, you should explain your confusion and request any additional clarification needed.\n\nCommunication protocol:\nTo send a message to the group leader, write \"SEND MESSAGE: \" and then your message.  Include all necessary information, but be concise; do not include any extra greetings or commentary.\n\nTo reduce communication errors, after you send a message you should write a short description of what you expect the response to look like.  If the response you get doesn't match your expectation, it is not necessarily wrong, but you should review it and potentially ask follow-up questions to ensure that no mistakes or miscommunications have occurred.",
+        "worker_chunk_prompt": "Your paper chunk is shown below:\n--- START PAPER CHUNK ---\n{source_paper_chunk}\n--- END PAPER CHUNK ---\n\nInformation about agents: There are {num_agents} agents in the group, including yourself.  You are {agent_name}.  The other agent(s) are: {other_agent_names}.\n\nWrite \"Ready\" if you have understood the assignment.  You will then receive messages.",
+        "task_prompt_set1_v1": "Task: Write a list of feedback comments, similar to the suggestions a reviewer might make.  Focus on major comments rather than minor comments; major comments are important things that affect the overall impact of the paper, whereas minor comments are small things like style/grammar or small details that don't matter much for whether the paper should be accepted to a venue.\n\nBe specific in your suggestions, including details about method or resource names and any particular steps the authors should follow.  However, don't suggest things that have already been included or addressed in the paper.\n\nYour review comments should have a clear purpose; obviously, it is always possible to simply say the authors should include more details or do more experiments, but in practice the authors have limited space to write and limited time to work, so each comment needs to have a clear purpose.",
+        "task_prompt_set1_v2": "Write the final list of review comments as a JSON list of strings.  Do not include any additional commentary or send any messages.",
+        "task_prompt_set2_v1": "Refine and improve the following list of review comments that were written about a scientific paper.  The goal is to have comments that are detailed and helpful, similar to those that a scientific paper reviewer might write.  The comments should not ask for things that are already in the paper, they should not be duplicated with other comments, they should include enough detail for an author to know clearly how to improve their paper, the purpose and value of each suggestion should be clearly justified, and so on.  Remove any bad comments.  You may need to incorporate additional information in the paper to refine some of these.\n\nHere is the review:\n{review_comments}",
+        "task_prompt_set2_v2": "Write the final list of review comments as a JSON list of strings.  Do not send any additional messages and do not include anything other than the JSON object in your response."
+    },
+    "experts": [],
+    "seed": 1,
+    "_generated_comments_file": "data/paper_align_eval/marg_tp/output/all_results.jsonl"
+}