diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index eae263e7..05243d5b 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,7 +1,7 @@
 minimum_pre_commit_version: "2.9.0"
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v2.3.0
+    rev: v4.4.0
     hooks:
       - id: check-yaml
         args: [--allow-multiple-documents]
@@ -20,15 +20,15 @@ repos:
       - id: reorder-python-imports
         args: [--py39-plus]
   - repo: https://github.com/psf/black
-    rev: 22.3.0
+    rev: 23.1.0
     hooks:
       - id: black
   - repo: https://github.com/PyCQA/flake8
-    rev: 3.9.2
+    rev: 6.0.0
     hooks:
       - id: flake8
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v0.991
+    rev: v1.1.1
     hooks:
       - id: mypy
         additional_dependencies: [numpy, httpx, pytest, structlog, types-PyYAML]
diff --git a/ice/agents/approval.py b/ice/agents/approval.py
index 1c9504f4..42689032 100644
--- a/ice/agents/approval.py
+++ b/ice/agents/approval.py
@@ -73,7 +73,6 @@ async def relevance(self, *, question, context, verbose=False, default=None):
         return score
 
     async def _check(self, prompt: str, candidate: str):
-
         approval_prompt = f"""Evaluate whether the following output is correct.
 
 Input:
diff --git a/ice/agents/openai.py b/ice/agents/openai.py
index 31da9d18..cadb5423 100644
--- a/ice/agents/openai.py
+++ b/ice/agents/openai.py
@@ -116,7 +116,7 @@ def _compute_relative_probs(
 
         def lookup_prob(choice: str):
             scores = 0.0
-            for (token, prob) in prediction.items():
+            for token, prob in prediction.items():
                 if choice[len(choice_prefix) :].startswith(token):
                     scores += prob
             return scores
diff --git a/ice/agents/openai_reasoning.py b/ice/agents/openai_reasoning.py
index 0a655502..0f852751 100644
--- a/ice/agents/openai_reasoning.py
+++ b/ice/agents/openai_reasoning.py
@@ -119,7 +119,7 @@ async def _parse_and_aggregate_responses(
         # Parse the responses and aggregate the answers and reasonings
         answers: Counter[str] = Counter()
         reasonings: list[str] = []
-        for (i, response_text) in enumerate(response_texts):
+        for i, response_text in enumerate(response_texts):
             # Check if the response contains the answer prefix
             if answer_prefix not in response_text:
                 # If not, request an explicit answer from the API
@@ -200,7 +200,6 @@ def _parse_answer_and_reasoning(
     def _format_result(
         self, answers: Counter[str], reasonings: list[str]
     ) -> tuple[dict[str, float], str]:
-
         # Join the reasonings with counts
         joined_reasonings = self._join_texts_with_counts(reasonings)
 
diff --git a/ice/agents/squad.py b/ice/agents/squad.py
index d19fe5ce..de301ee9 100644
--- a/ice/agents/squad.py
+++ b/ice/agents/squad.py
@@ -6,7 +6,6 @@
 
 class SquadAgent(Agent):
     def __init__(self, model_name: str = "z-uo/roberta-qasper"):
-
         self.nlp = pipeline(
             "question-answering", model=model_name, tokenizer=model_name
         )
diff --git a/ice/metrics/gold_paragraphs.py b/ice/metrics/gold_paragraphs.py
index 300698e9..9e5d85e6 100644
--- a/ice/metrics/gold_paragraphs.py
+++ b/ice/metrics/gold_paragraphs.py
@@ -85,7 +85,6 @@ def get_containing_paragraph(
 
 
 def get_gold_paragraph_df(question_short_name: str):
-
     gold_standards = get_question_gold_standards(question_short_name)
 
     entries = []
diff --git a/ice/metrics/nubia.py b/ice/metrics/nubia.py
index 2d899e02..1d1222f1 100644
--- a/ice/metrics/nubia.py
+++ b/ice/metrics/nubia.py
@@ -29,7 +29,6 @@ class NubiaResponse(BaseModel):
 
 
 async def _single_nubia(sample: Sample) -> list[NubiaResponse]:
-
     samples = list(product(sample.left, sample.right))
 
     async with httpx.AsyncClient(
diff --git a/ice/metrics/rouge.py b/ice/metrics/rouge.py
index e5630c62..609999dd 100644
--- a/ice/metrics/rouge.py
+++ b/ice/metrics/rouge.py
@@ -57,7 +57,7 @@ async def _compute_single(sample: Sample) -> RougeResult:
             )
             return RougeResult.parse_obj(result_dict)
 
-        return [await (_compute_single(s)) for s in sample]
+        return [await _compute_single(s) for s in sample]
 
 
 @diskcache()
diff --git a/ice/nn/bert_t5_t0_ensemble.py b/ice/nn/bert_t5_t0_ensemble.py
index 318bd538..98a2580f 100644
--- a/ice/nn/bert_t5_t0_ensemble.py
+++ b/ice/nn/bert_t5_t0_ensemble.py
@@ -163,6 +163,7 @@ def T0_classify(
 
 # Credit: https://stackoverflow.com/questions/39936527/python-removing-references-from-a-scientific-paper
 
+
 # Remove citations
 def remove_citations(s: str) -> str:
     return re.sub(r"\s\([A-Z][a-z]+,\s[A-Z][a-z]?\.[^\)]*,\s\d{4}\)", "", s)
diff --git a/ice/nn/bert_t5_t0_example.py b/ice/nn/bert_t5_t0_example.py
index 7c5abdd4..f00c909f 100644
--- a/ice/nn/bert_t5_t0_example.py
+++ b/ice/nn/bert_t5_t0_example.py
@@ -65,7 +65,6 @@ def extract_numbers(text: str) -> list[str]:
 
 
 def classify_example():
-
     abstract = """In this study we will examine the impact of the use of ..."""
     paragraph = """[..] The adherence rate is 88.2%."""
     numbers = extract_numbers(paragraph)
diff --git a/ice/recipes/combine_abstract_answers.py b/ice/recipes/combine_abstract_answers.py
index 702b2e99..7a3dbb0e 100644
--- a/ice/recipes/combine_abstract_answers.py
+++ b/ice/recipes/combine_abstract_answers.py
@@ -4,7 +4,6 @@
 
 
 def make_prompt(question: str, abstracts: list[Abstract], answers: list[str]) -> str:
-
     abstract_answers_str = "\n\n".join(
         [
             f"Title B{i}: {abstract.title}\nAbstract B{i}: {abstract.text}\nAnswer B{i}: {answer}"
diff --git a/ice/recipes/comparisons_qa.py b/ice/recipes/comparisons_qa.py
index 764f82d1..346d5dc8 100644
--- a/ice/recipes/comparisons_qa.py
+++ b/ice/recipes/comparisons_qa.py
@@ -47,7 +47,6 @@ async def run(
         num_paragraphs: int = 3,
         answer_prefix: str = DEFAULT_ANSWER_PREFIX,
     ):
-
         rank_paragraphs = RankParagraphs(mode=self.mode)
 
         top_paragraphs = await rank_paragraphs.run(
diff --git a/ice/recipes/consort_flow/baseline_elicit_answer.py b/ice/recipes/consort_flow/baseline_elicit_answer.py
index f43d31a9..d46c10c3 100644
--- a/ice/recipes/consort_flow/baseline_elicit_answer.py
+++ b/ice/recipes/consort_flow/baseline_elicit_answer.py
@@ -24,7 +24,6 @@ async def answer_like_elicit_qa(
     question: str,
     passage: str,
 ) -> str:
-
     prompt = elicit_qa_prompt(
         qa_question=question,
         excerpt=passage,
@@ -68,7 +67,6 @@ def elicit_qa_prompt(
     qa_question: str,
     excerpt: str,
 ) -> str:
-
     full_answer_prefix = "Answer:"
 
     return f"""Answer the question "{qa_question}" based on the excerpt from a research paper. \
diff --git a/ice/recipes/evaluate_result.py b/ice/recipes/evaluate_result.py
index 65e4f7d9..24880647 100644
--- a/ice/recipes/evaluate_result.py
+++ b/ice/recipes/evaluate_result.py
@@ -150,7 +150,6 @@ async def run(
         gold_result: Optional[str] = None,
         question: Optional[str] = None,
     ) -> ResultComparison:
-
         if self.mode == "test":
             model_results, gold_results, question = self.test_data()
             model_result = model_results[0]
diff --git a/ice/recipes/placebo_description.py b/ice/recipes/placebo_description.py
index b2b88a0c..f8fdea94 100644
--- a/ice/recipes/placebo_description.py
+++ b/ice/recipes/placebo_description.py
@@ -60,7 +60,6 @@ async def get_gold_experiments(self, paper: Paper) -> list[str]:
     async def placebo_for_experiment(
         self, paper: Paper, experiment: str, record=recorder
     ) -> str:
-
         # Generate the QA prompt
         qa_prompt = self.make_prompt(paper, experiment)
 
diff --git a/ice/recipes/placebo_dialogs.py b/ice/recipes/placebo_dialogs.py
index 101ed4f2..ce562956 100644
--- a/ice/recipes/placebo_dialogs.py
+++ b/ice/recipes/placebo_dialogs.py
@@ -73,14 +73,13 @@ async def ask(self, question: str, multiline=True, answer_prefix=""):
     async def multiple_choice(
         self, question: str, answers: list[str]
     ) -> tuple[dict[str, float], "DialogState"]:
-
         answer_prefix = longest_common_prefix(answers).rstrip()
         new_context = f"{self.context}\n\nQ: {question}\n\nA: {answer_prefix}"
         prediction = await self.agent.predict(context=new_context, default=" ")
 
         def lookup_prob(answer: str):
             scores = 0.0
-            for (token, prob) in prediction.items():
+            for token, prob in prediction.items():
                 if answer[len(answer_prefix) :].startswith(token):
                     scores += prob
             return scores
@@ -129,7 +128,6 @@ def make_initial_paragraph_context(
 
 
 class PlaceboDialogs(Recipe):
-
     verbose = False
 
     msg = SimpleNamespace(
@@ -367,7 +365,6 @@ async def aggregate_placebo_kind(
         return {"answer": answer, "quotes": quotes, "component_answers": answers}
 
     async def analyze_experiment(self, paper: Paper, experiment: Experiment):
-
         paragraphs = [
             paragraph
             for paragraph in paper.paragraphs
diff --git a/ice/recipes/primer/sequential_action.py b/ice/recipes/primer/sequential_action.py
index 7f72cc7b..50b8a840 100644
--- a/ice/recipes/primer/sequential_action.py
+++ b/ice/recipes/primer/sequential_action.py
@@ -259,7 +259,6 @@ async def sequential_action(
     log: list[str] = []
 
     for actions_left in range(max_actions, 0, -1):
-
         sufficient_info = await is_info_sufficient(question, log)
         if sufficient_info:
             break
diff --git a/ice/recipes/single_prompt.py b/ice/recipes/single_prompt.py
index e6b845f4..54304389 100644
--- a/ice/recipes/single_prompt.py
+++ b/ice/recipes/single_prompt.py
@@ -41,7 +41,6 @@ class SinglePrompt(Recipe):
     default_answer_classification: Optional[str]
 
     async def run(self, paper: Paper):
-
         # Get the full paper text and truncate it
         full_paper_text = get_paper_text(paper)
         paper_text = truncate_by_tokens(full_paper_text, max_tokens=self.max_tokens)
diff --git a/ice/recipes/synthesize.py b/ice/recipes/synthesize.py
index 201e3136..a4600984 100644
--- a/ice/recipes/synthesize.py
+++ b/ice/recipes/synthesize.py
@@ -95,7 +95,6 @@ def _get_reference(authors: list[str], year: Optional[int]) -> str:
 
 
 async def synthesize(question: str, abstracts: list[Abstract]) -> str:
-
     papers_str = "\n\n".join(
         [
             PAPER_FORMAT.format(
diff --git a/main.py b/main.py
index 0c553e88..f1283fd9 100644
--- a/main.py
+++ b/main.py
@@ -193,8 +193,7 @@ async def print_results(
     """
     results_json: list[dict] = []
 
-    for (document_id, final_result) in results_by_doc.items():
-
+    for document_id, final_result in results_by_doc.items():
         if json_out is not None:
             results_json.extend(recipe.to_json(final_result))