From 86fa0acf57074c320b44ee5c6284f2050b0aaece Mon Sep 17 00:00:00 2001 From: Kaiyu Yang Date: Sat, 7 Oct 2023 18:42:53 -0700 Subject: [PATCH] add instructions for MiniF2F and ProofNet --- README.md | 3 ++- common.py | 8 +++++-- docs/eval_MiniF2F_ProofNet.md | 22 +++++++++++++++++++ retrieval/confs/cli_lean3_random.yaml | 6 ++--- retrieval/confs/cli_lean4_novel_premises.yaml | 2 +- retrieval/datamodule.py | 21 +++++++----------- 6 files changed, 42 insertions(+), 20 deletions(-) create mode 100644 docs/eval_MiniF2F_ProofNet.md diff --git a/README.md b/README.md index 7702faf..2e2bcdf 100644 --- a/README.md +++ b/README.md @@ -307,7 +307,7 @@ python generator/main.py fit --config generator/confs/cli_lean4_random.yaml --mo python generator/main.py fit --config generator/confs/cli_lean4_novel_premises.yaml --model.ret_ckpt_path PATH_TO_RETRIEVER_CHECKPOINT --data.preds_path PATH_TO_PREDICTIONS_PICKLE ``` -### Evaluation on Theorem Proving +### Theorem Proving Evaluation on LeanDojo Benchmark (Lean 3 and Lean 4) After the tactic generator is trained, we combine it with best-first search to prove theorems by interacting with Lean. @@ -331,6 +331,7 @@ python prover/evaluate.py --data-path data/leandojo_benchmark/random/ --ckpt_pa python prover/evaluate.py --data-path data/leandojo_benchmark/novel_premises/ --ckpt_path PATH_TO_REPROVER_CHECKPOINT --indexed-corpus-path PATH_TO_INDEXED_CORPUS --split test --num-cpus 8 --with-gpus ``` +See [here](docs/eval_MiniF2F_ProofNet.md) if you want to evaluate on other Lean repos such as [miniF2F](https://github.com/facebookresearch/miniF2F) and [ProofNet](https://github.com/zhangir-azerbayev/ProofNet). ## Questions and Bugs diff --git a/common.py b/common.py index 45178d4..39f5369 100644 --- a/common.py +++ b/common.py @@ -156,10 +156,12 @@ def from_data(cls, file_data: Dict[str, Any]) -> "File": path = file_data["path"] premises = [] for p in file_data["premises"]: - if "user__.n" in p["full_name"] or p["code"] == "": + full_name = p["full_name"] + if full_name is None: + continue + if "user__.n" in full_name or p["code"] == "": # Ignore ill-formed premises (often due to errors in ASTs). continue - full_name = p["full_name"] if full_name.startswith("[") and full_name.endswith("]"): # Ignore mutual definitions. continue @@ -346,6 +348,8 @@ def get_all_pos_premises(annot_tac, corpus: Corpus) -> List[Premise]: p = corpus.locate_premise(def_path, Pos(*prov["def_pos"])) if p is not None: all_pos_premises.add(p) + else: + logger.warning(f"Cannot locate premise: {prov}") return list(all_pos_premises) diff --git a/docs/eval_MiniF2F_ProofNet.md b/docs/eval_MiniF2F_ProofNet.md new file mode 100644 index 0000000..4812cf3 --- /dev/null +++ b/docs/eval_MiniF2F_ProofNet.md @@ -0,0 +1,22 @@ +Evaluation on MiniF2F and ProofNet +---------------------------------- + +Here we evaluate models trained on LeanDojo Benchmark on MiniF2F or ProofNet. We use MiniF2F as an example, but the same procedure applies to ProofNet. + +First, use LeanDojo to extract data from MiniF2F. See the end of [this Jupyter notebook](https://github.com/lean-dojo/LeanDojo/blob/main/scripts/generate-benchmark-lean3.ipynb). Save the dataset to `data/leandojo_minif2f`. + + +For models without retrieval, run: +```bash +python prover/evaluate.py --data-path data/leandojo_minif2f/default/ --ckpt_path PATH_TO_MODEL_CHECKPOINT --split test --num-cpus 8 --with-gpus +``` + +For models with retrieval, first use the retriever to index the corpus (pre-computing the embeddings of all premises): +```bash +python retrieval/index.py --ckpt_path PATH_TO_RETRIEVER_CHECKPOINT --corpus-path data/leandojo_minif2f/corpus.jsonl --output-path PATH_TO_INDEXED_CORPUS +``` + +Then, run: +```bash +python prover/evaluate.py --data-path data/leandojo_minif2f/default/ --ckpt_path PATH_TO_REPROVER_CHECKPOINT --indexed-corpus-path PATH_TO_INDEXED_CORPUS --split test --num-cpus 8 --with-gpus +``` diff --git a/retrieval/confs/cli_lean3_random.yaml b/retrieval/confs/cli_lean3_random.yaml index 09380c9..7038eb7 100644 --- a/retrieval/confs/cli_lean3_random.yaml +++ b/retrieval/confs/cli_lean3_random.yaml @@ -25,13 +25,13 @@ trainer: - class_path: pytorch_lightning.callbacks.EarlyStopping init_args: monitor: Recall@10_val - patience: 3 + patience: 5 mode: max verbose: true model: model_name: google/byt5-small - lr: 1e-4 + lr: 2e-4 warmup_steps: 2000 num_retrieved: 100 @@ -40,7 +40,7 @@ data: corpus_path: data/leandojo_benchmark/corpus.jsonl num_negatives: 3 num_in_file_negatives: 1 - batch_size: 8 + batch_size: 12 eval_batch_size: 128 max_seq_len: 1024 num_workers: 4 diff --git a/retrieval/confs/cli_lean4_novel_premises.yaml b/retrieval/confs/cli_lean4_novel_premises.yaml index 9f71732..6971efb 100644 --- a/retrieval/confs/cli_lean4_novel_premises.yaml +++ b/retrieval/confs/cli_lean4_novel_premises.yaml @@ -25,7 +25,7 @@ trainer: - class_path: pytorch_lightning.callbacks.EarlyStopping init_args: monitor: Recall@10_val - patience: 3 + patience: 5 mode: max verbose: true diff --git a/retrieval/datamodule.py b/retrieval/datamodule.py index 5ba0e56..5c916c1 100644 --- a/retrieval/datamodule.py +++ b/retrieval/datamodule.py @@ -102,19 +102,14 @@ def __getitem__(self, idx: int) -> Example: premises_in_file = [] premises_outside_file = [] - try: - for p in self.corpus.get_premises(ex["context"].path): - if p == ex["pos_premise"]: - continue - if p.end < ex["context"].theorem_pos: - if ex["pos_premise"].path == ex["context"].path: - premises_in_file.append(p) - else: - premises_outside_file.append(p) - except Exception: - import pdb - - pdb.set_trace() + for p in self.corpus.get_premises(ex["context"].path): + if p == ex["pos_premise"]: + continue + if p.end < ex["context"].theorem_pos: + if ex["pos_premise"].path == ex["context"].path: + premises_in_file.append(p) + else: + premises_outside_file.append(p) for p in self.corpus.transitive_dep_graph.successors(ex["context"].path): if p == ex["pos_premise"].path: