base benchmark

gomate-community · Mar 27, 2024 · 37701c6 · 37701c6
1 parent 26ce4fa
commit 37701c6
Show file tree

Hide file tree

Showing 8 changed files with 593 additions and 591 deletions.
diff --git a/benchmarks/ASQA/README.md b/benchmarks/ASQA/README.md
@@ -1,7 +1,13 @@
 # ASQA BENCHMARK
 
+## Description
+
 This benchmark is designed to evaluate the performance of the [ASQA dataset](https://huggingface.co/datasets/din0s/asqa). For `generate.py`, we followed [FLARE](https://github.com/jzbjyb/FLARE), using `gpt-3.5-turbo-instruct` with no retrieval settings.
 
+## Dataset
+
+TODO
+
 ## Usage
 
 1. Prepare your model output results in the `output` directory. You can just attatch your `answers` to the end of each example in the origin ASQA dataset as what we did in the `dataset.json` file.
@@ -19,6 +25,15 @@ This benchmark is designed to evaluate the performance of the [ASQA dataset](htt
 
 `--api_key`: Your OpenAI API key.
 
+## Metrics
+
+## Performance
+
+| Model | Rouge-L | Exact Match | Disambig F1 | D-R Score|
+|:---:|:---:|:---:|:---:|:---:|
+| gpt-3.5-turbo-instruct | 29.8 | 34.0 | 30.6 | 30.2 |
+| text-davinci-003 (FLARE) | 33.3 | 33.8 | 24.2 | 28.4 |
+
 ## Citations
 
 ``` bibtex

diff --git a/benchmarks/ASQA/asqa_benchmark.py b/benchmarks/ASQA/asqa_benchmark.py
@@ -1,3 +1,4 @@
+from typing import Dict, Tuple, Any, Optional
 from datasets import Dataset, load_dataset
 import json
 import math
@@ -11,15 +12,19 @@
 logger = logging.getLogger(__name__)
 
 class ASQABenchmark(BaseBenchmark):
+    """Benchmark for ASQA dataset.
+
+    The ASQA dataset is a question-answering dataset that contains factoid questions and long-form answers. The benchmark evaluates the correctness of the answers in the dataset.
+    """
 
     name = "asqa_benchmark"
     metrics = [AnswerRougeCorrectness(rouge_type="rougeL"), 
                AnswerEMCorrectness(ignore_case=True), 
                AnswerDisambigF1Correctness()]
 
-    def __init__(self, output_dir: str, batch_size: int = 1) -> None:
-        self.output_dir = output_dir
-        self.batch_size = batch_size
+    def __init__(self) -> None:
+        """Initialization."""
+        super().__init__()
 
     def load_data(self, **kwargs):
         """Load ASQA dataset.
@@ -28,74 +33,79 @@ def load_data(self, **kwargs):
         """
         print("Load ASQA dataset...")
         super().load_data(**kwargs)
-        if "short_answers" not in dataset.features:
+        if "short_answers" not in self.dataset.features:
             self.dataset = self.dataset.map(lambda example: {"short_answers": [ann["short_answers"] for ann in example["qa_pairs"]]})
-        if "long_answers" not in dataset.features:
+        if "long_answers" not in self.dataset.features:
             self.dataset = self.dataset.map(lambda example: {"long_answers": [ann["long_answer"] for ann in example["annotations"]]})
         print("ASQA dataset loaded.")
 
-    def evaluate(self, dataset_name:str = "result_dataset", result_name:str = "results") -> Dataset:
-        """Evaluate the dataset and return the dataset with scores.
-
-        We use the `short_answers` to evaluate the string Exact Match correctness and the `long_answers` to evaluate the RougeL and DisambigF1 score. And then we calculate the `DR score` as the geometric mean of the RougeL and DisambigF1 scores.
+    def prepare_data(self, label_column: str, input_column: str):
+        """Modify self.dataset for different metric.
 
         Args:
-            dataset_name: The name of the dataset file to save.
-            result_name: The name of the result file to save.
+            input_column: The column name of the input text that has already existed in self.dataset, e.g. `long_answer`.
+            label_column: The column name of the label text that the metric requires, e.g. `gt_answer`.
+        """
+        if input_column not in self.dataset.column_names:
+            raise ValueError(f"The input column {input_column} is not in the dataset. Please check the column names.")
+
+        if label_column in self.dataset.column_names:
+            self.dataset = self.dataset.remove_columns(label_column)
+        self.dataset = self.dataset.add_column(label_column, self.dataset[input_column])
+
+    def _evaluate(self, ) -> Tuple[Dict[Any, Any], Dataset]:
+        """Evaluate the dataset and return the dataset with scores.
+
+        We use the `short_answers` as the `gt_answers` to evaluate the string Exact Match correctness and the `long_answers` to evaluate the RougeL and DisambigF1 score. And then we calculate the `DR score` as the geometric mean of the RougeL and DisambigF1 scores.
         """
         print("Start evaluate...")
-        if not hasattr(self, "dataset"):
-            raise ValueError("Please load the dataset first.")
 
-        self.results = {}
+        ground_truths = {
+            "answer_disambig_f1": ("gt_answers", "long_answers"),
+            "answer_rouge_correctness": ("gt_answers", "long_answers"),
+            "answer_exact_match": ("gt_answers", "short_answers")
+        }
+        results = {}
         scores = {}
         for m in self.metrics:
-            if m == "AnswerRougeCorrectness":
-                print("Evaluating AnswerRougeCorrectness...")
-                metric = self.get_metric(m, rouge_type="rougeL")
-                ds = self.dataset.add_column("gt_answers", self.dataset["long_answers"])
-            elif m == "AnswerEMCorrectness":
-                print("Evaluating AnswerEMCorrectness...")
-                metric = self.get_metric(m, ignore_case=True)
-                ds = self.dataset.add_column("gt_answers", self.dataset["short_answers"])
-            elif m == "AnswerDisambigF1Correctness":
-                print("Evaluating AnswerDisambigF1Correctness...")
-                metric = self.get_metric(m)
-                ds = self.dataset.add_column("gt_answers", self.dataset["long_answers"])
-            score, ds = metric.compute(ds, self.batch_size)
-            self.results[metric.name] = score
-            scores[metric.name] = ds[metric.name]
-        scores = [{k:v[i] for k,v in scores.items()} for i in range(len(self.dataset))]
-        self.dataset = self.dataset.add_column("scores", scores)
-
-        print("Calculating DR score...")
-        def dr_score(d:dict):
-            d['scores']['DR_score'] = math.sqrt(d['scores']["answer_disambig_f1"] * d['scores']["answer_rouge_correctness"])
-            return d
-        self.dataset = self.dataset.map(dr_score)
-        self.results["DR_score"] = math.sqrt(self.results["answer_disambig_f1"] * self.results["answer_rouge_correctness"])
+            if m.name in ground_truths:
+                print(f"Evaluating {m.name}...")
+                self.prepare_data(*ground_truths[m.name])
+                results[m.name], self.dataset = m.compute(self.dataset, self.batch_size)
+                self.dataset = self.dataset.map(lambda example: {f"{m.name}.{ground_truths[m.name][0]}": ground_truths[m.name][1]}) # Add the ground truth column name
+                scores[m.name] = self.dataset[m.name]
+
+        if "gt_answers" in self.dataset.column_names:
+            self.dataset = self.dataset.remove_columns("gt_answers")
+        # scores = [{k:v[i] for k,v in scores.items()} for i in range(len(self.dataset))]
+        # self.dataset = self.dataset.add_column("scores", scores)
+
+        if "answer_rouge_correctness" in self.dataset.column_names and "answer_disambig_f1" in self.dataset.column_names and "DR_score" not in self.dataset.column_names:
+            print("Calculating DR score...")
+            def dr_score(d:dict):
+                d['DR_score'] = math.sqrt(d["answer_disambig_f1"] * d   ["answer_rouge_correctness"])
+                return d
+            self.dataset = self.dataset.map(dr_score)
+            results["DR_score"] = math.sqrt(results["answer_disambig_f1"] * results ["answer_rouge_correctness"])
 
         print("Evaluation finished.")
-        print(f"Results: {self.results}")
-
-        self._save_result(dataset_name, result_name)
-        return self.dataset
-
-    def _save_result(self, dataset_name:str, result_name:str) -> None:
-        """Save the result to files."""
-        with open(os.path.join(self.output_dir, result_name)+".json", "w") as f:
-            json.dump(self.results, f, indent=4)
-        self.dataset.to_json(os.path.join(self.output_dir, dataset_name)+".jsonl")
-        print(f"Results saved to {self.output_dir}/results.json and {self.output_dir}/result_datasets.jsonl")
 
+        return results, self.dataset
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--output_dir", type=str, default="benchmarks/ASQA/output")
     args = parser.parse_args()
 
-    benchmark = ASQABenchmark(output_dir=args.output_dir)
+    benchmark = ASQABenchmark()
+
+    results = benchmark.evaluate(path="json", data_files=os.path.join(args.output_dir, "dataset.jsonl"), split="train")
+    print(f"Results:\n {results}")
 
-    dataset = benchmark.load_data(path="json", data_files=os.path.join(args.output_dir, "dataset.jsonl"), split="train")
+    benchmark.save_results(os.path.join(args.output_dir, "results.jsonl"))
+    benchmark.save_dataset(os.path.join(args.output_dir, "result_dataset.jsonl"))
 
-    dataset = benchmark.evaluate(dataset_name="result_dataset", result_name="results")
+    benchmark.dataset = benchmark.dataset.remove_columns("answer_exact_match")
+    benchmark.set_metric([AnswerEMCorrectness(ignore_case=False)])
+    results = benchmark.evaluate()
+    print(f"Results:\n {results}")