Skip to content

Commit

Permalink
base benchmark
Browse files Browse the repository at this point in the history
  • Loading branch information
bugtig6351 committed Mar 27, 2024
1 parent 26ce4fa commit 37701c6
Show file tree
Hide file tree
Showing 8 changed files with 593 additions and 591 deletions.
15 changes: 15 additions & 0 deletions benchmarks/ASQA/README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
# ASQA BENCHMARK

## Description

This benchmark is designed to evaluate the performance of the [ASQA dataset](https://huggingface.co/datasets/din0s/asqa). For `generate.py`, we followed [FLARE](https://github.com/jzbjyb/FLARE), using `gpt-3.5-turbo-instruct` with no retrieval settings.

## Dataset

TODO

## Usage

1. Prepare your model output results in the `output` directory. You can just attatch your `answers` to the end of each example in the origin ASQA dataset as what we did in the `dataset.json` file.
Expand All @@ -19,6 +25,15 @@ This benchmark is designed to evaluate the performance of the [ASQA dataset](htt

`--api_key`: Your OpenAI API key.

## Metrics

## Performance

| Model | Rouge-L | Exact Match | Disambig F1 | D-R Score|
|:---:|:---:|:---:|:---:|:---:|
| gpt-3.5-turbo-instruct | 29.8 | 34.0 | 30.6 | 30.2 |
| text-davinci-003 (FLARE) | 33.3 | 33.8 | 24.2 | 28.4 |

## Citations

``` bibtex
Expand Down
114 changes: 62 additions & 52 deletions benchmarks/ASQA/asqa_benchmark.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from typing import Dict, Tuple, Any, Optional
from datasets import Dataset, load_dataset
import json
import math
Expand All @@ -11,15 +12,19 @@
logger = logging.getLogger(__name__)

class ASQABenchmark(BaseBenchmark):
"""Benchmark for ASQA dataset.
The ASQA dataset is a question-answering dataset that contains factoid questions and long-form answers. The benchmark evaluates the correctness of the answers in the dataset.
"""

name = "asqa_benchmark"
metrics = [AnswerRougeCorrectness(rouge_type="rougeL"),
AnswerEMCorrectness(ignore_case=True),
AnswerDisambigF1Correctness()]

def __init__(self, output_dir: str, batch_size: int = 1) -> None:
self.output_dir = output_dir
self.batch_size = batch_size
def __init__(self) -> None:
"""Initialization."""
super().__init__()

def load_data(self, **kwargs):
"""Load ASQA dataset.
Expand All @@ -28,74 +33,79 @@ def load_data(self, **kwargs):
"""
print("Load ASQA dataset...")
super().load_data(**kwargs)
if "short_answers" not in dataset.features:
if "short_answers" not in self.dataset.features:
self.dataset = self.dataset.map(lambda example: {"short_answers": [ann["short_answers"] for ann in example["qa_pairs"]]})
if "long_answers" not in dataset.features:
if "long_answers" not in self.dataset.features:
self.dataset = self.dataset.map(lambda example: {"long_answers": [ann["long_answer"] for ann in example["annotations"]]})
print("ASQA dataset loaded.")

def evaluate(self, dataset_name:str = "result_dataset", result_name:str = "results") -> Dataset:
"""Evaluate the dataset and return the dataset with scores.
We use the `short_answers` to evaluate the string Exact Match correctness and the `long_answers` to evaluate the RougeL and DisambigF1 score. And then we calculate the `DR score` as the geometric mean of the RougeL and DisambigF1 scores.
def prepare_data(self, label_column: str, input_column: str):
"""Modify self.dataset for different metric.
Args:
dataset_name: The name of the dataset file to save.
result_name: The name of the result file to save.
input_column: The column name of the input text that has already existed in self.dataset, e.g. `long_answer`.
label_column: The column name of the label text that the metric requires, e.g. `gt_answer`.
"""
if input_column not in self.dataset.column_names:
raise ValueError(f"The input column {input_column} is not in the dataset. Please check the column names.")

if label_column in self.dataset.column_names:
self.dataset = self.dataset.remove_columns(label_column)
self.dataset = self.dataset.add_column(label_column, self.dataset[input_column])

def _evaluate(self, ) -> Tuple[Dict[Any, Any], Dataset]:
"""Evaluate the dataset and return the dataset with scores.
We use the `short_answers` as the `gt_answers` to evaluate the string Exact Match correctness and the `long_answers` to evaluate the RougeL and DisambigF1 score. And then we calculate the `DR score` as the geometric mean of the RougeL and DisambigF1 scores.
"""
print("Start evaluate...")
if not hasattr(self, "dataset"):
raise ValueError("Please load the dataset first.")

self.results = {}
ground_truths = {
"answer_disambig_f1": ("gt_answers", "long_answers"),
"answer_rouge_correctness": ("gt_answers", "long_answers"),
"answer_exact_match": ("gt_answers", "short_answers")
}
results = {}
scores = {}
for m in self.metrics:
if m == "AnswerRougeCorrectness":
print("Evaluating AnswerRougeCorrectness...")
metric = self.get_metric(m, rouge_type="rougeL")
ds = self.dataset.add_column("gt_answers", self.dataset["long_answers"])
elif m == "AnswerEMCorrectness":
print("Evaluating AnswerEMCorrectness...")
metric = self.get_metric(m, ignore_case=True)
ds = self.dataset.add_column("gt_answers", self.dataset["short_answers"])
elif m == "AnswerDisambigF1Correctness":
print("Evaluating AnswerDisambigF1Correctness...")
metric = self.get_metric(m)
ds = self.dataset.add_column("gt_answers", self.dataset["long_answers"])
score, ds = metric.compute(ds, self.batch_size)
self.results[metric.name] = score
scores[metric.name] = ds[metric.name]
scores = [{k:v[i] for k,v in scores.items()} for i in range(len(self.dataset))]
self.dataset = self.dataset.add_column("scores", scores)

print("Calculating DR score...")
def dr_score(d:dict):
d['scores']['DR_score'] = math.sqrt(d['scores']["answer_disambig_f1"] * d['scores']["answer_rouge_correctness"])
return d
self.dataset = self.dataset.map(dr_score)
self.results["DR_score"] = math.sqrt(self.results["answer_disambig_f1"] * self.results["answer_rouge_correctness"])
if m.name in ground_truths:
print(f"Evaluating {m.name}...")
self.prepare_data(*ground_truths[m.name])
results[m.name], self.dataset = m.compute(self.dataset, self.batch_size)
self.dataset = self.dataset.map(lambda example: {f"{m.name}.{ground_truths[m.name][0]}": ground_truths[m.name][1]}) # Add the ground truth column name
scores[m.name] = self.dataset[m.name]

if "gt_answers" in self.dataset.column_names:
self.dataset = self.dataset.remove_columns("gt_answers")
# scores = [{k:v[i] for k,v in scores.items()} for i in range(len(self.dataset))]
# self.dataset = self.dataset.add_column("scores", scores)

if "answer_rouge_correctness" in self.dataset.column_names and "answer_disambig_f1" in self.dataset.column_names and "DR_score" not in self.dataset.column_names:
print("Calculating DR score...")
def dr_score(d:dict):
d['DR_score'] = math.sqrt(d["answer_disambig_f1"] * d ["answer_rouge_correctness"])
return d
self.dataset = self.dataset.map(dr_score)
results["DR_score"] = math.sqrt(results["answer_disambig_f1"] * results ["answer_rouge_correctness"])

print("Evaluation finished.")
print(f"Results: {self.results}")

self._save_result(dataset_name, result_name)
return self.dataset

def _save_result(self, dataset_name:str, result_name:str) -> None:
"""Save the result to files."""
with open(os.path.join(self.output_dir, result_name)+".json", "w") as f:
json.dump(self.results, f, indent=4)
self.dataset.to_json(os.path.join(self.output_dir, dataset_name)+".jsonl")
print(f"Results saved to {self.output_dir}/results.json and {self.output_dir}/result_datasets.jsonl")

return results, self.dataset

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--output_dir", type=str, default="benchmarks/ASQA/output")
args = parser.parse_args()

benchmark = ASQABenchmark(output_dir=args.output_dir)
benchmark = ASQABenchmark()

results = benchmark.evaluate(path="json", data_files=os.path.join(args.output_dir, "dataset.jsonl"), split="train")
print(f"Results:\n {results}")

dataset = benchmark.load_data(path="json", data_files=os.path.join(args.output_dir, "dataset.jsonl"), split="train")
benchmark.save_results(os.path.join(args.output_dir, "results.jsonl"))
benchmark.save_dataset(os.path.join(args.output_dir, "result_dataset.jsonl"))

dataset = benchmark.evaluate(dataset_name="result_dataset", result_name="results")
benchmark.dataset = benchmark.dataset.remove_columns("answer_exact_match")
benchmark.set_metric([AnswerEMCorrectness(ignore_case=False)])
results = benchmark.evaluate()
print(f"Results:\n {results}")
Loading

0 comments on commit 37701c6

Please sign in to comment.