Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into add_text_length
Browse files Browse the repository at this point in the history
  • Loading branch information
Wenshansilvia committed Sep 25, 2024
2 parents 9a38067 + 019ffe3 commit e2f9273
Show file tree
Hide file tree
Showing 20 changed files with 140 additions and 280 deletions.
30 changes: 13 additions & 17 deletions benchmarks/ASQA/asqa_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@ class ASQABenchmark(BaseBenchmark):
AnswerEMCorrectness(ignore_case=True),
AnswerDisambigF1Correctness()]

ground_truths = {
"answer_disambig_f1": "long_answers",
"answer_rouge_correctness": "long_answers",
"answer_exact_match": "short_answers"
}

def __init__(self) -> None:
"""Initialization."""
super().__init__()
Expand All @@ -38,42 +44,32 @@ def _evaluate(self, ) -> Tuple[Dict[Any, Any], Dataset]:
if not self.is_existed("long_answers"):
self.dataset = self.dataset.map(lambda example: {"long_answers": [ann["long_answer"] for ann in example["annotations"]]})

ground_truths = {
"answer_disambig_f1": "long_answers",
"answer_rouge_correctness": "long_answers",
"answer_exact_match": "short_answers"
}

results = {}
for m in self.metrics:
if m.name in ground_truths:
if m.name in self.ground_truths:
print(f"Calculating {m.name}...")

if self.is_existed(m.name):
# Remove the metric column if it already exists
self.dataset = self.dataset.remove_columns(m.name)
if not self.is_existed(ground_truths[m.name]):
if not self.is_existed(self.ground_truths[m.name]):
# Check if the ground truth column exists
raise ValueError(f"The column {ground_truths[m.name]} is not in the dataset. Please check the column names.")
raise ValueError(f"The column {self.ground_truths[m.name]} is not in the dataset. Please check the column names.")

avg_scores, scores = m.compute(
self.dataset["answers"],
self.dataset[ground_truths[m.name]]
self.dataset[self.ground_truths[m.name]]
)
results[m.name] = avg_scores
self.dataset = self.dataset.add_column(m.name, scores)

print(f"{m.name}: {avg_scores}")

if self.is_existed("answer_rouge_correctness") and self.is_existed("answer_disambig_f1"):
if self.is_existed("DR_score"):
self.dataset = self.dataset.remove_columns("DR_score")
# Notice that DR score is an overall geometric mean of RougeL and DisambigF1 scores, which is calculated as sqrt(RougeL * DisambigF1) for whole dataset instead of average of each sample.
print("Calculating DR score...")
def dr_score(d:dict):
d['DR_score'] = np.sqrt(d["answer_disambig_f1"] * d["answer_rouge_correctness"])
return d
self.dataset = self.dataset.map(dr_score)
results["DR_score"] = np.average(self.dataset["DR_score"])
results["DR_score"] = np.sqrt(np.average(self.dataset["answer_disambig_f1"]) * np.average(self.dataset["answer_rouge_correctness"]))
print(f"DR_score: {results['DR_score']}")

return results, self.dataset

Expand Down
11 changes: 0 additions & 11 deletions rageval/metrics/answer_correctness/_answer_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,14 +111,3 @@ def _compute_one(
) -> float:
"""Evaluating the correctness of answer."""
return answer == gt_answer

def _compute_batch(
self,
pred_answers,
ref_answers
) -> List[float]:
"""Evaluate the correctness of a batch of answers."""
return [
self._compute_one(answer, gt_answer)
for answer, gt_answer in zip(pred_answers, ref_answers)
]
21 changes: 6 additions & 15 deletions rageval/metrics/answer_correctness/_answer_bert_score.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import re
from dataclasses import dataclass
from typing import List, Tuple

Expand Down Expand Up @@ -28,7 +27,7 @@
Functions:
_clean: clean special word in sentence.
_compute_single: compute bleu score for single prediction with its references
_compute_one: compute bleu score for single prediction with its references
_compute_batch: compute bleu score for a batch of predictions with their references
Examples:
Expand Down Expand Up @@ -57,10 +56,10 @@
>>> metric.mtype
'AnswerCorrectness'
>>> score, results = metric.compute(dataset["answers"], dataset["gt_answers"], 1)
>>> score
0.5511645078659058
>>> results[0]
0.7265544533729553
>>> round(score, 2)
0.55
>>> round(results[0], 1)
0.7
"""


Expand Down Expand Up @@ -112,19 +111,11 @@ def _info(self):
reference_urls=["https://openreview.net/forum?id=SkeHuCVFDr"]
)

def _compute_single(
def _compute_one(
self,
pred_answers: str,
ref_answers: List[str]
) -> float:
"""Compute the BERTscore for a pair of predictions and references."""
P, R, F1 = self.scorer.score([pred_answers] * len(ref_answers), ref_answers)
return F1.max().tolist()

def _compute_batch(
self,
pred_answers: List[str],
ref_answers: List[List[str]]
) -> List[float]:
"""Compute the BERTscore for a batch of predictions and references."""
return [self._compute_single(pred, refs) for pred, refs in zip(pred_answers, ref_answers)]
14 changes: 3 additions & 11 deletions rageval/metrics/answer_correctness/_answer_bleu.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
Functions:
_clean: clean special word in sentence.
_compute_single: compute bleu score for single prediction with its references
_compute_one: compute bleu score for single prediction with its references
Examples:
>>> from datasets import Dataset
Expand Down Expand Up @@ -119,13 +119,12 @@ def _clean_special_tokens(self, sentence: str, subword: str) -> str:
sentence = re.sub(subword, "", sentence)
return sentence

def _compute_single(
def _compute_one(
self,
pred_answers: List[str],
ref_answers: List[List[str]]
) -> List[float]:
"""Compute the bleu score of a batch of answers."""

scores = []
bleu = datasets.load_metric("bleu")
for output, gt_answers in zip(pred_answers, ref_answers):
Expand Down Expand Up @@ -162,13 +161,6 @@ def compute(
references.append(reference)
bleu_result = bleu.compute(predictions=predictions, references=references)
bleu_score = bleu_result['bleu']
scores = self._compute_single(pred_answers, ref_answers)
scores = self._compute_one(pred_answers, ref_answers)

return bleu_score, scores

def _compute_batch(
self,
pred_answers: List[str],
ref_answers: List[List[str]]
) -> List[float]:
pass
71 changes: 28 additions & 43 deletions rageval/metrics/answer_correctness/_answer_chrf.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from dataclasses import dataclass
from typing import List, Tuple
from typing import List, Tuple, Optional

import datasets
from sacrebleu.metrics import CHRF
import numpy as np

from rageval.metrics import Metric, add_attribute

Expand Down Expand Up @@ -116,12 +118,15 @@ def __init__(
Ensure all parent classes are initialized.
"""
super().__init__()
self.char_order = char_order
self.word_order = word_order
self.beta = beta
self.lowercase = lowercase
self.whitespace = whitespace
self.eps_smoothing = eps_smoothing

self.chrf = CHRF(
char_order=char_order,
word_order=word_order,
beta=beta,
lowercase=lowercase,
whitespace=whitespace,
eps_smoothing=eps_smoothing
)

def __repr__(self) -> str:
""":return: Formatted string representation of the metric."""
Expand All @@ -138,7 +143,7 @@ def _info(self):
"gt_answers": datasets.Sequence(datasets.Value("string"))
}
),
codebase_urls=["https://github.com/huggingface/datasets/blob/main/metrics/chrf/chrf.py"],
codebase_urls=["https://github.com/mjpost/sacreBLEU#chrf--chrf"],
reference_urls=[
"https://aclanthology.org/W15-3049.pdf",
"https://aclanthology.org/W17-4770",
Expand All @@ -152,49 +157,29 @@ def _validate_data(
ref_answers: List[List[str]]
) -> None:
"""Validate the input dataset."""
super()._validate_data(pred_answers, ref_answers)
if not all(isinstance(answer, str) for answer in pred_answers):
raise ValueError("The type of pred_answers should be a string.")
if not all(isinstance(a, list) and all(isinstance(item, str) for item in a) for a in ref_answers):
raise ValueError("The type of ref_answers should be a list of strings.")

def _compute_one(
self,
pred_answer: str,
ref_answers: List[str]
) -> float:
"""Compute the metric for a single sentence against a single (or multiple) reference(s)."""
return self.chrf.sentence_score(pred_answer, ref_answers).score

def compute(
self,
pred_answers: List[str],
ref_answers: List[List[str]],
batch_size: int
batch_size: Optional[int] = None,
) -> Tuple[float, List[float]]:
"""Evaluate the predictions against references."""
"""Corpus score takes into account all the answers as two corpora and returns the F1 score of the corpus, which is not equal to the average of the chrF scores of the individual (pred, refs) pair."""
self._validate_data(pred_answers, ref_answers)
chrf = datasets.load_metric("chrf")
result = chrf.compute(
predictions=pred_answers,
references=ref_answers,
char_order=self.char_order,
word_order=self.word_order,
beta=self.beta,
lowercase=self.lowercase,
whitespace=self.whitespace,
eps_smoothing=self.eps_smoothing
)['score']
scores = [
chrf.compute(
predictions=[pred_answers[i]],
references=[ref_answers[i]],
char_order=self.char_order,
word_order=self.word_order,
beta=self.beta,
lowercase=self.lowercase,
whitespace=self.whitespace,
eps_smoothing=self.eps_smoothing
)['score']
for i in range(len(pred_answers))
]

return result, scores

def _compute_batch(
self,
pred_answers: List[str],
ref_answers: List[List[str]]
) -> List[float]:
pass
scores = self._compute_batch(pred_answers, ref_answers)
ref_answers = np.array(ref_answers)
ref_answers = ref_answers.T.tolist()
return self.chrf.corpus_score(pred_answers, ref_answers).score, scores
4 changes: 2 additions & 2 deletions rageval/metrics/answer_correctness/_answer_claim_recall.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,8 @@ def _compute_one(

def _compute_batch(
self,
pred_answers,
ref_answers
pred_answers: List[str],
ref_answers: List[List[str]]
) -> List[float]:
"""
Evaluate the correctness of a batch of answers.
Expand Down
19 changes: 2 additions & 17 deletions rageval/metrics/answer_correctness/_answer_disambig_f1.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from collections import Counter
from dataclasses import dataclass
from typing import List
from tqdm import tqdm

import datasets
import numpy as np
Expand Down Expand Up @@ -176,20 +177,4 @@ def _compute_one(
ref_answers: List[str]
) -> float:
"""Evaluate the disambig f1 score of an answer."""
scores = []
for ref_answer in ref_answers:
score = self._f1_score(pred_answer, ref_answer)
scores.append(score)

return np.max(scores)

def _compute_batch(
self,
pred_answers: List[str],
ref_answers: List[List[str]]
) -> List[float]:
"""Evaluate the disambig f1 score of a batch of answers."""
return [
self._compute_one(pred_answer, gt_answer)
for pred_answer, gt_answer in zip(pred_answers, ref_answers)
]
return np.max([self._f1_score(pred_answer, ref_answer) for ref_answer in ref_answers])
11 changes: 0 additions & 11 deletions rageval/metrics/answer_correctness/_answer_edit_distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,14 +123,3 @@ def _compute_one(
dp[i][j] = min(dp[i][j], dp[i - 1][j - 1])

return dp[m][n] / m

def _compute_batch(
self,
pred_answers: List[str],
ref_answers: List[str],
) -> List[float]:
"""Evaluate the similarity of a batch of answers."""
return [
self._compute_one(pred_answer, reference)
for pred_answer, reference in zip(pred_answers, ref_answers)
]
12 changes: 0 additions & 12 deletions rageval/metrics/answer_correctness/_answer_exact_match.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,15 +122,3 @@ def _compute_one(self, pred_answer: str, short_answers: List[List[str]]) -> floa
else:
acc.append(False)
return np.average(acc)

def _compute_batch(
self,
pred_answers: List[str],
ref_answers: List[List[List[str]]]
) -> List[float]:
"""Compute the correctness of a batch of answers."""

return [
self._compute_one(prediction, short_answers)
for prediction, short_answers in zip(pred_answers, ref_answers)
]
Loading

0 comments on commit e2f9273

Please sign in to comment.