Merge remote-tracking branch 'origin/main' into add_text_length

gomate-community · Sep 25, 2024 · e2f9273 · e2f9273
2 parents 9a38067 + 019ffe3
commit e2f9273
Show file tree

Hide file tree

Showing 20 changed files with 140 additions and 280 deletions.
diff --git a/benchmarks/ASQA/asqa_benchmark.py b/benchmarks/ASQA/asqa_benchmark.py
@@ -18,6 +18,12 @@ class ASQABenchmark(BaseBenchmark):
                AnswerEMCorrectness(ignore_case=True), 
                AnswerDisambigF1Correctness()]
 
+    ground_truths = {
+        "answer_disambig_f1": "long_answers",
+        "answer_rouge_correctness": "long_answers",
+        "answer_exact_match": "short_answers"
+    }
+
     def __init__(self) -> None:
         """Initialization."""
         super().__init__()
@@ -38,42 +44,32 @@ def _evaluate(self, ) -> Tuple[Dict[Any, Any], Dataset]:
         if not self.is_existed("long_answers"):
             self.dataset = self.dataset.map(lambda example: {"long_answers": [ann["long_answer"] for ann in example["annotations"]]})
 
-        ground_truths = {
-            "answer_disambig_f1": "long_answers",
-            "answer_rouge_correctness": "long_answers",
-            "answer_exact_match": "short_answers"
-        }
-
         results = {}
         for m in self.metrics:
-            if m.name in ground_truths:
+            if m.name in self.ground_truths:
                 print(f"Calculating {m.name}...")
 
                 if self.is_existed(m.name):
                     # Remove the metric column if it already exists
                     self.dataset = self.dataset.remove_columns(m.name)
-                if not self.is_existed(ground_truths[m.name]):
+                if not self.is_existed(self.ground_truths[m.name]):
                     # Check if the ground truth column exists
-                    raise ValueError(f"The column {ground_truths[m.name]} is not in the dataset. Please check the column names.")
+                    raise ValueError(f"The column {self.ground_truths[m.name]} is not in the dataset. Please check the column names.")
 
                 avg_scores, scores = m.compute(
                     self.dataset["answers"], 
-                    self.dataset[ground_truths[m.name]]
+                    self.dataset[self.ground_truths[m.name]]
                 )
                 results[m.name] = avg_scores
                 self.dataset = self.dataset.add_column(m.name, scores)
 
                 print(f"{m.name}: {avg_scores}")
 
         if self.is_existed("answer_rouge_correctness") and self.is_existed("answer_disambig_f1"):
-            if self.is_existed("DR_score"):
-                self.dataset = self.dataset.remove_columns("DR_score")
+            # Notice that DR score is an overall geometric mean of RougeL and DisambigF1 scores, which is calculated as sqrt(RougeL * DisambigF1) for whole dataset instead of average of each sample.
             print("Calculating DR score...")
-            def dr_score(d:dict):
-                d['DR_score'] = np.sqrt(d["answer_disambig_f1"] * d["answer_rouge_correctness"])
-                return d
-            self.dataset = self.dataset.map(dr_score)
-            results["DR_score"] = np.average(self.dataset["DR_score"])
+            results["DR_score"] = np.sqrt(np.average(self.dataset["answer_disambig_f1"]) * np.average(self.dataset["answer_rouge_correctness"]))
+            print(f"DR_score: {results['DR_score']}")
 
         return results, self.dataset
 

diff --git a/rageval/metrics/answer_correctness/_answer_accuracy.py b/rageval/metrics/answer_correctness/_answer_accuracy.py
@@ -111,14 +111,3 @@ def _compute_one(
     ) -> float:
         """Evaluating the correctness of answer."""
         return answer == gt_answer
-
-    def _compute_batch(
-        self,
-        pred_answers,
-        ref_answers
-    ) -> List[float]:
-        """Evaluate the correctness of a batch of answers."""
-        return [
-            self._compute_one(answer, gt_answer)
-            for answer, gt_answer in zip(pred_answers, ref_answers)
-        ]
diff --git a/rageval/metrics/answer_correctness/_answer_bert_score.py b/rageval/metrics/answer_correctness/_answer_bert_score.py
@@ -1,4 +1,3 @@
-import re
 from dataclasses import dataclass
 from typing import List, Tuple
 
@@ -28,7 +27,7 @@
 
 Functions:
     _clean: clean special word in sentence.
-    _compute_single: compute bleu score for single prediction with its references
+    _compute_one: compute bleu score for single prediction with its references
     _compute_batch: compute bleu score for a batch of predictions with their references
 
 Examples:
@@ -57,10 +56,10 @@
     >>> metric.mtype
     'AnswerCorrectness'
     >>> score, results = metric.compute(dataset["answers"], dataset["gt_answers"], 1)
-    >>> score
-    0.5511645078659058
-    >>> results[0]
-    0.7265544533729553
+    >>> round(score, 2)
+    0.55
+    >>> round(results[0], 1)
+    0.7
 """
 
 
@@ -112,19 +111,11 @@ def _info(self):
             reference_urls=["https://openreview.net/forum?id=SkeHuCVFDr"]
         )
 
-    def _compute_single(
+    def _compute_one(
         self,
         pred_answers: str,
         ref_answers: List[str]
     ) -> float:
         """Compute the BERTscore for a pair of predictions and references."""
         P, R, F1 = self.scorer.score([pred_answers] * len(ref_answers), ref_answers)
         return F1.max().tolist()
-
-    def _compute_batch(
-        self,
-        pred_answers: List[str],
-        ref_answers: List[List[str]]
-    ) -> List[float]:
-        """Compute the BERTscore for a batch of predictions and references."""
-        return [self._compute_single(pred, refs) for pred, refs in zip(pred_answers, ref_answers)]
diff --git a/rageval/metrics/answer_correctness/_answer_bleu.py b/rageval/metrics/answer_correctness/_answer_bleu.py
@@ -26,7 +26,7 @@
 
 Functions:
     _clean: clean special word in sentence.
-    _compute_single: compute bleu score for single prediction with its references
+    _compute_one: compute bleu score for single prediction with its references
 
 Examples:
     >>> from datasets import Dataset
@@ -119,13 +119,12 @@ def _clean_special_tokens(self, sentence: str, subword: str) -> str:
             sentence = re.sub(subword, "", sentence)
         return sentence
 
-    def _compute_single(
+    def _compute_one(
         self,
         pred_answers: List[str],
         ref_answers: List[List[str]]
     ) -> List[float]:
         """Compute the bleu score of a batch of answers."""
-
         scores = []
         bleu = datasets.load_metric("bleu")
         for output, gt_answers in zip(pred_answers, ref_answers):
@@ -162,13 +161,6 @@ def compute(
             references.append(reference)
         bleu_result = bleu.compute(predictions=predictions, references=references)
         bleu_score = bleu_result['bleu']
-        scores = self._compute_single(pred_answers, ref_answers)
+        scores = self._compute_one(pred_answers, ref_answers)
 
         return bleu_score, scores
-
-    def _compute_batch(
-        self,
-        pred_answers: List[str],
-        ref_answers: List[List[str]]
-    ) -> List[float]:
-        pass
diff --git a/rageval/metrics/answer_correctness/_answer_chrf.py b/rageval/metrics/answer_correctness/_answer_chrf.py
@@ -1,7 +1,9 @@
 from dataclasses import dataclass
-from typing import List, Tuple
+from typing import List, Tuple, Optional
 
 import datasets
+from sacrebleu.metrics import CHRF
+import numpy as np
 
 from rageval.metrics import Metric, add_attribute
 
@@ -116,12 +118,15 @@ def __init__(
         Ensure all parent classes are initialized.
         """
         super().__init__()
-        self.char_order = char_order
-        self.word_order = word_order
-        self.beta = beta
-        self.lowercase = lowercase
-        self.whitespace = whitespace
-        self.eps_smoothing = eps_smoothing
+
+        self.chrf = CHRF(
+            char_order=char_order,
+            word_order=word_order,
+            beta=beta,
+            lowercase=lowercase,
+            whitespace=whitespace,
+            eps_smoothing=eps_smoothing
+        )
 
     def __repr__(self) -> str:
         """:return: Formatted string representation of the metric."""
@@ -138,7 +143,7 @@ def _info(self):
                     "gt_answers": datasets.Sequence(datasets.Value("string"))
                 }
             ),
-            codebase_urls=["https://github.com/huggingface/datasets/blob/main/metrics/chrf/chrf.py"],
+            codebase_urls=["https://github.com/mjpost/sacreBLEU#chrf--chrf"],
             reference_urls=[
                 "https://aclanthology.org/W15-3049.pdf",
                 "https://aclanthology.org/W17-4770",
@@ -152,49 +157,29 @@ def _validate_data(
         ref_answers: List[List[str]]
     ) -> None:
         """Validate the input dataset."""
+        super()._validate_data(pred_answers, ref_answers)
         if not all(isinstance(answer, str) for answer in pred_answers):
             raise ValueError("The type of pred_answers should be a string.")
         if not all(isinstance(a, list) and all(isinstance(item, str) for item in a) for a in ref_answers):
             raise ValueError("The type of ref_answers should be a list of strings.")
 
+    def _compute_one(
+        self,
+        pred_answer: str,
+        ref_answers: List[str]
+    ) -> float:
+        """Compute the metric for a single sentence against a single (or multiple) reference(s)."""
+        return self.chrf.sentence_score(pred_answer, ref_answers).score
+
     def compute(
         self,
         pred_answers: List[str],
         ref_answers: List[List[str]],
-        batch_size: int
+        batch_size: Optional[int] = None,
     ) -> Tuple[float, List[float]]:
-        """Evaluate the predictions against references."""
+        """Corpus score takes into account all the answers as two corpora and returns the F1 score of the corpus, which is not equal to the average of the chrF scores of the individual (pred, refs) pair."""
         self._validate_data(pred_answers, ref_answers)
-        chrf = datasets.load_metric("chrf")
-        result = chrf.compute(
-            predictions=pred_answers,
-            references=ref_answers,
-            char_order=self.char_order,
-            word_order=self.word_order,
-            beta=self.beta,
-            lowercase=self.lowercase,
-            whitespace=self.whitespace,
-            eps_smoothing=self.eps_smoothing
-        )['score']
-        scores = [
-            chrf.compute(
-                predictions=[pred_answers[i]],
-                references=[ref_answers[i]],
-                char_order=self.char_order,
-                word_order=self.word_order,
-                beta=self.beta,
-                lowercase=self.lowercase,
-                whitespace=self.whitespace,
-                eps_smoothing=self.eps_smoothing
-            )['score']
-            for i in range(len(pred_answers))
-        ]
-
-        return result, scores
-
-    def _compute_batch(
-        self,
-        pred_answers: List[str],
-        ref_answers: List[List[str]]
-    ) -> List[float]:
-        pass
+        scores = self._compute_batch(pred_answers, ref_answers)
+        ref_answers = np.array(ref_answers)
+        ref_answers = ref_answers.T.tolist()
+        return self.chrf.corpus_score(pred_answers, ref_answers).score, scores
diff --git a/rageval/metrics/answer_correctness/_answer_claim_recall.py b/rageval/metrics/answer_correctness/_answer_claim_recall.py
@@ -145,8 +145,8 @@ def _compute_one(
 
     def _compute_batch(
         self,
-        pred_answers,
-        ref_answers
+        pred_answers: List[str],
+        ref_answers: List[List[str]]
     ) -> List[float]:
         """
         Evaluate the correctness of a batch of answers.

diff --git a/rageval/metrics/answer_correctness/_answer_disambig_f1.py b/rageval/metrics/answer_correctness/_answer_disambig_f1.py
@@ -3,6 +3,7 @@
 from collections import Counter
 from dataclasses import dataclass
 from typing import List
+from tqdm import tqdm
 
 import datasets
 import numpy as np
@@ -176,20 +177,4 @@ def _compute_one(
         ref_answers: List[str]
     ) -> float:
         """Evaluate the disambig f1 score of an answer."""
-        scores = []
-        for ref_answer in ref_answers:
-            score = self._f1_score(pred_answer, ref_answer)
-            scores.append(score)
-
-        return np.max(scores)
-
-    def _compute_batch(
-        self,
-        pred_answers: List[str],
-        ref_answers: List[List[str]]
-    ) -> List[float]:
-        """Evaluate the disambig f1 score of a batch of answers."""
-        return [
-            self._compute_one(pred_answer, gt_answer)
-            for pred_answer, gt_answer in zip(pred_answers, ref_answers)
-        ]
+        return np.max([self._f1_score(pred_answer, ref_answer) for ref_answer in ref_answers])
diff --git a/rageval/metrics/answer_correctness/_answer_edit_distance.py b/rageval/metrics/answer_correctness/_answer_edit_distance.py
@@ -123,14 +123,3 @@ def _compute_one(
                     dp[i][j] = min(dp[i][j], dp[i - 1][j - 1])
 
         return dp[m][n] / m
-
-    def _compute_batch(
-        self,
-        pred_answers: List[str],
-        ref_answers: List[str],
-    ) -> List[float]:
-        """Evaluate the similarity of a batch of answers."""
-        return [
-            self._compute_one(pred_answer, reference)
-            for pred_answer, reference in zip(pred_answers, ref_answers)
-        ]
diff --git a/rageval/metrics/answer_correctness/_answer_exact_match.py b/rageval/metrics/answer_correctness/_answer_exact_match.py
@@ -122,15 +122,3 @@ def _compute_one(self, pred_answer: str, short_answers: List[List[str]]) -> floa
             else:
                 acc.append(False)
         return np.average(acc)
-
-    def _compute_batch(
-        self,
-        pred_answers: List[str],
-        ref_answers: List[List[List[str]]]
-    ) -> List[float]:
-        """Compute the correctness of a batch of answers."""
-
-        return [
-            self._compute_one(prediction, short_answers)
-            for prediction, short_answers in zip(pred_answers, ref_answers)
-        ]