diff --git a/benchmarks/ASQA/asqa_benchmark.py b/benchmarks/ASQA/asqa_benchmark.py index 355777a..b0cb559 100644 --- a/benchmarks/ASQA/asqa_benchmark.py +++ b/benchmarks/ASQA/asqa_benchmark.py @@ -18,6 +18,12 @@ class ASQABenchmark(BaseBenchmark): AnswerEMCorrectness(ignore_case=True), AnswerDisambigF1Correctness()] + ground_truths = { + "answer_disambig_f1": "long_answers", + "answer_rouge_correctness": "long_answers", + "answer_exact_match": "short_answers" + } + def __init__(self) -> None: """Initialization.""" super().__init__() @@ -38,27 +44,21 @@ def _evaluate(self, ) -> Tuple[Dict[Any, Any], Dataset]: if not self.is_existed("long_answers"): self.dataset = self.dataset.map(lambda example: {"long_answers": [ann["long_answer"] for ann in example["annotations"]]}) - ground_truths = { - "answer_disambig_f1": "long_answers", - "answer_rouge_correctness": "long_answers", - "answer_exact_match": "short_answers" - } - results = {} for m in self.metrics: - if m.name in ground_truths: + if m.name in self.ground_truths: print(f"Calculating {m.name}...") if self.is_existed(m.name): # Remove the metric column if it already exists self.dataset = self.dataset.remove_columns(m.name) - if not self.is_existed(ground_truths[m.name]): + if not self.is_existed(self.ground_truths[m.name]): # Check if the ground truth column exists - raise ValueError(f"The column {ground_truths[m.name]} is not in the dataset. Please check the column names.") + raise ValueError(f"The column {self.ground_truths[m.name]} is not in the dataset. Please check the column names.") avg_scores, scores = m.compute( self.dataset["answers"], - self.dataset[ground_truths[m.name]] + self.dataset[self.ground_truths[m.name]] ) results[m.name] = avg_scores self.dataset = self.dataset.add_column(m.name, scores) @@ -66,14 +66,10 @@ def _evaluate(self, ) -> Tuple[Dict[Any, Any], Dataset]: print(f"{m.name}: {avg_scores}") if self.is_existed("answer_rouge_correctness") and self.is_existed("answer_disambig_f1"): - if self.is_existed("DR_score"): - self.dataset = self.dataset.remove_columns("DR_score") + # Notice that DR score is an overall geometric mean of RougeL and DisambigF1 scores, which is calculated as sqrt(RougeL * DisambigF1) for whole dataset instead of average of each sample. print("Calculating DR score...") - def dr_score(d:dict): - d['DR_score'] = np.sqrt(d["answer_disambig_f1"] * d["answer_rouge_correctness"]) - return d - self.dataset = self.dataset.map(dr_score) - results["DR_score"] = np.average(self.dataset["DR_score"]) + results["DR_score"] = np.sqrt(np.average(self.dataset["answer_disambig_f1"]) * np.average(self.dataset["answer_rouge_correctness"])) + print(f"DR_score: {results['DR_score']}") return results, self.dataset diff --git a/rageval/metrics/answer_correctness/_answer_accuracy.py b/rageval/metrics/answer_correctness/_answer_accuracy.py index f9d55f3..740847a 100644 --- a/rageval/metrics/answer_correctness/_answer_accuracy.py +++ b/rageval/metrics/answer_correctness/_answer_accuracy.py @@ -111,14 +111,3 @@ def _compute_one( ) -> float: """Evaluating the correctness of answer.""" return answer == gt_answer - - def _compute_batch( - self, - pred_answers, - ref_answers - ) -> List[float]: - """Evaluate the correctness of a batch of answers.""" - return [ - self._compute_one(answer, gt_answer) - for answer, gt_answer in zip(pred_answers, ref_answers) - ] diff --git a/rageval/metrics/answer_correctness/_answer_bert_score.py b/rageval/metrics/answer_correctness/_answer_bert_score.py index 1234b83..e509357 100644 --- a/rageval/metrics/answer_correctness/_answer_bert_score.py +++ b/rageval/metrics/answer_correctness/_answer_bert_score.py @@ -1,4 +1,3 @@ -import re from dataclasses import dataclass from typing import List, Tuple @@ -28,7 +27,7 @@ Functions: _clean: clean special word in sentence. - _compute_single: compute bleu score for single prediction with its references + _compute_one: compute bleu score for single prediction with its references _compute_batch: compute bleu score for a batch of predictions with their references Examples: @@ -57,10 +56,10 @@ >>> metric.mtype 'AnswerCorrectness' >>> score, results = metric.compute(dataset["answers"], dataset["gt_answers"], 1) - >>> score - 0.5511645078659058 - >>> results[0] - 0.7265544533729553 + >>> round(score, 2) + 0.55 + >>> round(results[0], 1) + 0.7 """ @@ -112,7 +111,7 @@ def _info(self): reference_urls=["https://openreview.net/forum?id=SkeHuCVFDr"] ) - def _compute_single( + def _compute_one( self, pred_answers: str, ref_answers: List[str] @@ -120,11 +119,3 @@ def _compute_single( """Compute the BERTscore for a pair of predictions and references.""" P, R, F1 = self.scorer.score([pred_answers] * len(ref_answers), ref_answers) return F1.max().tolist() - - def _compute_batch( - self, - pred_answers: List[str], - ref_answers: List[List[str]] - ) -> List[float]: - """Compute the BERTscore for a batch of predictions and references.""" - return [self._compute_single(pred, refs) for pred, refs in zip(pred_answers, ref_answers)] diff --git a/rageval/metrics/answer_correctness/_answer_bleu.py b/rageval/metrics/answer_correctness/_answer_bleu.py index 6537b96..666fdcb 100644 --- a/rageval/metrics/answer_correctness/_answer_bleu.py +++ b/rageval/metrics/answer_correctness/_answer_bleu.py @@ -26,7 +26,7 @@ Functions: _clean: clean special word in sentence. - _compute_single: compute bleu score for single prediction with its references + _compute_one: compute bleu score for single prediction with its references Examples: >>> from datasets import Dataset @@ -119,13 +119,12 @@ def _clean_special_tokens(self, sentence: str, subword: str) -> str: sentence = re.sub(subword, "", sentence) return sentence - def _compute_single( + def _compute_one( self, pred_answers: List[str], ref_answers: List[List[str]] ) -> List[float]: """Compute the bleu score of a batch of answers.""" - scores = [] bleu = datasets.load_metric("bleu") for output, gt_answers in zip(pred_answers, ref_answers): @@ -162,13 +161,6 @@ def compute( references.append(reference) bleu_result = bleu.compute(predictions=predictions, references=references) bleu_score = bleu_result['bleu'] - scores = self._compute_single(pred_answers, ref_answers) + scores = self._compute_one(pred_answers, ref_answers) return bleu_score, scores - - def _compute_batch( - self, - pred_answers: List[str], - ref_answers: List[List[str]] - ) -> List[float]: - pass diff --git a/rageval/metrics/answer_correctness/_answer_chrf.py b/rageval/metrics/answer_correctness/_answer_chrf.py index 912d43a..a031340 100644 --- a/rageval/metrics/answer_correctness/_answer_chrf.py +++ b/rageval/metrics/answer_correctness/_answer_chrf.py @@ -1,7 +1,9 @@ from dataclasses import dataclass -from typing import List, Tuple +from typing import List, Tuple, Optional import datasets +from sacrebleu.metrics import CHRF +import numpy as np from rageval.metrics import Metric, add_attribute @@ -116,12 +118,15 @@ def __init__( Ensure all parent classes are initialized. """ super().__init__() - self.char_order = char_order - self.word_order = word_order - self.beta = beta - self.lowercase = lowercase - self.whitespace = whitespace - self.eps_smoothing = eps_smoothing + + self.chrf = CHRF( + char_order=char_order, + word_order=word_order, + beta=beta, + lowercase=lowercase, + whitespace=whitespace, + eps_smoothing=eps_smoothing + ) def __repr__(self) -> str: """:return: Formatted string representation of the metric.""" @@ -138,7 +143,7 @@ def _info(self): "gt_answers": datasets.Sequence(datasets.Value("string")) } ), - codebase_urls=["https://github.com/huggingface/datasets/blob/main/metrics/chrf/chrf.py"], + codebase_urls=["https://github.com/mjpost/sacreBLEU#chrf--chrf"], reference_urls=[ "https://aclanthology.org/W15-3049.pdf", "https://aclanthology.org/W17-4770", @@ -152,49 +157,29 @@ def _validate_data( ref_answers: List[List[str]] ) -> None: """Validate the input dataset.""" + super()._validate_data(pred_answers, ref_answers) if not all(isinstance(answer, str) for answer in pred_answers): raise ValueError("The type of pred_answers should be a string.") if not all(isinstance(a, list) and all(isinstance(item, str) for item in a) for a in ref_answers): raise ValueError("The type of ref_answers should be a list of strings.") + def _compute_one( + self, + pred_answer: str, + ref_answers: List[str] + ) -> float: + """Compute the metric for a single sentence against a single (or multiple) reference(s).""" + return self.chrf.sentence_score(pred_answer, ref_answers).score + def compute( self, pred_answers: List[str], ref_answers: List[List[str]], - batch_size: int + batch_size: Optional[int] = None, ) -> Tuple[float, List[float]]: - """Evaluate the predictions against references.""" + """Corpus score takes into account all the answers as two corpora and returns the F1 score of the corpus, which is not equal to the average of the chrF scores of the individual (pred, refs) pair.""" self._validate_data(pred_answers, ref_answers) - chrf = datasets.load_metric("chrf") - result = chrf.compute( - predictions=pred_answers, - references=ref_answers, - char_order=self.char_order, - word_order=self.word_order, - beta=self.beta, - lowercase=self.lowercase, - whitespace=self.whitespace, - eps_smoothing=self.eps_smoothing - )['score'] - scores = [ - chrf.compute( - predictions=[pred_answers[i]], - references=[ref_answers[i]], - char_order=self.char_order, - word_order=self.word_order, - beta=self.beta, - lowercase=self.lowercase, - whitespace=self.whitespace, - eps_smoothing=self.eps_smoothing - )['score'] - for i in range(len(pred_answers)) - ] - - return result, scores - - def _compute_batch( - self, - pred_answers: List[str], - ref_answers: List[List[str]] - ) -> List[float]: - pass + scores = self._compute_batch(pred_answers, ref_answers) + ref_answers = np.array(ref_answers) + ref_answers = ref_answers.T.tolist() + return self.chrf.corpus_score(pred_answers, ref_answers).score, scores diff --git a/rageval/metrics/answer_correctness/_answer_claim_recall.py b/rageval/metrics/answer_correctness/_answer_claim_recall.py index 36af3d7..bcd2d48 100644 --- a/rageval/metrics/answer_correctness/_answer_claim_recall.py +++ b/rageval/metrics/answer_correctness/_answer_claim_recall.py @@ -145,8 +145,8 @@ def _compute_one( def _compute_batch( self, - pred_answers, - ref_answers + pred_answers: List[str], + ref_answers: List[List[str]] ) -> List[float]: """ Evaluate the correctness of a batch of answers. diff --git a/rageval/metrics/answer_correctness/_answer_disambig_f1.py b/rageval/metrics/answer_correctness/_answer_disambig_f1.py index 262071b..3aa2a38 100644 --- a/rageval/metrics/answer_correctness/_answer_disambig_f1.py +++ b/rageval/metrics/answer_correctness/_answer_disambig_f1.py @@ -3,6 +3,7 @@ from collections import Counter from dataclasses import dataclass from typing import List +from tqdm import tqdm import datasets import numpy as np @@ -176,20 +177,4 @@ def _compute_one( ref_answers: List[str] ) -> float: """Evaluate the disambig f1 score of an answer.""" - scores = [] - for ref_answer in ref_answers: - score = self._f1_score(pred_answer, ref_answer) - scores.append(score) - - return np.max(scores) - - def _compute_batch( - self, - pred_answers: List[str], - ref_answers: List[List[str]] - ) -> List[float]: - """Evaluate the disambig f1 score of a batch of answers.""" - return [ - self._compute_one(pred_answer, gt_answer) - for pred_answer, gt_answer in zip(pred_answers, ref_answers) - ] + return np.max([self._f1_score(pred_answer, ref_answer) for ref_answer in ref_answers]) diff --git a/rageval/metrics/answer_correctness/_answer_edit_distance.py b/rageval/metrics/answer_correctness/_answer_edit_distance.py index 191c3de..4f610bf 100644 --- a/rageval/metrics/answer_correctness/_answer_edit_distance.py +++ b/rageval/metrics/answer_correctness/_answer_edit_distance.py @@ -123,14 +123,3 @@ def _compute_one( dp[i][j] = min(dp[i][j], dp[i - 1][j - 1]) return dp[m][n] / m - - def _compute_batch( - self, - pred_answers: List[str], - ref_answers: List[str], - ) -> List[float]: - """Evaluate the similarity of a batch of answers.""" - return [ - self._compute_one(pred_answer, reference) - for pred_answer, reference in zip(pred_answers, ref_answers) - ] diff --git a/rageval/metrics/answer_correctness/_answer_exact_match.py b/rageval/metrics/answer_correctness/_answer_exact_match.py index 9381359..24fd183 100644 --- a/rageval/metrics/answer_correctness/_answer_exact_match.py +++ b/rageval/metrics/answer_correctness/_answer_exact_match.py @@ -122,15 +122,3 @@ def _compute_one(self, pred_answer: str, short_answers: List[List[str]]) -> floa else: acc.append(False) return np.average(acc) - - def _compute_batch( - self, - pred_answers: List[str], - ref_answers: List[List[List[str]]] - ) -> List[float]: - """Compute the correctness of a batch of answers.""" - - return [ - self._compute_one(prediction, short_answers) - for prediction, short_answers in zip(pred_answers, ref_answers) - ] diff --git a/rageval/metrics/answer_correctness/_answer_f1.py b/rageval/metrics/answer_correctness/_answer_f1.py index 8e86cb0..0cfc98f 100644 --- a/rageval/metrics/answer_correctness/_answer_f1.py +++ b/rageval/metrics/answer_correctness/_answer_f1.py @@ -2,10 +2,10 @@ import string from collections import Counter from dataclasses import dataclass -from typing import List import datasets import numpy as np +from typing import Union, Iterable, List from rageval.metrics import Metric, add_attribute @@ -63,13 +63,14 @@ class AnswerF1Correctness(Metric): ALIAS = ['answer_f1'] - def __init__(self): + def __init__(self, normalize: bool = True): """ Explicitly initialize AnswerF1Correctness. Ensure all parent classes are initialized. """ super().__init__() + self.normalize = normalize def __repr__(self) -> str: """:return: Formatted string representation of the metric.""" @@ -90,31 +91,22 @@ def _info(self): reference_urls=[] ) - def _normalize_text(self, s: str) -> str: + def _normalize_text(self, s: str) -> List[str]: def remove_articles(text): return re.sub(r'\b(a|an|the)\b', ' ', text) - def white_space_fix(text): - return ' '.join(text.split()) - def remove_punc(text): exclude = set(string.punctuation) return ''.join(ch for ch in text if ch not in exclude) def lower(text): return text.lower() - return white_space_fix(remove_articles(remove_punc(lower(s)))) + return remove_articles(remove_punc(lower(s))).split() - def _f1_score(self, pred: str, ref: str) -> float: + def _f1_score(self, preds: Iterable, refs: Iterable) -> float: """Compute the f1 score between pred and ref.""" - normalized_prediction = self._normalize_text(pred) - normalized_ground_truth = self._normalize_text(ref) - - prediction_tokens = normalized_prediction.split() - ground_truth_tokens = normalized_ground_truth.split() - - pred_counter = Counter(prediction_tokens) - ref_counter = Counter(ground_truth_tokens) + pred_counter = Counter(preds) + ref_counter = Counter(refs) tp = sum((pred_counter & ref_counter).values()) fp = sum((pred_counter - ref_counter).values()) @@ -129,24 +121,14 @@ def _f1_score(self, pred: str, ref: str) -> float: def _compute_one( self, - pred_answer: str, - ref_answers: List[str] + pred_answer: Union[str, Iterable], + ref_answers: Union[List[str], Iterable] ) -> float: """Evaluate the f1 score of an answer.""" - scores = [] - for ref_answer in ref_answers: - score = self._f1_score(pred_answer, ref_answer) - scores.append(score) + if self.normalize: + pred_answer = self._normalize_text(pred_answer) + ref_answers = [self._normalize_text(ref_answer) for ref_answer in ref_answers] - return np.max(scores) + scores = [self._f1_score(pred_answer, ref_answer) for ref_answer in ref_answers] - def _compute_batch( - self, - pred_answers: List[str], - ref_answers: List[List[str]] - ) -> List[float]: - """Evaluate the f1 score of a batch of answers.""" - return [ - self._compute_one(pred_answer, ref_answer) - for pred_answer, ref_answer in zip(pred_answers, ref_answers) - ] + return np.max(scores) diff --git a/rageval/metrics/answer_correctness/_answer_lcs_ratio.py b/rageval/metrics/answer_correctness/_answer_lcs_ratio.py index e8a230b..60aaf8f 100644 --- a/rageval/metrics/answer_correctness/_answer_lcs_ratio.py +++ b/rageval/metrics/answer_correctness/_answer_lcs_ratio.py @@ -5,8 +5,7 @@ from rageval.metrics import Metric, add_attribute _DESCRIPTION = """\ -The AnswerLCSRatio is to measure the similarity between answer and gt_answer by calculating the longest common \ -subsequence. +The AnswerLCSRatio is to measure the similarity between answer and gt_answer by calculating the longest common subsequence. This is a very traditional method, but to this day, some work is still being carried out using it, such as \ https://ieeexplore.ieee.org/abstract/document/10172590. @@ -118,14 +117,3 @@ def _compute_one( pre = tmp return dp[-1] / m - - def _compute_batch( - self, - pred_answers: List[str], - ref_answers: List[str] - ) -> List[float]: - """Evaluate the similarity of a batch of answers.""" - return [ - self._compute_one(pred_answer, ref_answer) - for pred_answer, ref_answer in zip(pred_answers, ref_answers) - ] diff --git a/rageval/metrics/answer_correctness/_answer_rouge_correctness.py b/rageval/metrics/answer_correctness/_answer_rouge_correctness.py index f1810e2..175910a 100644 --- a/rageval/metrics/answer_correctness/_answer_rouge_correctness.py +++ b/rageval/metrics/answer_correctness/_answer_rouge_correctness.py @@ -117,15 +117,3 @@ def _compute_one(self, pred_answer: str, ref_answers: List[str]) -> float: """Evaluate the ROUGE between a single answer and groundtruth answers.""" score = self.scorer.score_multi(ref_answers, pred_answer) return score[self.rouge_type].fmeasure - - def _compute_batch( - self, - pred_answers: List[str], - ref_answers: List[List[str]] - ) -> List[float]: - """Evaluate the ROUGE of a batch of answers.""" - results = [ - self._compute_one(pred_answer, ref_answer) - for pred_answer, ref_answer in zip(pred_answers, ref_answers) - ] - return results diff --git a/rageval/metrics/answer_correctness/_answer_ter.py b/rageval/metrics/answer_correctness/_answer_ter.py index 95bbfe0..e2a3440 100644 --- a/rageval/metrics/answer_correctness/_answer_ter.py +++ b/rageval/metrics/answer_correctness/_answer_ter.py @@ -2,7 +2,8 @@ from typing import List, Tuple import datasets -from datasets import Dataset +from sacrebleu.metrics import TER +import numpy as np from rageval.metrics import Metric, add_attribute @@ -48,7 +49,9 @@ >>> metric = rl.metrics.AnswerTERCorrectness() >>> metric.mtype 'AnswerCorrectness' - >>> score, results = metric.compute(dataset['answers'], dataset['gt_answers'], 1) + >>> score, results = metric.compute(dataset['answers'], dataset['gt_answers']) + >>> assert score == 110.00000000000001 + >>> assert results[0] == 25.0 """ _CITATION = """\ @@ -104,10 +107,12 @@ def __init__( Ensure all parent classes are initialized. """ super().__init__() - self.normalized = normalized - self.ignore_punct = ignore_punct - self.support_zh_ja_chars = support_zh_ja_chars - self.case_sensitive = case_sensitive + self.ter = TER( + normalized=normalized, + no_punct=ignore_punct, + asian_support=support_zh_ja_chars, + case_sensitive=case_sensitive + ) def __repr__(self) -> str: """:return: Formatted string representation of the metric.""" @@ -124,7 +129,7 @@ def _info(self): "gt_answers": datasets.Sequence(datasets.Value("string")) } ), - codebase_urls=["https://github.com/huggingface/datasets/blob/main/metrics/ter/ter.py"], + codebase_urls=["https://github.com/mjpost/sacreBLEU#ter"], reference_urls=["https://aclanthology.org/2006.amta-papers.25", "https://www.aclweb.org/anthology/W18-6319"] ) @@ -134,43 +139,28 @@ def _validate_data( ref_answers: List[List[str]] ) -> None: """Validate the input predictions and references.""" + super()._validate_data(pred_answers, ref_answers) if not all(isinstance(pred_answer, str) for pred_answer in pred_answers): raise ValueError("The type of pred_answers should be a list of strings.") if not all(isinstance(reference_list, list) and all(isinstance(reference, str) for reference in reference_list) for reference_list in ref_answers): raise ValueError("The type of ref_answers should be a list of lists of strings.") + def _compute_one( + self, + pred_answer: str, + ref_answers: List[str] + ) -> float: + """Compute the TER score of a single answer.""" + return self.ter.sentence_score(pred_answer, ref_answers).score + def compute( self, pred_answers: List[str], ref_answers: List[List[str]], - batch_size: int, - ) -> Tuple[float, Dataset]: + ) -> Tuple[float, List[float]]: """Evaluate the dataset.""" - ter = datasets.load_metric("ter") - result = ter.compute( - predictions=pred_answers, - references=ref_answers, - normalized=self.normalized, - ignore_punct=self.ignore_punct, - support_zh_ja_chars=self.support_zh_ja_chars, - case_sensitive=self.case_sensitive - ) - scores = [ - ter.compute( - predictions=[pred_answers[i]], - references=[ref_answers[i]], - normalized=self.normalized, - ignore_punct=self.ignore_punct, - support_zh_ja_chars=self.support_zh_ja_chars, - case_sensitive=self.case_sensitive - )['score'] - for i in range(len(pred_answers)) - ] - return result, scores - - def _compute_batch( - self, - pred_answers: List[str], - ref_answers: List[List[str]] - ) -> list: - pass + self._validate_data(pred_answers, ref_answers) + scores = self._compute_batch(pred_answers, ref_answers) + ref_answers = np.array(ref_answers) + ref_answers = ref_answers.T.tolist() + return self.ter.corpus_score(pred_answers, ref_answers).score, scores diff --git a/rageval/metrics/answer_groundedness/_answer_citation_precision.py b/rageval/metrics/answer_groundedness/_answer_citation_precision.py index 198fa21..8a121e2 100644 --- a/rageval/metrics/answer_groundedness/_answer_citation_precision.py +++ b/rageval/metrics/answer_groundedness/_answer_citation_precision.py @@ -192,7 +192,6 @@ def _compute_batch( Then, average over all statements in the LLM answer. Finally, average over all scores of each answer. """ - results = [] for answer, context in tqdm(zip(answers, contexts)): citation_correct, citation_total = self._compute_one(answer, context) diff --git a/rageval/metrics/answer_groundedness/_answer_citation_recall.py b/rageval/metrics/answer_groundedness/_answer_citation_recall.py index 8a16356..128a987 100644 --- a/rageval/metrics/answer_groundedness/_answer_citation_recall.py +++ b/rageval/metrics/answer_groundedness/_answer_citation_recall.py @@ -179,9 +179,4 @@ def _compute_batch( Then, average over all statements in the LLM answer. Finally, average over all scores of each answer. """ - - results = [] - for answer, context in tqdm(zip(answers, contexts)): - r = self._compute_one(answer, context) - results.append(r) - return results + return super()._compute_batch(pred_answers=answers, ref_answers=contexts) diff --git a/rageval/metrics/base.py b/rageval/metrics/base.py index 296d681..dd6999f 100644 --- a/rageval/metrics/base.py +++ b/rageval/metrics/base.py @@ -49,7 +49,7 @@ def __init__( @abstractmethod def name(self) -> str: """The metric name.""" - ... + ... # pragma: no cover def _info(self) -> MetricInfo: """Construct the MetricInfo object. See `datasets.MetricInfo` for details. @@ -61,7 +61,7 @@ def _info(self) -> MetricInfo: info: (datasets.MetricInfo) The metrics information """ - raise NotImplementedError + raise NotImplementedError # pragma: no cover def _validate_data( self, @@ -77,7 +77,7 @@ def compute( self, pred_answers: Optional[Iterable] = None, ref_answers: Optional[Iterable] = None, - batch_size: int = None, + batch_size: Optional[int] = None, *args: Optional[Iterable], ) -> Tuple[float, List[float]]: """ @@ -85,47 +85,46 @@ def compute( Return average scores of all inputs and a score list for each example. """ - if ref_answers: - self._validate_data(pred_answers, ref_answers, *args) - scores = [] - length = len(pred_answers) - if batch_size: - for start in tqdm(range(0, length, batch_size)): - end = start + batch_size - end = end if end < length else length - score = self._compute_batch( - pred_answers[start:end], - ref_answers[start:end], - *[arg[start:end] for arg in args], - ) - scores.extend(score) - else: - scores = self._compute_batch(pred_answers, ref_answers, *args) - else: - scores = [] - length = len(pred_answers) - if batch_size: - for start in tqdm(range(0, length, batch_size)): - end = start + batch_size - end = end if end < length else length - score = self._compute_batch( - pred_answers[start:end], - *[arg[start:end] for arg in args], - ) - scores.extend(score) - else: - scores = self._compute_batch(pred_answers, *args) + self._validate_data(pred_answers, ref_answers, *args) + # scores = [] + # length = len(pred_answers) + # if batch_size: + # for start in tqdm(range(0, length, batch_size)): + # end = start + batch_size + # end = end if end < length else length + # score = self._compute_batch( + # pred_answers[start:end], + # ref_answers[start:end], + # *[arg[start:end] for arg in args], + # ) + # scores.extend(score) + # else: + scores = self._compute_batch(pred_answers, ref_answers, *args) return np.average(scores), scores @abstractmethod + def _compute_one( + self, + pred_answers: Optional[Iterable] = None, + ref_answers: Optional[Iterable] = None, + *args: Optional[Iterable] + ) -> float: + ... # pragma: no cover + def _compute_batch( self, pred_answers: Optional[Iterable] = None, ref_answers: Optional[Iterable] = None, *args: Optional[Iterable] ) -> List[float]: - ... + """Compute the metric for a batch of predictions and references.""" + scores = [] + for pred, refs in tqdm(zip(pred_answers, ref_answers), + desc=f"Computing {self.name}", + total=len(pred_answers)): + scores.append(self._compute_one(pred, refs)) + return scores @dataclass @@ -140,4 +139,4 @@ def __init__(self, model: Callable): @abstractmethod def parse_llm_result(self, prompts: List[str], result: LLMResult): """Parse the LLM Result based on the Prompt.""" - ... + ... # pragma: no cover diff --git a/rageval/metrics/context_adequacy/_context_recall.py b/rageval/metrics/context_adequacy/_context_recall.py index 11d9051..d3e0176 100644 --- a/rageval/metrics/context_adequacy/_context_recall.py +++ b/rageval/metrics/context_adequacy/_context_recall.py @@ -62,7 +62,7 @@ >>> assert 0 <= score <= 1 """ -_CITATION = """\ +_CITATION = """ @misc{ragas, author= {explodinggradients}, year = {2023}, diff --git a/tests/units/test_answer_chrf.py b/tests/units/test_answer_chrf.py index f1442cc..b15ccf3 100644 --- a/tests/units/test_answer_chrf.py +++ b/tests/units/test_answer_chrf.py @@ -26,7 +26,7 @@ def testset(sample): @pytest.mark.slow -def test_case_on_answer_ter(testset): +def test_case_on_answer_chrf(testset): metric = AnswerCHRFCorrectness() assert metric.name == "answer_chrf" assert metric.mtype == 'AnswerCorrectness' diff --git a/tests/units/test_answer_f1.py b/tests/units/test_answer_f1.py index d5b4915..f6091b4 100644 --- a/tests/units/test_answer_f1.py +++ b/tests/units/test_answer_f1.py @@ -32,8 +32,10 @@ def testset(sample): @pytest.mark.slow def test_case_on_answer_f1(testset): - metric = AnswerF1Correctness() + metric = AnswerF1Correctness(normalize=True) assert metric.name == "answer_f1" assert metric.mtype == 'AnswerCorrectness' score, results = metric.compute(testset['answers'], testset['gt_answers'], 1) assert 0 <= score <= 1 + score = metric._compute_one(testset['answers'][0], testset['gt_answers'][0]) + assert 0 <= score <= 1 diff --git a/tests/units/test_answer_ter.py b/tests/units/test_answer_ter.py index 6c54e06..c425f98 100644 --- a/tests/units/test_answer_ter.py +++ b/tests/units/test_answer_ter.py @@ -32,5 +32,7 @@ def test_case_on_answer_ter(testset): metric = AnswerTERCorrectness() assert metric.name == "answer_ter" assert metric.mtype == 'AnswerCorrectness' - score, results = metric.compute(testset['answers'], testset['gt_answers'], 1) + score, results = metric.compute(testset['answers'], testset['gt_answers']) + assert score == 110.00000000000001 + assert results[0] == 25.0