fix comments

gomate-community · Sep 25, 2024 · 08e4ef6 · 08e4ef6
1 parent e2f9273
commit 08e4ef6
Show file tree

Hide file tree

Showing 8 changed files with 64 additions and 49 deletions.
diff --git a/README.md b/README.md
@@ -46,6 +46,31 @@ The search task is to retrieve relevant documents from the knowledge base.
 
 * [Context Recall](./rageval/metrics/_context_recall.py): also known as *Context Recall* in [RAGAS framework](https://github.com/explodinggradients/ragas).
 
+## Setup Evaluator LLMs
+
+Some metrics evaluations rely on LLMs as evaluators. You can either directly call OpenAI's API or deploy an open-source model as a RESTful API in the OpenAI format for evaluation.
+
+- OpenAI
+
+```python
+os.environ["OPENAI_API_KEY"] = "<your_openai_api_key>"
+```
+
+
+- Open source LLMs
+
+Please use [vllm](https://github.com/vllm-project/vllm) to setup the API server for open source LLMs. For example, use the following command to deploy a Llama-3-8B model hosted on HuggingFace:
+
+```bash
+python -m vllm.entrypoints.openai.api_server \
+  --model meta-llama/Meta-Llama-3-8B-Instruct \
+  --tensor-parallel-size 8 \
+  --dtype auto \
+  --api-key sk-123456789 \
+  --gpu-memory-utilization 0.9 \
+  --port 5000
+```
+
 ## Benchmark Results
 
 ### 1. [ASQA benchmark](benchmarks/ASQA/README.md)

diff --git a/rageval/metrics/__init__.py b/rageval/metrics/__init__.py
@@ -23,7 +23,7 @@
 
 # Metrics about the answer informativeness
 ##from .answer_informative._claim_num import ClaimNum
-from .answer_informative._text_length import TextLength
+from .answer_informativeness._text_length import TextLength
 ##from .answer_informative._repetitiveness import Repetitiveness
 ##from .answer_informative._pairwise_accuracy import PairwiseAccuracy
 

diff --git a/.../metrics/answer_informative/_claim_num.py → ...rics/answer_informativeness/_claim_num.py b/.../metrics/answer_informative/_claim_num.py → ...rics/answer_informativeness/_claim_num.py
diff --git a/.../answer_informative/_pairwise_accuracy.py → ...wer_informativeness/_pairwise_accuracy.py b/.../answer_informative/_pairwise_accuracy.py → ...wer_informativeness/_pairwise_accuracy.py
diff --git a/...ics/answer_informative/_repetitiveness.py → ...answer_informativeness/_repetitiveness.py b/...ics/answer_informative/_repetitiveness.py → ...answer_informativeness/_repetitiveness.py
diff --git a/...etrics/answer_informative/_text_length.py → ...cs/answer_informativeness/_text_length.py b/...etrics/answer_informative/_text_length.py → ...cs/answer_informativeness/_text_length.py
@@ -1,6 +1,8 @@
 from dataclasses import dataclass
-from typing import List
+from typing import List, Optional, Iterable
 import numpy as np
+from transformers import AutoTokenizer
+
 
 import datasets
 
@@ -36,12 +38,12 @@
     >>> tokenize_model = rl.models.Tokenizer("Qwen/Qwen2-0.5B-Instruct")
     >>> metric = rl.metrics.TextLength(tokenize_model=tokenize_model)
     >>> metric.mtype
-    'answer_informative'
+    'answer_informativeness'
 """
 
 
 @dataclass
-@add_attribute('mtype', 'answer_informative')
+@add_attribute('mtype', 'answer_informativeness')
 @datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class TextLength(Metric):
     """Estimates the text length of answers."""
@@ -56,7 +58,7 @@ def __init__(self, tokenize_model: str = "Qwen/Qwen2-0.5B-Instruct"):
 
         Ensure all parent classes are initialized.
         """
-        self.tokenize_model = tokenize_model
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenize_model)
         super().__init__()
 
     def __repr__(self) -> str:
@@ -81,15 +83,8 @@ def _info(self):
     def _compute_one(
         self,
         answer: str,
+        *args: Optional[Iterable],
     ) -> float:
         """Evaluating the text length of answer."""
-        length = len(self.tokenize_model.tokenizer(answer, return_tensors="pt")['input_ids'][0])
+        length = len(self.tokenizer(answer, return_tensors="pt")['input_ids'][0])
         return length
-
-    def _compute_batch(
-        self,
-        pred_answers,
-    ) -> List[float]:
-        """Evaluate the text length of a batch of answers."""
-        results = [self._compute_one(answer) for answer in pred_answers]
-        return results
diff --git a/rageval/metrics/base.py b/rageval/metrics/base.py
@@ -65,18 +65,19 @@ def _info(self) -> MetricInfo:
 
     def _validate_data(
         self,
-        pred_answers: Optional[Iterable] = None,
-        ref_answers: Optional[Iterable] = None,
+        AObjects: Optional[Iterable] = None,
+        BObjects: Optional[Iterable] = None,
         *args: Optional[Iterable]
     ) -> None:
         """Validate the of the input dataset."""
-        if len(pred_answers) != len(ref_answers) or any(len(pred_answers) != len(arg) for arg in args):
-            raise ValueError("The length of predictions and references should be the same.")
+        if (AObjects and BObjects):
+            if len(AObjects) != len(BObjects) or any(len(AObjects) != len(arg) for arg in args):
+                raise ValueError("The length of predictions and references should be the same.")
 
     def compute(
         self,
-        pred_answers: Optional[Iterable] = None,
-        ref_answers: Optional[Iterable] = None,
+        AObjects: Optional[Iterable] = None,
+        BObjects: Optional[Iterable] = None,
         batch_size: Optional[int] = None,
         *args: Optional[Iterable],
     ) -> Tuple[float, List[float]]:
@@ -85,45 +86,38 @@ def compute(
 
         Return average scores of all inputs and a score list for each example.
         """
-        self._validate_data(pred_answers, ref_answers, *args)
-        # scores = []
-        # length = len(pred_answers)
-        # if batch_size:
-        #     for start in tqdm(range(0, length, batch_size)):
-        #         end = start + batch_size
-        #         end = end if end < length else length
-        #         score = self._compute_batch(
-        #             pred_answers[start:end],
-        #             ref_answers[start:end],
-        #             *[arg[start:end] for arg in args],
-        #         )
-        #         scores.extend(score)
-        # else:
-        scores = self._compute_batch(pred_answers, ref_answers, *args)
+        self._validate_data(AObjects, BObjects, *args)
+        scores = self._compute_batch(AObjects, BObjects, *args)
 
         return np.average(scores), scores
 
     @abstractmethod
     def _compute_one(
         self,
-        pred_answers: Optional[Iterable] = None,
-        ref_answers: Optional[Iterable] = None,
+        AObject: Optional[Iterable] = None,
+        BObject: Optional[Iterable] = None,
         *args: Optional[Iterable]
     ) -> float:
         ...  # pragma: no cover
 
     def _compute_batch(
         self,
-        pred_answers: Optional[Iterable] = None,
-        ref_answers: Optional[Iterable] = None,
+        AObjects: Optional[Iterable] = None,
+        BObjects: Optional[Iterable] = None,
         *args: Optional[Iterable]
     ) -> List[float]:
         """Compute the metric for a batch of predictions and references."""
         scores = []
-        for pred, refs in tqdm(zip(pred_answers, ref_answers),
-                               desc=f"Computing {self.name}",
-                               total=len(pred_answers)):
-            scores.append(self._compute_one(pred, refs))
+        if (AObjects and BObjects): # if both columns exist
+            for AObject, BObject in tqdm(zip(AObjects, BObjects),
+                                desc=f"Computing {self.name}",
+                                total=len(AObjects)):
+                scores.append(self._compute_one(AObject, BObject))
+        else:
+            for AObject in tqdm(AObjects,
+                                desc=f"Computing {self.name}",
+                                total=len(AObjects)):
+                scores.append(self._compute_one(AObject))
         return scores
 
 

diff --git a/tests/units/test_text_length.py → test_text_length.py b/tests/units/test_text_length.py → test_text_length.py
@@ -5,7 +5,7 @@
 import rageval as rl
 
 
-@pytest.fixture(scope='module')
+#@pytest.fixture(scope='module')
 def sample():
     test_case = {
         #"questions": [
@@ -20,17 +20,18 @@ def sample():
     return test_case
 
 
-@pytest.fixture(scope='module')
+#@pytest.fixture(scope='module')
 def testset(sample):
     ds = Dataset.from_dict(sample)
     return ds
 
 
-@pytest.mark.slow
+#@pytest.mark.slow
 def test_case_on_text_length(testset):
-    tokenize_model = rl.models.Tokenizer("Qwen/Qwen2-0.5B-Instruct")
-    metric = TextLength(tokenize_model=tokenize_model)
+    metric = TextLength(tokenize_model="Qwen/Qwen2-0.5B-Instruct")
     assert metric.name == "text_length"
-    score, results = metric.compute(testset["answers"], batch_size = 1)
+    score, results = metric.compute(testset["answers"])
     print(score, results)
     assert score == 75.0
+
+test_case_on_text_length(testset(sample()))