Skip to content

Commit

Permalink
fix comments
Browse files Browse the repository at this point in the history
  • Loading branch information
Wenshansilvia committed Sep 25, 2024
1 parent e2f9273 commit 08e4ef6
Show file tree
Hide file tree
Showing 8 changed files with 64 additions and 49 deletions.
25 changes: 25 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,31 @@ The search task is to retrieve relevant documents from the knowledge base.

* [Context Recall](./rageval/metrics/_context_recall.py): also known as *Context Recall* in [RAGAS framework](https://github.com/explodinggradients/ragas).

## Setup Evaluator LLMs

Some metrics evaluations rely on LLMs as evaluators. You can either directly call OpenAI's API or deploy an open-source model as a RESTful API in the OpenAI format for evaluation.

- OpenAI

```python
os.environ["OPENAI_API_KEY"] = "<your_openai_api_key>"
```


- Open source LLMs

Please use [vllm](https://github.com/vllm-project/vllm) to setup the API server for open source LLMs. For example, use the following command to deploy a Llama-3-8B model hosted on HuggingFace:

```bash
python -m vllm.entrypoints.openai.api_server \
--model meta-llama/Meta-Llama-3-8B-Instruct \
--tensor-parallel-size 8 \
--dtype auto \
--api-key sk-123456789 \
--gpu-memory-utilization 0.9 \
--port 5000
```

## Benchmark Results

### 1. [ASQA benchmark](benchmarks/ASQA/README.md)
Expand Down
2 changes: 1 addition & 1 deletion rageval/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

# Metrics about the answer informativeness
##from .answer_informative._claim_num import ClaimNum
from .answer_informative._text_length import TextLength
from .answer_informativeness._text_length import TextLength
##from .answer_informative._repetitiveness import Repetitiveness
##from .answer_informative._pairwise_accuracy import PairwiseAccuracy

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from dataclasses import dataclass
from typing import List
from typing import List, Optional, Iterable
import numpy as np
from transformers import AutoTokenizer


import datasets

Expand Down Expand Up @@ -36,12 +38,12 @@
>>> tokenize_model = rl.models.Tokenizer("Qwen/Qwen2-0.5B-Instruct")
>>> metric = rl.metrics.TextLength(tokenize_model=tokenize_model)
>>> metric.mtype
'answer_informative'
'answer_informativeness'
"""


@dataclass
@add_attribute('mtype', 'answer_informative')
@add_attribute('mtype', 'answer_informativeness')
@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class TextLength(Metric):
"""Estimates the text length of answers."""
Expand All @@ -56,7 +58,7 @@ def __init__(self, tokenize_model: str = "Qwen/Qwen2-0.5B-Instruct"):
Ensure all parent classes are initialized.
"""
self.tokenize_model = tokenize_model
self.tokenizer = AutoTokenizer.from_pretrained(tokenize_model)
super().__init__()

def __repr__(self) -> str:
Expand All @@ -81,15 +83,8 @@ def _info(self):
def _compute_one(
self,
answer: str,
*args: Optional[Iterable],
) -> float:
"""Evaluating the text length of answer."""
length = len(self.tokenize_model.tokenizer(answer, return_tensors="pt")['input_ids'][0])
length = len(self.tokenizer(answer, return_tensors="pt")['input_ids'][0])
return length

def _compute_batch(
self,
pred_answers,
) -> List[float]:
"""Evaluate the text length of a batch of answers."""
results = [self._compute_one(answer) for answer in pred_answers]
return results
52 changes: 23 additions & 29 deletions rageval/metrics/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,18 +65,19 @@ def _info(self) -> MetricInfo:

def _validate_data(
self,
pred_answers: Optional[Iterable] = None,
ref_answers: Optional[Iterable] = None,
AObjects: Optional[Iterable] = None,
BObjects: Optional[Iterable] = None,
*args: Optional[Iterable]
) -> None:
"""Validate the of the input dataset."""
if len(pred_answers) != len(ref_answers) or any(len(pred_answers) != len(arg) for arg in args):
raise ValueError("The length of predictions and references should be the same.")
if (AObjects and BObjects):
if len(AObjects) != len(BObjects) or any(len(AObjects) != len(arg) for arg in args):
raise ValueError("The length of predictions and references should be the same.")

def compute(
self,
pred_answers: Optional[Iterable] = None,
ref_answers: Optional[Iterable] = None,
AObjects: Optional[Iterable] = None,
BObjects: Optional[Iterable] = None,
batch_size: Optional[int] = None,
*args: Optional[Iterable],
) -> Tuple[float, List[float]]:
Expand All @@ -85,45 +86,38 @@ def compute(
Return average scores of all inputs and a score list for each example.
"""
self._validate_data(pred_answers, ref_answers, *args)
# scores = []
# length = len(pred_answers)
# if batch_size:
# for start in tqdm(range(0, length, batch_size)):
# end = start + batch_size
# end = end if end < length else length
# score = self._compute_batch(
# pred_answers[start:end],
# ref_answers[start:end],
# *[arg[start:end] for arg in args],
# )
# scores.extend(score)
# else:
scores = self._compute_batch(pred_answers, ref_answers, *args)
self._validate_data(AObjects, BObjects, *args)
scores = self._compute_batch(AObjects, BObjects, *args)

return np.average(scores), scores

@abstractmethod
def _compute_one(
self,
pred_answers: Optional[Iterable] = None,
ref_answers: Optional[Iterable] = None,
AObject: Optional[Iterable] = None,
BObject: Optional[Iterable] = None,
*args: Optional[Iterable]
) -> float:
... # pragma: no cover

def _compute_batch(
self,
pred_answers: Optional[Iterable] = None,
ref_answers: Optional[Iterable] = None,
AObjects: Optional[Iterable] = None,
BObjects: Optional[Iterable] = None,
*args: Optional[Iterable]
) -> List[float]:
"""Compute the metric for a batch of predictions and references."""
scores = []
for pred, refs in tqdm(zip(pred_answers, ref_answers),
desc=f"Computing {self.name}",
total=len(pred_answers)):
scores.append(self._compute_one(pred, refs))
if (AObjects and BObjects): # if both columns exist
for AObject, BObject in tqdm(zip(AObjects, BObjects),
desc=f"Computing {self.name}",
total=len(AObjects)):
scores.append(self._compute_one(AObject, BObject))
else:
for AObject in tqdm(AObjects,
desc=f"Computing {self.name}",
total=len(AObjects)):
scores.append(self._compute_one(AObject))
return scores


Expand Down
13 changes: 7 additions & 6 deletions tests/units/test_text_length.py → test_text_length.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import rageval as rl


@pytest.fixture(scope='module')
#@pytest.fixture(scope='module')
def sample():
test_case = {
#"questions": [
Expand All @@ -20,17 +20,18 @@ def sample():
return test_case


@pytest.fixture(scope='module')
#@pytest.fixture(scope='module')
def testset(sample):
ds = Dataset.from_dict(sample)
return ds


@pytest.mark.slow
#@pytest.mark.slow
def test_case_on_text_length(testset):
tokenize_model = rl.models.Tokenizer("Qwen/Qwen2-0.5B-Instruct")
metric = TextLength(tokenize_model=tokenize_model)
metric = TextLength(tokenize_model="Qwen/Qwen2-0.5B-Instruct")
assert metric.name == "text_length"
score, results = metric.compute(testset["answers"], batch_size = 1)
score, results = metric.compute(testset["answers"])
print(score, results)
assert score == 75.0

test_case_on_text_length(testset(sample()))

0 comments on commit 08e4ef6

Please sign in to comment.