Skip to content

Commit df441c1

Browse files
committed
feat: add ability for ragas to read from a list
We want ragas to read from both a list as well as a list of samples Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com>
1 parent 5eb2310 commit df441c1

File tree

1 file changed

+23
-8
lines changed

1 file changed

+23
-8
lines changed

src/instructlab/eval/ragas.py

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
11
# Standard
2+
from pathlib import Path
23
from typing import List, TypedDict
34

45
# Third Party
56
from langchain_community.chat_models import ChatOpenAI
67
from ragas.evaluation import EvaluationDataset, EvaluationResult, RunConfig, evaluate
7-
from ragas.metrics import RubricsScore
8-
from ragas.metrics._domain_specific_rubrics import DEFAULT_WITH_REFERENCE_RUBRICS
8+
from ragas.metrics._domain_specific_rubrics import ( # the rubrics we must instantiate are located inside of a file marked as private
9+
DEFAULT_WITH_REFERENCE_RUBRICS,
10+
RubricsScore,
11+
)
12+
import pandas as pd
913

1014
# Local
1115
from .evaluator import Evaluator
@@ -30,13 +34,13 @@ def __init__(self):
3034
pass
3135

3236
def run(
33-
self, dataset: List[Sample], run_config: RunConfig | None = None
37+
self, dataset: List[Sample] | Path = None, run_config: RunConfig | None = None
3438
) -> EvaluationResult:
3539
"""
3640
Evaluates the quality of model responses against a graded rubric.
3741
3842
Args:
39-
dataset (List[Sample]):
43+
dataset (List[Sample] | Path):
4044
List of model questions and answers
4145
run_config (RunConfig | None, optional):
4246
Configuration to use when running evaluations. If none is provided, then
@@ -47,6 +51,19 @@ def run(
4751
Returns:
4852
EvaluationResult: The results of all evaluations performed by Ragas
4953
"""
54+
if not dataset:
55+
raise ValueError(
56+
"no dataset was provided, please specify the `dataset` argument"
57+
)
58+
if isinstance(dataset, Path):
59+
input_ds = EvaluationDataset.from_pandas(
60+
pd.read_json(dataset, lines=True, orient="records")
61+
)
62+
elif isinstance(dataset, list):
63+
input_ds = EvaluationDataset.from_list(dataset)
64+
else:
65+
raise TypeError(f"invalid type passed for dataset: {type(dataset)}")
66+
5067
if not run_config:
5168
# we set extreme timeout/retry values by default since OpenAI tier-1 rate limits
5269
# are horrible and will result in half of our evaluation results being NaN or 0
@@ -57,17 +74,15 @@ def run(
5774
timeout=3600,
5875
)
5976

60-
# we will be using gpt-4o for the foreseeable future, we hardcode this
61-
# for consistency of answers
62-
input_ds = EvaluationDataset.from_list(dataset)
63-
6477
# default set of metrics
6578
metrics = [
6679
RubricsScore(
6780
rubrics=DEFAULT_WITH_REFERENCE_RUBRICS,
6881
)
6982
]
7083

84+
# we will be using gpt-4o for the foreseeable future, we hardcode this
85+
# for consistency of answers
7186
critic_lm = ChatOpenAI(model="gpt-4o")
7287
results = evaluate(
7388
dataset=input_ds,

0 commit comments

Comments
 (0)