Skip to content

Commit 04fe42a

Browse files
committed
adds basic ragas eval
Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com>
1 parent ce8880f commit 04fe42a

File tree

2 files changed

+80
-0
lines changed

2 files changed

+80
-0
lines changed

requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,5 @@ pandas
1010
pandas-stubs
1111
lm-eval>=0.4.4
1212
httpx
13+
14+
ragas

src/instructlab/eval/ragas.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
from .evaluator import Evaluator
2+
from typing import List, TypedDict
3+
from ragas.metrics import RubricsScore
4+
from ragas.evaluation import EvaluationResult, EvaluationDataset, RunConfig, evaluate
5+
from langchain_community.chat_models import ChatOpenAI
6+
7+
8+
class Sample(TypedDict):
9+
# question
10+
user_input: str
11+
12+
# model answer
13+
response: str
14+
15+
# golden answer
16+
reference: str
17+
18+
19+
DEFAULT_RUBRICS = {
20+
"score1_description": "The response is completely incorrect, inaccurate, and/or not factual.",
21+
"score2_description": "The response is mostly incorrect, inaccurate, and/or not factual.",
22+
"score3_description": "The response is somewhat correct, accurate, and/or factual.",
23+
"score4_description": "The response is mostly correct, accurate, and factual.",
24+
"score5_description": "The response is completely correct, accurate, and factual.",
25+
}
26+
27+
28+
class RagasEvaluator(Evaluator):
29+
# most basic implementation, we just assume that the user will bring the existing model responses
30+
name = "ragas"
31+
32+
def __init__(self):
33+
pass
34+
35+
def run(
36+
self, dataset: List[Sample], run_config: RunConfig | None = None
37+
) -> EvaluationResult:
38+
"""
39+
Evaluates the quality of model responses against a graded rubric.
40+
41+
Args:
42+
dataset (List[Sample]): List of model questions and ans
43+
run_config (RunConfig | None, optional): _description_. Defaults to None.
44+
45+
Returns:
46+
EvaluationResult: _description_
47+
"""
48+
if not run_config:
49+
# we set extreme timeout/retry values by default since OpenAI tier-1 rate limits
50+
# are horrible and will result in half of our evaluation results being NaN or 0
51+
run_config = RunConfig(
52+
max_retries=120,
53+
max_wait=7200,
54+
seed=42,
55+
timeout=3600,
56+
)
57+
58+
# we will be using gpt-4o for the foreseeable future, we hardcode this
59+
# for consistency of answers
60+
input_ds = EvaluationDataset.from_list(dataset)
61+
62+
# default set of metrics
63+
metrics = [
64+
RubricsScore(
65+
rubrics=DEFAULT_RUBRICS,
66+
)
67+
]
68+
69+
critic_lm = ChatOpenAI(model="gpt-4o")
70+
results = evaluate(
71+
dataset=input_ds,
72+
batch_size=4,
73+
run_config=run_config,
74+
llm=critic_lm,
75+
metrics=metrics,
76+
show_progress=True,
77+
)
78+
return results

0 commit comments

Comments
 (0)