Skip to content

Commit

Permalink
adds basic ragas eval
Browse files Browse the repository at this point in the history
Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com>
  • Loading branch information
RobotSail committed Dec 6, 2024
1 parent ce8880f commit 04fe42a
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 0 deletions.
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,5 @@ pandas
pandas-stubs
lm-eval>=0.4.4
httpx

ragas
78 changes: 78 additions & 0 deletions src/instructlab/eval/ragas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from .evaluator import Evaluator
from typing import List, TypedDict
from ragas.metrics import RubricsScore
from ragas.evaluation import EvaluationResult, EvaluationDataset, RunConfig, evaluate
from langchain_community.chat_models import ChatOpenAI


class Sample(TypedDict):
# question
user_input: str

# model answer
response: str

# golden answer
reference: str


DEFAULT_RUBRICS = {
"score1_description": "The response is completely incorrect, inaccurate, and/or not factual.",
"score2_description": "The response is mostly incorrect, inaccurate, and/or not factual.",
"score3_description": "The response is somewhat correct, accurate, and/or factual.",
"score4_description": "The response is mostly correct, accurate, and factual.",
"score5_description": "The response is completely correct, accurate, and factual.",
}


class RagasEvaluator(Evaluator):
# most basic implementation, we just assume that the user will bring the existing model responses
name = "ragas"

def __init__(self):
pass

def run(
self, dataset: List[Sample], run_config: RunConfig | None = None
) -> EvaluationResult:
"""
Evaluates the quality of model responses against a graded rubric.
Args:
dataset (List[Sample]): List of model questions and ans
run_config (RunConfig | None, optional): _description_. Defaults to None.
Returns:
EvaluationResult: _description_
"""
if not run_config:
# we set extreme timeout/retry values by default since OpenAI tier-1 rate limits
# are horrible and will result in half of our evaluation results being NaN or 0
run_config = RunConfig(
max_retries=120,
max_wait=7200,
seed=42,
timeout=3600,
)

# we will be using gpt-4o for the foreseeable future, we hardcode this
# for consistency of answers
input_ds = EvaluationDataset.from_list(dataset)

# default set of metrics
metrics = [
RubricsScore(
rubrics=DEFAULT_RUBRICS,
)
]

critic_lm = ChatOpenAI(model="gpt-4o")
results = evaluate(
dataset=input_ds,
batch_size=4,
run_config=run_config,
llm=critic_lm,
metrics=metrics,
show_progress=True,
)
return results

0 comments on commit 04fe42a

Please sign in to comment.