-
Notifications
You must be signed in to change notification settings - Fork 22
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com>
- Loading branch information
Showing
2 changed files
with
80 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,3 +10,5 @@ pandas | |
pandas-stubs | ||
lm-eval>=0.4.4 | ||
httpx | ||
|
||
ragas |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
from .evaluator import Evaluator | ||
from typing import List, TypedDict | ||
from ragas.metrics import RubricsScore | ||
from ragas.evaluation import EvaluationResult, EvaluationDataset, RunConfig, evaluate | ||
from langchain_community.chat_models import ChatOpenAI | ||
|
||
|
||
class Sample(TypedDict): | ||
# question | ||
user_input: str | ||
|
||
# model answer | ||
response: str | ||
|
||
# golden answer | ||
reference: str | ||
|
||
|
||
DEFAULT_RUBRICS = { | ||
"score1_description": "The response is completely incorrect, inaccurate, and/or not factual.", | ||
"score2_description": "The response is mostly incorrect, inaccurate, and/or not factual.", | ||
"score3_description": "The response is somewhat correct, accurate, and/or factual.", | ||
"score4_description": "The response is mostly correct, accurate, and factual.", | ||
"score5_description": "The response is completely correct, accurate, and factual.", | ||
} | ||
|
||
|
||
class RagasEvaluator(Evaluator): | ||
# most basic implementation, we just assume that the user will bring the existing model responses | ||
name = "ragas" | ||
|
||
def __init__(self): | ||
pass | ||
|
||
def run( | ||
self, dataset: List[Sample], run_config: RunConfig | None = None | ||
) -> EvaluationResult: | ||
""" | ||
Evaluates the quality of model responses against a graded rubric. | ||
Args: | ||
dataset (List[Sample]): List of model questions and ans | ||
run_config (RunConfig | None, optional): _description_. Defaults to None. | ||
Returns: | ||
EvaluationResult: _description_ | ||
""" | ||
if not run_config: | ||
# we set extreme timeout/retry values by default since OpenAI tier-1 rate limits | ||
# are horrible and will result in half of our evaluation results being NaN or 0 | ||
run_config = RunConfig( | ||
max_retries=120, | ||
max_wait=7200, | ||
seed=42, | ||
timeout=3600, | ||
) | ||
|
||
# we will be using gpt-4o for the foreseeable future, we hardcode this | ||
# for consistency of answers | ||
input_ds = EvaluationDataset.from_list(dataset) | ||
|
||
# default set of metrics | ||
metrics = [ | ||
RubricsScore( | ||
rubrics=DEFAULT_RUBRICS, | ||
) | ||
] | ||
|
||
critic_lm = ChatOpenAI(model="gpt-4o") | ||
results = evaluate( | ||
dataset=input_ds, | ||
batch_size=4, | ||
run_config=run_config, | ||
llm=critic_lm, | ||
metrics=metrics, | ||
show_progress=True, | ||
) | ||
return results |