-
Notifications
You must be signed in to change notification settings - Fork 26
adds basic ragas eval #193
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
3443ffa
8568b13
58880c3
04117dd
c6b5a70
ab3d168
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,3 +10,4 @@ pandas | |
pandas-stubs | ||
lm-eval>=0.4.4 | ||
httpx | ||
ragas |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,264 @@ | ||
# # SPDX-License-Identifier: Apache-2.0 | ||
# Standard | ||
from pathlib import Path | ||
from typing import TYPE_CHECKING, List, Optional, TypedDict | ||
|
||
# Third Party | ||
from langchain_community.chat_models import ChatOpenAI | ||
from openai import Client as OpenAIClient | ||
from openai.types.chat import ChatCompletionMessageParam | ||
from pandas import DataFrame, read_json | ||
from pydantic import BaseModel, ConfigDict, Field | ||
from ragas.evaluation import EvaluationDataset, EvaluationResult, RunConfig, evaluate | ||
from ragas.metrics import Metric | ||
from ragas.metrics._domain_specific_rubrics import ( # the rubrics we must instantiate are located inside of a file marked as private | ||
DEFAULT_WITH_REFERENCE_RUBRICS, | ||
RubricsScore, | ||
) | ||
|
||
# Local | ||
from .evaluator import Evaluator | ||
from .logger_config import setup_logger | ||
|
||
logger = setup_logger(__name__) | ||
|
||
|
||
class Sample(TypedDict): | ||
""" | ||
TypedDict of a sample that we accept when doing eval with Ragas. | ||
We specifically use TypedDict here to be flexible with the input data we accept. | ||
""" | ||
|
||
# question | ||
user_input: str | ||
|
||
# model answer | ||
response: Optional[str] | ||
|
||
# golden answer | ||
reference: str | ||
|
||
|
||
# default system prompt we'll use when none is provided. Make it private as we don't intend this to be a public object | ||
_DEFAULT_SYSTEM_PROMPT = """You are an advanced AI assistant designed to provide precise and accurate information. | ||
Your primary goal is to answer queries with the most up-to-date and factual information available. | ||
Focus on delivering clear, concise, and correct responses. | ||
If you're uncertain about any aspect of the query, state your level of confidence and provide the most accurate information you can. | ||
Your responses should prioritize accuracy over all other considerations.""" | ||
|
||
DEFAULT_SEED = 1337 | ||
DEFAULT_JUDGE_MODEL = "gpt-4o" | ||
|
||
|
||
class ModelConfig(BaseModel): | ||
model_config = ConfigDict(protected_namespaces=()) | ||
|
||
# name of the model to use. | ||
model_name: str | ||
|
||
# The system prompt to be used when applying the chat template. | ||
system_prompt: str = _DEFAULT_SYSTEM_PROMPT | ||
|
||
# "model randomness" aka likelihood of sampling something other than the likeliest token | ||
temperature: float = Field(default=0.0, le=1.0, ge=0.0) | ||
|
||
# Max amount of tokens to generate. | ||
max_tokens: int = 768 | ||
|
||
# Random seed for reproducibility. Caution: this isn't supported by all model serving runtimes. | ||
seed: int = DEFAULT_SEED | ||
|
||
|
||
class RagasEvaluator(Evaluator): | ||
# most basic implementation, we just assume that the user will bring the existing model responses | ||
name = "ragas" | ||
|
||
def __init__( | ||
self, | ||
student_model: ModelConfig | None = None, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can any of these actually be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, because the user isn't required to pass them in at initialization time. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It might streamline the code if these are required in the constructor so you don't have to do checks below. Although you might be implementing to the standard of the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see your point but it'd be fine to leave it as-is. This is also how a lot of other libraries including Ragas implement similar functionality. |
||
run_config: RunConfig | None = None, | ||
student_openai_client: OpenAIClient | None = None, | ||
judge_model_name: str = DEFAULT_JUDGE_MODEL, | ||
judge_openai_api_key: str | None = None, | ||
): | ||
self.student_model = student_model | ||
self.run_config = run_config | ||
self.student_openai_client = student_openai_client | ||
self.judge_model_name = judge_model_name | ||
self.judge_openai_api_key = judge_openai_api_key | ||
|
||
@staticmethod | ||
def _validate_dataset(df: DataFrame): | ||
""" | ||
Validates whether or not the given `df` is a valid dataset of `Sample` objects. | ||
|
||
Args: | ||
df (DataFrame): DataFrame containing the dataset to be evaluated. | ||
""" | ||
# We have to hardcode these fields because the automated way of resolving the required fields from a TypedDict | ||
# is only included by default in Python3.11+. For earlier versions, the `typing_extensions` package is required. | ||
# See: https://docs.python.org/3/whatsnew/3.11.html#pep-655-marking-individual-typeddict-items-as-required-or-not-required | ||
required_keys = {"user_input", "reference"} | ||
missing_keys = required_keys - set(df.columns) | ||
if missing_keys: | ||
raise ValueError( | ||
f"invalid dataset provided, missing the following keys: {', '.join(missing_keys)}" | ||
) | ||
|
||
def run( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So for this a user is expected to bring a list of Sample objects, which hold the input, prediction, and ground truth? Are we going to provide a way to build this list of Samples from given files or lists of each category, or is this moreso just for use with self-built scripts that import the Sample object and build? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated it such that |
||
self, | ||
dataset: List[Sample] | Path, | ||
student_model: ModelConfig | None = None, | ||
run_config: RunConfig | None = None, | ||
student_openai_client: OpenAIClient | None = None, | ||
judge_model_name: str | None = None, | ||
judge_openai_api_key: str | None = None, | ||
) -> EvaluationResult: | ||
""" | ||
Evaluates the quality of model responses against a graded rubric. | ||
|
||
When the `dataset` lacks the `response` field, then `student_model` must be provided | ||
in order to generate the answers. | ||
|
||
Args: | ||
dataset (List[Sample] | Path): | ||
Can be either a list of `Sample` objects or a path to a jsonl file containing | ||
records matching `Sample`. | ||
student_model: (StudentModelConfig): | ||
When this parameter is provided, we'll attempt to use the described model in order to | ||
generate the responses from the given list of questions. | ||
run_config (RunConfig | None, optional): | ||
Configuration to use when running evaluations. If none is provided, then | ||
a default one is created containing extremely permissive settings when handling | ||
timeouts. This is because by default, OpenAI tier-1 usage accounts have very high | ||
rate limits resulting in heavy throttling during evaluations. | ||
student_openai_client (openai.Client | None, optional): | ||
The client to use when generating questions from the student model, must be compatible with the OpenAI API. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this the client for the student model or the judge model? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is for the student model. In this PR we are making the opinionated stance that the judge model needs to be GPT-4o for consistent results. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That makes sense- could consider renaming api variables so the intention is clear: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So the user doesn't accidentally create a client to 4o-mini thinking that they literally need a client pointing to ChatGPT. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes I think that's a good suggestion now that we've decoupled the OpenAI config from being in the student model. Good catch! |
||
This field is required when `student_model` is provided. | ||
judge_model_name (str | None, optional): | ||
Name of the OpenAI model to use as the judge model. Defaults to "gpt-4o" when none is specified. | ||
judge_openai_api_key (str | None, optional): | ||
The API key to use for evaluating the given dataset. When this isn't provided, `OPENAI_API_KEY` is read instead. | ||
|
||
|
||
Returns: | ||
EvaluationResult: The results of all evaluations performed by Ragas | ||
""" | ||
judge_model_name = ( | ||
judge_model_name if judge_model_name else self.judge_model_name | ||
) | ||
judge_openai_api_key = ( | ||
judge_openai_api_key if judge_openai_api_key else self.judge_openai_api_key | ||
) | ||
student_model = student_model if student_model else self.student_model | ||
run_config = run_config if run_config else self.run_config | ||
student_openai_client = ( | ||
student_openai_client | ||
if student_openai_client | ||
else self.student_openai_client | ||
) | ||
|
||
# ensure we are in the dataframe format | ||
input_df = None | ||
if isinstance(dataset, list): | ||
input_df = DataFrame(dataset) | ||
elif isinstance(dataset, Path): | ||
input_df = read_json(dataset, orient="records", lines=True) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think there's an implicit requirement here that the dataset referred to by the path is well-formed (shaped like list[Sample]). Could consider doing a quick check to make sure the required columns are present in the df and failing here if they aren't. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, I don't see a reason not to. |
||
else: | ||
raise TypeError(f"invalid type of dataset: {type(dataset)}") | ||
|
||
# this should never happen, but pylint is not smart enough to detect it | ||
if TYPE_CHECKING: | ||
assert input_df is not None | ||
|
||
# ensure the dataset is in the format we expect it | ||
self._validate_dataset(input_df) | ||
|
||
alimaredia marked this conversation as resolved.
Show resolved
Hide resolved
|
||
need_to_generate_questions = "response" not in input_df.columns | ||
alimaredia marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if need_to_generate_questions: | ||
logger.debug( | ||
"`response` is missing in the input dataframe columns, generating questions from the model is required." | ||
) | ||
if not student_model or not student_openai_client: | ||
raise ValueError( | ||
"provided dataset doesn't contain the model `response`, but either `student_model` or `student_openai_client` wasn't provided for inference" | ||
) | ||
|
||
# if the student model was provided then we always generate regardless | ||
if student_model: | ||
alimaredia marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if not student_openai_client: | ||
raise ValueError( | ||
"`student_model` was specified but `student_openai_client` was not provided" | ||
) | ||
input_df = self._generate_answers_from_model( | ||
input_df, student_model, student_openai_client | ||
) | ||
|
||
if not run_config: | ||
# we set extreme timeout/retry values by default since OpenAI tier-1 rate limits | ||
# are horrible and will result in half of our evaluation results being NaN or 0 | ||
run_config = RunConfig( | ||
max_retries=120, | ||
max_wait=7200, | ||
seed=DEFAULT_SEED, | ||
timeout=3600, | ||
) | ||
|
||
metrics = self._get_metrics() | ||
evaluation_ds = EvaluationDataset.from_pandas(input_df) | ||
|
||
# we will be using gpt-4o for the foreseeable future, we hardcode this | ||
alimaredia marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# for consistency of answers | ||
|
||
critic_lm = ChatOpenAI(model=judge_model_name, api_key=judge_openai_api_key) | ||
results = evaluate( | ||
dataset=evaluation_ds, | ||
batch_size=4, | ||
run_config=run_config, | ||
llm=critic_lm, | ||
metrics=metrics, | ||
show_progress=True, | ||
) | ||
return results | ||
|
||
def _generate_answers_from_model( | ||
self, | ||
questions: DataFrame, | ||
student_model: ModelConfig, | ||
student_openai_client: OpenAIClient, | ||
) -> DataFrame: | ||
""" | ||
Given a DataFrame containing `user_input` columns, generates responses from the given model | ||
and returns a new DataFrame containing its answers in the `response` column. | ||
""" | ||
# initialize response to write into | ||
updated_df = questions.copy() | ||
updated_df["response"] = "" | ||
|
||
for i, qna in updated_df.iterrows(): | ||
messages: List[ChatCompletionMessageParam] = [ | ||
{ | ||
"role": "system", | ||
"content": student_model.system_prompt, | ||
}, | ||
{"role": "user", "content": qna["user_input"]}, | ||
] | ||
response = student_openai_client.chat.completions.create( | ||
messages=messages, | ||
model=student_model.model_name, | ||
# specify the seed so we can at least try to have some reproducibility when the clients support it | ||
seed=42, | ||
max_tokens=student_model.max_tokens, | ||
temperature=student_model.temperature, | ||
) | ||
updated_df.at[i, "response"] = response.choices[0].message.content | ||
return updated_df | ||
|
||
@staticmethod | ||
def _get_metrics() -> List[Metric]: | ||
# default set of metrics | ||
return [ | ||
RubricsScore( | ||
rubrics=DEFAULT_WITH_REFERENCE_RUBRICS, | ||
) | ||
] |
Uh oh!
There was an error while loading. Please reload this page.