From ab3d168d434f0098af1be8ebf6f4fa8ce2e4a40a Mon Sep 17 00:00:00 2001 From: Oleg S <97077423+RobotSail@users.noreply.github.com> Date: Tue, 7 Jan 2025 16:07:43 -0500 Subject: [PATCH] chore: decouple tests into more atomic units Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com> --- src/instructlab/eval/ragas.py | 112 +++++++++++----- tests/test_ragas.py | 238 ++++++++++++++++++++-------------- 2 files changed, 220 insertions(+), 130 deletions(-) diff --git a/src/instructlab/eval/ragas.py b/src/instructlab/eval/ragas.py index 9515a95..f0445da 100644 --- a/src/instructlab/eval/ragas.py +++ b/src/instructlab/eval/ragas.py @@ -1,13 +1,14 @@ # # SPDX-License-Identifier: Apache-2.0 # Standard from pathlib import Path -from typing import List, Optional, TypedDict +from typing import TYPE_CHECKING, List, Optional, TypedDict # Third Party from langchain_community.chat_models import ChatOpenAI from openai import Client as OpenAIClient +from openai.types.chat import ChatCompletionMessageParam from pandas import DataFrame, read_json -from pydantic import BaseModel, ConfigDict, field_validator +from pydantic import BaseModel, ConfigDict, Field from ragas.evaluation import EvaluationDataset, EvaluationResult, RunConfig, evaluate from ragas.metrics import Metric from ragas.metrics._domain_specific_rubrics import ( # the rubrics we must instantiate are located inside of a file marked as private @@ -17,6 +18,9 @@ # Local from .evaluator import Evaluator +from .logger_config import setup_logger + +logger = setup_logger(__name__) class Sample(TypedDict): @@ -56,7 +60,7 @@ class ModelConfig(BaseModel): system_prompt: str = _DEFAULT_SYSTEM_PROMPT # "model randomness" aka likelihood of sampling something other than the likeliest token - temperature: float = 0.0 + temperature: float = Field(default=0.0, le=1.0, ge=0.0) # Max amount of tokens to generate. max_tokens: int = 768 @@ -64,13 +68,6 @@ class ModelConfig(BaseModel): # Random seed for reproducibility. Caution: this isn't supported by all model serving runtimes. seed: int = DEFAULT_SEED - @field_validator("temperature") - @classmethod - def check_temperature(cls, v: float) -> float: - if not 0.0 <= v <= 1.0: - raise ValueError("temperature must be between 0.0 and 1.0") - return v - class RagasEvaluator(Evaluator): # most basic implementation, we just assume that the user will bring the existing model responses @@ -80,18 +77,42 @@ def __init__( self, student_model: ModelConfig | None = None, run_config: RunConfig | None = None, - openai_client: OpenAIClient | None = None, + student_openai_client: OpenAIClient | None = None, + judge_model_name: str = DEFAULT_JUDGE_MODEL, + judge_openai_api_key: str | None = None, ): self.student_model = student_model self.run_config = run_config - self.openai_client = openai_client + self.student_openai_client = student_openai_client + self.judge_model_name = judge_model_name + self.judge_openai_api_key = judge_openai_api_key + + @staticmethod + def _validate_dataset(df: DataFrame): + """ + Validates whether or not the given `df` is a valid dataset of `Sample` objects. + + Args: + df (DataFrame): DataFrame containing the dataset to be evaluated. + """ + # We have to hardcode these fields because the automated way of resolving the required fields from a TypedDict + # is only included by default in Python3.11+. For earlier versions, the `typing_extensions` package is required. + # See: https://docs.python.org/3/whatsnew/3.11.html#pep-655-marking-individual-typeddict-items-as-required-or-not-required + required_keys = {"user_input", "reference"} + missing_keys = required_keys - set(df.columns) + if missing_keys: + raise ValueError( + f"invalid dataset provided, missing the following keys: {', '.join(missing_keys)}" + ) def run( self, dataset: List[Sample] | Path, student_model: ModelConfig | None = None, run_config: RunConfig | None = None, - openai_client: OpenAIClient | None = None, + student_openai_client: OpenAIClient | None = None, + judge_model_name: str | None = None, + judge_openai_api_key: str | None = None, ) -> EvaluationResult: """ Evaluates the quality of model responses against a graded rubric. @@ -111,21 +132,31 @@ def run( a default one is created containing extremely permissive settings when handling timeouts. This is because by default, OpenAI tier-1 usage accounts have very high rate limits resulting in heavy throttling during evaluations. - openai_client (openai.Client | None, optional): + student_openai_client (openai.Client | None, optional): The client to use when generating questions from the student model, must be compatible with the OpenAI API. This field is required when `student_model` is provided. + judge_model_name (str | None, optional): + Name of the OpenAI model to use as the judge model. Defaults to "gpt-4o" when none is specified. + judge_openai_api_key (str | None, optional): + The API key to use for evaluating the given dataset. When this isn't provided, `OPENAI_API_KEY` is read instead. + Returns: EvaluationResult: The results of all evaluations performed by Ragas """ + judge_model_name = ( + judge_model_name if judge_model_name else self.judge_model_name + ) + judge_openai_api_key = ( + judge_openai_api_key if judge_openai_api_key else self.judge_openai_api_key + ) student_model = student_model if student_model else self.student_model run_config = run_config if run_config else self.run_config - openai_client = openai_client if openai_client else self.openai_client - - if not dataset: - raise ValueError( - "no dataset was provided, please specify the `dataset` argument" - ) + student_openai_client = ( + student_openai_client + if student_openai_client + else self.student_openai_client + ) # ensure we are in the dataframe format input_df = None @@ -137,22 +168,30 @@ def run( raise TypeError(f"invalid type of dataset: {type(dataset)}") # this should never happen, but pylint is not smart enough to detect it - assert input_df is not None + if TYPE_CHECKING: + assert input_df is not None + + # ensure the dataset is in the format we expect it + self._validate_dataset(input_df) need_to_generate_questions = "response" not in input_df.columns - if need_to_generate_questions and (not student_model or not openai_client): - raise ValueError( - "provided dataset doesn't contain the model `response`, but either `student_model` or `openai_client` wasn't provided for inference" + if need_to_generate_questions: + logger.debug( + "`response` is missing in the input dataframe columns, generating questions from the model is required." ) + if not student_model or not student_openai_client: + raise ValueError( + "provided dataset doesn't contain the model `response`, but either `student_model` or `student_openai_client` wasn't provided for inference" + ) # if the student model was provided then we always generate regardless if student_model: - if not openai_client: + if not student_openai_client: raise ValueError( - "`student_model` was specified but `openai_client` was not provided" + "`student_model` was specified but `student_openai_client` was not provided" ) input_df = self._generate_answers_from_model( - input_df, student_model, openai_client + input_df, student_model, student_openai_client ) if not run_config: @@ -170,7 +209,8 @@ def run( # we will be using gpt-4o for the foreseeable future, we hardcode this # for consistency of answers - critic_lm = ChatOpenAI(model=DEFAULT_JUDGE_MODEL) + + critic_lm = ChatOpenAI(model=judge_model_name, api_key=judge_openai_api_key) results = evaluate( dataset=evaluation_ds, batch_size=4, @@ -185,7 +225,7 @@ def _generate_answers_from_model( self, questions: DataFrame, student_model: ModelConfig, - openai_client: OpenAIClient, + student_openai_client: OpenAIClient, ) -> DataFrame: """ Given a DataFrame containing `user_input` columns, generates responses from the given model @@ -196,11 +236,14 @@ def _generate_answers_from_model( updated_df["response"] = "" for i, qna in updated_df.iterrows(): - messages = [ - student_model.system_prompt, - qna["user_input"], + messages: List[ChatCompletionMessageParam] = [ + { + "role": "system", + "content": student_model.system_prompt, + }, + {"role": "user", "content": qna["user_input"]}, ] - response = openai_client.chat.completions.create( + response = student_openai_client.chat.completions.create( messages=messages, model=student_model.model_name, # specify the seed so we can at least try to have some reproducibility when the clients support it @@ -211,7 +254,8 @@ def _generate_answers_from_model( updated_df.at[i, "response"] = response.choices[0].message.content return updated_df - def _get_metrics(self) -> List[Metric]: + @staticmethod + def _get_metrics() -> List[Metric]: # default set of metrics return [ RubricsScore( diff --git a/tests/test_ragas.py b/tests/test_ragas.py index ebabb2b..1d3bb8f 100644 --- a/tests/test_ragas.py +++ b/tests/test_ragas.py @@ -1,4 +1,4 @@ -# # SPDX-License-Identifier: Apache-2.0 +# SPDX-License-Identifier: Apache-2.0 # Standard from pathlib import Path from unittest.mock import MagicMock, patch @@ -8,102 +8,55 @@ from pandas import DataFrame from ragas.callbacks import ChainRun from ragas.dataset_schema import EvaluationDataset, EvaluationResult -import pandas as pd # First Party from instructlab.eval.ragas import ModelConfig, RagasEvaluator, RunConfig class TestRagasEvaluator(unittest.TestCase): - def test_generate_answers_from_model(self): - # mock the OpenAI client to always return "london" for chat completions - user_input = "What is the capital of France?" - model_response = "London" - mock_client = MagicMock() - mock_response = MagicMock() - mock_response.choices = [MagicMock(message=MagicMock(content=model_response))] - mock_client.chat.completions.create.return_value = mock_response - - # get answers - questions = pd.DataFrame({"user_input": [user_input]}) - student_model = ModelConfig( + def setUp(self): + # Common setup data for all tests + self.student_model_response = "Paris" + self.user_question = "What is the capital of France?" + self.golden_answer = "The capital of France is Paris." + self.metric = "mocked-metric" + self.metric_score = 4.0 + self.base_ds = [ + { + "user_input": self.user_question, + "reference": self.golden_answer, + } + ] + self.student_model = ModelConfig( model_name="super-jeeves-8x700B", ) - evaluator = RagasEvaluator() - result_df = evaluator._generate_answers_from_model( - questions, student_model, mock_client - ) - - # what we expect to see - expected_df = questions.copy() - expected_df["response"] = [model_response] - - # perform the assertions - pd.testing.assert_frame_equal(result_df, expected_df) - mock_client.chat.completions.create.assert_called_once_with( - messages=[student_model.system_prompt, user_input], - model=student_model.model_name, - seed=42, - max_tokens=student_model.max_tokens, - temperature=student_model.temperature, - ) + self.run_config = RunConfig(max_retries=3, max_wait=60, seed=42, timeout=30) @patch("instructlab.eval.ragas.ChatOpenAI") - @patch("instructlab.eval.ragas.read_json") @patch("instructlab.eval.ragas.evaluate") @patch.object(RagasEvaluator, "_generate_answers_from_model") @patch.object(RagasEvaluator, "_get_metrics") - def test_run( + def test_run_with_dataset( self, mock_get_metrics: MagicMock, mock_generate_answers_from_model: MagicMock, mock_evaluate: MagicMock, - mock_read_json: MagicMock, mock_ChatOpenAI: MagicMock, ): - ######################################################################## - # SETUP EVERYTHING WE NEED FOR THE TESTS - ######################################################################## - - # These are the variables which will control the flow of the test. - # Since we have to re-construct some Ragas components under the hood, - - student_model_response = "Paris" - user_question = "What is the capital of France?" - golden_answer = "The capital of France is Paris." - metric = "mocked-metric" - metric_score = 4.0 - base_ds = [{"user_input": user_question, "reference": golden_answer}] - student_model = ModelConfig( - model_name="super-jeeves-8x700B", - ) - run_config = RunConfig(max_retries=3, max_wait=60, seed=42, timeout=30) - - # The following section takes care of mocking function return calls. - # Ragas is tricky because it has some complex data structures under the hood, - # so what we have to do is configure the intermediate outputs that we expect - # to receive from Ragas. - - mock_get_metrics.return_value = [metric] + """ + Test case 1: Directly passing a Python list/dict dataset to `RagasEvaluator.run()`. + """ + # Prepare mocks + mock_get_metrics.return_value = [self.metric] interim_df = DataFrame( { - "user_input": [user_question], - "response": [student_model_response], - "reference": [golden_answer], + "user_input": [self.user_question], + "response": [self.student_model_response], + "reference": [self.golden_answer], } ) - mock_generate_answers_from_model.return_value = interim_df.copy() + mock_generate_answers_from_model.return_value = interim_df mocked_evaluation_ds = EvaluationDataset.from_pandas(interim_df) - mock_client = MagicMock() - mock_response = MagicMock() - mock_response.choices = [ - MagicMock(message=MagicMock(content=student_model_response)) - ] - mock_client.chat.completions.create.return_value = mock_response - - # Ragas requires this value to instantiate an EvaluationResult object, so we must provide it. - # It isn't functionally used for our purposes though. - _unimportant_ragas_traces = { "default": ChainRun( run_id="42", @@ -115,39 +68,86 @@ def test_run( ) } mock_evaluate.return_value = EvaluationResult( - scores=[{metric: metric_score}], + scores=[{self.metric: self.metric_score}], dataset=mocked_evaluation_ds, ragas_traces=_unimportant_ragas_traces, ) - ######################################################################## - # Test case: directly passing a dataset - ######################################################################## + # Instantiate evaluator evaluator = RagasEvaluator() + + # Run test result = evaluator.run( - dataset=base_ds, - student_model=student_model, - run_config=run_config, - openai_client=mock_client, + dataset=self.base_ds, + student_model=self.student_model, + run_config=self.run_config, + student_openai_client=MagicMock(), # We pass a mock client ) + # Assertions self.assertIsInstance(result, EvaluationResult) mock_generate_answers_from_model.assert_called_once() mock_evaluate.assert_called_once() - mock_ChatOpenAI.assert_called_once_with(model="gpt-4o") + # we didn't provide an API key, so it expects to get `api_key=None` + mock_ChatOpenAI.assert_called_once_with(model="gpt-4o", api_key=None) - ######################################################################## - # Test case: passing a dataset in via Path to JSONL file - ######################################################################## + @patch("instructlab.eval.ragas.ChatOpenAI") + @patch("instructlab.eval.ragas.read_json") + @patch("instructlab.eval.ragas.evaluate") + @patch.object(RagasEvaluator, "_generate_answers_from_model") + @patch.object(RagasEvaluator, "_get_metrics") + def test_run_with_dataset_via_path( + self, + mock_get_metrics: MagicMock, + mock_generate_answers_from_model: MagicMock, + mock_evaluate: MagicMock, + mock_read_json: MagicMock, + mock_ChatOpenAI: MagicMock, + ): + """ + Test case 2: Passing a Path to a JSONL file (containing the dataset) to `RagasEvaluator.run()`. + """ + # Prepare mocks + mock_get_metrics.return_value = [self.metric] + interim_df = DataFrame( + { + "user_input": [self.user_question], + "response": [self.student_model_response], + "reference": [self.golden_answer], + } + ) + mock_generate_answers_from_model.return_value = interim_df + mocked_evaluation_ds = EvaluationDataset.from_pandas(interim_df) + _unimportant_ragas_traces = { + "default": ChainRun( + run_id="42", + parent_run_id=None, + name="root", + inputs={"system": "null", "user": "null"}, + outputs={"assistant": "null"}, + metadata={"user_id": 1337}, + ) + } + mock_evaluate.return_value = EvaluationResult( + scores=[{self.metric: self.metric_score}], + dataset=mocked_evaluation_ds, + ragas_traces=_unimportant_ragas_traces, + ) + + mock_read_json.return_value = DataFrame(self.base_ds) + + # Instantiate evaluator evaluator = RagasEvaluator() - mock_read_json.return_value = DataFrame(base_ds) + + # Run test result = evaluator.run( dataset=Path("dummy_path.jsonl"), - student_model=student_model, - run_config=run_config, - openai_client=mock_client, + student_model=self.student_model, + run_config=self.run_config, + student_openai_client=MagicMock(), ) + # Assertions self.assertIsInstance(result, EvaluationResult) mock_read_json.assert_called_once_with( Path("dummy_path.jsonl"), orient="records", lines=True @@ -155,17 +155,63 @@ def test_run( mock_generate_answers_from_model.assert_called() mock_evaluate.assert_called() - ######################################################################## - # Test case: using the instance attributes - ######################################################################## + @patch("instructlab.eval.ragas.ChatOpenAI") + @patch("instructlab.eval.ragas.read_json") + @patch("instructlab.eval.ragas.evaluate") + @patch.object(RagasEvaluator, "_generate_answers_from_model") + @patch.object(RagasEvaluator, "_get_metrics") + def test_run_with_instance_attributes( + self, + mock_get_metrics: MagicMock, + mock_generate_answers_from_model: MagicMock, + mock_evaluate: MagicMock, + mock_read_json: MagicMock, + mock_ChatOpenAI: MagicMock, + ): + """ + Test case 3: Using `RagasEvaluator` instance attributes for `student_model`, `run_config`, + and `student_openai_client` instead of passing them explicitly. + """ + # Prepare mocks + mock_get_metrics.return_value = [self.metric] + interim_df = DataFrame( + { + "user_input": [self.user_question], + "response": [self.student_model_response], + "reference": [self.golden_answer], + } + ) + mock_generate_answers_from_model.return_value = interim_df + mocked_evaluation_ds = EvaluationDataset.from_pandas(interim_df) + _unimportant_ragas_traces = { + "default": ChainRun( + run_id="42", + parent_run_id=None, + name="root", + inputs={"system": "null", "user": "null"}, + outputs={"assistant": "null"}, + metadata={"user_id": 1337}, + ) + } + mock_evaluate.return_value = EvaluationResult( + scores=[{self.metric: self.metric_score}], + dataset=mocked_evaluation_ds, + ragas_traces=_unimportant_ragas_traces, + ) + + mock_read_json.return_value = DataFrame(self.base_ds) + + # Instantiate evaluator with instance-level configs evaluator = RagasEvaluator( - student_model=student_model, - openai_client=mock_client, - run_config=run_config, + student_model=self.student_model, + student_openai_client=MagicMock(), + run_config=self.run_config, ) - mock_read_json.return_value = DataFrame(base_ds) + + # Run test result = evaluator.run(dataset=Path("dummy_path.jsonl")) + # Assertions self.assertIsInstance(result, EvaluationResult) mock_read_json.assert_called_with( Path("dummy_path.jsonl"), orient="records", lines=True