chore: add unit tests for ragas evaluator

RobotSail · RobotSail · commit d581e7d7a7d9 · 2025-01-02T12:33:41.000-05:00
Signed-off-by: Oleg S &lt;97077423+RobotSail@users.noreply.github.com&gt;
diff --git a/src/instructlab/eval/ragas.py b/src/instructlab/eval/ragas.py
@@ -1,3 +1,4 @@
+# # SPDX-License-Identifier: Apache-2.0
 # Standard
 from pathlib import Path
 from typing import List, Optional, TypedDict
@@ -53,7 +54,7 @@ class ModelConfig(BaseModel):
 
     # name of the model to use.
     model_name: str
-    
+
     # The system prompt to be used when applying the chat template.
     system_prompt: str = _DEFAULT_SYSTEM_PROMPT
 
@@ -67,7 +68,7 @@ class ModelConfig(BaseModel):
     # Max amount of tokens to generate.
     max_tokens: int = 768
 
-    # Random seed for reproducibility. Caution: this isn't supported by all model serving runtimes. 
+    # Random seed for reproducibility. Caution: this isn't supported by all model serving runtimes.
     seed: int = DEFAULT_SEED
 
     @field_validator("temperature")
@@ -126,15 +127,14 @@ def run(
                 "no dataset was provided, please specify the `dataset` argument"
             )
 
-        if type(dataset) not in (list, Path):
-            raise TypeError(f"invalid type of dataset: {type(dataset)}")
-
         # ensure we are in the dataframe format
         input_df = None
         if isinstance(dataset, list):
             input_df = DataFrame(dataset)
         elif isinstance(dataset, Path):
             input_df = read_json(dataset, orient="records", lines=True)
+        else:
+            raise TypeError(f"invalid type of dataset: {type(dataset)}")
 
         # this should never happen, but pylint is not smart enough to detect it
         assert input_df is not None
@@ -192,8 +192,8 @@ def _generate_answers_from_model(
 
         for i, qna in updated_df.iterrows():
             messages = [
-                student_model.system_prompt,
-                qna["user_input"],
+                {"role": "system", "content": student_model.system_prompt},
+                {"role": "user", "content": qna["user_input"]},
             ]
             response = client.chat.completions.create(
                 messages=messages,
diff --git a/tests/test_ragas.py b/tests/test_ragas.py
@@ -0,0 +1,161 @@
+# # SPDX-License-Identifier: Apache-2.0
+# Standard
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+import unittest
+
+# Third Party
+from pandas import DataFrame
+from ragas.callbacks import ChainRun
+from ragas.dataset_schema import EvaluationDataset, EvaluationResult
+import pandas as pd
+
+# First Party
+from instructlab.eval.ragas import ModelConfig, RagasEvaluator, RunConfig, Sample
+
+
+class TestRagasEvaluator(unittest.TestCase):
+    @patch("instructlab.eval.ragas.get_openai_client")
+    def test_generate_answers_from_model(self, mock_get_openai_client):
+        # mock the OpenAI client to always return "london" for chat completions
+        mock_client = MagicMock()
+        mock_response = MagicMock()
+        mock_response.choices[0].message.content = "London"
+        mock_client.chat.completions.create.return_value = mock_response
+        mock_get_openai_client.return_value = mock_client
+
+        # get answers
+        questions = pd.DataFrame({"user_input": ["What is the capital of France?"]})
+        student_model = ModelConfig(
+            base_url="https://api.openai.com",
+            model_name="gpt-3.5-turbo",
+            api_key="test-api-key",
+        )
+        evaluator = RagasEvaluator()
+        result_df = evaluator._generate_answers_from_model(questions, student_model)
+
+        # what we expect to see
+        expected_df = questions.copy()
+        expected_df["response"] = ["London"]
+
+        # perform the assertions
+        pd.testing.assert_frame_equal(result_df, expected_df)
+        mock_get_openai_client.assert_called_once_with(
+            model_api_base=student_model.base_url, api_key=student_model.api_key
+        )
+        mock_client.chat.completions.create.assert_called_once_with(
+            messages=[student_model.system_prompt, "What is the capital of France?"],
+            model=student_model.model_name,
+            seed=42,
+            max_tokens=student_model.max_tokens,
+            temperature=student_model.temperature,
+        )
+
+    @patch("instructlab.eval.ragas.read_json")
+    @patch("instructlab.eval.ragas.evaluate")
+    @patch("instructlab.eval.ragas.ChatOpenAI")
+    @patch.object(RagasEvaluator, "_generate_answers_from_model")
+    @patch.object(RagasEvaluator, "_get_metrics")
+    def test_run(
+        self,
+        mock_get_metrics: MagicMock,
+        mock_generate_answers_from_model: MagicMock,
+        mock_ChatOpenAI: MagicMock,
+        mock_evaluate: MagicMock,
+        mock_read_json: MagicMock,
+    ):
+        ########################################################################
+        # SETUP EVERYTHING WE NEED FOR THE TESTS
+        ########################################################################
+
+        # These are the variables which will control the flow of the test.
+        # Since we have to re-construct some Ragas components under the hood,
+
+        student_model_response = "Paris"
+        user_question = "What is the capital of France?"
+        golden_answer = "The capital of France is Paris."
+        base_ds = [{"user_input": user_question, "reference": golden_answer}]
+        mocked_metric = "mocked-metric"
+        mocked_metric_score = 4.0
+
+        # The following section takes care of mocking function return calls.
+        # Ragas is tricky because it has some complex data structures under the hood,
+        # so what we have to do is configure the intermediate outputs that we expect
+        # to receive from Ragas.
+
+        mock_get_metrics.return_value = [mocked_metric]
+        interim_df = DataFrame(
+            {
+                "user_input": [user_question],
+                "response": [student_model_response],
+                "reference": [golden_answer],
+            }
+        )
+        mock_generate_answers_from_model.return_value = interim_df.copy()
+        mocked_evaluation_ds = EvaluationDataset.from_pandas(interim_df)
+        mock_ChatOpenAI.return_value = MagicMock()
+
+        # Ragas requires this value to instantiate an EvaluationResult object, so we must provide it.
+        # It isn't functionally used for our purposes though.
+
+        _unimportant_ragas_traces = {
+            "default": ChainRun(
+                run_id="42",
+                parent_run_id=None,
+                name="root",
+                inputs={"system": "null", "user": "null"},
+                outputs={"assistant": "null"},
+                metadata={"user_id": 1337},
+            )
+        }
+        mock_evaluate.return_value = EvaluationResult(
+            scores=[{mocked_metric: mocked_metric_score}],
+            dataset=mocked_evaluation_ds,
+            ragas_traces=_unimportant_ragas_traces,
+        )
+
+        ########################################################################
+        # Run the tests
+        ########################################################################
+
+        # Configure all other inputs that Ragas does not depend on for proper mocking
+        student_model = ModelConfig(
+            base_url="https://api.openai.com",
+            model_name="pt-3.5-turbo",
+            api_key="test-api-key",
+        )
+        run_config = RunConfig(max_retries=3, max_wait=60, seed=42, timeout=30)
+        evaluator = RagasEvaluator()
+
+        ########################################################################
+        # Test case: directly passing a dataset
+        ########################################################################
+        result = evaluator.run(
+            dataset=base_ds, student_model=student_model, run_config=run_config
+        )
+
+        self.assertIsInstance(result, EvaluationResult)
+        mock_generate_answers_from_model.assert_called_once()
+        mock_evaluate.assert_called_once()
+        mock_ChatOpenAI.assert_called_once_with(model="gpt-4o")
+
+        ########################################################################
+        # Test case: passing a dataset in via Path to JSONL file
+        ########################################################################
+        mock_read_json.return_value = DataFrame(base_ds)
+        result = evaluator.run(
+            dataset=Path("dummy_path.jsonl"),
+            student_model=student_model,
+            run_config=run_config,
+        )
+
+        self.assertIsInstance(result, EvaluationResult)
+        mock_read_json.assert_called_once_with(
+            Path("dummy_path.jsonl"), orient="records", lines=True
+        )
+        mock_generate_answers_from_model.assert_called()
+        mock_evaluate.assert_called()
+
+
+if __name__ == "__main__":
+    unittest.main()