feat: renamed sample operations and cleaned up test warnings (#1407)

jjmachan · web-flow · commit b46827cf0ec0 · 2024-10-02T11:09:22.000+05:30
this is a continuation of #1394 cleaned up all the pydantic issues because of this
diff --git a/pyproject.toml b/pyproject.toml
@@ -59,4 +59,5 @@ build-backend = "setuptools.build_meta"
 write_to = "src/ragas/_version.py"
 
 [tool.pytest.ini_options]
-addopts = "-n 4"
+addopts = "-n 4"
+asyncio_default_fixture_loop_scope = "function"
diff --git a/src/ragas/dataset_schema.py b/src/ragas/dataset_schema.py
@@ -13,16 +13,24 @@
 
 
 class BaseEvalSample(BaseModel):
-    def dict(self, **kwargs):
-        row = super().dict(**kwargs)
-        row = {k: v for k, v in row.items() if v is not None}
-        return row
+    def to_dict(self) -> t.Dict:
+        """
+        Get the dictionary representation of the sample without attributes that are None.
+        """
+        return self.model_dump(exclude_none=True)
 
-    def features(self):
-        return set(self.dict().keys())
+    def get_features(self) -> t.List[str]:
+        """
+        Get the features of the sample that are not None.
+        """
+        return list(self.to_dict().keys())
 
 
 class SingleTurnSample(BaseEvalSample):
+    """
+    Represents evaluation samples for single-turn interactions.
+    """
+
     user_input: t.Optional[str] = None
     retrieved_contexts: t.Optional[t.List[str]] = None
     reference_contexts: t.Optional[t.List[str]] = None
@@ -68,7 +76,7 @@ def validate_user_input(
         return messages
 
     def to_messages(self):
-        return [m.dict() for m in self.user_input]
+        return [m.model_dump() for m in self.user_input]
 
     def pretty_repr(self):
         lines = []
@@ -98,7 +106,7 @@ def get_sample_type(self):
         return type(self.samples[0])
 
     def _to_list(self) -> t.List[t.Dict]:
-        rows = [sample.dict() for sample in self.samples]
+        rows = [sample.model_dump() for sample in self.samples]
 
         if self.get_sample_type() == MultiTurnSample:
             for sample in rows:
@@ -130,7 +138,7 @@ def to_pandas(self) -> PandasDataframe:
         return pd.DataFrame(data)
 
     def features(self):
-        return self.samples[0].features()
+        return self.samples[0].get_features()
 
     @classmethod
     def from_list(cls, mapping: t.List[t.Dict]):
diff --git a/src/ragas/experimental/metrics/_faithfulness.py b/src/ragas/experimental/metrics/_faithfulness.py
@@ -248,5 +248,5 @@ async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
     async def _single_turn_ascore(
         self: t.Self, sample: SingleTurnSample, callbacks: Callbacks
     ) -> float:
-        row = sample.dict()
+        row = sample.to_dict()
         return await self._ascore(row, callbacks)
diff --git a/src/ragas/integrations/langchain.py b/src/ragas/integrations/langchain.py
@@ -48,9 +48,9 @@ def __init__(self, metric: Metric, **kwargs: t.Any):
             t.cast(MetricWithLLM, self.metric).llm = LangchainLLMWrapper(llm)
         if isinstance(self.metric, MetricWithEmbeddings):
             embeddings = get_or_init(kwargs, "embeddings", OpenAIEmbeddings)
-            t.cast(MetricWithEmbeddings, self.metric).embeddings = (
-                LangchainEmbeddingsWrapper(embeddings)
-            )
+            t.cast(
+                MetricWithEmbeddings, self.metric
+            ).embeddings = LangchainEmbeddingsWrapper(embeddings)
         self.metric.init(run_config)
 
         assert isinstance(
@@ -132,7 +132,7 @@ def _validate(self, input: SingleTurnSample) -> None:
         # validate each example
         required_columns = self.metric.required_columns.get("SINGLE_TURN", [])
         for col in required_columns:
-            if col not in input.features():
+            if col not in input.get_features():
                 raise ValueError(
                     f'"{col}" is required in each example'
                     f"for the metric[{self.metric.name}] you have chosen."
diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py
@@ -219,7 +219,7 @@ def _create_statements_prompt(self, question: str, text: str) -> PromptValue:
     async def _single_turn_ascore(
         self: t.Self, sample: SingleTurnSample, callbacks: Callbacks
     ) -> float:
-        row = sample.dict()
+        row = sample.to_dict()
         score = await self._ascore(row, callbacks)
         return score
 
diff --git a/src/ragas/metrics/_answer_relevance.py b/src/ragas/metrics/_answer_relevance.py
@@ -160,7 +160,7 @@ def _create_question_gen_prompt(self, row: t.Dict) -> PromptValue:
     async def _single_turn_ascore(
         self, sample: SingleTurnSample, callbacks: Callbacks
     ) -> float:
-        row = sample.dict()
+        row = sample.to_dict()
         return await self._ascore(row, callbacks)
 
     async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
diff --git a/src/ragas/metrics/_answer_similarity.py b/src/ragas/metrics/_answer_similarity.py
@@ -60,7 +60,7 @@ def __post_init__(self: t.Self):
     async def _single_turn_ascore(
         self, sample: SingleTurnSample, callbacks: Callbacks
     ) -> float:
-        row = sample.dict()
+        row = sample.to_dict()
         return await self._ascore(row, callbacks)
 
     async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float:
diff --git a/src/ragas/metrics/_aspect_critic.py b/src/ragas/metrics/_aspect_critic.py
@@ -143,7 +143,7 @@ def _compute_score(
     async def _single_turn_ascore(
         self: t.Self, sample: SingleTurnSample, callbacks: Callbacks
     ) -> float:
-        row = sample.dict()
+        row = sample.to_dict()
         return await self._ascore(row, callbacks)
 
     async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float:
diff --git a/src/ragas/metrics/_context_entities_recall.py b/src/ragas/metrics/_context_entities_recall.py
@@ -175,7 +175,7 @@ async def get_entities(
     async def _single_turn_ascore(
         self, sample: SingleTurnSample, callbacks: Callbacks
     ) -> float:
-        row = sample.dict()
+        row = sample.to_dict()
         return await self._ascore(row, callbacks)
 
     async def _ascore(
diff --git a/src/ragas/metrics/_context_precision.py b/src/ragas/metrics/_context_precision.py
@@ -154,7 +154,7 @@ def _calculate_average_precision(
     async def _single_turn_ascore(
         self, sample: SingleTurnSample, callbacks: Callbacks
     ) -> float:
-        row = sample.dict()
+        row = sample.to_dict()
         return await self._ascore(row, callbacks)
 
     async def _ascore(
diff --git a/src/ragas/metrics/_context_recall.py b/src/ragas/metrics/_context_recall.py
@@ -177,7 +177,7 @@ def _compute_score(self, response: t.Any) -> float:
     async def _single_turn_ascore(
         self, sample: SingleTurnSample, callbacks: Callbacks
     ) -> float:
-        row = sample.dict()
+        row = sample.to_dict()
         return await self._ascore(row, callbacks)
 
     async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
@@ -224,7 +224,7 @@ class ContextRecall(LLMContextRecall):
     async def _single_turn_ascore(
         self, sample: SingleTurnSample, callbacks: Callbacks
     ) -> float:
-        row = sample.dict()
+        row = sample.to_dict()
         return await self._ascore(row, callbacks)
 
     @deprecated(since="0.2", removal="0.3", alternative="LLMContextRecall")
diff --git a/src/ragas/metrics/_domain_specific_rubrics.py b/src/ragas/metrics/_domain_specific_rubrics.py
@@ -120,7 +120,7 @@ def __post_init__(self):
     async def _single_turn_ascore(
         self, sample: SingleTurnSample, callbacks: Callbacks
     ) -> float:
-        return await self._ascore(sample.dict(), callbacks)
+        return await self._ascore(sample.to_dict(), callbacks)
 
     async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
         assert self.llm is not None, "LLM is not set"
@@ -254,7 +254,7 @@ def __post_init__(self):
     async def _single_turn_ascore(
         self, sample: SingleTurnSample, callbacks: Callbacks
     ) -> float:
-        return await self._ascore(sample.dict(), callbacks)
+        return await self._ascore(sample.to_dict(), callbacks)
 
     async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
         assert self.llm is not None, "LLM is not set"
diff --git a/src/ragas/metrics/_faithfulness.py b/src/ragas/metrics/_faithfulness.py
@@ -248,7 +248,7 @@ def _compute_score(self, answers: StatementFaithfulnessAnswers):
     async def _single_turn_ascore(
         self, sample: SingleTurnSample, callbacks: Callbacks
     ) -> float:
-        row = sample.dict()
+        row = sample.to_dict()
         return await self._ascore(row, callbacks)
 
     async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float:
diff --git a/src/ragas/metrics/_instance_specific_rubrics.py b/src/ragas/metrics/_instance_specific_rubrics.py
@@ -72,7 +72,7 @@ async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
     async def _single_turn_ascore(
         self, sample: SingleTurnSample, callbacks: Callbacks
     ) -> float:
-        row = sample.dict()
+        row = sample.to_dict()
         return await self._ascore(row, callbacks)
 
     async def _multi_turn_ascore(
@@ -144,7 +144,7 @@ async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
     async def _single_turn_ascore(
         self, sample: SingleTurnSample, callbacks: Callbacks
     ) -> float:
-        row = sample.dict()
+        row = sample.to_dict()
         return await self._ascore(row, callbacks)
 
     async def _multi_turn_ascore(
diff --git a/src/ragas/metrics/_noise_sensitivity.py b/src/ragas/metrics/_noise_sensitivity.py
@@ -226,7 +226,7 @@ def _compute_score(self, answers: t.Dict) -> float:
     async def _single_turn_ascore(
         self, sample: SingleTurnSample, callbacks: Callbacks
     ) -> float:
-        row = sample.dict()
+        row = sample.to_dict()
         return await self._ascore(row, callbacks)
 
     async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float:
diff --git a/src/ragas/metrics/_simple_criteria.py b/src/ragas/metrics/_simple_criteria.py
@@ -209,7 +209,7 @@ def _compute_score(
     async def _single_turn_ascore(
         self: t.Self, sample: SingleTurnSample, callbacks: Callbacks
     ) -> float:
-        row = sample.dict()
+        row = sample.to_dict()
         return await self._ascore(row, callbacks)
 
     async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float:
diff --git a/src/ragas/metrics/_summarization.py b/src/ragas/metrics/_summarization.py
@@ -168,7 +168,7 @@ class SummarizationScore(MetricWithLLM, SingleTurnMetric):
     async def _single_turn_ascore(
         self, sample: SingleTurnSample, callbacks: Callbacks
     ) -> float:
-        row = sample.dict()
+        row = sample.to_dict()
         return await self._ascore(row, callbacks)
 
     async def _ascore(self, row: Dict, callbacks: Callbacks) -> float:
diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py
@@ -189,7 +189,9 @@ def single_turn_score(
         callbacks: Callbacks = None,
     ) -> float:
         callbacks = callbacks or []
-        rm, group_cm = new_group(self.name, inputs=sample.dict(), callbacks=callbacks)
+        rm, group_cm = new_group(
+            self.name, inputs=sample.model_dump(), callbacks=callbacks
+        )
         try:
             loop = asyncio.get_event_loop()
             score = loop.run_until_complete(
@@ -211,7 +213,7 @@ async def single_turn_ascore(
         timeout: t.Optional[float] = None,
     ) -> float:
         callbacks = callbacks or []
-        row = sample.dict()
+        row = sample.model_dump()
         rm, group_cm = new_group(self.name, inputs=row, callbacks=callbacks)
         try:
             score = await asyncio.wait_for(
@@ -242,7 +244,9 @@ def multi_turn_score(
         callbacks: Callbacks = None,
     ) -> float:
         callbacks = callbacks or []
-        rm, group_cm = new_group(self.name, inputs=sample.dict(), callbacks=callbacks)
+        rm, group_cm = new_group(
+            self.name, inputs=sample.model_dump(), callbacks=callbacks
+        )
         try:
             loop = asyncio.get_event_loop()
             score = loop.run_until_complete(
@@ -264,7 +268,9 @@ async def multi_turn_ascore(
         timeout: t.Optional[float] = None,
     ) -> float:
         callbacks = callbacks or []
-        rm, group_cm = new_group(self.name, inputs=sample.dict(), callbacks=callbacks)
+        rm, group_cm = new_group(
+            self.name, inputs=sample.model_dump(), callbacks=callbacks
+        )
         try:
             score = await asyncio.wait_for(
                 self._multi_turn_ascore(sample=sample, callbacks=group_cm),
diff --git a/src/ragas/validation.py b/src/ragas/validation.py
@@ -57,7 +57,7 @@ def validate_required_columns(ds: EvaluationDataset, metrics: list[Metric]):
     metric_type = get_supported_metric_type(ds)
     for m in metrics:
         required_columns = set(m.required_columns.get(metric_type, []))
-        available_columns = ds.features()
+        available_columns = set(ds.features())
         if not required_columns.issubset(available_columns):
             raise ValueError(
                 f"The metric [{m.name}] that is used requires the following "
diff --git a/tests/unit/llms/test_prompt_old.py b/tests/unit/llms/test_prompt_old.py
@@ -1,6 +1,8 @@
 import importlib
 import pkgutil
 
+import pytest
+
 import ragas
 from ragas.llms.prompt import Prompt
 
@@ -77,6 +79,7 @@
 ]
 
 
+@pytest.mark.skip(reason="Prompt is deprecated")
 def test_prompt_object():
     for testcase in TESTCASES:
         prompt = Prompt(**testcase)
@@ -103,6 +106,7 @@ def test_prompt_object():
             ), "get_example_str should return a string"
 
 
+@pytest.mark.skip(reason="Prompt is deprecated")
 def test_prompt_object_names():
     "ensure that all prompt objects have unique names"
 
@@ -123,6 +127,7 @@ def test_prompt_object_names():
                 prompt_object_names.append(obj.name)
 
 
+@pytest.mark.skip(reason="Prompt is deprecated")
 def test_save_and_load(tmp_path):
     for testcase in TESTCASES:
         prompt = Prompt(**testcase)
diff --git a/tests/unit/test_dataset_schema.py b/tests/unit/test_dataset_schema.py
@@ -1,3 +1,5 @@
+import typing as t
+
 import pytest
 from pydantic import ValidationError
 
@@ -13,7 +15,7 @@ def test_evaluation_dataset():
 
     assert dataset.get_sample_type() == SingleTurnSample
     assert len(hf_dataset) == 2
-    assert dataset.features() == {"response", "user_input"}
+    assert dataset.features() == ["user_input", "response"]
     assert len(dataset) == 2
     assert dataset[0] == single_turn_sample
 
@@ -22,10 +24,23 @@ def test_single_type_evaluation_dataset():
     single_turn_sample = SingleTurnSample(user_input="What is X", response="Y")
     multi_turn_sample = MultiTurnSample(
         user_input=[{"content": "What is X"}],
-        response="Y",  # type: ignore
+        response="Y",  # type: ignore (this type error is what we want to test)
     )
 
     with pytest.raises(ValidationError) as exc_info:
         EvaluationDataset(samples=[single_turn_sample, multi_turn_sample])
 
     assert "All samples must be of the same type" in str(exc_info.value)
+
+
+def test_base_eval_sample():
+    from ragas.dataset_schema import BaseEvalSample
+
+    class FakeSample(BaseEvalSample):
+        user_input: str
+        response: str
+        reference: t.Optional[str] = None
+
+    fake_sample = FakeSample(user_input="What is X", response="Y")
+    assert fake_sample.to_dict() == {"user_input": "What is X", "response": "Y"}
+    assert fake_sample.get_features() == ["user_input", "response"]
diff --git a/tests/unit/test_import.py b/tests/unit/test_import.py
@@ -35,8 +35,13 @@ def test_import_in_debug_mode():
     """
     import os
 
-    os.environ["RAGAS_DEBUG"] = "True"
-
     from ragas.utils import get_debug_mode
 
+    get_debug_mode.cache_clear()
+
+    os.environ["RAGAS_DEBUG"] = "True"
+
     assert get_debug_mode() is True
+
+    del os.environ["RAGAS_DEBUG"]
+    get_debug_mode.cache_clear()
diff --git a/tests/unit/test_metric.py b/tests/unit/test_metric.py