changes to align with spec (#3702)

A collection of small changes to the ECI and IP evaluators to match the spec more closely. Changes include: - refactor non-content safety evaluator response parsing to add metric prefix to results. This includes special logic to capitalize the 'eci' prefix. - Changed ECI simulator enum to just be 'ECI'.
microsoft · Sep 4, 2024 · c97470b · c97470b
1 parent 5d9c715
commit c97470b
Show file tree

Hide file tree

Showing 6 changed files with 34 additions and 19 deletions.
diff --git a/src/promptflow-evals/promptflow/evals/_common/rai_service.py b/src/promptflow-evals/promptflow/evals/_common/rai_service.py
@@ -207,7 +207,7 @@ def parse_response(  # pylint: disable=too-many-branches,too-many-statements
     :return: The parsed annotation result.
     :rtype: List[List[Dict]]
     """
-
+    # non-numeric metrics
     if metric_name in {EvaluationMetrics.PROTECTED_MATERIAL, _InternalEvaluationMetrics.ECI}:
         if not batch_response or len(batch_response[0]) == 0 or metric_name not in batch_response[0]:
             return {}
@@ -216,12 +216,27 @@ def parse_response(  # pylint: disable=too-many-branches,too-many-statements
         response = response.replace("true", "True")
         parsed_response = literal_eval(response)
         result = {}
-        result["label"] = parsed_response["label"] if "label" in parsed_response else np.nan
-        result["reasoning"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
+        metric_prefix = _get_metric_prefix(metric_name)
+        # Use label instead of score since these are assumed to be boolean results.
+        result[metric_prefix + "_label"] = parsed_response["label"] if "label" in parsed_response else ""
+        result[metric_prefix + "_reasoning"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
         return result
     return _parse_content_harm_response(batch_response, metric_name)
 
 
+def _get_metric_prefix(metric_name: str) -> str:
+    """Get the prefix for the evaluation metric. This is usually the metric name.
+
+    :param metric_name: The evaluation metric to use.
+    :type metric_name: str
+    :return: The prefix for the evaluation metric.
+    :rtype: str
+    """
+    if metric_name == _InternalEvaluationMetrics.ECI:
+        return "ECI"
+    return metric_name
+
+
 def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -> Dict:
     """Parse the annotation response from Responsible AI service for a content harm evaluation.
 

diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_eci/_eci.py b/src/promptflow-evals/promptflow/evals/evaluators/_eci/_eci.py
@@ -65,8 +65,8 @@ class ECIEvaluator:
     .. code-block:: python
 
         {
-            "label": "False",
-            "reasoning": "Some reason."
+            "ECI_label": "False",
+            "ECI_reasoning": "Some reason."
         }
     """
 
@@ -81,7 +81,7 @@ def __call__(self, *, question: str, answer: str, **kwargs):
         :paramtype question: str
         :keyword answer: The answer to be evaluated.
         :paramtype answer: str
-        :return: The ECI score.
+        :return: The ECI result.
         :rtype: dict
         """
         return async_run_allowing_running_loop(self._async_evaluator, question=question, answer=answer, **kwargs)

diff --git a/...promptflow-evals/promptflow/evals/evaluators/_protected_materials/_protected_materials.py b/...promptflow-evals/promptflow/evals/evaluators/_protected_materials/_protected_materials.py
@@ -19,7 +19,7 @@ async def __call__(self, *, question: str, answer: str, **kwargs):
         :paramtype question: str
         :keyword answer: The answer to be evaluated.
         :paramtype answer: str
-        :return: The evaluation score computation based on the Content Safety metric (self.metric).
+        :return: The evaluation result computation based on the Content Safety metric (self.metric).
         :rtype: Any
         """
         # Validate inputs
@@ -70,8 +70,8 @@ class ProtectedMaterialsEvaluator:
     .. code-block:: python
 
         {
-            "label": "False",
-            "reasoning": "This question does not contain any protected material."
+            "protected_material_label": "False",
+            "protected_material_reasoning": "This question does not contain any protected material."
         }
     """
 

diff --git a/src/promptflow-evals/promptflow/evals/synthetic/adversarial_scenario.py b/src/promptflow-evals/promptflow/evals/synthetic/adversarial_scenario.py
@@ -23,4 +23,4 @@ class _UnstableAdverarialScenario(Enum):
     Values listed here are subject to potential change, and/or migration to the main enum over time.
     """
 
-    ADVERSARIAL_CONTENT_ECI = "adv_politics"
+    ECI = "adv_politics"
diff --git a/src/promptflow-evals/tests/evals/e2etests/test_adv_simulator.py b/src/promptflow-evals/tests/evals/e2etests/test_adv_simulator.py
@@ -348,7 +348,7 @@ async def callback(
 
         outputs = asyncio.run(
             simulator(
-                scenario=_UnstableAdverarialScenario.ADVERSARIAL_CONTENT_ECI,
+                scenario=_UnstableAdverarialScenario.ECI,
                 max_conversation_turns=1,
                 max_simulation_results=1,
                 target=callback,

diff --git a/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py b/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py
@@ -368,8 +368,8 @@ def test_protected_materials_evaluator(self, project_scope, azure_cred):
             answer="Rhombus",
         )
         assert good_result is not None
-        assert not good_result["label"]
-        assert "material was not found" in good_result["reasoning"]
+        assert not good_result["protected_material_label"]
+        assert "material was not found" in good_result["protected_material_reasoning"]
 
         problem_answer = ip_eval(
             question="-",
@@ -399,11 +399,11 @@ def test_protected_materials_evaluator(self, project_scope, azure_cred):
             ),
         )
         assert problem_answer is not None
-        assert problem_answer["label"]
-        assert "material was found" in problem_answer["reasoning"]
+        assert problem_answer["protected_material_label"]
+        assert "material was found" in problem_answer["protected_material_reasoning"]
         assert problem_question is not None
-        assert problem_question["label"]
-        assert "material was found" in problem_question["reasoning"]
+        assert problem_question["protected_material_label"]
+        assert "material was found" in problem_question["protected_material_reasoning"]
 
     @pytest.mark.skipif(
         not is_replay(), reason="API not fully released yet. Don't run in live mode unless connected to INT."
@@ -415,5 +415,5 @@ def test_eci_evaluator(self, project_scope, azure_cred):
             answer="Rhombus",
         )
         assert unrelated_result is not None
-        assert not unrelated_result["label"]
-        assert "geometry question" in unrelated_result["reasoning"]
+        assert not unrelated_result["ECI_label"]
+        assert "geometry question" in unrelated_result["ECI_reasoning"]