From c97470ba151f06d14aca684f49c3a5a0950b4008 Mon Sep 17 00:00:00 2001 From: MilesHolland <108901744+MilesHolland@users.noreply.github.com> Date: Wed, 4 Sep 2024 15:06:32 -0400 Subject: [PATCH] changes to align with spec (#3702) A collection of small changes to the ECI and IP evaluators to match the spec more closely. Changes include: - refactor non-content safety evaluator response parsing to add metric prefix to results. This includes special logic to capitalize the 'eci' prefix. - Changed ECI simulator enum to just be 'ECI'. --- .../promptflow/evals/_common/rai_service.py | 21 ++++++++++++++++--- .../promptflow/evals/evaluators/_eci/_eci.py | 6 +++--- .../_protected_materials.py | 6 +++--- .../evals/synthetic/adversarial_scenario.py | 2 +- .../evals/e2etests/test_adv_simulator.py | 2 +- .../evals/e2etests/test_builtin_evaluators.py | 16 +++++++------- 6 files changed, 34 insertions(+), 19 deletions(-) diff --git a/src/promptflow-evals/promptflow/evals/_common/rai_service.py b/src/promptflow-evals/promptflow/evals/_common/rai_service.py index 4d8a8fa7441..07456e7a22c 100644 --- a/src/promptflow-evals/promptflow/evals/_common/rai_service.py +++ b/src/promptflow-evals/promptflow/evals/_common/rai_service.py @@ -207,7 +207,7 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements :return: The parsed annotation result. :rtype: List[List[Dict]] """ - + # non-numeric metrics if metric_name in {EvaluationMetrics.PROTECTED_MATERIAL, _InternalEvaluationMetrics.ECI}: if not batch_response or len(batch_response[0]) == 0 or metric_name not in batch_response[0]: return {} @@ -216,12 +216,27 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements response = response.replace("true", "True") parsed_response = literal_eval(response) result = {} - result["label"] = parsed_response["label"] if "label" in parsed_response else np.nan - result["reasoning"] = parsed_response["reasoning"] if "reasoning" in parsed_response else "" + metric_prefix = _get_metric_prefix(metric_name) + # Use label instead of score since these are assumed to be boolean results. + result[metric_prefix + "_label"] = parsed_response["label"] if "label" in parsed_response else "" + result[metric_prefix + "_reasoning"] = parsed_response["reasoning"] if "reasoning" in parsed_response else "" return result return _parse_content_harm_response(batch_response, metric_name) +def _get_metric_prefix(metric_name: str) -> str: + """Get the prefix for the evaluation metric. This is usually the metric name. + + :param metric_name: The evaluation metric to use. + :type metric_name: str + :return: The prefix for the evaluation metric. + :rtype: str + """ + if metric_name == _InternalEvaluationMetrics.ECI: + return "ECI" + return metric_name + + def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -> Dict: """Parse the annotation response from Responsible AI service for a content harm evaluation. diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_eci/_eci.py b/src/promptflow-evals/promptflow/evals/evaluators/_eci/_eci.py index f90186bdd32..4f120bdabd4 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/_eci/_eci.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/_eci/_eci.py @@ -65,8 +65,8 @@ class ECIEvaluator: .. code-block:: python { - "label": "False", - "reasoning": "Some reason." + "ECI_label": "False", + "ECI_reasoning": "Some reason." } """ @@ -81,7 +81,7 @@ def __call__(self, *, question: str, answer: str, **kwargs): :paramtype question: str :keyword answer: The answer to be evaluated. :paramtype answer: str - :return: The ECI score. + :return: The ECI result. :rtype: dict """ return async_run_allowing_running_loop(self._async_evaluator, question=question, answer=answer, **kwargs) diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_protected_materials/_protected_materials.py b/src/promptflow-evals/promptflow/evals/evaluators/_protected_materials/_protected_materials.py index 94c2044bcb9..4a40f149486 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/_protected_materials/_protected_materials.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/_protected_materials/_protected_materials.py @@ -19,7 +19,7 @@ async def __call__(self, *, question: str, answer: str, **kwargs): :paramtype question: str :keyword answer: The answer to be evaluated. :paramtype answer: str - :return: The evaluation score computation based on the Content Safety metric (self.metric). + :return: The evaluation result computation based on the Content Safety metric (self.metric). :rtype: Any """ # Validate inputs @@ -70,8 +70,8 @@ class ProtectedMaterialsEvaluator: .. code-block:: python { - "label": "False", - "reasoning": "This question does not contain any protected material." + "protected_material_label": "False", + "protected_material_reasoning": "This question does not contain any protected material." } """ diff --git a/src/promptflow-evals/promptflow/evals/synthetic/adversarial_scenario.py b/src/promptflow-evals/promptflow/evals/synthetic/adversarial_scenario.py index 07668ab0484..2e76c548359 100644 --- a/src/promptflow-evals/promptflow/evals/synthetic/adversarial_scenario.py +++ b/src/promptflow-evals/promptflow/evals/synthetic/adversarial_scenario.py @@ -23,4 +23,4 @@ class _UnstableAdverarialScenario(Enum): Values listed here are subject to potential change, and/or migration to the main enum over time. """ - ADVERSARIAL_CONTENT_ECI = "adv_politics" + ECI = "adv_politics" diff --git a/src/promptflow-evals/tests/evals/e2etests/test_adv_simulator.py b/src/promptflow-evals/tests/evals/e2etests/test_adv_simulator.py index 51057ac21d2..6403b4f566d 100644 --- a/src/promptflow-evals/tests/evals/e2etests/test_adv_simulator.py +++ b/src/promptflow-evals/tests/evals/e2etests/test_adv_simulator.py @@ -348,7 +348,7 @@ async def callback( outputs = asyncio.run( simulator( - scenario=_UnstableAdverarialScenario.ADVERSARIAL_CONTENT_ECI, + scenario=_UnstableAdverarialScenario.ECI, max_conversation_turns=1, max_simulation_results=1, target=callback, diff --git a/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py b/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py index a001ce12f47..e7d118bc070 100644 --- a/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py +++ b/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py @@ -368,8 +368,8 @@ def test_protected_materials_evaluator(self, project_scope, azure_cred): answer="Rhombus", ) assert good_result is not None - assert not good_result["label"] - assert "material was not found" in good_result["reasoning"] + assert not good_result["protected_material_label"] + assert "material was not found" in good_result["protected_material_reasoning"] problem_answer = ip_eval( question="-", @@ -399,11 +399,11 @@ def test_protected_materials_evaluator(self, project_scope, azure_cred): ), ) assert problem_answer is not None - assert problem_answer["label"] - assert "material was found" in problem_answer["reasoning"] + assert problem_answer["protected_material_label"] + assert "material was found" in problem_answer["protected_material_reasoning"] assert problem_question is not None - assert problem_question["label"] - assert "material was found" in problem_question["reasoning"] + assert problem_question["protected_material_label"] + assert "material was found" in problem_question["protected_material_reasoning"] @pytest.mark.skipif( not is_replay(), reason="API not fully released yet. Don't run in live mode unless connected to INT." @@ -415,5 +415,5 @@ def test_eci_evaluator(self, project_scope, azure_cred): answer="Rhombus", ) assert unrelated_result is not None - assert not unrelated_result["label"] - assert "geometry question" in unrelated_result["reasoning"] + assert not unrelated_result["ECI_label"] + assert "geometry question" in unrelated_result["ECI_reasoning"]