Skip to content

Commit

Permalink
changes to align with spec (#3702)
Browse files Browse the repository at this point in the history
A collection of small changes to the ECI and IP evaluators to match the
spec more closely. Changes include:
- refactor non-content safety evaluator response parsing to add metric
prefix to results. This includes special logic to capitalize the 'eci'
prefix.
- Changed ECI simulator enum to just be 'ECI'.
  • Loading branch information
MilesHolland authored Sep 4, 2024
1 parent 5d9c715 commit c97470b
Show file tree
Hide file tree
Showing 6 changed files with 34 additions and 19 deletions.
21 changes: 18 additions & 3 deletions src/promptflow-evals/promptflow/evals/_common/rai_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
:return: The parsed annotation result.
:rtype: List[List[Dict]]
"""

# non-numeric metrics
if metric_name in {EvaluationMetrics.PROTECTED_MATERIAL, _InternalEvaluationMetrics.ECI}:
if not batch_response or len(batch_response[0]) == 0 or metric_name not in batch_response[0]:
return {}
Expand All @@ -216,12 +216,27 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
response = response.replace("true", "True")
parsed_response = literal_eval(response)
result = {}
result["label"] = parsed_response["label"] if "label" in parsed_response else np.nan
result["reasoning"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
metric_prefix = _get_metric_prefix(metric_name)
# Use label instead of score since these are assumed to be boolean results.
result[metric_prefix + "_label"] = parsed_response["label"] if "label" in parsed_response else ""
result[metric_prefix + "_reasoning"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
return result
return _parse_content_harm_response(batch_response, metric_name)


def _get_metric_prefix(metric_name: str) -> str:
"""Get the prefix for the evaluation metric. This is usually the metric name.
:param metric_name: The evaluation metric to use.
:type metric_name: str
:return: The prefix for the evaluation metric.
:rtype: str
"""
if metric_name == _InternalEvaluationMetrics.ECI:
return "ECI"
return metric_name


def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -> Dict:
"""Parse the annotation response from Responsible AI service for a content harm evaluation.
Expand Down
6 changes: 3 additions & 3 deletions src/promptflow-evals/promptflow/evals/evaluators/_eci/_eci.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@ class ECIEvaluator:
.. code-block:: python
{
"label": "False",
"reasoning": "Some reason."
"ECI_label": "False",
"ECI_reasoning": "Some reason."
}
"""

Expand All @@ -81,7 +81,7 @@ def __call__(self, *, question: str, answer: str, **kwargs):
:paramtype question: str
:keyword answer: The answer to be evaluated.
:paramtype answer: str
:return: The ECI score.
:return: The ECI result.
:rtype: dict
"""
return async_run_allowing_running_loop(self._async_evaluator, question=question, answer=answer, **kwargs)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ async def __call__(self, *, question: str, answer: str, **kwargs):
:paramtype question: str
:keyword answer: The answer to be evaluated.
:paramtype answer: str
:return: The evaluation score computation based on the Content Safety metric (self.metric).
:return: The evaluation result computation based on the Content Safety metric (self.metric).
:rtype: Any
"""
# Validate inputs
Expand Down Expand Up @@ -70,8 +70,8 @@ class ProtectedMaterialsEvaluator:
.. code-block:: python
{
"label": "False",
"reasoning": "This question does not contain any protected material."
"protected_material_label": "False",
"protected_material_reasoning": "This question does not contain any protected material."
}
"""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,4 @@ class _UnstableAdverarialScenario(Enum):
Values listed here are subject to potential change, and/or migration to the main enum over time.
"""

ADVERSARIAL_CONTENT_ECI = "adv_politics"
ECI = "adv_politics"
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,7 @@ async def callback(

outputs = asyncio.run(
simulator(
scenario=_UnstableAdverarialScenario.ADVERSARIAL_CONTENT_ECI,
scenario=_UnstableAdverarialScenario.ECI,
max_conversation_turns=1,
max_simulation_results=1,
target=callback,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -368,8 +368,8 @@ def test_protected_materials_evaluator(self, project_scope, azure_cred):
answer="Rhombus",
)
assert good_result is not None
assert not good_result["label"]
assert "material was not found" in good_result["reasoning"]
assert not good_result["protected_material_label"]
assert "material was not found" in good_result["protected_material_reasoning"]

problem_answer = ip_eval(
question="-",
Expand Down Expand Up @@ -399,11 +399,11 @@ def test_protected_materials_evaluator(self, project_scope, azure_cred):
),
)
assert problem_answer is not None
assert problem_answer["label"]
assert "material was found" in problem_answer["reasoning"]
assert problem_answer["protected_material_label"]
assert "material was found" in problem_answer["protected_material_reasoning"]
assert problem_question is not None
assert problem_question["label"]
assert "material was found" in problem_question["reasoning"]
assert problem_question["protected_material_label"]
assert "material was found" in problem_question["protected_material_reasoning"]

@pytest.mark.skipif(
not is_replay(), reason="API not fully released yet. Don't run in live mode unless connected to INT."
Expand All @@ -415,5 +415,5 @@ def test_eci_evaluator(self, project_scope, azure_cred):
answer="Rhombus",
)
assert unrelated_result is not None
assert not unrelated_result["label"]
assert "geometry question" in unrelated_result["reasoning"]
assert not unrelated_result["ECI_label"]
assert "geometry question" in unrelated_result["ECI_reasoning"]

0 comments on commit c97470b

Please sign in to comment.