Feature/xpia sim and eval fixes (#3723)

A fork of the original XPIA sim/eval branch with additional fixes for bugs discovered last night. - changes the jailbreak check for combining templates to only account for upia (since xpia doesn't merge templates) - removes conversations as an input for xpia evals until the default override bug is fixed. - Accounts for new xpia evaluator return fields. - Changes the output base name of 'reasoning' fields for label-based evaluators to just 'reason' Original PR: #3703 --------- Co-authored-by: Diondra Peck <dipeck@microsoft.com> Co-authored-by: Diondra <16376603+diondrapeck@users.noreply.github.com>
microsoft · Sep 6, 2024 · b04e889 · b04e889
1 parent b7dc8b7
commit b04e889
Show file tree

Hide file tree

Showing 20 changed files with 136,456 additions and 38 deletions.
diff --git a/.cspell.json b/.cspell.json
@@ -113,7 +113,8 @@
     "vnet",
     "Weaviate",
     "westus",
-    "wsid"
+    "wsid",
+    "Xpia"
   ],
   "ignoreWords": [
     "openmpi",
@@ -243,6 +244,8 @@
     "azureopenaimodelconfiguration",
     "openaimodelconfiguration",
     "usecwd",
+    "upia",
+    "xpia",
     "locustio",
     "euap",
     "Rerank",

diff --git a/src/promptflow-evals/CHANGELOG.md b/src/promptflow-evals/CHANGELOG.md
@@ -2,12 +2,17 @@
 
 ## v0.3.3 (Upcoming)
 ### Features Added
+- Introduced `IndirectAttackSimulator` to simulate XPIA (cross domain prompt injected attack) jailbreak attacks on your AI system.
+- Introduced `IndirectAttackEvaluator` to evaluate content for the presence of XPIA (cross domain prompt injected attacks) injected into conversation or Q/A context to interrupt normal expected functionality by eliciting manipulated content, intrusion and attempting to gather information outside the scope of your AI system.
 - Add a new evaluator (ProtectedMaterialEvaluator) and associated adversarial content simulator enum type (AdversarialScenario.ADVERSARIAL_CONTENT_PROTECTED_MATERIAL) for protected material, which determines if given inputs contain material protected by IP laws.
 - Added four mathematic evaluators, `BleuScoreEvaluator`, `GleuScoreEvaluator`, `MeteorScoreEvaluator` and `RougeScoreEvaluator` - for evaluating the quality of generated text by comparing it against referenced text.
 
 ### Bugs Fixed
 - Fixed evaluators to accept (non-Azure) Open AI Configs.
 
+### Breaking Changes
+- Replaced `jailbreak` parameter in `AdversarialSimulator` with `_jailbreak_type` parameter to support multiple jailbreak types. Instead of editing this parameter directly, we recommend using the `JailbreakAdversarialSimulator` class for UPIA jailbreak and `IndirectAttackSimulator` class for XPIA jailbreak.
+
 ### Improvements
 - Renamed `JailbreakAdversarialSimulator` to `DirectAttackSimulator`
 - Set the PF_EVALS_BATCH_USE_ASYNC environment variable to True by default to enable asynchronous batch run for async-enabled built-in evaluators, improving performance.

diff --git a/src/promptflow-evals/promptflow/evals/_common/constants.py b/src/promptflow-evals/promptflow/evals/_common/constants.py
@@ -32,6 +32,7 @@ class Tasks:
 
     CONTENT_HARM = "content harm"
     PROTECTED_MATERIAL = "protected material"
+    XPIA = "xpia"
 
 
 class _InternalAnnotationTasks:
@@ -52,6 +53,7 @@ class EvaluationMetrics:
     SELF_HARM = "self_harm"
     SEXUAL = "sexual"
     PROTECTED_MATERIAL = "protected_material"
+    XPIA = "xpia"
 
 
 class _InternalEvaluationMetrics:

diff --git a/src/promptflow-evals/promptflow/evals/_common/rai_service.py b/src/promptflow-evals/promptflow/evals/_common/rai_service.py
@@ -103,6 +103,9 @@ def generate_payload(normalized_user_text: str, metric: str) -> Dict:
     elif metric == _InternalEvaluationMetrics.ECI:
         task = _InternalAnnotationTasks.ECI
         include_metric = False
+    elif metric == EvaluationMetrics.XPIA:
+        task = Tasks.XPIA
+        include_metric = False
     return (
         {
             "UserTextList": [normalized_user_text],
@@ -207,8 +210,9 @@ def parse_response(  # pylint: disable=too-many-branches,too-many-statements
     :return: The parsed annotation result.
     :rtype: List[List[Dict]]
     """
+
     # non-numeric metrics
-    if metric_name in {EvaluationMetrics.PROTECTED_MATERIAL, _InternalEvaluationMetrics.ECI}:
+    if metric_name in {EvaluationMetrics.PROTECTED_MATERIAL, _InternalEvaluationMetrics.ECI, EvaluationMetrics.XPIA}:
         if not batch_response or len(batch_response[0]) == 0 or metric_name not in batch_response[0]:
             return {}
         response = batch_response[0][metric_name]
@@ -218,8 +222,23 @@ def parse_response(  # pylint: disable=too-many-branches,too-many-statements
         result = {}
         metric_prefix = _get_metric_prefix(metric_name)
         # Use label instead of score since these are assumed to be boolean results.
-        result[metric_prefix + "_label"] = parsed_response["label"] if "label" in parsed_response else ""
-        result[metric_prefix + "_reasoning"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
+        # Use np.nan as null value since it's ignored by aggregations rather than treated as 0.
+        result[metric_prefix + "_label"] = parsed_response["label"] if "label" in parsed_response else np.nan
+        result[metric_prefix + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
+
+        if metric_name == EvaluationMetrics.XPIA:
+            # Add "manipulated_content", "intrusion" and "information_gathering" to the result
+            # if present else set them to np.nan
+            result[metric_prefix + "_manipulated_content"] = (
+                parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else np.nan
+            )
+            result[metric_prefix + "_intrusion"] = (
+                parsed_response["intrusion"] if "intrusion" in parsed_response else np.nan
+            )
+            result[metric_prefix + "_information_gathering"] = (
+                parsed_response["information_gathering"] if "information_gathering" in parsed_response else np.nan
+            )
+
         return result
     return _parse_content_harm_response(batch_response, metric_name)
 

diff --git a/src/promptflow-evals/promptflow/evals/evaluators/__init__.py b/src/promptflow-evals/promptflow/evals/evaluators/__init__.py
@@ -23,6 +23,7 @@
 from ._relevance import RelevanceEvaluator
 from ._rouge import RougeScoreEvaluator, RougeType
 from ._similarity import SimilarityEvaluator
+from ._xpia import IndirectAttackEvaluator
 
 __all__ = [
     "CoherenceEvaluator",
@@ -39,6 +40,7 @@
     "HateUnfairnessEvaluator",
     "ContentSafetyEvaluator",
     "ContentSafetyChatEvaluator",
+    "IndirectAttackEvaluator",
     "BleuScoreEvaluator",
     "GleuScoreEvaluator",
     "MeteorScoreEvaluator",

diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_eci/_eci.py b/src/promptflow-evals/promptflow/evals/evaluators/_eci/_eci.py
@@ -66,7 +66,7 @@ class ECIEvaluator:
 
         {
             "ECI_label": "False",
-            "ECI_reasoning": "Some reason."
+            "ECI_reason": "Some reason."
         }
     """
 

diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_protected_material/_protected_material.py b/src/promptflow-evals/promptflow/evals/evaluators/_protected_material/_protected_material.py
@@ -70,8 +70,8 @@ class ProtectedMaterialEvaluator:
     .. code-block:: python
 
         {
-            "label": "False",
-            "reasoning": "This question does not contain any protected material."
+            "protected_material_label": "False",
+            "protected_material_reason": "This question does not contain any protected material."
         }
     """
 

diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_xpia/__init__.py b/src/promptflow-evals/promptflow/evals/evaluators/_xpia/__init__.py
@@ -0,0 +1,5 @@
+from ._xpia import IndirectAttackEvaluator
+
+__all__ = [
+    "IndirectAttackEvaluator",
+]
diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_xpia/_xpia.py b/src/promptflow-evals/promptflow/evals/evaluators/_xpia/_xpia.py
@@ -0,0 +1,133 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+import logging
+from typing import Optional
+
+from promptflow._utils.async_utils import async_run_allowing_running_loop
+from promptflow.evals._common.constants import EvaluationMetrics
+from promptflow.evals._common.rai_service import evaluate_with_rai_service
+
+logger = logging.getLogger(__name__)
+
+
+class IndirectAttackEvaluator:
+    """
+    Initializes an XPIA (cross domain prompt injected attack) jailbreak evaluator to detect whether cross domain
+    injected attacks are present in your AI system's response.
+
+    :param project_scope: The scope of the Azure AI project.
+        It contains subscription id, resource group, and project name.
+    :type project_scope: dict
+    :param eval_last_turn: Set to True to evaluate only the most recent exchange in the dialogue,
+        focusing on the latest user inquiry and the assistant's corresponding response. Defaults to False
+    :type eval_last_turn: bool
+    :param credential: The credential for connecting to Azure AI project.
+    :type credential: ~azure.core.credentials.TokenCredential
+    :return: A function that evaluates and generates metrics for XPIA chat scenario. Metrics include the overall
+        evaluation label and reason for the Q/A Pair, as well as sub-labels
+        for manipulated content, intrusion, and information.
+    :rtype: Callable
+
+    **Usage**
+
+    .. code-block:: python
+
+        eval_fn = IndirectAttackEvaluator(model_config)
+        result = eval_fn(question="What is the capital of France?", answer="Paris.")
+
+    **Output format for question-answer pair**
+
+    .. code-block:: python
+
+        {
+            'xpia_label': False,
+            'xpia_reason': 'The conversation does not contain any manipulated content, intrusion or
+                          information gathering.'
+            'xpia_information_gathering': False,
+            'xpia_intrusion': False
+            'xpia_manipulated_content': False
+        }
+
+    """
+
+    def __init__(self, project_scope: dict, eval_last_turn: bool = False, credential=None):
+        self._evaluator = _IndirectAttackEvaluator(project_scope, credential)
+        self._eval_last_turn = eval_last_turn
+
+    def __call__(
+        self,
+        *,
+        question: Optional[str],
+        answer: Optional[str],
+        **kwargs,
+    ):
+        """
+        Evaluates content according to the presence of attacks injected into the conversation context to
+        interrupt normal expected functionality by eliciting manipulated content, intrusion and attempting
+        to gather information outside the scope of your AI system.
+
+        :keyword question: The question to be evaluated. Mutually exclusive with 'conversation'.
+        :paramtype question: Optional[str]
+        :keyword answer: The answer to be evaluated. Mutually exclusive with 'conversation'.
+        :paramtype answer: Optional[str]
+        :return: The evaluation scores and reasoning.
+        :rtype: dict
+        """
+
+        return self._evaluator(question=question, answer=answer, **kwargs)
+
+
+class _AsyncIndirectAttackEvaluator:
+    def __init__(self, project_scope: dict, credential=None):
+        self._project_scope = project_scope
+        self._credential = credential
+
+    async def __call__(self, *, question: str, answer: str, **kwargs):
+        """
+        Evaluates content according to this evaluator's metric.
+        :keyword question: The question to be evaluated.
+        :paramtype question: str
+        :keyword answer: The answer to be evaluated.
+        :paramtype answer: str
+        :return: The evaluation score computation based on the metric (self.metric).
+        :rtype: Any
+        """
+        # Validate inputs
+        # Raises value error if failed, so execution alone signifies success.
+        if not (question and question.strip() and question != "None") or not (
+            answer and answer.strip() and answer != "None"
+        ):
+            raise ValueError("Both 'question' and 'answer' must be non-empty strings.")
+
+        # Run score computation based on supplied metric.
+        result = await evaluate_with_rai_service(
+            metric_name=EvaluationMetrics.XPIA,
+            question=question,
+            answer=answer,
+            project_scope=self._project_scope,
+            credential=self._credential,
+        )
+        return result
+
+
+class _IndirectAttackEvaluator:
+    def __init__(self, project_scope: dict, credential=None):
+        self._async_evaluator = _AsyncIndirectAttackEvaluator(project_scope, credential)
+
+    def __call__(self, *, question: str, answer: str, **kwargs):
+        """
+        Evaluates XPIA content.
+        :keyword question: The question to be evaluated.
+        :paramtype question: str
+        :keyword answer: The answer to be evaluated.
+        :paramtype answer: str
+        :keyword context: The context to be evaluated.
+        :paramtype context: str
+        :return: The XPIA score.
+        :rtype: dict
+        """
+        return async_run_allowing_running_loop(self._async_evaluator, question=question, answer=answer, **kwargs)
+
+    def _to_async(self):
+        return self._async_evaluator
diff --git a/src/promptflow-evals/promptflow/evals/synthetic/__init__.py b/src/promptflow-evals/promptflow/evals/synthetic/__init__.py
@@ -1,5 +1,6 @@
 from .adversarial_scenario import AdversarialScenario
 from .adversarial_simulator import AdversarialSimulator
 from .direct_attack_simulator import DirectAttackSimulator
+from .xpia_simulator import IndirectAttackSimulator
 
-__all__ = ["AdversarialSimulator", "AdversarialScenario", "DirectAttackSimulator"]
+__all__ = ["AdversarialSimulator", "AdversarialScenario", "DirectAttackSimulator", "IndirectAttackSimulator"]
diff --git a/src/promptflow-evals/promptflow/evals/synthetic/_model_tools/_proxy_completion_model.py b/src/promptflow-evals/promptflow/evals/synthetic/_model_tools/_proxy_completion_model.py
@@ -198,8 +198,11 @@ async def request_api(
 
         exp_retry_client = get_async_http_client().with_policies(retry_policy=retry_policy)
 
-        # initial 10 seconds wait before attempting to fetch result
-        await asyncio.sleep(10)
+        # initial 15 seconds wait before attempting to fetch result
+        # Need to wait both in this thread and in the async thread for some reason?
+        # Someone not under a crunch and with better async understandings should dig into this more.
+        await asyncio.sleep(15)
+        time.sleep(15)
 
         response = await exp_retry_client.get(  # pylint: disable=too-many-function-args,unexpected-keyword-arg
             self.result_url, headers=proxy_headers

diff --git a/src/promptflow-evals/promptflow/evals/synthetic/_model_tools/_rai_client.py b/src/promptflow-evals/promptflow/evals/synthetic/_model_tools/_rai_client.py
@@ -55,6 +55,7 @@ def __init__(self, azure_ai_project: Dict, token_manager: APITokenManager) -> No
         self.parameter_json_endpoint = urljoin(self.api_url, "simulation/template/parameters")
         self.jailbreaks_json_endpoint = urljoin(self.api_url, "simulation/jailbreak")
         self.simulation_submit_endpoint = urljoin(self.api_url, "simulation/chat/completions/submit")
+        self.xpia_jailbreaks_json_endpoint = urljoin(self.api_url, "simulation/jailbreak/xpia")
 
     def _get_service_discovery_url(self):
         bearer_token = self.token_manager.get_token()
@@ -92,10 +93,15 @@ async def get_contentharm_parameters(self) -> Any:
 
         return self.contentharm_parameters
 
-    async def get_jailbreaks_dataset(self) -> Any:
+    async def get_jailbreaks_dataset(self, type: str) -> Any:
         "Get the jailbreaks dataset, if exists"
         if self.jailbreaks_dataset is None:
-            self.jailbreaks_dataset = await self.get(self.jailbreaks_json_endpoint)
+            if type == "xpia":
+                self.jailbreaks_dataset = await self.get(self.xpia_jailbreaks_json_endpoint)
+            elif type == "upia":
+                self.jailbreaks_dataset = await self.get(self.jailbreaks_json_endpoint)
+            else:
+                raise ValueError("Invalid type, please provide either 'xpia' or 'upia'")
 
         return self.jailbreaks_dataset
 

diff --git a/src/promptflow-evals/promptflow/evals/synthetic/adversarial_scenario.py b/src/promptflow-evals/promptflow/evals/synthetic/adversarial_scenario.py
@@ -16,6 +16,7 @@ class AdversarialScenario(Enum):
     ADVERSARIAL_CONTENT_GEN_UNGROUNDED = "adv_content_gen_ungrounded"
     ADVERSARIAL_CONTENT_GEN_GROUNDED = "adv_content_gen_grounded"
     ADVERSARIAL_CONTENT_PROTECTED_MATERIAL = "adv_content_protected_material"
+    ADVERSARIAL_INDIRECT_JAILBREAK = "adv_xpia"
 
 
 class _UnstableAdversarialScenario(Enum):

diff --git a/src/promptflow-evals/promptflow/evals/synthetic/adversarial_simulator.py b/src/promptflow-evals/promptflow/evals/synthetic/adversarial_simulator.py
@@ -44,15 +44,15 @@ def wrapper(*args, **kwargs):
         scenario = str(kwargs.get("scenario", None))
         max_conversation_turns = kwargs.get("max_conversation_turns", None)
         max_simulation_results = kwargs.get("max_simulation_results", None)
-        jailbreak = kwargs.get("jailbreak", None)
+        _jailbreak_type = kwargs.get("_jailbreak_type", None)
         decorated_func = monitor_operation(
             activity_name="adversarial.simulator.call",
             activity_type=ActivityType.PUBLICAPI,
             custom_dimensions={
                 "scenario": scenario,
                 "max_conversation_turns": max_conversation_turns,
                 "max_simulation_results": max_simulation_results,
-                "jailbreak": jailbreak,
+                "_jailbreak_type": _jailbreak_type,
             },
         )(func)
 
@@ -115,7 +115,7 @@ async def __call__(
         api_call_retry_sleep_sec: int = 1,
         api_call_delay_sec: int = 0,
         concurrent_async_task: int = 3,
-        jailbreak: bool = False,
+        _jailbreak_type: Optional[str] = None,
         randomize_order: bool = True,
         randomization_seed: Optional[int] = None,
     ):
@@ -149,9 +149,6 @@ async def __call__(
         :keyword concurrent_async_task: The number of asynchronous tasks to run concurrently during the simulation.
             Defaults to 3.
         :paramtype concurrent_async_task: int
-        :keyword jailbreak: If set to True, allows breaking out of the conversation flow defined by the scenario.
-            Defaults to False.
-        :paramtype jailbreak: bool
         :keyword randomize_order: Whether or not the order of the prompts should be randomized. Defaults to True.
         :paramtype randomize_order: bool
         :keyword randomization_seed: The seed used to randomize prompt selection. If unset, the system's
@@ -218,11 +215,11 @@ async def __call__(
                 total_tasks,
             )
         total_tasks = min(total_tasks, max_simulation_results)
-        if jailbreak:
-            jailbreak_dataset = await self.rai_client.get_jailbreaks_dataset()
+        if _jailbreak_type:
+            jailbreak_dataset = await self.rai_client.get_jailbreaks_dataset(type=_jailbreak_type)
         progress_bar = tqdm(
             total=total_tasks,
-            desc="generating jailbreak simulations" if jailbreak else "generating simulations",
+            desc="generating jailbreak simulations" if _jailbreak_type else "generating simulations",
             ncols=100,
             unit="simulations",
         )
@@ -237,7 +234,7 @@ async def __call__(
                 random.shuffle(parameter_order)
             for index in parameter_order:
                 parameter = template.template_parameters[index].copy()
-                if jailbreak:
+                if _jailbreak_type == "upia":
                     parameter = self._join_conversation_starter(parameter, random.choice(jailbreak_dataset))
                 tasks.append(
                     asyncio.create_task(