diff --git a/.vscode/cspell.json b/.vscode/cspell.json index 17bb451e85aa..896770ccfe9b 100644 --- a/.vscode/cspell.json +++ b/.vscode/cspell.json @@ -1890,6 +1890,12 @@ "deidentify", "deidentified" ] + }, + { + "filename": "sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py", + "words": [ + "stringized" + ] } ], "allowCompoundWords": true diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py index d07747891513..76cd337242be 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py @@ -27,8 +27,8 @@ from ._model_configurations import ( AzureAIProject, AzureOpenAIModelConfiguration, - OpenAIModelConfiguration, EvaluatorConfig, + OpenAIModelConfiguration, ) __all__ = [ diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/constants.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/constants.py index 53d512ab9056..c2fb55ed4c2c 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/constants.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/constants.py @@ -3,6 +3,8 @@ # --------------------------------------------------------- from enum import Enum +from azure.core import CaseInsensitiveEnumMeta + class CommonConstants: """Define common constants.""" @@ -43,7 +45,7 @@ class _InternalAnnotationTasks: ECI = "eci" -class EvaluationMetrics: +class EvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta): """Evaluation metrics to aid the RAI service in determining what metrics to request, and how to present them back to the user.""" @@ -56,7 +58,7 @@ class EvaluationMetrics: XPIA = "xpia" -class _InternalEvaluationMetrics: +class _InternalEvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta): """Evaluation metrics that are not publicly supported. These metrics are experimental and subject to potential change or migration to the main enum over time. diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py index fb84db1a2cba..20e710019824 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py @@ -3,19 +3,20 @@ # --------------------------------------------------------- import asyncio import importlib.metadata +import math import re import time -import math from ast import literal_eval -from typing import Dict, List +from typing import Dict, List, Optional, Union, cast from urllib.parse import urlparse import jwt from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException -from azure.ai.evaluation._http_utils import get_async_http_client +from azure.ai.evaluation._http_utils import AsyncHttpPipeline, get_async_http_client from azure.ai.evaluation._model_configurations import AzureAIProject from azure.core.credentials import TokenCredential +from azure.core.pipeline.policies import AsyncRetryPolicy from .constants import ( CommonConstants, @@ -52,7 +53,13 @@ def get_common_headers(token: str) -> Dict: } -async def ensure_service_availability(rai_svc_url: str, token: str, capability: str = None) -> None: +def get_async_http_client_with_timeout() -> AsyncHttpPipeline: + return get_async_http_client().with_policies( + retry_policy=AsyncRetryPolicy(timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT) + ) + + +async def ensure_service_availability(rai_svc_url: str, token: str, capability: Optional[str] = None) -> None: """Check if the Responsible AI service is available in the region and has the required capability, if relevant. :param rai_svc_url: The Responsible AI service URL. @@ -67,9 +74,7 @@ async def ensure_service_availability(rai_svc_url: str, token: str, capability: svc_liveness_url = rai_svc_url + "/checkannotation" async with get_async_http_client() as client: - response = await client.get( # pylint: disable=too-many-function-args,unexpected-keyword-arg - svc_liveness_url, headers=headers, timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT - ) + response = await client.get(svc_liveness_url, headers=headers) if response.status_code != 200: msg = f"RAI service is not available in this region. Status Code: {response.status_code}" @@ -153,16 +158,14 @@ async def submit_request(query: str, response: str, metric: str, rai_svc_url: st url = rai_svc_url + "/submitannotation" headers = get_common_headers(token) - async with get_async_http_client() as client: - response = await client.post( # pylint: disable=too-many-function-args,unexpected-keyword-arg - url, json=payload, headers=headers, timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT - ) + async with get_async_http_client_with_timeout() as client: + http_response = await client.post(url, json=payload, headers=headers) - if response.status_code != 202: - print("Fail evaluating '%s' with error message: %s" % (payload["UserTextList"], response.text)) - response.raise_for_status() + if http_response.status_code != 202: + print("Fail evaluating '%s' with error message: %s" % (payload["UserTextList"], http_response.text())) + http_response.raise_for_status() - result = response.json() + result = http_response.json() operation_id = result["location"].split("/")[-1] return operation_id @@ -189,10 +192,8 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre token = await fetch_or_reuse_token(credential, token) headers = get_common_headers(token) - async with get_async_http_client() as client: - response = await client.get( # pylint: disable=too-many-function-args,unexpected-keyword-arg - url, headers=headers, timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT - ) + async with get_async_http_client_with_timeout() as client: + response = await client.get(url, headers=headers) if response.status_code == 200: return response.json() @@ -208,7 +209,7 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre def parse_response( # pylint: disable=too-many-branches,too-many-statements batch_response: List[Dict], metric_name: str -) -> Dict: +) -> Dict[str, Union[str, float]]: """Parse the annotation response from Responsible AI service for a content harm evaluation. :param batch_response: The annotation response from Responsible AI service. @@ -216,7 +217,7 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements :param metric_name: The evaluation metric to use. :type metric_name: str :return: The parsed annotation result. - :rtype: List[List[Dict]] + :rtype: Dict[str, Union[str, float]] """ # non-numeric metrics if metric_name in {EvaluationMetrics.PROTECTED_MATERIAL, _InternalEvaluationMetrics.ECI, EvaluationMetrics.XPIA}: @@ -248,7 +249,7 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements return _parse_content_harm_response(batch_response, metric_name) -def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -> Dict: +def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -> Dict[str, Union[str, float]]: """Parse the annotation response from Responsible AI service for a content harm evaluation. :param batch_response: The annotation response from Responsible AI service. @@ -256,7 +257,7 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) - :param metric_name: The evaluation metric to use. :type metric_name: str :return: The parsed annotation result. - :rtype: List[List[Dict]] + :rtype: Dict[str, Union[str, float]] """ # Fix the metric name if it's "hate_fairness" # Eventually we will remove this fix once the RAI service is updated @@ -264,7 +265,7 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) - if key == EvaluationMetrics.HATE_FAIRNESS: key = EvaluationMetrics.HATE_UNFAIRNESS - result = {key: math.nan, key + "_score": math.nan, key + "_reason": ""} + result: Dict[str, Union[str, float]] = {key: math.nan, key + "_score": math.nan, key + "_reason": ""} response = batch_response[0] if metric_name not in response: @@ -336,14 +337,13 @@ async def _get_service_discovery_url(azure_ai_project: AzureAIProject, token: st """ headers = get_common_headers(token) - async with get_async_http_client() as client: - response = await client.get( # pylint: disable=too-many-function-args,unexpected-keyword-arg + async with get_async_http_client_with_timeout() as client: + response = await client.get( f"https://management.azure.com/subscriptions/{azure_ai_project['subscription_id']}/" f"resourceGroups/{azure_ai_project['resource_group_name']}/" f"providers/Microsoft.MachineLearningServices/workspaces/{azure_ai_project['project_name']}?" f"api-version=2023-08-01-preview", headers=headers, - timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT, ) if response.status_code != 200: @@ -360,7 +360,7 @@ async def _get_service_discovery_url(azure_ai_project: AzureAIProject, token: st return f"{base_url.scheme}://{base_url.netloc}" -async def get_rai_svc_url(project_scope: dict, token: str) -> str: +async def get_rai_svc_url(project_scope: AzureAIProject, token: str) -> str: """Get the Responsible AI service URL :param project_scope: The Azure AI project scope details. @@ -384,7 +384,7 @@ async def get_rai_svc_url(project_scope: dict, token: str) -> str: return rai_url -async def fetch_or_reuse_token(credential: TokenCredential, token: str = None) -> str: +async def fetch_or_reuse_token(credential: TokenCredential, token: Optional[str] = None) -> str: """Get token. Fetch a new token if the current token is near expiry :param credential: The Azure authentication credential. @@ -394,29 +394,26 @@ async def fetch_or_reuse_token(credential: TokenCredential, token: str = None) - :type token: str :return: The Azure authentication token. """ - acquire_new_token = True - try: - if token: - # Decode the token to get its expiration time + if token: + # Decode the token to get its expiration time + try: decoded_token = jwt.decode(token, options={"verify_signature": False}) + except jwt.PyJWTError: + pass + else: exp_time = decoded_token["exp"] current_time = time.time() - # Check if the token is near expiry + # Return current token if not near expiry if (exp_time - current_time) >= 300: - acquire_new_token = False - except Exception: # pylint: disable=broad-exception-caught - pass - - if acquire_new_token: - token = credential.get_token("https://management.azure.com/.default").token + return token - return token + return credential.get_token("https://management.azure.com/.default").token async def evaluate_with_rai_service( query: str, response: str, metric_name: str, project_scope: AzureAIProject, credential: TokenCredential -): +) -> Dict[str, Union[str, float]]: """ "Evaluate the content safety of the response using Responsible AI service :param query: The query to evaluate. @@ -431,7 +428,7 @@ async def evaluate_with_rai_service( :type credential: ~azure.core.credentials.TokenCredential :return: The parsed annotation result. - :rtype: List[List[Dict]] + :rtype: Dict[str, Union[str, float]] """ # Get RAI service URL from discovery service and check service availability @@ -441,7 +438,7 @@ async def evaluate_with_rai_service( # Submit annotation request and fetch result operation_id = await submit_request(query, response, metric_name, rai_svc_url, token) - annotation_response = await fetch_result(operation_id, rai_svc_url, credential, token) + annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token)) result = parse_response(annotation_response, metric_name) return result diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py index 461f413900f5..1ab18c0623a4 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py @@ -4,19 +4,27 @@ import math import threading -from typing import List, Union +from typing import Any, List, Literal, Mapping, Type, TypeVar, Union, cast, get_args, get_origin import nltk +from typing_extensions import NotRequired, Required, TypeGuard -from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration from azure.ai.evaluation._constants import AZURE_OPENAI_TYPE, OPENAI_TYPE +from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException +from azure.ai.evaluation._model_configurations import ( + AzureAIProject, + AzureOpenAIModelConfiguration, + OpenAIModelConfiguration, +) from . import constants _nltk_data_download_lock = threading.Lock() +T_TypedDict = TypeVar("T_TypedDict", bound=Mapping[Any, Any]) -def get_harm_severity_level(harm_score: int) -> str: + +def get_harm_severity_level(harm_score: int) -> Union[str, float]: """Generate harm severity level based on harm score. :param harm_score: The harm score to be evaluated. @@ -71,12 +79,20 @@ def nltk_tokenize(text: str) -> List[str]: return list(tokens) +def _is_aoi_model_config(val: object) -> TypeGuard[AzureOpenAIModelConfiguration]: + return isinstance(val, dict) and all(isinstance(val.get(k), str) for k in ("azure_endpoint", "azure_deployment")) + + +def _is_openai_model_config(val: object) -> TypeGuard[OpenAIModelConfiguration]: + return isinstance(val, dict) and all(isinstance(val.get(k), str) for k in ("model")) + + def parse_model_config_type( model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], ) -> None: - if "azure_endpoint" in model_config or "azure_deployment" in model_config: + if _is_aoi_model_config(model_config): model_config["type"] = AZURE_OPENAI_TYPE - else: + elif _is_openai_model_config(model_config): model_config["type"] = OPENAI_TYPE @@ -87,16 +103,170 @@ def construct_prompty_model_config( ) -> dict: parse_model_config_type(model_config) - if model_config["type"] == AZURE_OPENAI_TYPE: + if _is_aoi_model_config(model_config): model_config["api_version"] = model_config.get("api_version", default_api_version) - prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}} + prompty_model_config: dict = {"configuration": model_config, "parameters": {"extra_headers": {}}} # Handle "RuntimeError: Event loop is closed" from httpx AsyncClient # https://github.com/encode/httpx/discussions/2959 prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"}) - if model_config["type"] == AZURE_OPENAI_TYPE and user_agent: + if _is_aoi_model_config(model_config) and user_agent: prompty_model_config["parameters"]["extra_headers"].update({"x-ms-useragent": user_agent}) return prompty_model_config + + +def validate_azure_ai_project(o: object) -> AzureAIProject: + fields = {"subscription_id": str, "resource_group_name": str, "project_name": str} + + if not isinstance(o, dict): + msg = "azure_ai_project must be a dictionary" + raise EvaluationException( + message=msg, + internal_message=msg, + target=ErrorTarget.DIRECT_ATTACK_SIMULATOR, + category=ErrorCategory.MISSING_FIELD, + blame=ErrorBlame.USER_ERROR, + ) + + missing_fields = set(fields.keys()) - o.keys() + + if missing_fields: + msg = "azure_ai_project must contain keys: " + ", ".join(f'"{field}"' for field in missing_fields) + raise EvaluationException( + message=msg, + internal_message=msg, + target=ErrorTarget.DIRECT_ATTACK_SIMULATOR, + category=ErrorCategory.MISSING_FIELD, + blame=ErrorBlame.USER_ERROR, + ) + + for field_name, expected_type in fields.items(): + if isinstance(o[field_name], expected_type): + continue + + msg = f"Expected azure_ai_project field {field_name!r} to be of type {expected_type}." + + raise EvaluationException( + message=f"{msg}. Got {type(o[field_name])}.", + internal_message=msg, + target=ErrorTarget.DIRECT_ATTACK_SIMULATOR, + category=ErrorCategory.MISSING_FIELD, + blame=ErrorBlame.USER_ERROR, + ) + + return cast(AzureAIProject, o) + + +def validate_model_config(config: dict) -> Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration]: + try: + return _validate_typed_dict(config, AzureOpenAIModelConfiguration) + except TypeError: + try: + return _validate_typed_dict(config, OpenAIModelConfiguration) + except TypeError as e: + msg = "Model config validation failed." + raise EvaluationException( + message=msg, internal_message=msg, category=ErrorCategory.MISSING_FIELD, blame=ErrorBlame.USER_ERROR + ) from e + + +def _validate_typed_dict(o: object, t: Type[T_TypedDict]) -> T_TypedDict: + """Do very basic runtime validation that an object is a typed dict + + .. warning:: + + This validation is very basic, robust enough to cover some very simple TypedDicts. + Ideally, validation of this kind should be delegated to something more robust. + + You will very quickly run into limitations trying to apply this function more broadly: + * Doesn't support stringized annotations at all + * Very limited support for generics, and "special form" (NoReturn, NotRequired, Required, etc...) types. + * Error messages are poor, especially if there is any nesting. + + :param object o: The object to check + :param Type[T_TypedDict] t: The TypedDict to validate against + :raises NotImplementedError: Several forms of validation are unsupported + * Checking against stringized annotations + * Checking a generic that is not one of a few basic forms + :raises TypeError: If a value does not match the specified annotation + :raises ValueError: If t's annotation is not a string, type of a special form (e.g. NotRequired, Required, etc...) + :returns: The object passed in + :rtype: T_TypedDict + """ + if not isinstance(o, dict): + raise TypeError(f"Expected type 'dict', got type '{type(object)}'.") + + annotations = t.__annotations__ + is_total = getattr(t, "__total__", False) + unknown_keys = set(o.keys()) - annotations.keys() + + if unknown_keys: + raise TypeError(f"dict contains unknown keys: {list(unknown_keys)!r}") + + required_keys = { + k + for k in annotations + if (is_total and get_origin(annotations[k]) is not NotRequired) + or (not is_total and get_origin(annotations[k]) is Required) + } + + missing_keys = required_keys - o.keys() + + if missing_keys: + raise TypeError(f"Missing required keys: {list(missing_keys)!r}.") + + def validate_annotation(v: object, annotation: Union[str, type, object]) -> bool: + if isinstance(annotation, str): + raise NotImplementedError("Missing support for validating against stringized annotations.") + + if (origin := get_origin(annotation)) is not None: + if origin is tuple: + validate_annotation(v, tuple) + tuple_args = get_args(annotation) + if len(cast(tuple, v)) != len(tuple_args): + raise TypeError(f"Expected a {len(tuple_args)}-tuple, got a {len(cast(tuple, v))}-tuple.") + for tuple_val, tuple_args in zip(cast(tuple, v), tuple_args): + validate_annotation(tuple_val, tuple_args) + elif origin is dict: + validate_annotation(v, dict) + dict_key_ann, dict_val_ann = get_args(annotation) + for dict_key, dict_val in cast(dict, v).items(): + validate_annotation(dict_val, dict_val_ann) + validate_annotation(dict_key, dict_key_ann) + elif origin is list: + validate_annotation(v, list) + list_val_ann = get_args(annotation)[0] + for list_val in cast(list, v): + validate_annotation(list_val, list_val_ann) + elif origin is Union: + for generic_arg in get_args(annotation): + try: + validate_annotation(v, generic_arg) + return True + except TypeError: + pass + raise TypeError(f"Expected value to have type {annotation}. Received type {type(v)}") + elif origin is Literal: + literal_args = get_args(annotation) + if not any(type(literal) is type(v) and literal == v for literal in literal_args): + raise TypeError(f"Expected value to be one of {list(literal_args)!r}. Received type {type(v)}") + elif any(origin is g for g in (NotRequired, Required)): + validate_annotation(v, get_args(annotation)[0]) + else: + raise NotImplementedError(f"Validation not implemented for generic {origin}.") + return True + + if isinstance(annotation, type): + if not isinstance(v, annotation): + raise TypeError(f"Expected value to have type {annotation}. Received type {type(v)}.") + return True + + raise ValueError("Annotation to validate against should be a str, type, or generic.") + + for k, v in o.items(): + validate_annotation(v, annotations[k]) + + return cast(T_TypedDict, o) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_constants.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_constants.py index fe8b9df6230a..4148ff405cff 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_constants.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_constants.py @@ -1,6 +1,7 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- +from typing import Literal class EvaluationMetrics: @@ -65,6 +66,6 @@ class EvaluationRunProperties: OTEL_EXPORTER_OTLP_TRACES_TIMEOUT = "OTEL_EXPORTER_OTLP_TRACES_TIMEOUT" OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT = 60 -AZURE_OPENAI_TYPE = "azure_openai" +AZURE_OPENAI_TYPE: Literal["azure_openai"] = "azure_openai" -OPENAI_TYPE = "openai" +OPENAI_TYPE: Literal["openai"] = "openai" diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py index c57eee00b903..089ab12dd40d 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py @@ -2,6 +2,8 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- import os +import types +from typing import Optional, Type, Union from promptflow._sdk._constants import PF_FLOW_ENTRY_IN_TMP, PF_FLOW_META_LOAD_IN_SUBPROCESS from promptflow._utils.user_agent_utils import ClientUserAgentUtil @@ -30,12 +32,12 @@ class BatchRunContext: ] """ - def __init__(self, client) -> None: + def __init__(self, client: Union[CodeClient, ProxyClient]) -> None: self.client = client self._is_batch_timeout_set_by_system = False self._is_otel_timeout_set_by_system = False - def __enter__(self): + def __enter__(self) -> None: if isinstance(self.client, CodeClient): ClientUserAgentUtil.append_user_agent(USER_AGENT) inject_openai_api() @@ -56,7 +58,12 @@ def __enter__(self): # For addressing the issue of asyncio event loop closed on Windows set_event_loop_policy() - def __exit__(self, exc_type, exc_val, exc_tb): + def __exit__( + self, + exc_type: Optional[Type[BaseException]], + exc_value: Optional[BaseException], + exc_tb: Optional[types.TracebackType], + ) -> None: if isinstance(self.client, CodeClient): recover_openai_api() diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py index 2ee07a8e10e7..a72fe153b767 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py @@ -5,8 +5,9 @@ import json import logging import os +from concurrent.futures import Future from pathlib import Path -from typing import Callable, Dict, Optional, Union +from typing import Any, Callable, Dict, Optional, Union, cast import pandas as pd from promptflow.contracts.types import AttrDict @@ -22,25 +23,31 @@ class CodeRun: def __init__( - self, run, input_data, evaluator_name=None, aggregated_metrics=None, **kwargs # pylint: disable=unused-argument - ): + self, + *, + run: Future, + input_data, + evaluator_name: Optional[str] = None, + aggregator: Callable[["CodeRun"], Future], + **kwargs, # pylint: disable=unused-argument + ) -> None: self.run = run self.evaluator_name = evaluator_name if evaluator_name is not None else "" self.input_data = input_data - self.aggregated_metrics = aggregated_metrics + self.aggregated_metrics = aggregator(self) - def get_result_df(self, exclude_inputs=False): + def get_result_df(self, exclude_inputs: bool = False) -> pd.DataFrame: batch_run_timeout = get_int_env_var(PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT) - result_df = self.run.result(timeout=batch_run_timeout) + result_df = cast(pd.DataFrame, self.run.result(timeout=batch_run_timeout)) if exclude_inputs: result_df = result_df.drop(columns=[col for col in result_df.columns if col.startswith("inputs.")]) return result_df - def get_aggregated_metrics(self): + def get_aggregated_metrics(self) -> Dict[str, Any]: try: batch_run_timeout = get_int_env_var(PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT) - aggregated_metrics = ( - self.aggregated_metrics.result(timeout=batch_run_timeout) + aggregated_metrics: Optional[Any] = ( + cast(Dict, self.aggregated_metrics.result(timeout=batch_run_timeout)) if self.aggregated_metrics is not None else None ) @@ -104,10 +111,10 @@ def _calculate_metric( verify_integrity=True, ) - def _calculate_aggregations(self, evaluator, run): + @staticmethod + def _calculate_aggregations(evaluator: Callable, run: CodeRun) -> Any: try: if _has_aggregator(evaluator): - aggregate_input = None evaluator_output = run.get_result_df(exclude_inputs=True) if len(evaluator_output.columns) == 1 and evaluator_output.columns[0] == "output": aggregate_input = evaluator_output["output"].tolist() @@ -152,21 +159,26 @@ def run( column_mapping=column_mapping, evaluator_name=evaluator_name, ) - run = CodeRun(run=eval_future, input_data=data, evaluator_name=evaluator_name, aggregated_metrics=None) - aggregation_future = self._thread_pool.submit(self._calculate_aggregations, evaluator=flow, run=run) - run.aggregated_metrics = aggregation_future - return run + + return CodeRun( + run=eval_future, + input_data=data, + evaluator_name=evaluator_name, + aggregator=lambda code_run: self._thread_pool.submit( + self._calculate_aggregations, evaluator=flow, run=code_run + ), + ) def get_details(self, run: CodeRun, all_results: bool = False) -> pd.DataFrame: result_df = run.get_result_df(exclude_inputs=not all_results) return result_df - def get_metrics(self, run: CodeRun) -> Optional[None]: + def get_metrics(self, run: CodeRun) -> Dict[str, Any]: try: aggregated_metrics = run.get_aggregated_metrics() print("Aggregated metrics") print(aggregated_metrics) except Exception as ex: # pylint: disable=broad-exception-caught LOGGER.debug("Error calculating metrics for evaluator %s, failed with error %s", run.evaluator_name, ex) - return None + return {} return aggregated_metrics diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_eval_run.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_eval_run.py index 397cbf4e8c82..8b6c4cf8c339 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_eval_run.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_eval_run.py @@ -10,10 +10,11 @@ import time import types import uuid -from typing import Any, Dict, Optional, Set, Type +from typing import Any, Dict, List, Optional, Set, Type from urllib.parse import urlparse from promptflow._sdk.entities import Run +from typing_extensions import Self from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException from azure.ai.evaluation._http_utils import get_http_client @@ -27,6 +28,7 @@ # Handle optional import. The azure libraries are only present if # promptflow-azure is installed. try: + from azure.ai.ml import MLClient from azure.ai.ml.entities._credentials import AccountKeyConfiguration # pylint: disable=ungrouped-imports from azure.ai.ml.entities._datastore.datastore import Datastore from azure.storage.blob import BlobServiceClient @@ -121,8 +123,8 @@ def __init__( self._run_name = run_name self._promptflow_run = promptflow_run self._status = RunStatus.NOT_STARTED - self._url_base = None - self.info = None + self._url_base: Optional[str] = None + self._info: Optional[RunInfo] = None @property def status(self) -> RunStatus: @@ -134,6 +136,20 @@ def status(self) -> RunStatus: """ return self._status + @property + def info(self) -> RunInfo: + if self._info is None: + msg = "Run info is missing" + raise EvaluationException( + message=msg, + internal_message=msg, + target=ErrorTarget.EVAL_RUN, + category=ErrorCategory.UNKNOWN, + blame=ErrorBlame.UNKNOWN, + ) + + return self._info + def _get_scope(self) -> str: """ Return the scope information for the workspace. @@ -161,11 +177,11 @@ def _start_run(self) -> None: ) self._url_base = None self._status = RunStatus.BROKEN - self.info = RunInfo.generate(self._run_name) + self._info = RunInfo.generate(self._run_name) else: self._url_base = urlparse(self._tracking_uri).netloc if self._promptflow_run is not None: - self.info = RunInfo( + self._info = RunInfo( self._promptflow_run.name, self._promptflow_run._experiment_name, # pylint: disable=protected-access self._promptflow_run.name, @@ -182,7 +198,7 @@ def _start_run(self) -> None: body["run_name"] = self._run_name response = self.request_with_retry(url=url, method="POST", json_dict=body) if response.status_code != 200: - self.info = RunInfo.generate(self._run_name) + self._info = RunInfo.generate(self._run_name) LOGGER.warning( "The run failed to start: %s: %s." "The results will be saved locally, but will not be logged to Azure.", @@ -192,7 +208,7 @@ def _start_run(self) -> None: self._status = RunStatus.BROKEN else: parsed_response = response.json() - self.info = RunInfo( + self._info = RunInfo( run_id=parsed_response["run"]["info"]["run_id"], experiment_id=parsed_response["run"]["info"]["experiment_id"], run_name=parsed_response["run"]["info"]["run_name"], @@ -235,7 +251,7 @@ def _end_run(self, reason: str) -> None: LOGGER.warning("Unable to terminate the run.") self._status = RunStatus.TERMINATED - def __enter__(self): + def __enter__(self) -> Self: """The Context Manager enter call. :return: The instance of the class. @@ -249,7 +265,7 @@ def __exit__( exc_type: Optional[Type[BaseException]], exc_value: Optional[BaseException], exc_tb: Optional[types.TracebackType], - ) -> Optional[bool]: + ) -> None: """The context manager exit call. :param exc_type: The exception type @@ -408,7 +424,7 @@ def log_artifact(self, artifact_folder: str, artifact_name: str = EVALUATION_ART return # First we will list the files and the appropriate remote paths for them. root_upload_path = posixpath.join("promptflow", "PromptFlowArtifacts", self.info.run_name) - remote_paths = {"paths": []} + remote_paths: Dict[str, List[Dict[str, str]]] = {"paths": []} local_paths = [] # Go over the artifact folder and upload all artifacts. for root, _, filenames in os.walk(artifact_folder): diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py index 832195262f88..30cdc549e916 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py @@ -4,14 +4,15 @@ import inspect import os import re -from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type +from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, TypeVar, Union import pandas as pd from promptflow._sdk._constants import LINE_NUMBER from promptflow.client import PFClient +from promptflow.entities import Run -from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException from azure.ai.evaluation._common.math import list_sum +from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException from .._constants import ( CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT, @@ -24,16 +25,24 @@ from .._user_agent import USER_AGENT from ._batch_run_client import BatchRunContext, CodeClient, ProxyClient from ._utils import ( + EvaluateResult, _apply_column_mapping, _log_metrics_and_instance_results, _trace_destination_from_project_scope, _write_output, ) +TClient = TypeVar("TClient", ProxyClient, CodeClient) + + +class __EvaluatorInfo(TypedDict): + result: pd.DataFrame + metrics: Dict[str, Any] + # pylint: disable=line-too-long def _aggregate_content_safety_metrics( - df: pd.DataFrame, evaluators: Dict[str, Type] + df: pd.DataFrame, evaluators: Dict[str, Callable] ) -> Tuple[List[str], Dict[str, float]]: """Find and aggregate defect rates for content safety metrics. Returns both a list of columns that were used to calculate defect rates and the defect rates themselves. @@ -114,7 +123,7 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s return label_cols, defect_rates -def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Type]) -> Dict[str, float]: +def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dict[str, float]: """Aggregate metrics from the evaluation results. On top of naively calculating the mean of most metrics, this function also identifies certain columns that represent defect rates and renames them accordingly. Other columns in the dataframe are dropped. @@ -123,7 +132,7 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Type]) -> Dict[st :param df: The dataframe of evaluation results. :type df: ~pandas.DataFrame :param evaluators: A dictionary mapping of strings to evaluator classes. - :type evaluators: Dict[str, Type] + :type evaluators: Dict[str, Callable] :return: The aggregated metrics. :rtype: Dict[str, float] """ @@ -278,7 +287,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj def _validate_columns( df: pd.DataFrame, - evaluators: Dict[str, Any], + evaluators: Dict[str, Callable], target: Optional[Callable], column_mapping: Dict[str, Dict[str, str]], ) -> None: @@ -288,7 +297,7 @@ def _validate_columns( :param df: The data frame to be validated. :type df: pd.DataFrame :param evaluators: The dictionary of evaluators. - :type evaluators: Dict[str, Any] + :type evaluators: Dict[str, Callable] :param target: The callable to be applied to data set. :type target: Optional[Callable] :param column_mapping: Dictionary mapping evaluator name to evaluator column mapping @@ -327,7 +336,7 @@ def _apply_target_to_data( initial_data: pd.DataFrame, evaluation_name: Optional[str] = None, _run_name: Optional[str] = None, -) -> Tuple[pd.DataFrame, Set[str]]: +) -> Tuple[pd.DataFrame, Set[str], Run]: """ Apply the target function to the data set and return updated data and generated columns. @@ -349,7 +358,7 @@ def _apply_target_to_data( # We are manually creating the temporary directory for the flow # because the way tempdir remove temporary directories will # hang the debugger, because promptflow will keep flow directory. - run = pf_client.run( + run: Run = pf_client.run( flow=target, display_name=evaluation_name, data=data, @@ -357,7 +366,7 @@ def _apply_target_to_data( stream=True, name=_run_name, ) - target_output = pf_client.runs.get_details(run, all_results=True) + target_output: pd.DataFrame = pf_client.runs.get_details(run, all_results=True) # Remove input and output prefix generated_columns = { col[len(Prefixes.OUTPUTS) :] for col in target_output.columns if col.startswith(Prefixes.OUTPUTS) @@ -379,16 +388,18 @@ def _apply_target_to_data( return target_output, generated_columns, run -def _process_column_mappings(column_mapping: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str, str]]: +def _process_column_mappings( + column_mapping: Dict[str, Optional[Dict[str, str]]], +) -> Dict[str, Dict[str, str]]: """Process column_mapping to replace ${target.} with ${data.} :param column_mapping: The configuration for evaluators. - :type column_mapping: Dict[str, Dict[str, str]] + :type column_mapping: Dict[str, Optional[Dict[str, str]]] :return: The processed configuration. :rtype: Dict[str, Dict[str, str]] """ - processed_config = {} + processed_config: Dict[str, Dict[str, str]] = {} unexpected_references = re.compile(r"\${(?!target\.|data\.).+?}") @@ -557,26 +568,27 @@ def evaluate( def _evaluate( # pylint: disable=too-many-locals,too-many-statements *, + evaluators: Dict[str, Callable], evaluation_name: Optional[str] = None, target: Optional[Callable] = None, - data: Optional[str] = None, - evaluators: Optional[Dict[str, Callable]] = None, + data: str, evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None, azure_ai_project: Optional[AzureAIProject] = None, output_path: Optional[str] = None, **kwargs, -): +) -> EvaluateResult: input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name) # Process evaluator config to replace ${target.} with ${data.} if evaluator_config is None: evaluator_config = {} # extract column mapping dicts into dictionary mapping evaluator name to column mapping - column_mapping = { - evaluator_name: evaluator_configuration.get("column_mapping", None) - for evaluator_name, evaluator_configuration in evaluator_config.items() - } - column_mapping = _process_column_mappings(column_mapping) + column_mapping = _process_column_mappings( + { + evaluator_name: evaluator_configuration.get("column_mapping", None) + for evaluator_name, evaluator_configuration in evaluator_config.items() + } + ) _validate_columns(input_data_df, evaluators, target, column_mapping) # Target Run @@ -587,9 +599,8 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements user_agent=USER_AGENT, ) - trace_destination = pf_client._config.get_trace_destination() # pylint: disable=protected-access - target_run = None - target_generated_columns = set() + trace_destination: Optional[str] = pf_client._config.get_trace_destination() # pylint: disable=protected-access + target_run: Optional[Run] = None # Create default configuration for evaluators that directly maps # input data names to keyword inputs of the same name in the evaluators. @@ -628,39 +639,47 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements # Also ignore columns that are already in config, since they've been covered by target mapping. if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in column_mapping["default"].keys(): column_mapping["default"][col] = f"${{data.{col}}}" + + def get_evaluators_info( + batch_run_client: TClient, *, data=Union[str, os.PathLike, pd.DataFrame] + ) -> Dict[str, __EvaluatorInfo]: + with BatchRunContext(batch_run_client): + runs = { + evaluator_name: batch_run_client.run( + flow=evaluator, + run=target_run, + evaluator_name=evaluator_name, + column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)), + data=data, + stream=True, + name=kwargs.get("_run_name"), + ) + for evaluator_name, evaluator in evaluators.items() + } + + # get_details needs to be called within BatchRunContext scope in order to have user agent populated + return { + evaluator_name: { + "result": batch_run_client.get_details(run, all_results=True), + "metrics": batch_run_client.get_metrics(run), + } + for evaluator_name, run in runs.items() + } + # Batch Run - evaluators_info = {} use_pf_client = kwargs.get("_use_pf_client", True) if use_pf_client: - # A user reported intermittent errors when PFClient uploads evaluation runs to the cloud. - # The root cause is still unclear, but it seems related to a conflict between the async run uploader - # and the async batch run. As a quick mitigation, use a PFClient without a trace destination for batch runs. - batch_run_client = ProxyClient(PFClient(user_agent=USER_AGENT)) - # Ensure the absolute path is passed to pf.run, as relative path doesn't work with # multiple evaluators. If the path is already absolute, abspath will return the original path. data = os.path.abspath(data) + + # A user reported intermittent errors when PFClient uploads evaluation runs to the cloud. + # The root cause is still unclear, but it seems related to a conflict between the async run uploader + # and the async batch run. As a quick mitigation, use a PFClient without a trace destination for batch runs. + evaluators_info = get_evaluators_info(ProxyClient(PFClient(user_agent=USER_AGENT)), data=data) else: - batch_run_client = CodeClient() data = input_data_df - - with BatchRunContext(batch_run_client): - for evaluator_name, evaluator in evaluators.items(): - evaluators_info[evaluator_name] = {} - evaluators_info[evaluator_name]["run"] = batch_run_client.run( - flow=evaluator, - run=target_run, - evaluator_name=evaluator_name, - column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)), - data=data, - stream=True, - name=kwargs.get("_run_name"), - ) - - # get_details needs to be called within BatchRunContext scope in order to have user agent populated - for evaluator_name, evaluator_info in evaluators_info.items(): - evaluator_info["result"] = batch_run_client.get_details(evaluator_info["run"], all_results=True) - evaluator_info["metrics"] = batch_run_client.get_metrics(evaluator_info["run"]) + evaluators_info = get_evaluators_info(CodeClient(), data=input_data_df) # Concatenate all results evaluators_result_df = None @@ -707,7 +726,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements evaluation_name, ) - result = {"rows": result_df.to_dict("records"), "metrics": metrics, "studio_url": studio_url} + result: EvaluateResult = {"rows": result_df.to_dict("records"), "metrics": metrics, "studio_url": studio_url} if output_path: _write_output(output_path, result) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_telemetry/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_telemetry/__init__.py index d0497df3f770..87fb1b5c9593 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_telemetry/__init__.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_telemetry/__init__.py @@ -6,7 +6,7 @@ import inspect import json import logging -from typing import Callable, Dict, TypeVar +from typing import Callable, Dict, Literal, Optional, Union, cast import pandas as pd from promptflow._sdk.entities._flows import FlexFlow as flex_flow @@ -16,31 +16,30 @@ from promptflow.core import Prompty as prompty_core from typing_extensions import ParamSpec +from azure.ai.evaluation._model_configurations import AzureAIProject + from ..._user_agent import USER_AGENT -from .._utils import _trace_destination_from_project_scope +from .._utils import EvaluateResult, _trace_destination_from_project_scope LOGGER = logging.getLogger(__name__) P = ParamSpec("P") -R = TypeVar("R") -def _get_evaluator_type(evaluator: Dict[str, Callable]): +def _get_evaluator_type(evaluator: Dict[str, Callable]) -> Literal["content-safety", "built-in", "custom"]: """ Get evaluator type for telemetry. :param evaluator: The evaluator object :type evaluator: Dict[str, Callable] :return: The evaluator type. Possible values are "built-in", "custom", and "content-safety". - :rtype: str + :rtype: Literal["content-safety", "built-in", "custom"] """ - built_in = False - content_safety = False - module = inspect.getmodule(evaluator) - built_in = module and module.__name__.startswith("azure.ai.evaluation._evaluators.") - if built_in: - content_safety = module.__name__.startswith("azure.ai.evaluation._evaluators._content_safety") + module_name = module.__name__ if module else "" + + built_in = module_name.startswith("azure.ai.evaluation._evaluators.") + content_safety = built_in and module_name.startswith("azure.ai.evaluation._evaluators._content_safety") if content_safety: return "content-safety" @@ -98,22 +97,22 @@ def _get_evaluator_properties(evaluator, evaluator_name): # cspell:ignore isna -def log_evaluate_activity(func: Callable[P, R]) -> Callable[P, R]: +def log_evaluate_activity(func: Callable[P, EvaluateResult]) -> Callable[P, EvaluateResult]: """Decorator to log evaluate activity :param func: The function to be decorated :type func: Callable :returns: The decorated function - :rtype: Callable[P, R] + :rtype: Callable[P, EvaluateResult] """ @functools.wraps(func) - def wrapper(*args: P.args, **kwargs: P.kwargs) -> R: + def wrapper(*args: P.args, **kwargs: P.kwargs) -> EvaluateResult: from promptflow._sdk._telemetry import ActivityType, log_activity from promptflow._sdk._telemetry.telemetry import get_telemetry_logger - evaluators = kwargs.get("evaluators", []) - azure_ai_project = kwargs.get("azure_ai_project", None) + evaluators = cast(Optional[Dict[str, Callable]], kwargs.get("evaluators", {})) or {} + azure_ai_project = cast(Optional[AzureAIProject], kwargs.get("azure_ai_project", None)) pf_client = PFClient( config=( @@ -127,7 +126,7 @@ def wrapper(*args: P.args, **kwargs: P.kwargs) -> R: track_in_cloud = bool(pf_client._config.get_trace_destination()) # pylint: disable=protected-access evaluate_target = bool(kwargs.get("target", None)) evaluator_config = bool(kwargs.get("evaluator_config", None)) - custom_dimensions = { + custom_dimensions: Dict[str, Union[str, bool]] = { "track_in_cloud": track_in_cloud, "evaluate_target": evaluate_target, "evaluator_config": evaluator_config, diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_utils.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_utils.py index 4e87fced2d85..421c60001fee 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_utils.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_utils.py @@ -6,20 +6,22 @@ import os import re import tempfile -from collections import namedtuple from pathlib import Path -from typing import Dict +from typing import Any, Dict, List, NamedTuple, Optional, Tuple, TypedDict, Union import pandas as pd +from promptflow.client import PFClient +from promptflow.entities import Run from azure.ai.evaluation._constants import ( DEFAULT_EVALUATION_RESULTS_FILE_NAME, DefaultOpenEncoding, - Prefixes, EvaluationRunProperties, + Prefixes, ) from azure.ai.evaluation._evaluate._eval_run import EvalRun from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException +from azure.ai.evaluation._model_configurations import AzureAIProject LOGGER = logging.getLogger(__name__) @@ -28,14 +30,26 @@ "(/providers/Microsoft.MachineLearningServices)?/workspaces/([^/]+)$" ) -AzureMLWorkspaceTriad = namedtuple("AzureMLWorkspace", ["subscription_id", "resource_group_name", "workspace_name"]) + +class AzureMLWorkspace(NamedTuple): + subscription_id: str + resource_group_name: str + workspace_name: str -def is_none(value): +class EvaluateResult(TypedDict): + metrics: Dict[str, float] + studio_url: Optional[str] + rows: List[Dict] + + +def is_none(value) -> bool: return value is None or str(value).lower() == "none" -def extract_workspace_triad_from_trace_provider(trace_provider: str): # pylint: disable=name-too-long +def extract_workspace_triad_from_trace_provider( # pylint: disable=name-too-long + trace_provider: str, +) -> AzureMLWorkspace: match = re.match(AZURE_WORKSPACE_REGEX_FORMAT, trace_provider) if not match or len(match.groups()) != 5: raise EvaluationException( @@ -52,7 +66,7 @@ def extract_workspace_triad_from_trace_provider(trace_provider: str): # pylint: subscription_id = match.group(1) resource_group_name = match.group(3) workspace_name = match.group(5) - return AzureMLWorkspaceTriad(subscription_id, resource_group_name, workspace_name) + return AzureMLWorkspace(subscription_id, resource_group_name, workspace_name) def load_jsonl(path): @@ -60,7 +74,7 @@ def load_jsonl(path): return [json.loads(line) for line in f.readlines()] -def _azure_pf_client_and_triad(trace_destination): +def _azure_pf_client_and_triad(trace_destination) -> Tuple[PFClient, AzureMLWorkspace]: from promptflow.azure._cli._utils import _get_azure_pf_client ws_triad = extract_workspace_triad_from_trace_provider(trace_destination) @@ -74,12 +88,12 @@ def _azure_pf_client_and_triad(trace_destination): def _log_metrics_and_instance_results( - metrics, - instance_results, - trace_destination, - run, - evaluation_name, -) -> str: + metrics: Dict[str, Any], + instance_results: pd.DataFrame, + trace_destination: Optional[str], + run: Run, + evaluation_name: Optional[str], +) -> Optional[str]: if trace_destination is None: LOGGER.error("Unable to log traces as trace destination was not defined.") return None @@ -99,7 +113,6 @@ def _log_metrics_and_instance_results( ml_client=azure_pf_client.ml_client, promptflow_run=run, ) as ev_run: - artifact_name = EvalRun.EVALUATION_ARTIFACT if run else EvalRun.EVALUATION_ARTIFACT_DUMMY_RUN with tempfile.TemporaryDirectory() as tmpdir: @@ -144,7 +157,7 @@ def _get_ai_studio_url(trace_destination: str, evaluation_id: str) -> str: return studio_url -def _trace_destination_from_project_scope(project_scope: dict) -> str: +def _trace_destination_from_project_scope(project_scope: AzureAIProject) -> str: subscription_id = project_scope["subscription_id"] resource_group_name = project_scope["resource_group_name"] workspace_name = project_scope["project_name"] @@ -157,9 +170,9 @@ def _trace_destination_from_project_scope(project_scope: dict) -> str: return trace_destination -def _write_output(path, data_dict): +def _write_output(path: Union[str, os.PathLike], data_dict: Any) -> None: p = Path(path) - if os.path.isdir(path): + if p.is_dir(): p = p / DEFAULT_EVALUATION_RESULTS_FILE_NAME with open(p, "w", encoding=DefaultOpenEncoding.WRITE) as f: @@ -167,7 +180,7 @@ def _write_output(path, data_dict): def _apply_column_mapping( - source_df: pd.DataFrame, mapping_config: Dict[str, str], inplace: bool = False + source_df: pd.DataFrame, mapping_config: Optional[Dict[str, str]], inplace: bool = False ) -> pd.DataFrame: """ Apply column mapping to source_df based on mapping_config. @@ -217,7 +230,7 @@ def _apply_column_mapping( return result_df -def _has_aggregator(evaluator): +def _has_aggregator(evaluator: object) -> bool: return hasattr(evaluator, "__aggregate__") @@ -240,11 +253,11 @@ def get_int_env_var(env_var_name: str, default_value: int) -> int: return default_value -def set_event_loop_policy(): +def set_event_loop_policy() -> None: import asyncio import platform if platform.system().lower() == "windows": # Reference: https://stackoverflow.com/questions/45600579/asyncio-event-loop-is-closed-when-getting-loop # On Windows seems to be a problem with EventLoopPolicy, use this snippet to work around it - asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) + asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) # type: ignore[attr-defined] diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py index bd14e12004c0..d71ae34ea520 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py @@ -3,6 +3,7 @@ # --------------------------------------------------------- import os from typing import Optional + from typing_extensions import override from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase @@ -50,7 +51,7 @@ def __call__( query: Optional[str] = None, response: Optional[str] = None, conversation: Optional[dict] = None, - **kwargs + **kwargs, ): """Evaluate coherence. Accepts either a query and response for a single evaluation, or a conversation for a potentially multi-turn evaluation. If the conversation has more than one pair of @@ -65,6 +66,6 @@ def __call__( to be dictionaries with keys "content" and "role". :paramtype conversation: Optional[Dict] :return: The relevance score. - :rtype: dict + :rtype: Dict[str, float] """ return super().__call__(query=query, response=response, conversation=conversation, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py index c0251d6e1865..411f8a8d5455 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py @@ -2,19 +2,55 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -from typing import List, Dict, Callable, Any import inspect - -from abc import ABC +from abc import ABC, abstractmethod +from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final from promptflow._utils.async_utils import async_run_allowing_running_loop +from typing_extensions import ParamSpec, TypeAlias -from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget from azure.ai.evaluation._common.math import list_mean +from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException + +P = ParamSpec("P") +T = TypeVar("T") +T_EvalValue = TypeVar("T_EvalValue") + + +class DerivedEvalInput(TypedDict, total=False): + """The eval input generated by EvaluatorBase._derive_conversation_starter.""" + + query: Dict[str, Any] + response: Dict[str, Any] + context: str + + +AggregateResult: TypeAlias = Dict[str, Union[float, Dict[str, List[T]]]] +"""TypeAlias that models the return value of EvaluatorBase._aggregate_results + + .. code-block:: python + + foo: AggregateResult[float] = { + "evaluation_per_turn": { + "gpt_coherence": [1.0, 2.0, 3.0] + }, + "gpt_coherence": 2.0 + } +""" + +DoEvalResult: TypeAlias = Dict[str, T] +"""TypeAlias that models the return value of EvaluatorBase._do_eval + + .. code-block:: python + + foo: DoEvalResult[float] = { + "gpt_coherence": 2.0 + } +""" # TODO exception target pass down? -class EvaluatorBase(ABC): +class EvaluatorBase(ABC, Generic[T_EvalValue]): """Base class for all evaluators that are capable of accepting either a group of single values, or conversation as input. All such evaluators need to implement two functions of their own: - _convert_conversation_to_eval_input @@ -51,7 +87,7 @@ def __init__( # This needs to be overridden just to change the function header into something more informative, # and to be able to add a more specific docstring. The actual function contents should just be # super().__call__() - def __call__(self, **kwargs) -> Dict: + def __call__(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]: """Evaluate a given input. This method serves as a wrapper and is meant to be overridden by child classes for one main reason - to overwrite the method headers and docstring to include additional inputs as needed. The actual behavior of this function shouldn't change beyond adding more inputs to the @@ -64,9 +100,8 @@ def __call__(self, **kwargs) -> Dict: """ return async_run_allowing_running_loop(self._async_evaluator, **kwargs) - # Probably the only thing that can't be simplified. Each evaluator, or at least each family - # of evaluators, will need to implement their own version of this function. - async def _do_eval(self, eval_input: Any) -> Dict: + @abstractmethod + async def _do_eval(self, eval_input: Any) -> DoEvalResult[T_EvalValue]: """Evaluate the input and produce a response. Must be overridden to produce a functional evaluator. In the default case, all required inputs are assumed to be within eval_input, as user-friendly typing is handled above this function in favor of polymorphic simplicity. This function must be @@ -76,12 +111,7 @@ async def _do_eval(self, eval_input: Any) -> Dict: :type eval_input: Any :return: A single evaluation result :rtype: Dict - """ - raise EvaluationException( - message="Not implemented", - internal_message="BaseConversationEval's _do_eval method called somehow. This should be overridden.", - ) # ~~~ METHODS THAT MIGHT NEED TO BE OVERRIDDEN BY CHILDREN~~~ @@ -103,7 +133,7 @@ def _derive_singleton_inputs(self) -> List[str]: singletons.append(param) return singletons - def _derive_conversation_converter(self) -> Callable: + def _derive_conversation_converter(self) -> Callable[[Dict], List[DerivedEvalInput]]: """Produce the function that will be used to convert conversations to a list of evaluable inputs. This uses the inputs derived from the _derive_singleton_inputs function to determine which aspects of a conversation ought to be extracted. @@ -115,12 +145,12 @@ def _derive_conversation_converter(self) -> Callable: include_query = "query" in self._singleton_inputs include_response = "response" in self._singleton_inputs - def converter(conversation: Dict) -> List: - messages = conversation["messages"] + def converter(conversation: Dict) -> List[DerivedEvalInput]: + messages = cast(List[Dict[str, Any]], conversation["messages"]) global_context = conversation.get("context", None) # Extract queries, responses from conversation - queries = [] - responses = [] + queries: List[Dict[str, Any]] = [] + responses: List[Dict[str, Any]] = [] # Convert conversation slice into queries and responses. # Assume that 'user' role is asking queries and 'assistant' role is responding. @@ -147,7 +177,7 @@ def converter(conversation: Dict) -> List: if response_context and not include_response: context["response_context"] = response_context - eval_input = {} + eval_input: DerivedEvalInput = {} if include_query: eval_input["query"] = query if include_response: @@ -159,7 +189,7 @@ def converter(conversation: Dict) -> List: return converter - def _convert_kwargs_to_eval_input(self, **kwargs) -> List: + def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput]]: """Convert an arbitrary input into a list of inputs for evaluators. It is assumed that evaluators generally make use of their inputs in one of two ways. Either they receive a collection of keyname inputs that are all single values @@ -211,7 +241,7 @@ def _convert_kwargs_to_eval_input(self, **kwargs) -> List: target=ErrorTarget.CONVERSATION, ) - def _aggregate_results(self, per_turn_results: List[Dict]) -> Dict: + def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]]) -> AggregateResult[T_EvalValue]: """Aggregate the evaluation results of each conversation turn into a single result. Exact implementation might need to vary slightly depending on the results produced. @@ -227,8 +257,8 @@ def _aggregate_results(self, per_turn_results: List[Dict]) -> Dict: :rtype: Dict """ - aggregated = {} - evaluation_per_turn = {} + aggregated: Dict[str, Union[float, Dict[str, List[T_EvalValue]]]] = {} + evaluation_per_turn: Dict[str, List[T_EvalValue]] = {} # Go over each turn, and rotate the results into a # metric: List[values] format for the evals_per_turn dictionary. @@ -241,13 +271,13 @@ def _aggregate_results(self, per_turn_results: List[Dict]) -> Dict: # Find and average all numeric values for metric, values in evaluation_per_turn.items(): if all(isinstance(value, (int, float)) for value in values): - aggregated[metric] = list_mean(values) + aggregated[metric] = list_mean(cast(List[Union[int, float]], values)) # Slap the per-turn results back in. aggregated["evaluation_per_turn"] = evaluation_per_turn return aggregated - async def _real_call(self, **kwargs): + async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]: """The asynchronous call where real end-to-end evaluation logic is performed. :keyword kwargs: The inputs to evaluate. @@ -270,9 +300,8 @@ async def _real_call(self, **kwargs): # Otherwise, aggregate results. return self._aggregate_results(per_turn_results=per_turn_results) - # ~~~ METHODS THAT SHOULD NEVER BE OVERRIDDEN BY CHILDREN~~~ - - def _to_async(self): + @final + def _to_async(self) -> "AsyncEvaluatorBase": return self._async_evaluator diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py index bea728245a53..21b03c6ac316 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py @@ -9,16 +9,16 @@ from promptflow.core import AsyncPrompty from typing_extensions import override -from ..._common.utils import construct_prompty_model_config +from ..._common.utils import construct_prompty_model_config, validate_model_config +from . import EvaluatorBase try: from ..._user_agent import USER_AGENT except ImportError: - USER_AGENT = None -from . import EvaluatorBase + USER_AGENT = "None" -class PromptyEvaluatorBase(EvaluatorBase): +class PromptyEvaluatorBase(EvaluatorBase[float]): """Base class for all evaluators that make use of context as an input. It's also assumed that such evaluators make use of a prompty file, and return their results as a dictionary, with a single key-value pair linking the result name to a float value (unless multi-turn evaluation occurs, in which case the @@ -39,13 +39,13 @@ class PromptyEvaluatorBase(EvaluatorBase): LLM_CALL_TIMEOUT = 600 DEFAULT_OPEN_API_VERSION = "2024-02-15-preview" - def __init__(self, *, result_key: str, prompty_file: str, model_config: Dict, eval_last_turn: bool = False): + def __init__(self, *, result_key: str, prompty_file: str, model_config: dict, eval_last_turn: bool = False): self._result_key = result_key self._prompty_file = prompty_file super().__init__(eval_last_turn=eval_last_turn) prompty_model_config = construct_prompty_model_config( - model_config, + validate_model_config(model_config), self.DEFAULT_OPEN_API_VERSION, USER_AGENT, ) @@ -56,7 +56,7 @@ def __init__(self, *, result_key: str, prompty_file: str, model_config: Dict, ev # defining a default here. @override - async def _do_eval(self, eval_input: Dict) -> Dict: + async def _do_eval(self, eval_input: Dict) -> Dict[str, float]: """Do a relevance evaluation. :param eval_input: The input to the evaluator. Expected to contain diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py index 4267c05abb7d..c2c8cf49f093 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py @@ -1,18 +1,20 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- +from typing import Dict, Optional, Union -from typing import Dict, Optional from typing_extensions import override -from azure.core.credentials import TokenCredential -from azure.ai.evaluation._common.constants import EvaluationMetrics +from azure.ai.evaluation._common.constants import EvaluationMetrics, _InternalEvaluationMetrics from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service +from azure.ai.evaluation._common.utils import validate_azure_ai_project from azure.ai.evaluation._exceptions import EvaluationException +from azure.core.credentials import TokenCredential + from . import EvaluatorBase -class RaiServiceEvaluatorBase(EvaluatorBase): +class RaiServiceEvaluatorBase(EvaluatorBase[Union[str, float]]): """Base class for all evaluators that require the use of the Azure AI RAI service for evaluation. This includes content safety evaluators, protected material evaluators, and others. These evaluators are all assumed to be of the "query and response or conversation" input variety. @@ -30,14 +32,14 @@ class RaiServiceEvaluatorBase(EvaluatorBase): @override def __init__( self, - eval_metric: EvaluationMetrics, + eval_metric: Union[EvaluationMetrics, _InternalEvaluationMetrics], azure_ai_project: dict, credential: TokenCredential, eval_last_turn: bool = False, ): super().__init__(eval_last_turn=eval_last_turn) self._eval_metric = eval_metric - self._azure_ai_project = azure_ai_project + self._azure_ai_project = validate_azure_ai_project(azure_ai_project) self._credential = credential @override @@ -47,7 +49,7 @@ def __call__( query: Optional[str] = None, response: Optional[str] = None, conversation: Optional[dict] = None, - **kwargs + **kwargs, ): """Evaluate either a query and response or a conversation. Must supply either a query AND response, or a conversation, but not both. @@ -61,12 +63,12 @@ def __call__( to be dictionaries with keys "content", "role", and possibly "context". :paramtype conversation: Optional[Dict] :return: The evaluation result. - :rtype: Dict + :rtype: Dict[str, Union[str, float]] """ return super().__call__(query=query, response=response, conversation=conversation, **kwargs) @override - async def _do_eval(self, eval_input: Dict): + async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: """Perform the evaluation using the Azure AI RAI service. The exact evaluation performed is determined by the evaluation metric supplied by the child class initializer. diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py index c1d4d2c1bc94..c7d13bd2892b 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py @@ -2,19 +2,14 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- from concurrent.futures import as_completed +from typing import Callable, Dict, List, Union from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor -try: - from ._hate_unfairness import HateUnfairnessEvaluator - from ._self_harm import SelfHarmEvaluator - from ._sexual import SexualEvaluator - from ._violence import ViolenceEvaluator -except ImportError: - from _hate_unfairness import HateUnfairnessEvaluator - from _self_harm import SelfHarmEvaluator - from _sexual import SexualEvaluator - from _violence import ViolenceEvaluator +from ._hate_unfairness import HateUnfairnessEvaluator +from ._self_harm import SelfHarmEvaluator +from ._sexual import SexualEvaluator +from ._violence import ViolenceEvaluator class ContentSafetyEvaluator: @@ -68,7 +63,7 @@ class ContentSafetyEvaluator: def __init__(self, credential, azure_ai_project: dict, parallel: bool = True): self._parallel = parallel - self._evaluators = [ + self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [ ViolenceEvaluator(credential, azure_ai_project), SexualEvaluator(credential, azure_ai_project), SelfHarmEvaluator(credential, azure_ai_project), @@ -86,9 +81,9 @@ def __call__(self, *, query: str, response: str, **kwargs): :keyword parallel: Whether to evaluate in parallel. :paramtype parallel: bool :return: The scores for content-safety. - :rtype: dict + :rtype: Dict[str, Union[str, float]] """ - results = {} + results: Dict[str, Union[str, float]] = {} if self._parallel: with ThreadPoolExecutor() as executor: futures = { diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py index b4587ce01af3..d19c9ead4fca 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py @@ -4,27 +4,28 @@ import logging import math from concurrent.futures import as_completed -from typing import Dict, List +from typing import Callable, Dict, List, TypedDict, Union, cast from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor +from azure.ai.evaluation._common.constants import HarmSeverityLevel from azure.ai.evaluation._common.math import list_mean_nan_safe from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException -try: - from ._hate_unfairness import HateUnfairnessEvaluator - from ._self_harm import SelfHarmEvaluator - from ._sexual import SexualEvaluator - from ._violence import ViolenceEvaluator -except ImportError: - from _hate_unfairness import HateUnfairnessEvaluator - from _self_harm import SelfHarmEvaluator - from _sexual import SexualEvaluator - from _violence import ViolenceEvaluator +from ._hate_unfairness import HateUnfairnessEvaluator +from ._self_harm import SelfHarmEvaluator +from ._sexual import SexualEvaluator +from ._violence import ViolenceEvaluator logger = logging.getLogger(__name__) +class _EvaluationPerTurn(TypedDict): + severity: List[str] + score: List[float] + reason: List[str] + + class ContentSafetyChatEvaluator: """ Initialize a content safety chat evaluator configured to evaluate content safetry metrics for chat scenario. @@ -88,24 +89,30 @@ class ContentSafetyChatEvaluator: } """ - def __init__(self, credential, azure_ai_project: dict, eval_last_turn: bool = False, parallel: bool = True): + def __init__( + self, + credential, + azure_ai_project: dict, + eval_last_turn: bool = False, + parallel: bool = True, + ): self._eval_last_turn = eval_last_turn self._parallel = parallel - self._evaluators = [ + self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [ ViolenceEvaluator(azure_ai_project, credential), SexualEvaluator(azure_ai_project, credential), SelfHarmEvaluator(azure_ai_project, credential), HateUnfairnessEvaluator(azure_ai_project, credential), ] - def __call__(self, *, conversation, **kwargs): + def __call__(self, *, conversation: list, **kwargs): """ Evaluates content-safety metrics for "chat" scenario. :keyword conversation: The conversation to be evaluated. Each turn should have "role" and "content" keys. :paramtype conversation: List[Dict] :return: The scores for Chat scenario. - :rtype: dict + :rtype: Dict[str, Union[float, str, Dict[str, _EvaluationPerTurn]]] """ self._validate_conversation(conversation) @@ -142,7 +149,7 @@ def __call__(self, *, conversation, **kwargs): } for future in as_completed(future_to_evaluator): - result = future.result() + result: Dict[str, Union[str, float]] = future.result() current_turn_result.update(result) else: # Sequential execution @@ -155,7 +162,13 @@ def __call__(self, *, conversation, **kwargs): aggregated = self._aggregate_results(per_turn_results) return aggregated - def _evaluate_turn(self, turn_num, queries, responses, evaluator): + def _evaluate_turn( + self, + turn_num: int, + queries: List[str], + responses: List[str], + evaluator: Callable[..., Dict[str, Union[str, float]]], + ) -> Dict[str, Union[str, float]]: try: query = queries[turn_num] if turn_num < len(queries) else "" response = responses[turn_num] if turn_num < len(responses) else "" @@ -172,41 +185,48 @@ def _evaluate_turn(self, turn_num, queries, responses, evaluator): ) return {} - def _aggregate_results(self, per_turn_results: List[Dict]): - scores = {} - reasons = {} - levels = {} + def _aggregate_results( + self, per_turn_results: List[Dict[str, Union[str, float]]] + ) -> Dict[str, Union[float, str, Dict[str, _EvaluationPerTurn]]]: + scores: Dict[str, List[float]] = {} + reasons: Dict[str, List[str]] = {} + levels: Dict[str, List[str]] = {} for turn in per_turn_results: for metric, value in turn.items(): if "_score" in metric: if metric not in scores: scores[metric] = [] - scores[metric].append(value) + scores[metric].append(cast(float, value)) elif "_reason" in metric: if metric not in reasons: reasons[metric] = [] - reasons[metric].append(value) + reasons[metric].append(cast(str, value)) else: if metric not in levels: levels[metric] = [] - levels[metric].append(value) + levels[metric].append(cast(str, value)) - aggregated = {} - evaluation_per_turn = {} + aggregated: Dict[str, Union[float, str, Dict[str, _EvaluationPerTurn]]] = {} + evaluation_per_turn: Dict[str, _EvaluationPerTurn] = {} for metric, values in levels.items(): score_key = f"{metric}_score" reason_key = f"{metric}_reason" aggregated_score = list_mean_nan_safe(scores[score_key]) - aggregated[metric] = self._get_harm_severity_level(aggregated_score) + harm_severity_level = self._get_harm_severity_level(aggregated_score) + aggregated[metric] = ( + harm_severity_level.value if isinstance(harm_severity_level, HarmSeverityLevel) else harm_severity_level + ) aggregated[score_key] = aggregated_score # Prepare per-turn evaluations - evaluation_per_turn[metric] = {"severity": values} - evaluation_per_turn[metric]["score"] = scores[score_key] - evaluation_per_turn[metric]["reason"] = reasons[reason_key] + evaluation_per_turn[metric] = { + "severity": values, + "score": scores[score_key], + "reason": reasons[reason_key], + } aggregated["evaluation_per_turn"] = evaluation_per_turn @@ -284,12 +304,12 @@ def _validate_conversation(self, conversation: List[Dict]): blame=ErrorBlame.USER_ERROR, ) - def _get_harm_severity_level(self, harm_score: float) -> str: + def _get_harm_severity_level(self, harm_score: float) -> Union[HarmSeverityLevel, float]: HARM_SEVERITY_LEVEL_MAPPING = { - "Very low": [0, 1], - "Low": [2, 3], - "Medium": [4, 5], - "High": [6, 7], + HarmSeverityLevel.VeryLow: (0, 1), + HarmSeverityLevel.Low: (2, 3), + HarmSeverityLevel.Medium: (4, 5), + HarmSeverityLevel.High: (6, 7), } if math.isnan(harm_score) or harm_score is None: diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py index cda53fa057a7..2c666f7e4ccf 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py @@ -2,6 +2,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- from typing_extensions import override + from azure.ai.evaluation._common.constants import EvaluationMetrics from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py index 2948c49e84e1..89cd6247502c 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py @@ -2,6 +2,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- from typing_extensions import override + from azure.ai.evaluation._common.constants import EvaluationMetrics from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py index a752a49b3c52..4e4aaef0d406 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py @@ -2,6 +2,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- from typing_extensions import override + from azure.ai.evaluation._common.constants import EvaluationMetrics from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py index 606c256750d9..8197a76c4e87 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py @@ -2,6 +2,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- from typing_extensions import override + from azure.ai.evaluation._common.constants import EvaluationMetrics from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py index 59e3f616fbb0..e158544f582a 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py @@ -2,6 +2,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- from typing_extensions import override + from azure.ai.evaluation._common.constants import _InternalEvaluationMetrics from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py index 83751461e3f2..dbd77a949b3e 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py @@ -15,6 +15,16 @@ def __init__(self): pass async def __call__(self, *, response: str, ground_truth: str, **kwargs): + """ + Evaluate F1 score. + + :keyword response: The response to be evaluated. + :paramtype response: str + :keyword ground_truth: The ground truth to be evaluated. + :paramtype ground_truth: str + :return: The F1 score. + :rtype: Dict[str, float] + """ # Validate inputs if not (response and response.strip() and response != "None") or not ( ground_truth and ground_truth.strip() and ground_truth != "None" @@ -34,7 +44,7 @@ async def __call__(self, *, response: str, ground_truth: str, **kwargs): return {"f1_score": f1_result} @classmethod - def _compute_f1_score(cls, response: str, ground_truth: str) -> str: + def _compute_f1_score(cls, response: str, ground_truth: str) -> float: import re import string @@ -76,11 +86,9 @@ def lower(text): return white_space_fix(remove_articles(remove_punctuation(lower(text)))) - prediction_tokens = normalize_text(response) - reference_tokens = normalize_text(ground_truth) tokenizer = QASplitTokenizer() - prediction_tokens = tokenizer(prediction_tokens) - reference_tokens = tokenizer(reference_tokens) + prediction_tokens = tokenizer(normalize_text(response)) + reference_tokens = tokenizer(normalize_text(ground_truth)) common_tokens = Counter(prediction_tokens) & Counter(reference_tokens) num_common_tokens = sum(common_tokens.values()) @@ -131,7 +139,7 @@ def __call__(self, *, response: str, ground_truth: str, **kwargs): :keyword ground_truth: The ground truth to be evaluated. :paramtype ground_truth: str :return: The F1 score. - :rtype: dict + :rtype: Dict[str, float] """ return async_run_allowing_running_loop( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py index 928b780522e1..22f472c2756c 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py @@ -4,6 +4,7 @@ import os from typing import Optional + from typing_extensions import override from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase @@ -51,7 +52,7 @@ def __call__( query: Optional[str] = None, response: Optional[str] = None, conversation: Optional[dict] = None, - **kwargs + **kwargs, ): """ Evaluate fluency. Accepts either a query and response for a single evaluation, @@ -67,6 +68,6 @@ def __call__( to be dictionaries with keys "content" and "role". :paramtype conversation: Optional[Dict] :return: The fluency score. - :rtype: dict + :rtype: Dict[str, float] """ return super().__call__(query=query, response=response, conversation=conversation, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py index 8625ae94efab..16482e514f18 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py @@ -3,6 +3,7 @@ # --------------------------------------------------------- import os from typing import Optional + from typing_extensions import override from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase @@ -51,7 +52,7 @@ def __call__( response: Optional[str] = None, context: Optional[str] = None, conversation: Optional[dict] = None, - **kwargs + **kwargs, ): """Evaluate groundedless. Accepts either a response and context a single evaluation, or a conversation for a multi-turn evaluation. If the conversation has more than one turn, @@ -66,6 +67,6 @@ def __call__( to be dictionaries with keys "content", "role", and possibly "context". :paramtype conversation: Optional[Dict] :return: The relevance score. - :rtype: dict + :rtype: Dict[str, float] """ return super().__call__(response=response, context=context, conversation=conversation, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py index 6035e5bc67c9..98c3e44bbc18 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py @@ -2,6 +2,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- from typing_extensions import override + from azure.ai.evaluation._common.constants import EvaluationMetrics from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py index fe4f9c7cfd84..519e351cf2f2 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py @@ -1,15 +1,20 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- +from typing import cast + from promptflow._utils.async_utils import async_run_allowing_running_loop from azure.ai.evaluation._common.constants import EvaluationMetrics from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service +from azure.ai.evaluation._common.utils import validate_azure_ai_project from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException +from azure.ai.evaluation._model_configurations import AzureAIProject +from azure.core.credentials import TokenCredential class _AsyncProtectedMaterialsEvaluator: - def __init__(self, azure_ai_project: dict, credential=None): + def __init__(self, azure_ai_project: AzureAIProject, credential: TokenCredential): self._azure_ai_project = azure_ai_project self._credential = credential @@ -85,7 +90,9 @@ class ProtectedMaterialsEvaluator: """ def __init__(self, credential, azure_ai_project: dict): - self._async_evaluator = _AsyncProtectedMaterialsEvaluator(azure_ai_project, credential) + self._async_evaluator = _AsyncProtectedMaterialsEvaluator( + validate_azure_ai_project(azure_ai_project), cast(TokenCredential, credential) + ) def __call__(self, *, query: str, response: str, **kwargs): """ diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py index 59dcb6758ba1..e33df7ae20ea 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py @@ -3,6 +3,7 @@ # --------------------------------------------------------- from concurrent.futures import as_completed +from typing import Callable, Dict, List from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor @@ -53,7 +54,7 @@ class QAEvaluator: def __init__(self, model_config: dict, parallel: bool = True): self._parallel = parallel - self._evaluators = [ + self._evaluators: List[Callable[..., Dict[str, float]]] = [ GroundednessEvaluator(model_config), RelevanceEvaluator(model_config), CoherenceEvaluator(model_config), @@ -77,9 +78,9 @@ def __call__(self, *, query: str, response: str, context: str, ground_truth: str :keyword parallel: Whether to evaluate in parallel. Defaults to True. :paramtype parallel: bool :return: The scores for QA scenario. - :rtype: dict + :rtype: Dict[str, float] """ - results = {} + results: Dict[str, float] = {} if self._parallel: with ThreadPoolExecutor() as executor: futures = { diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py index bf1a060cafaf..b0374f9ed777 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py @@ -4,6 +4,7 @@ import os from typing import Optional + from typing_extensions import override from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase @@ -55,7 +56,7 @@ def __call__( response: Optional[str] = None, context: Optional[str] = None, conversation: Optional[dict] = None, - **kwargs + **kwargs, ): """Evaluate relevance. Accepts either a response and context a single evaluation, or a conversation for a multi-turn evaluation. If the conversation has more than one turn, @@ -72,6 +73,6 @@ def __call__( to be dictionaries with keys "content", "role", and possibly "context". :paramtype conversation: Optional[Dict] :return: The relevance score. - :rtype: dict + :rtype: Dict[str, float] """ return super().__call__(query=query, response=response, context=context, conversation=conversation, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py index cd8620d17874..5766f25a88ef 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py @@ -7,19 +7,22 @@ import math import os import re +from typing import Union from promptflow._utils.async_utils import async_run_allowing_running_loop from promptflow.core import AsyncPrompty +from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration + from ..._common.math import list_mean_nan_safe -from ..._common.utils import construct_prompty_model_config +from ..._common.utils import construct_prompty_model_config, validate_model_config logger = logging.getLogger(__name__) try: from .._user_agent import USER_AGENT except ImportError: - USER_AGENT = None + USER_AGENT = "None" class _AsyncRetrievalScoreEvaluator: @@ -28,7 +31,7 @@ class _AsyncRetrievalScoreEvaluator: LLM_CALL_TIMEOUT = 600 DEFAULT_OPEN_API_VERSION = "2024-02-15-preview" - def __init__(self, model_config: dict): + def __init__(self, model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration]): prompty_model_config = construct_prompty_model_config( model_config, self.DEFAULT_OPEN_API_VERSION, @@ -135,7 +138,7 @@ class RetrievalEvaluator: """ def __init__(self, model_config: dict): - self._async_evaluator = _AsyncRetrievalScoreEvaluator(model_config) + self._async_evaluator = _AsyncRetrievalScoreEvaluator(validate_model_config(model_config)) def __call__(self, *, conversation, **kwargs): """Evaluates retrieval score chat scenario. diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_similarity/_similarity.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_similarity/_similarity.py index 8d96ac33f1d4..2c764303fff3 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_similarity/_similarity.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_similarity/_similarity.py @@ -5,18 +5,20 @@ import math import os import re +from typing import Union from promptflow._utils.async_utils import async_run_allowing_running_loop from promptflow.core import AsyncPrompty from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException +from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration -from ..._common.utils import construct_prompty_model_config +from ..._common.utils import construct_prompty_model_config, validate_model_config try: from ..._user_agent import USER_AGENT except ImportError: - USER_AGENT = None + USER_AGENT = "None" class _AsyncSimilarityEvaluator: @@ -25,7 +27,7 @@ class _AsyncSimilarityEvaluator: LLM_CALL_TIMEOUT = 600 DEFAULT_OPEN_API_VERSION = "2024-02-15-preview" - def __init__(self, model_config: dict): + def __init__(self, model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration]): prompty_model_config = construct_prompty_model_config( model_config, self.DEFAULT_OPEN_API_VERSION, @@ -37,6 +39,18 @@ def __init__(self, model_config: dict): self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config) async def __call__(self, *, query: str, response: str, ground_truth: str, **kwargs): + """ + Evaluate similarity. + + :keyword query: The query to be evaluated. + :paramtype query: str + :keyword response: The response to be evaluated. + :paramtype response: str + :keyword ground_truth: The ground truth to be evaluated. + :paramtype ground_truth: str + :return: The similarity score. + :rtype: Dict[str, float] + """ # Validate input parameters query = str(query or "") response = str(response or "") @@ -94,7 +108,7 @@ class SimilarityEvaluator: """ def __init__(self, model_config: dict): - self._async_evaluator = _AsyncSimilarityEvaluator(model_config) + self._async_evaluator = _AsyncSimilarityEvaluator(validate_model_config(model_config)) def __call__(self, *, query: str, response: str, ground_truth: str, **kwargs): """ @@ -107,7 +121,7 @@ def __call__(self, *, query: str, response: str, ground_truth: str, **kwargs): :keyword ground_truth: The ground truth to be evaluated. :paramtype ground_truth: str :return: The similarity score. - :rtype: dict + :rtype: Dict[str, float] """ return async_run_allowing_running_loop( self._async_evaluator, query=query, response=response, ground_truth=ground_truth, **kwargs diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py index ea5a12868f04..eeaf6d3fb9b1 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py @@ -4,10 +4,10 @@ import logging from typing_extensions import override + from azure.ai.evaluation._common.constants import EvaluationMetrics from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase - logger = logging.getLogger(__name__) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_http_utils.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_http_utils.py index d0559e52dccf..197bca40078d 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_http_utils.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_http_utils.py @@ -3,10 +3,9 @@ # --------------------------------------------------------- -from functools import wraps -from typing import Any, Awaitable, Callable, Dict, MutableMapping, Optional +from typing import Any, Dict, MutableMapping, Optional, TypedDict, cast -from typing_extensions import Self +from typing_extensions import Self, Unpack from azure.ai.evaluation._user_agent import USER_AGENT from azure.core.configuration import Configuration @@ -33,78 +32,25 @@ from azure.core.rest._rest_py3 import ContentType, FilesType, ParamsType -def _request_fn(f: Callable[["HttpPipeline"], None]): - """Decorator to generate convenience methods for HTTP method. +class RequestKwargs(TypedDict, total=False): + """Keyword arguments for request-style http request functions - :param Callable[["HttpPipeline"],None] f: A HttpPipeline classmethod to wrap. - The f.__name__ is the HTTP method used - :return: A wrapped callable that sends a `f.__name__` request - :rtype: Callable - """ + .. note:: - @wraps(f) - def request_fn( - self: "HttpPipeline", - url: str, - *, - params: Optional[ParamsType] = None, - headers: Optional[MutableMapping[str, str]] = None, - json: Any = None, - content: Optional[ContentType] = None, - data: Optional[Dict[str, Any]] = None, - files: Optional[FilesType] = None, - **kwargs, - ) -> HttpResponse: - return self.request( - f.__name__.upper(), - url, - params=params, - headers=headers, - json=json, - content=content, - data=data, - files=files, - **kwargs, - ) + Ideally, we'd be able to express that these are the known subset of kwargs, but it's possible to provide + others. But that currently isn't possible; there's no way currently to express a TypedDict that expects + a known set of keys and an unknown set of keys. - return request_fn - - -def _async_request_fn(f: Callable[["AsyncHttpPipeline"], Awaitable[None]]): - """Decorator to generate convenience methods for HTTP method. - - :param Callable[["HttpPipeline"],None] f: A HttpPipeline classmethod to wrap. - The f.__name__ is the HTTP method used - :return: A wrapped callable that sends a `f.__name__` request - :rtype: Callable + PEP 728 - TypedDict with Typed Extra Items (https://peps.python.org/pep-0728/) would rectify this but it's + still in Draft status. """ - @wraps(f) - async def request_fn( - self: "AsyncHttpPipeline", - url: str, - *, - params: Optional[ParamsType] = None, - headers: Optional[MutableMapping[str, str]] = None, - json: Any = None, - content: Optional[ContentType] = None, - data: Optional[Dict[str, Any]] = None, - files: Optional[FilesType] = None, - **kwargs, - ) -> AsyncHttpResponse: - return await self.request( - f.__name__.upper(), - url, - params=params, - headers=headers, - json=json, - content=content, - data=data, - files=files, - **kwargs, - ) - - return request_fn + params: ParamsType + headers: MutableMapping[str, str] + json: Any + content: ContentType + data: Dict[str, Any] + files: FilesType class HttpPipeline(Pipeline): @@ -145,14 +91,32 @@ def __init__( :param RedirectPolicy redirect_policy: """ config = config or Configuration() - config.headers_policy = headers_policy or config.headers_policy or HeadersPolicy(**kwargs) - config.proxy_policy = proxy_policy or config.proxy_policy or ProxyPolicy(**kwargs) - config.redirect_policy = redirect_policy or config.redirect_policy or RedirectPolicy(**kwargs) - config.retry_policy = retry_policy or config.retry_policy or RetryPolicy(**kwargs) - config.custom_hook_policy = custom_hook_policy or config.custom_hook_policy or CustomHookPolicy(**kwargs) - config.logging_policy = logging_policy or config.logging_policy or NetworkTraceLoggingPolicy(**kwargs) - config.http_logging_policy = http_logging_policy or config.http_logging_policy or HttpLoggingPolicy(**kwargs) - config.user_agent_policy = user_agent_policy or config.user_agent_policy or UserAgentPolicy(**kwargs) + config.headers_policy = ( + headers_policy or cast(Optional[HeadersPolicy], config.headers_policy) or HeadersPolicy(**kwargs) + ) + config.proxy_policy = proxy_policy or cast(Optional[ProxyPolicy], config.proxy_policy) or ProxyPolicy(**kwargs) + config.redirect_policy = ( + redirect_policy or cast(Optional[RedirectPolicy], config.redirect_policy) or RedirectPolicy(**kwargs) + ) + config.retry_policy = retry_policy or cast(Optional[RetryPolicy], config.retry_policy) or RetryPolicy(**kwargs) + config.custom_hook_policy = ( + custom_hook_policy + or cast(Optional[CustomHookPolicy], config.custom_hook_policy) + or CustomHookPolicy(**kwargs) + ) + config.logging_policy = ( + logging_policy + or cast(Optional[NetworkTraceLoggingPolicy], config.logging_policy) + or NetworkTraceLoggingPolicy(**kwargs) + ) + config.http_logging_policy = ( + http_logging_policy + or cast(Optional[HttpLoggingPolicy], config.http_logging_policy) + or HttpLoggingPolicy(**kwargs) + ) + config.user_agent_policy = ( + user_agent_policy or cast(Optional[UserAgentPolicy], config.user_agent_policy) or UserAgentPolicy(**kwargs) + ) config.polling_interval = kwargs.get("polling_interval", 30) super().__init__( @@ -166,7 +130,6 @@ def __init__( config.proxy_policy, config.redirect_policy, config.retry_policy, - config.authentication_policy, config.custom_hook_policy, config.logging_policy, ], @@ -199,7 +162,6 @@ def request( files: Optional[FilesType] = None, **kwargs, ) -> HttpResponse: - request = HttpRequest( method, url, @@ -213,33 +175,78 @@ def request( return self.run(request, **kwargs).http_response - @_request_fn - def delete(self) -> None: - """Send a DELETE request.""" + def delete(self: "HttpPipeline", url: str, **kwargs: Unpack[RequestKwargs]) -> HttpResponse: + """Send a DELETE request. - @_request_fn - def put(self) -> None: - """Send a PUT request.""" + :param str url: The request url + :returns: The request response + :rtype: HttpResponse + """ - @_request_fn - def get(self) -> None: - """Send a GET request.""" + return self.request(self.delete.__name__.upper(), url, **kwargs) - @_request_fn - def post(self) -> None: - """Send a POST request.""" + def put(self: "HttpPipeline", url: str, **kwargs: Unpack[RequestKwargs]) -> HttpResponse: + """Send a PUT request. - @_request_fn - def head(self) -> None: - """Send a HEAD request.""" + :param str url: The request url + :returns: The request response + :rtype: HttpResponse + """ - @_request_fn - def options(self) -> None: - """Send a OPTIONS request.""" + return self.request(self.put.__name__.upper(), url, **kwargs) - @_request_fn - def patch(self) -> None: - """Send a PATCH request.""" + def get(self: "HttpPipeline", url: str, **kwargs: Unpack[RequestKwargs]) -> HttpResponse: + """Send a GET request. + + :param str url: The request url + :returns: The request response + :rtype: HttpResponse + """ + + return self.request(self.get.__name__.upper(), url, **kwargs) + + def post(self: "HttpPipeline", url: str, **kwargs: Unpack[RequestKwargs]) -> HttpResponse: + """Send a POST request. + + :param str url: The request url + :returns: The request response + :rtype: HttpResponse + """ + + return self.request(self.post.__name__.upper(), url, **kwargs) + + def head(self: "HttpPipeline", url: str, **kwargs: Unpack[RequestKwargs]) -> HttpResponse: + """Send a HEAD request. + + :param str url: The request url + :returns: The request response + :rtype: HttpResponse + """ + + return self.request(self.head.__name__.upper(), url, **kwargs) + + def options(self: "HttpPipeline", url: str, **kwargs: Unpack[RequestKwargs]) -> HttpResponse: + """Send a OPTIONS request. + + :param str url: The request url + :returns: The request response + :rtype: HttpResponse + """ + + return self.request(self.options.__name__.upper(), url, **kwargs) + + def patch(self: "HttpPipeline", url: str, **kwargs: Unpack[RequestKwargs]) -> HttpResponse: + """Send a PATCH request. + + :param str url: The request url + :returns: The request response + :rtype: HttpResponse + """ + + return self.request(self.patch.__name__.upper(), url, **kwargs) + + def __enter__(self) -> Self: + return cast(Self, super().__enter__()) class AsyncHttpPipeline(AsyncPipeline): @@ -280,14 +287,36 @@ def __init__( :param AsyncRedirectPolicy redirect_policy: """ config = config or Configuration() - config.headers_policy = headers_policy or config.headers_policy or HeadersPolicy(**kwargs) - config.proxy_policy = proxy_policy or config.proxy_policy or ProxyPolicy(**kwargs) - config.redirect_policy = redirect_policy or config.redirect_policy or AsyncRedirectPolicy(**kwargs) - config.retry_policy = retry_policy or config.retry_policy or AsyncRetryPolicy(**kwargs) - config.custom_hook_policy = custom_hook_policy or config.custom_hook_policy or CustomHookPolicy(**kwargs) - config.logging_policy = logging_policy or config.logging_policy or NetworkTraceLoggingPolicy(**kwargs) - config.http_logging_policy = http_logging_policy or config.http_logging_policy or HttpLoggingPolicy(**kwargs) - config.user_agent_policy = user_agent_policy or config.user_agent_policy or UserAgentPolicy(**kwargs) + config.headers_policy = ( + headers_policy or cast(Optional[HeadersPolicy], config.headers_policy) or HeadersPolicy(**kwargs) + ) + config.proxy_policy = proxy_policy or cast(Optional[ProxyPolicy], config.proxy_policy) or ProxyPolicy(**kwargs) + config.redirect_policy = ( + redirect_policy + or cast(Optional[AsyncRedirectPolicy], config.redirect_policy) + or AsyncRedirectPolicy(**kwargs) + ) + config.retry_policy = ( + retry_policy or cast(Optional[AsyncRetryPolicy], config.retry_policy) or AsyncRetryPolicy(**kwargs) + ) + config.custom_hook_policy = ( + custom_hook_policy + or cast(Optional[CustomHookPolicy], config.custom_hook_policy) + or CustomHookPolicy(**kwargs) + ) + config.logging_policy = ( + logging_policy + or cast(Optional[NetworkTraceLoggingPolicy], config.logging_policy) + or NetworkTraceLoggingPolicy(**kwargs) + ) + config.http_logging_policy = ( + http_logging_policy + or cast(Optional[HttpLoggingPolicy], config.http_logging_policy) + or HttpLoggingPolicy(**kwargs) + ) + config.user_agent_policy = ( + user_agent_policy or cast(Optional[UserAgentPolicy], config.user_agent_policy) or UserAgentPolicy(**kwargs) + ) config.polling_interval = kwargs.get("polling_interval", 30) super().__init__( @@ -301,7 +330,6 @@ def __init__( config.proxy_policy, config.redirect_policy, config.retry_policy, - config.authentication_policy, config.custom_hook_policy, config.logging_policy, ], @@ -334,7 +362,6 @@ async def request( files: Optional[FilesType] = None, **kwargs, ) -> AsyncHttpResponse: - request = HttpRequest( method, url, @@ -348,33 +375,77 @@ async def request( return (await self.run(request, **kwargs)).http_response - @_async_request_fn - async def delete(self) -> None: - """Send a DELETE request.""" + async def delete(self: "AsyncHttpPipeline", url: str, **kwargs: Unpack[RequestKwargs]) -> AsyncHttpResponse: + """Send a DELETE request. + + :param str url: The request url + :returns: The request response + :rtype: AsyncHttpResponse + """ + return await self.request(self.delete.__name__.upper(), url, **kwargs) + + async def put(self: "AsyncHttpPipeline", url: str, **kwargs: Unpack[RequestKwargs]) -> AsyncHttpResponse: + """Send a PUT request. + + :param str url: The request url + :returns: The request response + :rtype: AsyncHttpResponse + """ + + return await self.request(self.put.__name__.upper(), url, **kwargs) + + async def get(self: "AsyncHttpPipeline", url: str, **kwargs: Unpack[RequestKwargs]) -> AsyncHttpResponse: + """Send a GET request. + + :param str url: The request url + :returns: The request response + :rtype: AsyncHttpResponse + """ + + return await self.request(self.get.__name__.upper(), url, **kwargs) + + async def post(self: "AsyncHttpPipeline", url: str, **kwargs: Unpack[RequestKwargs]) -> AsyncHttpResponse: + """Send a POST request. + + :param str url: The request url + :returns: The request response + :rtype: AsyncHttpResponse + """ + + return await self.request(self.post.__name__.upper(), url, **kwargs) + + async def head(self: "AsyncHttpPipeline", url: str, **kwargs: Unpack[RequestKwargs]) -> AsyncHttpResponse: + """Send a HEAD request. - @_async_request_fn - async def put(self) -> None: - """Send a PUT request.""" + :param str url: The request url + :returns: The request response + :rtype: AsyncHttpResponse + """ + + return await self.request(self.head.__name__.upper(), url, **kwargs) + + async def options(self: "AsyncHttpPipeline", url: str, **kwargs: Unpack[RequestKwargs]) -> AsyncHttpResponse: + """Send a OPTIONS request. - @_async_request_fn - async def get(self) -> None: - """Send a GET request.""" + :param str url: The request url + :returns: The request response + :rtype: AsyncHttpResponse + """ + + return await self.request(self.options.__name__.upper(), url, **kwargs) - @_async_request_fn - async def post(self) -> None: - """Send a POST request.""" + async def patch(self: "AsyncHttpPipeline", url: str, **kwargs: Unpack[RequestKwargs]) -> AsyncHttpResponse: + """Send a PATCH request. - @_async_request_fn - async def head(self) -> None: - """Send a HEAD request.""" + :param str url: The request url + :returns: The request response + :rtype: AsyncHttpResponse + """ - @_async_request_fn - async def options(self) -> None: - """Send a OPTIONS request.""" + return await self.request(self.patch.__name__.upper(), url, **kwargs) - @_async_request_fn - async def patch(self) -> None: - """Send a PATCH request.""" + async def __aenter__(self) -> Self: + return cast(Self, await super().__aenter__()) def get_http_client() -> HttpPipeline: diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_model_configurations.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_model_configurations.py index e0885d4cbc19..43114d3605c3 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_model_configurations.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_model_configurations.py @@ -7,10 +7,10 @@ from typing_extensions import NotRequired -class AzureOpenAIModelConfiguration(TypedDict, total=False): +class AzureOpenAIModelConfiguration(TypedDict): """Model Configuration for Azure OpenAI Model""" - type: Literal["azure_openai"] + type: NotRequired[Literal["azure_openai"]] """The type of the model configuration. Should be 'azure_openai' for AzureOpenAIModelConfiguration""" azure_deployment: str """Name of Azure OpenAI deployment to make request to""" @@ -22,10 +22,10 @@ class AzureOpenAIModelConfiguration(TypedDict, total=False): """(Optional) API version to use in request to Azure OpenAI deployment""" -class OpenAIModelConfiguration(TypedDict, total=False): +class OpenAIModelConfiguration(TypedDict): """Model Configuration for OpenAI Model""" - type: Literal["openai"] + type: NotRequired[Literal["openai"]] """The type of the model configuration. Should be 'openai' for OpenAIModelConfiguration""" api_key: str "API key needed to make request to model" diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_adversarial_simulator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_adversarial_simulator.py index 2cc511c3e35a..f4149bbe4945 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_adversarial_simulator.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_adversarial_simulator.py @@ -6,21 +6,22 @@ import asyncio import logging import random -from typing import Any, Callable, Dict, List, Optional +from typing import Any, Callable, Dict, List, Literal, Optional, Union, cast from tqdm import tqdm +from azure.ai.evaluation._common.utils import validate_azure_ai_project from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException from azure.ai.evaluation._http_utils import get_async_http_client -from azure.ai.evaluation._model_configurations import AzureAIProject from azure.ai.evaluation.simulator import AdversarialScenario from azure.ai.evaluation.simulator._adversarial_scenario import _UnstableAdversarialScenario +from azure.core.credentials import TokenCredential from azure.core.pipeline.policies import AsyncRetryPolicy, RetryMode -from azure.identity import DefaultAzureCredential from ._constants import SupportedLanguages -from ._conversation import CallbackConversationBot, ConversationBot, ConversationRole +from ._conversation import CallbackConversationBot, ConversationBot, ConversationRole, ConversationTurn from ._conversation._conversation import simulate_conversation +from ._helpers import experimental from ._model_tools import ( AdversarialTemplateHandler, ManagedIdentityAPITokenManager, @@ -28,8 +29,8 @@ RAIClient, TokenScope, ) +from ._model_tools._template_handler import AdversarialTemplate, TemplateParameters from ._utils import JsonLineList -from ._helpers import experimental logger = logging.getLogger(__name__) @@ -46,41 +47,28 @@ class AdversarialSimulator: :type credential: ~azure.core.credentials.TokenCredential """ - def __init__(self, *, azure_ai_project: AzureAIProject, credential=None): + def __init__(self, *, azure_ai_project: dict, credential): """Constructor.""" - # check if azure_ai_project has the keys: subscription_id, resource_group_name and project_name - if not all(key in azure_ai_project for key in ["subscription_id", "resource_group_name", "project_name"]): - msg = "azure_ai_project must contain keys: subscription_id, resource_group_name, project_name" - raise EvaluationException( - message=msg, - internal_message=msg, - target=ErrorTarget.ADVERSARIAL_SIMULATOR, - category=ErrorCategory.MISSING_FIELD, - blame=ErrorBlame.USER_ERROR, - ) - # check the value of the keys in azure_ai_project is not none - if not all(azure_ai_project[key] for key in ["subscription_id", "resource_group_name", "project_name"]): - msg = "subscription_id, resource_group_name and project_name cannot be None" + + try: + self.azure_ai_project = validate_azure_ai_project(azure_ai_project) + except EvaluationException as e: raise EvaluationException( - message=msg, - internal_message=msg, + message=e.message, + internal_message=e.internal_message, target=ErrorTarget.ADVERSARIAL_SIMULATOR, - category=ErrorCategory.MISSING_FIELD, - blame=ErrorBlame.USER_ERROR, - ) - if "credential" not in azure_ai_project and not credential: - credential = DefaultAzureCredential() - elif "credential" in azure_ai_project: - credential = azure_ai_project["credential"] - self.azure_ai_project = azure_ai_project + category=e.category, + blame=e.blame, + ) from e + self.token_manager = ManagedIdentityAPITokenManager( token_scope=TokenScope.DEFAULT_AZURE_MANAGEMENT, logger=logging.getLogger("AdversarialSimulator"), - credential=credential, + credential=cast(TokenCredential, credential), ) - self.rai_client = RAIClient(azure_ai_project=azure_ai_project, token_manager=self.token_manager) + self.rai_client = RAIClient(azure_ai_project=self.azure_ai_project, token_manager=self.token_manager) self.adversarial_template_handler = AdversarialTemplateHandler( - azure_ai_project=azure_ai_project, rai_client=self.rai_client + azure_ai_project=self.azure_ai_project, rai_client=self.rai_client ) def _ensure_service_dependencies(self): @@ -266,16 +254,21 @@ async def __call__( return JsonLineList(sim_results) - def _to_chat_protocol(self, *, conversation_history, template_parameters: Dict = None): + def _to_chat_protocol( + self, + *, + conversation_history: List[ConversationTurn], + template_parameters: Optional[Dict[str, Union[str, Dict[str, str]]]] = None, + ): if template_parameters is None: template_parameters = {} messages = [] for _, m in enumerate(conversation_history): message = {"content": m.message, "role": m.role.value} - if "context" in m.full_response: + if m.full_response is not None and "context" in m.full_response: message["context"] = m.full_response["context"] messages.append(message) - conversation_category = template_parameters.pop("metadata", {}).get("Category") + conversation_category = cast(Dict[str, str], template_parameters.pop("metadata", {})).get("Category") template_parameters["metadata"] = {} for key in ( "conversation_starter", @@ -297,14 +290,14 @@ async def _simulate_async( self, *, target: Callable, - template, - parameters, - max_conversation_turns, - api_call_retry_limit, - api_call_retry_sleep_sec, - api_call_delay_sec, - language, - semaphore, + template: AdversarialTemplate, + parameters: TemplateParameters, + max_conversation_turns: int, + api_call_retry_limit: int, + api_call_retry_sleep_sec: int, + api_call_delay_sec: int, + language: SupportedLanguages, + semaphore: asyncio.Semaphore, ) -> List[Dict]: user_bot = self._setup_bot(role=ConversationRole.USER, template=template, parameters=parameters) system_bot = self._setup_bot( @@ -327,9 +320,15 @@ async def _simulate_async( api_call_delay_sec=api_call_delay_sec, language=language, ) - return self._to_chat_protocol(conversation_history=conversation_history, template_parameters=parameters) - def _get_user_proxy_completion_model(self, template_key, template_parameters): + return self._to_chat_protocol( + conversation_history=conversation_history, + template_parameters=cast(Dict[str, Union[str, Dict[str, str]]], parameters), + ) + + def _get_user_proxy_completion_model( + self, template_key: str, template_parameters: TemplateParameters + ) -> ProxyChatCompletionsModel: return ProxyChatCompletionsModel( name="raisvc_proxy_model", template_key=template_key, @@ -341,8 +340,15 @@ def _get_user_proxy_completion_model(self, template_key, template_parameters): temperature=0.0, ) - def _setup_bot(self, *, role, template, parameters, target: Callable = None): - if role == ConversationRole.USER: + def _setup_bot( + self, + *, + role: ConversationRole, + template: AdversarialTemplate, + parameters: TemplateParameters, + target: Optional[Callable] = None, + ) -> ConversationBot: + if role is ConversationRole.USER: model = self._get_user_proxy_completion_model( template_key=template.template_name, template_parameters=parameters ) @@ -353,30 +359,46 @@ def _setup_bot(self, *, role, template, parameters, target: Callable = None): instantiation_parameters=parameters, ) - if role == ConversationRole.ASSISTANT: + if role is ConversationRole.ASSISTANT: + if target is None: + msg = "Cannot setup system bot. Target is None" - def dummy_model() -> None: - return None + raise EvaluationException( + message=msg, + internal_message=msg, + target=ErrorTarget.ADVERSARIAL_SIMULATOR, + error_category=ErrorCategory.INVALID_VALUE, + blame=ErrorBlame.SYSTEM_ERROR, + ) + + class DummyModel: + def __init__(self): + self.name = "dummy_model" + + def __call__(self) -> None: + pass - dummy_model.name = "dummy_model" return CallbackConversationBot( callback=target, role=role, - model=dummy_model, + model=DummyModel(), user_template=str(template), user_template_parameters=parameters, conversation_template="", instantiation_parameters={}, ) - return ConversationBot( - role=role, - model=model, - conversation_template=template, - instantiation_parameters=parameters, + + msg = "Invalid value for enum ConversationRole. This should never happen." + raise EvaluationException( + message=msg, + internal_message=msg, + target=ErrorTarget.ADVERSARIAL_SIMULATOR, + category=ErrorCategory.INVALID_VALUE, + blame=ErrorBlame.SYSTEM_ERROR, ) - def _join_conversation_starter(self, parameters, to_join): - key = "conversation_starter" + def _join_conversation_starter(self, parameters: TemplateParameters, to_join: str) -> TemplateParameters: + key: Literal["conversation_starter"] = "conversation_starter" if key in parameters.keys(): parameters[key] = f"{to_join} {parameters[key]}" else: diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_conversation/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_conversation/__init__.py index 9f512d1001a5..6f044849484f 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_conversation/__init__.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_conversation/__init__.py @@ -7,7 +7,7 @@ import logging import time from dataclasses import dataclass -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast import jinja2 @@ -15,6 +15,7 @@ from azure.ai.evaluation._http_utils import AsyncHttpPipeline from .._model_tools import LLMBase, OpenAIChatCompletionsModel +from .._model_tools._template_handler import TemplateParameters from .constants import ConversationRole @@ -40,7 +41,7 @@ class ConversationTurn: role: "ConversationRole" name: Optional[str] = None message: str = "" - full_response: Optional[Any] = None + full_response: Optional[Dict[str, Any]] = None request: Optional[Any] = None def to_openai_chat_format(self, reverse: bool = False) -> Dict[str, str]: @@ -109,7 +110,7 @@ def __init__( role: ConversationRole, model: Union[LLMBase, OpenAIChatCompletionsModel], conversation_template: str, - instantiation_parameters: Dict[str, str], + instantiation_parameters: TemplateParameters, ) -> None: self.role = role self.conversation_template_orig = conversation_template @@ -118,13 +119,13 @@ def __init__( ) self.persona_template_args = instantiation_parameters if self.role == ConversationRole.USER: - self.name = self.persona_template_args.get("name", role.value) + self.name: str = cast(str, self.persona_template_args.get("name", role.value)) else: - self.name = self.persona_template_args.get("chatbot_name", role.value) or model.name + self.name = cast(str, self.persona_template_args.get("chatbot_name", role.value)) or model.name self.model = model self.logger = logging.getLogger(repr(self)) - self.conversation_starter = None # can either be a dictionary or jinja template + self.conversation_starter: Optional[Union[str, jinja2.Template, Dict]] = None if role == ConversationRole.USER: if "conversation_starter" in self.persona_template_args: conversation_starter_content = self.persona_template_args["conversation_starter"] @@ -148,7 +149,7 @@ async def generate_response( conversation_history: List[ConversationTurn], max_history: int, turn_number: int = 0, - ) -> Tuple[dict, dict, int, dict]: + ) -> Tuple[dict, dict, float, dict]: """ Prompt the ConversationBot for a response. @@ -161,7 +162,7 @@ async def generate_response( :param turn_number: Parameters used to query GPT-4 model. :type turn_number: int :return: The response from the ConversationBot. - :rtype: Tuple[dict, dict, int, dict] + :rtype: Tuple[dict, dict, float, dict] """ # check if this is the first turn and the conversation_starter is not None, @@ -169,11 +170,11 @@ async def generate_response( if turn_number == 0 and self.conversation_starter is not None: # if conversation_starter is a dictionary, pass it into samples as is if isinstance(self.conversation_starter, dict): - samples = [self.conversation_starter] + samples: List[Union[str, jinja2.Template, Dict]] = [self.conversation_starter] if isinstance(self.conversation_starter, jinja2.Template): samples = [self.conversation_starter.render(**self.persona_template_args)] else: - samples = [self.conversation_starter] # type: ignore[attr-defined] + samples = [self.conversation_starter] time_taken = 0 finish_reason = ["stop"] @@ -238,7 +239,7 @@ def __init__( self, callback: Callable, user_template: str, - user_template_parameters: Dict, + user_template_parameters: TemplateParameters, *args, **kwargs, ) -> None: @@ -254,7 +255,7 @@ async def generate_response( conversation_history: List[Any], max_history: int, turn_number: int = 0, - ) -> Tuple[dict, dict, int, dict]: + ) -> Tuple[dict, dict, float, dict]: chat_protocol_message = self._to_chat_protocol( self.user_template, conversation_history, self.user_template_parameters ) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_conversation/_conversation.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_conversation/_conversation.py index 4baa7d467476..f40ca43b577f 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_conversation/_conversation.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_conversation/_conversation.py @@ -4,7 +4,7 @@ import asyncio import logging -from typing import Callable, Dict, List, Tuple, Union +from typing import Callable, Dict, List, Optional, Tuple, Union from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException from azure.ai.evaluation.simulator._constants import SupportedLanguages @@ -80,7 +80,7 @@ async def simulate_conversation( history_limit: int = 5, api_call_delay_sec: float = 0, logger: logging.Logger = logging.getLogger(__name__), -) -> Tuple: +) -> Tuple[Optional[str], List[ConversationTurn]]: """ Simulate a conversation between the given bots. @@ -99,7 +99,7 @@ async def simulate_conversation( :keyword logger: The logger to use for logging. Defaults to the logger named after the current module. :paramtype logger: logging.Logger :return: Simulation a conversation between the given bots. - :rtype: Tuple + :rtype: Tuple[Optional[str], List[ConversationTurn]] """ # Read the first prompt. @@ -110,7 +110,7 @@ async def simulate_conversation( turn_number=0, ) if "id" in first_response: - conversation_id = first_response["id"] + conversation_id: Optional[str] = first_response["id"] else: conversation_id = None first_prompt = first_response["samples"][0] diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_direct_attack_simulator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_direct_attack_simulator.py index a7f6b5a715c2..0db506b56504 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_direct_attack_simulator.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_direct_attack_simulator.py @@ -5,16 +5,16 @@ # noqa: E501 import logging from random import randint -from typing import Callable, Optional +from typing import Callable, Optional, cast +from azure.ai.evaluation._common.utils import validate_azure_ai_project from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException -from azure.ai.evaluation._model_configurations import AzureAIProject from azure.ai.evaluation.simulator import AdversarialScenario -from azure.identity import DefaultAzureCredential +from azure.core.credentials import TokenCredential from ._adversarial_simulator import AdversarialSimulator -from ._model_tools import AdversarialTemplateHandler, ManagedIdentityAPITokenManager, RAIClient, TokenScope from ._helpers import experimental +from ._model_tools import AdversarialTemplateHandler, ManagedIdentityAPITokenManager, RAIClient, TokenScope logger = logging.getLogger(__name__) @@ -32,42 +32,28 @@ class DirectAttackSimulator: :type credential: ~azure.core.credentials.TokenCredential """ - def __init__(self, *, azure_ai_project: AzureAIProject, credential=None): + def __init__(self, *, azure_ai_project: dict, credential): """Constructor.""" - # check if azure_ai_project has the keys: subscription_id, resource_group_name, project_name, credential - if not all(key in azure_ai_project for key in ["subscription_id", "resource_group_name", "project_name"]): - msg = "azure_ai_project must contain keys: subscription_id, resource_group_name and project_name" - raise EvaluationException( - message=msg, - internal_message=msg, - target=ErrorTarget.DIRECT_ATTACK_SIMULATOR, - category=ErrorCategory.MISSING_FIELD, - blame=ErrorBlame.USER_ERROR, - ) - # check the value of the keys in azure_ai_project is not none - if not all(azure_ai_project[key] for key in ["subscription_id", "resource_group_name", "project_name"]): - msg = "subscription_id, resource_group_name and project_name keys cannot be None" + + try: + self.azure_ai_project = validate_azure_ai_project(azure_ai_project) + except EvaluationException as e: raise EvaluationException( - message=msg, - internal_message=msg, + message=e.message, + internal_message=e.internal_message, target=ErrorTarget.DIRECT_ATTACK_SIMULATOR, - category=ErrorCategory.MISSING_FIELD, - blame=ErrorBlame.USER_ERROR, - ) - if "credential" not in azure_ai_project and not credential: - credential = DefaultAzureCredential() - elif "credential" in azure_ai_project: - credential = azure_ai_project["credential"] - self.credential = credential - self.azure_ai_project = azure_ai_project + category=e.category, + blame=e.blame, + ) from e + self.credential = cast(TokenCredential, credential) self.token_manager = ManagedIdentityAPITokenManager( token_scope=TokenScope.DEFAULT_AZURE_MANAGEMENT, logger=logging.getLogger("AdversarialSimulator"), - credential=credential, + credential=self.credential, ) - self.rai_client = RAIClient(azure_ai_project=azure_ai_project, token_manager=self.token_manager) + self.rai_client = RAIClient(azure_ai_project=self.azure_ai_project, token_manager=self.token_manager) self.adversarial_template_handler = AdversarialTemplateHandler( - azure_ai_project=azure_ai_project, rai_client=self.rai_client + azure_ai_project=self.azure_ai_project, rai_client=self.rai_client ) def _ensure_service_dependencies(self): @@ -192,7 +178,9 @@ async def __call__( if not randomization_seed: randomization_seed = randint(0, 1000000) - regular_sim = AdversarialSimulator(azure_ai_project=self.azure_ai_project, credential=self.credential) + regular_sim = AdversarialSimulator( + azure_ai_project=cast(dict, self.azure_ai_project), credential=self.credential + ) regular_sim_results = await regular_sim( scenario=scenario, target=target, @@ -205,7 +193,7 @@ async def __call__( randomize_order=True, randomization_seed=randomization_seed, ) - jb_sim = AdversarialSimulator(azure_ai_project=self.azure_ai_project, credential=self.credential) + jb_sim = AdversarialSimulator(azure_ai_project=cast(dict, self.azure_ai_project), credential=self.credential) jb_sim_results = await jb_sim( scenario=scenario, target=target, diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_helpers/_experimental.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_helpers/_experimental.py index b08657f1fdc9..ca676c9bcdc9 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_helpers/_experimental.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_helpers/_experimental.py @@ -6,9 +6,9 @@ import inspect import logging import sys -from typing import Callable, Type, TypeVar, Union +from typing import Callable, Type, TypeVar, Union, overload -from typing_extensions import ParamSpec +from typing_extensions import ParamSpec, TypeGuard DOCSTRING_TEMPLATE = ".. note:: {0} {1}\n\n" DOCSTRING_DEFAULT_INDENTATION = 8 @@ -22,20 +22,31 @@ _warning_cache = set() module_logger = logging.getLogger(__name__) -TExperimental = TypeVar("TExperimental", bound=Union[Type, Callable]) P = ParamSpec("P") T = TypeVar("T") -def experimental(wrapped: TExperimental) -> TExperimental: +@overload +def experimental(wrapped: Type[T]) -> Type[T]: ... + + +@overload +def experimental(wrapped: Callable[P, T]) -> Callable[P, T]: ... + + +def experimental(wrapped: Union[Type[T], Callable[P, T]]) -> Union[Type[T], Callable[P, T]]: """Add experimental tag to a class or a method. :param wrapped: Either a Class or Function to mark as experimental - :type wrapped: TExperimental + :type wrapped: Union[Type[T], Callable[P, T]] :return: The wrapped class or method - :rtype: TExperimental + :rtype: Union[Type[T], Callable[P, T]] """ - if inspect.isclass(wrapped): + + def is_class(t: Union[Type[T], Callable[P, T]]) -> TypeGuard[Type[T]]: + return isinstance(t, type) + + if is_class(wrapped): return _add_class_docstring(wrapped) if inspect.isfunction(wrapped): return _add_method_docstring(wrapped) @@ -74,11 +85,11 @@ def wrapped(*args, **kwargs): cls.__doc__ = _add_note_to_docstring(cls.__doc__, doc_string) else: cls.__doc__ = doc_string + ">" - cls.__init__ = _add_class_warning(cls.__init__) + cls.__init__ = _add_class_warning(cls.__init__) # type: ignore[method-assign] return cls -def _add_method_docstring(func: Callable[P, T] = None) -> Callable[P, T]: +def _add_method_docstring(func: Callable[P, T]) -> Callable[P, T]: """Add experimental tag to the method doc string. :param func: The function to update diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py index 5db56fd9b06c..109384bc2500 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py @@ -18,7 +18,7 @@ class Turn: role: Union[str, ConversationRole] content: str - context: str = None + context: Optional[str] = None def to_dict(self) -> Dict[str, Optional[str]]: """ @@ -42,13 +42,13 @@ class ConversationHistory: Conversation history class to keep track of the conversation turns in a conversation. """ - def __init__(self): + def __init__(self) -> None: """ Initializes the conversation history with an empty list of turns. """ self.history: List[Turn] = [] - def add_to_history(self, turn: Turn): + def add_to_history(self, turn: Turn) -> None: """ Adds a turn to the conversation history. @@ -57,7 +57,7 @@ def add_to_history(self, turn: Turn): """ self.history.append(turn) - def to_list(self) -> List[Dict[str, str]]: + def to_list(self) -> List[Dict[str, Optional[str]]]: """ Converts the conversation history to a list of dictionaries. diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_indirect_attack_simulator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_indirect_attack_simulator.py index 5d64b692ac06..83f17254be3c 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_indirect_attack_simulator.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_indirect_attack_simulator.py @@ -4,15 +4,16 @@ # pylint: disable=C0301,C0114,R0913,R0903 # noqa: E501 import logging -from typing import Callable +from typing import Callable, cast + +from azure.ai.evaluation._common.utils import validate_azure_ai_project from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException -from azure.ai.evaluation._model_configurations import AzureAIProject from azure.ai.evaluation.simulator import AdversarialScenario -from azure.identity import DefaultAzureCredential +from azure.core.credentials import TokenCredential from ._adversarial_simulator import AdversarialSimulator -from ._model_tools import AdversarialTemplateHandler, ManagedIdentityAPITokenManager, RAIClient, TokenScope from ._helpers import experimental +from ._model_tools import AdversarialTemplateHandler, ManagedIdentityAPITokenManager, RAIClient, TokenScope logger = logging.getLogger(__name__) @@ -29,41 +30,29 @@ class IndirectAttackSimulator: :type credential: ~azure.core.credentials.TokenCredential """ - def __init__(self, *, azure_ai_project: AzureAIProject, credential=None): + def __init__(self, *, azure_ai_project: dict, credential): """Constructor.""" - # check if azure_ai_project has the keys: subscription_id, resource_group_name, project_name, credential - if not all(key in azure_ai_project for key in ["subscription_id", "resource_group_name", "project_name"]): - msg = "azure_ai_project must contain keys: subscription_id, resource_group_name and project_name" - raise EvaluationException( - message=msg, - internal_message=msg, - target=ErrorTarget.DIRECT_ATTACK_SIMULATOR, - category=ErrorCategory.MISSING_FIELD, - blame=ErrorBlame.USER_ERROR, - ) - if not all(azure_ai_project[key] for key in ["subscription_id", "resource_group_name", "project_name"]): - msg = "subscription_id, resource_group_name and project_name keys cannot be None" + + try: + self.azure_ai_project = validate_azure_ai_project(azure_ai_project) + except EvaluationException as e: raise EvaluationException( - message=msg, - internal_message=msg, + message=e.message, + internal_message=e.internal_message, target=ErrorTarget.DIRECT_ATTACK_SIMULATOR, - category=ErrorCategory.MISSING_FIELD, - blame=ErrorBlame.USER_ERROR, - ) - if "credential" not in azure_ai_project and not credential: - credential = DefaultAzureCredential() - elif "credential" in azure_ai_project: - credential = azure_ai_project["credential"] - self.credential = credential - self.azure_ai_project = azure_ai_project + category=e.category, + blame=e.blame, + ) from e + + self.credential = cast(TokenCredential, credential) self.token_manager = ManagedIdentityAPITokenManager( token_scope=TokenScope.DEFAULT_AZURE_MANAGEMENT, logger=logging.getLogger("AdversarialSimulator"), - credential=credential, + credential=self.credential, ) - self.rai_client = RAIClient(azure_ai_project=azure_ai_project, token_manager=self.token_manager) + self.rai_client = RAIClient(azure_ai_project=self.azure_ai_project, token_manager=self.token_manager) self.adversarial_template_handler = AdversarialTemplateHandler( - azure_ai_project=azure_ai_project, rai_client=self.rai_client + azure_ai_project=self.azure_ai_project, rai_client=self.rai_client ) def _ensure_service_dependencies(self): @@ -161,7 +150,7 @@ async def __call__( category=ErrorCategory.INVALID_VALUE, blame=ErrorBlame.USER_ERROR, ) - jb_sim = AdversarialSimulator(azure_ai_project=self.azure_ai_project, credential=self.credential) + jb_sim = AdversarialSimulator(azure_ai_project=cast(dict, self.azure_ai_project), credential=self.credential) jb_sim_results = await jb_sim( scenario=scenario, target=target, diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_model_tools/_identity_manager.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_model_tools/_identity_manager.py index 162a5e31e5c4..1ac5d8076400 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_model_tools/_identity_manager.py @@ -8,8 +8,9 @@ import time from abc import ABC, abstractmethod from enum import Enum -from typing import Dict, Optional, Union +from typing import Optional, Union +from azure.core.credentials import TokenCredential from azure.identity import DefaultAzureCredential, ManagedIdentityCredential AZURE_TOKEN_REFRESH_INTERVAL = 600 # seconds @@ -29,24 +30,24 @@ class APITokenManager(ABC): :param auth_header: Authorization header prefix. Defaults to "Bearer" :type auth_header: str :param credential: Azure credential object - :type credential: Optional[Union[azure.identity.DefaultAzureCredential, azure.identity.ManagedIdentityCredential] + :type credential: Optional[TokenCredential] """ def __init__( self, logger: logging.Logger, auth_header: str = "Bearer", - credential: Optional[Union[DefaultAzureCredential, ManagedIdentityCredential]] = None, + credential: Optional[TokenCredential] = None, ) -> None: self.logger = logger self.auth_header = auth_header - self._lock = None + self._lock: Optional[asyncio.Lock] = None if credential is not None: self.credential = credential else: self.credential = self.get_aad_credential() - self.token = None - self.last_refresh_time = None + self.token: Optional[str] = None + self.last_refresh_time: Optional[float] = None @property def lock(self) -> asyncio.Lock: @@ -73,20 +74,18 @@ def get_aad_credential(self) -> Union[DefaultAzureCredential, ManagedIdentityCre identity_client_id = os.environ.get("DEFAULT_IDENTITY_CLIENT_ID", None) if identity_client_id is not None: self.logger.info(f"Using DEFAULT_IDENTITY_CLIENT_ID: {identity_client_id}") - credential = ManagedIdentityCredential(client_id=identity_client_id) - else: - self.logger.info("Environment variable DEFAULT_IDENTITY_CLIENT_ID is not set, using DefaultAzureCredential") - credential = DefaultAzureCredential() - return credential + return ManagedIdentityCredential(client_id=identity_client_id) + + self.logger.info("Environment variable DEFAULT_IDENTITY_CLIENT_ID is not set, using DefaultAzureCredential") + return DefaultAzureCredential() @abstractmethod - async def get_token(self) -> str: + def get_token(self) -> str: """Async method to get the API token. Subclasses should implement this method. :return: API token :rtype: str """ - pass # pylint: disable=unnecessary-pass class ManagedIdentityAPITokenManager(APITokenManager): @@ -100,12 +99,18 @@ class ManagedIdentityAPITokenManager(APITokenManager): :paramtype kwargs: Dict """ - def __init__(self, token_scope: TokenScope, logger: logging.Logger, **kwargs: Dict): - super().__init__(logger, **kwargs) + def __init__( + self, + token_scope: TokenScope, + logger: logging.Logger, + *, + auth_header: str = "Bearer", + credential: Optional[TokenCredential] = None, + ): + super().__init__(logger, auth_header=auth_header, credential=credential) self.token_scope = token_scope - # Bug 3353724: This get_token is sync method, but it is defined as async method in the base class - def get_token(self) -> str: # pylint: disable=invalid-overridden-method + def get_token(self) -> str: """Get the API token. If the token is not available or has expired, refresh the token. :return: API token @@ -134,11 +139,18 @@ class PlainTokenManager(APITokenManager): :paramtype kwargs: Dict """ - def __init__(self, openapi_key: str, logger: logging.Logger, **kwargs: Dict): - super().__init__(logger, **kwargs) - self.token = openapi_key + def __init__( + self, + openapi_key: str, + logger: logging.Logger, + *, + auth_header: str = "Bearer", + credential: Optional[TokenCredential] = None, + ) -> None: + super().__init__(logger, auth_header=auth_header, credential=credential) + self.token: str = openapi_key - async def get_token(self) -> str: + def get_token(self) -> str: """Get the API token :return: API token diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py index b1eaac7de22a..e802f398fc42 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py @@ -6,13 +6,14 @@ import json import time import uuid -from typing import Dict, List +from typing import Any, Dict, List, Optional, cast from azure.ai.evaluation._http_utils import AsyncHttpPipeline, get_async_http_client from azure.ai.evaluation._user_agent import USER_AGENT from azure.core.exceptions import HttpResponseError from azure.core.pipeline.policies import AsyncRetryPolicy, RetryMode +from .._model_tools._template_handler import TemplateParameters from .models import OpenAIChatCompletionsModel @@ -33,7 +34,15 @@ class SimulationRequestDTO: :type template_parameters: Dict """ - def __init__(self, url, headers, payload, params, templatekey, template_parameters): + def __init__( + self, + url: str, + headers: Dict[str, str], + payload: Dict[str, Any], + params: Dict[str, str], + templatekey: str, + template_parameters: Optional[TemplateParameters], + ): self.url = url self.headers = headers self.json = json.dumps(payload) @@ -47,9 +56,12 @@ def to_dict(self) -> Dict: :return: The DTO as a dictionary. :rtype: Dict """ - if self.templateParameters is not None: - self.templateParameters = {str(k): str(v) for k, v in self.templateParameters.items()} - return self.__dict__ + toReturn = self.__dict__.copy() + + if toReturn["templateParameters"] is not None: + toReturn["templateParameters"] = {str(k): str(v) for k, v in toReturn["templateParameters"].items()} + + return toReturn def to_json(self): """Convert the DTO to a JSON string. @@ -73,12 +85,12 @@ class ProxyChatCompletionsModel(OpenAIChatCompletionsModel): :keyword kwargs: Additional keyword arguments to pass to the parent class. """ - def __init__(self, name: str, template_key: str, template_parameters, *args, **kwargs) -> None: + def __init__(self, name: str, template_key: str, template_parameters: TemplateParameters, **kwargs) -> None: self.tkey = template_key self.tparam = template_parameters - self.result_url = None + self.result_url: Optional[str] = None - super().__init__(name=name, *args, **kwargs) + super().__init__(name=name, **kwargs) def format_request_data(self, messages: List[Dict], **request_params) -> Dict: # type: ignore[override] """Format the request data to query the model with. @@ -184,8 +196,8 @@ async def request_api( message=f"Received unexpected HTTP status: {response.status_code} {response.text()}", response=response ) - response = response.json() - self.result_url = response["location"] + response_data = response.json() + self.result_url = cast(str, response_data["location"]) retry_policy = AsyncRetryPolicy( # set up retry configuration retry_on_status_codes=[202], # on which statuses to retry diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_model_tools/_template_handler.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_model_tools/_template_handler.py index 7a7f16b05c80..1ac9536e418e 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_model_tools/_template_handler.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_model_tools/_template_handler.py @@ -2,25 +2,66 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -from typing import Optional +from typing import Dict, List, Optional, TypedDict, cast + +from typing_extensions import NotRequired from azure.ai.evaluation._model_configurations import AzureAIProject from ._rai_client import RAIClient -CONTENT_HARM_TEMPLATES_COLLECTION_KEY = set( - [ - "adv_qa", - "adv_conversation", - "adv_summarization", - "adv_search", - "adv_rewrite", - "adv_content_gen_ungrounded", - "adv_content_gen_grounded", - "adv_content_protected_material", - "adv_politics", - ] -) +CONTENT_HARM_TEMPLATES_COLLECTION_KEY = { + "adv_qa", + "adv_conversation", + "adv_summarization", + "adv_search", + "adv_rewrite", + "adv_content_gen_ungrounded", + "adv_content_gen_grounded", + "adv_content_protected_material", + "adv_politics", +} + + +class TemplateParameters(TypedDict): + """Parameters used in Templates + + .. note:: + + This type is good enough to type check, but is incorrect. It's meant to represent a dictionary with a known + `metadata` key (Dict[str, str]), a known `ch_template_placeholder` key (str), and an unknown number of keys + that map to `str` values. + + In typescript, this type would be spelled: + + .. code-block:: typescript + + type AdversarialTemplateParameters = { + [key: string]: string + ch_template_placeholder: string + metadata: {[index: string]: string} # Doesn't typecheck but gets the point across + } + + At time of writing, this isn't possible to express with a TypedDict. TypedDicts must be "closed" in that + they fully specify all the keys they can contain. + + `PEP 728 – TypedDict with Typed Extra Items ` is a proposal to support + this, but would only be available in Python 3.13 at the earliest. + """ + + metadata: Dict[str, str] + conversation_starter: str + ch_template_placeholder: str + group_of_people: NotRequired[str] + category: NotRequired[str] + target_population: NotRequired[str] + topic: NotRequired[str] + + +class _CategorizedParameter(TypedDict): + parameters: List[TemplateParameters] + category: str + parameters_key: str class ContentHarmTemplatesUtils: @@ -85,13 +126,19 @@ class AdversarialTemplate: :param template_parameters: The template parameters. """ - def __init__(self, template_name, text, context_key, template_parameters=None) -> None: + def __init__( + self, + template_name: str, + text: Optional[str], + context_key: List, + template_parameters: Optional[List[TemplateParameters]] = None, + ) -> None: self.text = text self.context_key = context_key self.template_name = template_name - self.template_parameters = template_parameters + self.template_parameters = template_parameters or [] - def __str__(self): + def __str__(self) -> str: return "{{ch_template_placeholder}}" @@ -106,16 +153,13 @@ class AdversarialTemplateHandler: """ def __init__(self, azure_ai_project: AzureAIProject, rai_client: RAIClient) -> None: - self.cached_templates_source = {} - # self.template_env = JinjaEnvironment(loader=JinjaFileSystemLoader(searchpath=template_dir)) self.azure_ai_project = azure_ai_project - self.categorized_ch_parameters = None + self.categorized_ch_parameters: Optional[Dict[str, _CategorizedParameter]] = None self.rai_client = rai_client - async def _get_content_harm_template_collections(self, collection_key): - + async def _get_content_harm_template_collections(self, collection_key: str) -> List[AdversarialTemplate]: if self.categorized_ch_parameters is None: - categorized_parameters = {} + categorized_parameters: Dict[str, _CategorizedParameter] = {} util = ContentHarmTemplatesUtils parameters = await self.rai_client.get_contentharm_parameters() @@ -123,7 +167,7 @@ async def _get_content_harm_template_collections(self, collection_key): for k in parameters.keys(): template_key = util.get_template_key(k) categorized_parameters[template_key] = { - "parameters": parameters[k], + "parameters": cast(List[TemplateParameters], parameters[k]), "category": util.get_template_category(k), "parameters_key": k, } diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_model_tools/models.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_model_tools/models.py index 7dbb0f3d586f..9f4f135cbdfa 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_model_tools/models.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_model_tools/models.py @@ -49,10 +49,10 @@ class LLMBase(ABC): Base class for all LLM models. """ - def __init__(self, endpoint_url: str, name: str = "unknown", additional_headers: Optional[dict] = {}): + def __init__(self, endpoint_url: str, name: str = "unknown", additional_headers: Optional[Dict[str, str]] = None): self.endpoint_url = endpoint_url self.name = name - self.additional_headers = additional_headers + self.additional_headers = additional_headers or {} self.logger = logging.getLogger(repr(self)) # Metric tracking @@ -208,7 +208,7 @@ def __init__( *, endpoint_url: str, name: str = "OpenAICompletionsModel", - additional_headers: Optional[dict] = {}, + additional_headers: Optional[Dict[str, str]] = None, api_version: Optional[str] = "2023-03-15-preview", token_manager: APITokenManager, azureml_model_deployment: Optional[str] = None, @@ -220,7 +220,7 @@ def __init__( frequency_penalty: Optional[float] = 0, presence_penalty: Optional[float] = 0, stop: Optional[Union[List[str], str]] = None, - image_captions: Dict[str, str] = {}, + image_captions: Optional[Dict[str, str]] = None, images_dir: Optional[str] = None, # Note: unused, kept for class compatibility ): super().__init__(endpoint_url=endpoint_url, name=name, additional_headers=additional_headers) @@ -234,7 +234,7 @@ def __init__( self.n = n self.frequency_penalty = frequency_penalty self.presence_penalty = presence_penalty - self.image_captions = image_captions + self.image_captions = image_captions or {} # Default stop to end token if not provided if not stop: @@ -263,7 +263,7 @@ def __init__( def get_model_params(self): return {param: getattr(self, param) for param in self.model_param_names if getattr(self, param) is not None} - def format_request_data(self, prompt: str, **request_params) -> Dict[str, str]: + def format_request_data(self, prompt: Dict[str, str], **request_params) -> Dict[str, str]: # type: ignore[override] """ Format the request data for the OpenAI API. """ @@ -328,7 +328,7 @@ async def get_all_completions( # type: ignore[override] # Format prompts and tag with index request_datas: List[Dict] = [] for idx, prompt in enumerate(prompts): - prompt: Dict[str, str] = self.format_request_data(prompt, **request_params) + prompt = self.format_request_data(prompt, **request_params) prompt[self.prompt_idx_key] = idx # type: ignore[assignment] request_datas.append(prompt) @@ -447,7 +447,7 @@ async def request_api( self._log_request(request_data) - token = await self.token_manager.get_token() + token = self.token_manager.get_token() headers = { "Content-Type": "application/json", @@ -522,8 +522,8 @@ class OpenAIChatCompletionsModel(OpenAICompletionsModel): formats the prompt for chat completion. """ - def __init__(self, name="OpenAIChatCompletionsModel", *args, **kwargs): - super().__init__(name=name, *args, **kwargs) + def __init__(self, name="OpenAIChatCompletionsModel", **kwargs): + super().__init__(name=name, **kwargs) def format_request_data(self, messages: List[dict], **request_params): # type: ignore[override] request_data = {"messages": messages, **self.get_model_params()} diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_simulator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_simulator.py index f2621966fab7..f20104bf5d5e 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_simulator.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_simulator.py @@ -15,11 +15,10 @@ from promptflow.core import AzureOpenAIModelConfiguration, Flow from tqdm import tqdm +from .._exceptions import ErrorBlame, ErrorCategory, EvaluationException from .._user_agent import USER_AGENT from ._conversation.constants import ConversationRole from ._helpers import ConversationHistory, Turn, experimental - -# from ._tracing import monitor_task_simulator from ._utils import JsonLineChatProtocol @@ -65,7 +64,7 @@ async def __call__( *, target: Callable, max_conversation_turns: int = 5, - tasks: List[Dict] = [], + tasks: List[str] = [], text: str = "", num_queries: int = 5, query_response_generating_prompty: Optional[str] = None, @@ -305,7 +304,7 @@ async def _extend_conversation_with_simulator( def _load_user_simulation_flow( self, *, - user_simulator_prompty: Union[str, os.PathLike], + user_simulator_prompty: Optional[Union[str, os.PathLike]], prompty_model_config: Dict[str, Any], user_simulator_prompty_kwargs: Dict[str, Any], ) -> Flow: @@ -313,7 +312,7 @@ def _load_user_simulation_flow( Loads the flow for simulating user interactions. :keyword user_simulator_prompty: Path to the user simulator prompty file. - :paramtype user_simulator_prompty: Union[str, os.PathLike] + :paramtype user_simulator_prompty: Optional[Union[str, os.PathLike]] :keyword prompty_model_config: The configuration for the prompty model. :paramtype prompty_model_config: Dict[str, Any] :keyword user_simulator_prompty_kwargs: Additional keyword arguments for the user simulator prompty. @@ -330,7 +329,13 @@ def _load_user_simulation_flow( with pkg_resources.path(package, resource_name) as prompty_path: return load_flow(source=str(prompty_path), model=prompty_model_config) except FileNotFoundError as e: - raise f"Flow path for {resource_name} does not exist in package {package}." from e + msg = f"Flow path for {resource_name} does not exist in package {package}." + raise EvaluationException( + message=msg, + internal_message=msg, + error_category=ErrorCategory.FILE_OR_FOLDER_NOT_FOUND, + blame=ErrorBlame.USER_ERROR, + ) from e return load_flow( source=user_simulator_prompty, model=prompty_model_config, @@ -420,7 +425,7 @@ async def _generate_query_responses( def _load_query_generation_flow( self, *, - query_response_generating_prompty: Union[str, os.PathLike], + query_response_generating_prompty: Optional[Union[str, os.PathLike]], prompty_model_config: Dict[str, Any], query_response_generating_prompty_kwargs: Dict[str, Any], ) -> Flow: @@ -428,7 +433,7 @@ def _load_query_generation_flow( Loads the flow for generating query responses. :keyword query_response_generating_prompty: Path to the query response generating prompty file. - :paramtype query_response_generating_prompty: Union[str, os.PathLike] + :paramtype query_response_generating_prompty: Optional[Union[str, os.PathLike]] :keyword prompty_model_config: The configuration for the prompty model. :paramtype prompty_model_config: Dict[str, Any] :keyword query_response_generating_prompty_kwargs: Additional keyword arguments for the flow. @@ -445,7 +450,13 @@ def _load_query_generation_flow( with pkg_resources.path(package, resource_name) as prompty_path: return load_flow(source=str(prompty_path), model=prompty_model_config) except FileNotFoundError as e: - raise f"Flow path for {resource_name} does not exist in package {package}." from e + msg = f"Flow path for {resource_name} does not exist in package {package}." + raise EvaluationException( + message=msg, + internal_message=msg, + error_category=ErrorCategory.FILE_OR_FOLDER_NOT_FOUND, + blame=ErrorBlame.USER_ERROR, + ) from e return load_flow( source=query_response_generating_prompty, model=prompty_model_config, @@ -457,7 +468,7 @@ async def _create_conversations_from_query_responses( *, query_responses: List[Dict[str, str]], max_conversation_turns: int, - tasks: List[Dict], + tasks: List[str], user_simulator_prompty: Optional[str], user_simulator_prompty_kwargs: Dict[str, Any], target: Callable, @@ -471,7 +482,7 @@ async def _create_conversations_from_query_responses( :keyword max_conversation_turns: The maximum number of conversation turns. :paramtype max_conversation_turns: int :keyword tasks: A list of tasks for the simulation. - :paramtype tasks: List[Dict] + :paramtype tasks: List[str] :keyword user_simulator_prompty: Path to the user simulator prompty file. :paramtype user_simulator_prompty: Optional[str] :keyword user_simulator_prompty_kwargs: Additional keyword arguments for the user simulator prompty. @@ -536,7 +547,7 @@ async def _complete_conversation( target: Callable, api_call_delay_sec: float, progress_bar: tqdm, - ) -> List[Dict[str, str]]: + ) -> List[Dict[str, Optional[str]]]: """ Completes a conversation with the target model based on the conversation starter. @@ -557,7 +568,7 @@ async def _complete_conversation( :keyword progress_bar: Progress bar for tracking simulation progress. :paramtype progress_bar: tqdm :return: A list representing the conversation history with each turn's content. - :rtype: List[Dict[str, str]] + :rtype: List[Dict[str, Optional[str]]] """ conversation_history = ConversationHistory() # user_turn = Turn(role=ConversationRole.USER, content=conversation_starter) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_tracing.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_tracing.py index 50b0f1531804..33b76d214eba 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_tracing.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_tracing.py @@ -69,16 +69,16 @@ def monitor_task_simulator(func: Callable[P, R]) -> Callable[P, R]: @functools.wraps(func) def wrapper(*args: P.args, **kwargs: P.kwargs) -> R: - text_length = len(kwargs.get("text", "")) - user_persona_length = len(kwargs.get("user_persona", [])) + text = kwargs.get("text") + user_persona = kwargs.get("user_persona") num_queries = kwargs.get("num_queries", 0) max_conversation_turns = kwargs.get("max_conversation_turns", 0) decorated_func = monitor_operation( activity_name="task.simulator.call", activity_type=ActivityType.PUBLICAPI, custom_dimensions={ - "text_length": text_length, - "user_persona_length": user_persona_length, + "text_length": len(text) if isinstance(text, str) else 0, + "user_persona_length": len(user_persona) if isinstance(user_persona, list) else 0, "number_of_queries": num_queries, "max_conversation_turns": max_conversation_turns, }, diff --git a/sdk/evaluation/azure-ai-evaluation/pyproject.toml b/sdk/evaluation/azure-ai-evaluation/pyproject.toml index c70ecde63e5e..ffac798c3b2d 100644 --- a/sdk/evaluation/azure-ai-evaluation/pyproject.toml +++ b/sdk/evaluation/azure-ai-evaluation/pyproject.toml @@ -1,5 +1,5 @@ [tool.azure-sdk-build] -mypy = false +mypy = true pyright = false pylint = true black = true diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_adv_simulator.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_adv_simulator.py index 108ed708c611..2af863cf46fb 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_adv_simulator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_adv_simulator.py @@ -1,6 +1,6 @@ import asyncio import os -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional import pytest from devtools_testutils import is_live @@ -19,9 +19,8 @@ def test_adv_sim_init_with_prod_url(self, azure_cred, project_scope): "subscription_id": project_scope["subscription_id"], "resource_group_name": project_scope["resource_group_name"], "project_name": project_scope["project_name"], - "credential": azure_cred, } - simulator = AdversarialSimulator(azure_ai_project=azure_ai_project) + simulator = AdversarialSimulator(azure_ai_project=azure_ai_project, credential=azure_cred) assert callable(simulator) def test_incorrect_scenario_raises_error(self, azure_cred, project_scope): @@ -32,13 +31,12 @@ def test_incorrect_scenario_raises_error(self, azure_cred, project_scope): "subscription_id": project_scope["subscription_id"], "resource_group_name": project_scope["resource_group_name"], "project_name": project_scope["project_name"], - "credential": azure_cred, } async def callback(x): return x - simulator = AdversarialSimulator(azure_ai_project=azure_ai_project) + simulator = AdversarialSimulator(azure_ai_project=azure_ai_project, credential=azure_cred) with pytest.raises(EvaluationException): asyncio.run( simulator( @@ -57,11 +55,13 @@ def test_adv_qa_sim_responds_with_one_response(self, azure_cred, project_scope): "subscription_id": project_scope["subscription_id"], "resource_group_name": project_scope["resource_group_name"], "project_name": project_scope["project_name"], - "credential": azure_cred, } async def callback( - messages: List[Dict], stream: bool = False, session_state: Any = None, context: Dict[str, Any] = None + messages: List[Dict], + stream: bool = False, + session_state: Any = None, + context: Optional[Dict[str, Any]] = None, ) -> dict: query = messages["messages"][0]["content"] response_from_acs, temperature = query, 0.0 @@ -80,7 +80,7 @@ async def callback( "context": context, } - simulator = AdversarialSimulator(azure_ai_project=azure_ai_project) + simulator = AdversarialSimulator(azure_ai_project=azure_ai_project, credential=azure_cred) outputs = asyncio.run( simulator( @@ -108,11 +108,13 @@ def test_adv_conversation_sim_responds_with_responses(self, azure_cred, project_ "subscription_id": project_scope["subscription_id"], "resource_group_name": project_scope["resource_group_name"], "project_name": project_scope["project_name"], - "credential": azure_cred, } async def callback( - messages: List[Dict], stream: bool = False, session_state: Any = None, context: Dict[str, Any] = None + messages: List[Dict], + stream: bool = False, + session_state: Any = None, + context: Optional[Dict[str, Any]] = None, ) -> dict: query = messages["messages"][0]["content"] @@ -125,7 +127,7 @@ async def callback( "context": context, } - simulator = AdversarialSimulator(azure_ai_project=azure_ai_project) + simulator = AdversarialSimulator(azure_ai_project=azure_ai_project, credential=azure_cred) outputs = asyncio.run( simulator( @@ -150,11 +152,13 @@ def test_adv_summarization_sim_responds_with_responses(self, azure_cred, project "subscription_id": project_scope["subscription_id"], "resource_group_name": project_scope["resource_group_name"], "project_name": project_scope["project_name"], - "credential": azure_cred, } async def callback( - messages: List[Dict], stream: bool = False, session_state: Any = None, context: Dict[str, Any] = None + messages: List[Dict], + stream: bool = False, + session_state: Any = None, + context: Optional[Dict[str, Any]] = None, ) -> dict: query = messages["messages"][0]["content"] @@ -167,7 +171,7 @@ async def callback( "context": context, } - simulator = AdversarialSimulator(azure_ai_project=azure_ai_project) + simulator = AdversarialSimulator(azure_ai_project=azure_ai_project, credential=azure_cred) outputs = asyncio.run( simulator( @@ -191,11 +195,13 @@ def test_adv_summarization_jailbreak_sim_responds_with_responses(self, azure_cre "subscription_id": project_scope["subscription_id"], "resource_group_name": project_scope["resource_group_name"], "project_name": project_scope["project_name"], - "credential": azure_cred, } async def callback( - messages: List[Dict], stream: bool = False, session_state: Any = None, context: Dict[str, Any] = None + messages: List[Dict], + stream: bool = False, + session_state: Any = None, + context: Optional[Dict[str, Any]] = None, ) -> dict: query = messages["messages"][0]["content"] @@ -208,7 +214,7 @@ async def callback( "context": context, } - simulator = AdversarialSimulator(azure_ai_project=azure_ai_project) + simulator = AdversarialSimulator(azure_ai_project=azure_ai_project, credential=azure_cred) outputs = asyncio.run( simulator( @@ -236,7 +242,10 @@ def test_adv_rewrite_sim_responds_with_responses(self, azure_cred, project_scope } async def callback( - messages: List[Dict], stream: bool = False, session_state: Any = None, context: Dict[str, Any] = None + messages: List[Dict], + stream: bool = False, + session_state: Any = None, + context: Optional[Dict[str, Any]] = None, ) -> dict: query = messages["messages"][0]["content"] @@ -278,7 +287,10 @@ def test_adv_protected_matierial_sim_responds_with_responses(self, azure_cred, p } async def callback( - messages: List[Dict], stream: bool = False, session_state: Any = None, context: Dict[str, Any] = None + messages: List[Dict], + stream: bool = False, + session_state: Any = None, + context: Optional[Dict[str, Any]] = None, ) -> dict: query = messages["messages"][0]["content"] @@ -320,7 +332,10 @@ def test_adv_eci_sim_responds_with_responses(self, azure_cred, project_scope): } async def callback( - messages: List[Dict], stream: bool = False, session_state: Any = None, context: Dict[str, Any] = None + messages: List[Dict], + stream: bool = False, + session_state: Any = None, + context: Optional[Dict[str, Any]] = None, ) -> dict: query = messages["messages"][0]["content"] @@ -364,7 +379,10 @@ def test_adv_xpia_sim_responds_with_responses(self, azure_cred, project_scope): } async def callback( - messages: List[Dict], stream: bool = False, session_state: Any = None, context: Dict[str, Any] = None + messages: List[Dict], + stream: bool = False, + session_state: Any = None, + context: Optional[Dict[str, Any]] = None, ) -> dict: query = messages["messages"][0]["content"] @@ -403,7 +421,10 @@ def test_adv_sim_order_randomness_with_jailbreak(self, azure_cred, project_scope } async def callback( - messages: List[Dict], stream: bool = False, session_state: Any = None, context: Dict[str, Any] = None + messages: List[Dict], + stream: bool = False, + session_state: Any = None, + context: Optional[Dict[str, Any]] = None, ) -> dict: query = messages["messages"][0]["content"] @@ -480,7 +501,10 @@ def test_adv_sim_order_randomness(self, azure_cred, project_scope): } async def callback( - messages: List[Dict], stream: bool = False, session_state: Any = None, context: Dict[str, Any] = None + messages: List[Dict], + stream: bool = False, + session_state: Any = None, + context: Optional[Dict[str, Any]] = None, ) -> dict: query = messages["messages"][0]["content"] @@ -554,7 +578,10 @@ def test_jailbreak_sim_order_randomness(self, azure_cred, project_scope): } async def callback( - messages: List[Dict], stream: bool = False, session_state: Any = None, context: Dict[str, Any] = None + messages: List[Dict], + stream: bool = False, + session_state: Any = None, + context: Optional[Dict[str, Any]] = None, ) -> dict: query = messages["messages"][0]["content"] diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate_telemetry.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate_telemetry.py index fa88bb96d597..314688075ef6 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate_telemetry.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate_telemetry.py @@ -58,7 +58,6 @@ def dummy_evaluate_function( df = pd.read_json(data, lines=True) nan_count = kwargs.get("number_of_nans", 1) for evaluation_name, evaluator in evaluators.items(): - df[f"outputs.{evaluation_name}.score"] = [random.choice(range(100)) for _ in range(df.shape[0])] _add_nans(df, nan_count, f"outputs.{evaluation_name}.score") @@ -142,7 +141,7 @@ def test_evaluator_start_telemetry( mock_trace_destination_to_cloud, mock_validate_trace_destination, ): - hate_unfairness = HateUnfairnessEvaluator(azure_cred, azure_ai_project=None) + hate_unfairness = HateUnfairnessEvaluator(azure_cred, azure_ai_project=mock_project_scope) data = _get_file("evaluate_test_data.jsonl") evaluators = { diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_jailbreak_simulator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_jailbreak_simulator.py index dbc49b520f6a..0a2595be40d8 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_jailbreak_simulator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_jailbreak_simulator.py @@ -35,6 +35,7 @@ def test_initialization_with_all_valid_scenarios( mock_get_content_harm_template_collections, mock_simulate_async, mock_get_service_discovery_url, + azure_cred, ): mock_get_service_discovery_url.return_value = "http://some.url/discovery/" mock_simulate_async.return_value = MagicMock() @@ -55,7 +56,7 @@ def test_initialization_with_all_valid_scenarios( AdversarialScenario.ADVERSARIAL_CONTENT_GEN_GROUNDED, ] for scenario in available_scenarios: - simulator = DirectAttackSimulator(azure_ai_project=azure_ai_project) + simulator = DirectAttackSimulator(azure_ai_project=azure_ai_project, credential=azure_cred) assert callable(simulator) simulator(scenario=scenario, max_conversation_turns=1, max_simulation_results=3, target=async_callback) @@ -64,7 +65,7 @@ def test_initialization_with_all_valid_scenarios( "azure.ai.evaluation.simulator._model_tools.AdversarialTemplateHandler._get_content_harm_template_collections" ) def test_simulator_raises_validation_error_with_unsupported_scenario( - self, _get_content_harm_template_collections, _get_service_discovery_url + self, _get_content_harm_template_collections, _get_service_discovery_url, azure_cred ): _get_content_harm_template_collections.return_value = [] _get_service_discovery_url.return_value = "some-url" @@ -77,7 +78,7 @@ def test_simulator_raises_validation_error_with_unsupported_scenario( async def callback(x): return x - simulator = DirectAttackSimulator(azure_ai_project=azure_ai_project) + simulator = DirectAttackSimulator(azure_ai_project=azure_ai_project, credential=azure_cred) with pytest.raises(EvaluationException): outputs = asyncio.run( simulator( diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_simulator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_simulator.py index 7d835aec6eb7..4133bc1110f6 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_simulator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_simulator.py @@ -35,6 +35,7 @@ def test_initialization_with_all_valid_scenarios( mock_get_content_harm_template_collections, mock_simulate_async, mock_get_service_discovery_url, + azure_cred, ): mock_get_service_discovery_url.return_value = "http://some.url/discovery/" mock_simulate_async.return_value = MagicMock() @@ -44,7 +45,6 @@ def test_initialization_with_all_valid_scenarios( "subscription_id": "test_subscription", "resource_group_name": "test_resource_group", "project_name": "test_workspace", - "credential": "test_credential", } available_scenarios = [ AdversarialScenario.ADVERSARIAL_CONVERSATION, @@ -56,7 +56,7 @@ def test_initialization_with_all_valid_scenarios( AdversarialScenario.ADVERSARIAL_CONTENT_GEN_GROUNDED, ] for scenario in available_scenarios: - simulator = AdversarialSimulator(azure_ai_project=azure_ai_project) + simulator = AdversarialSimulator(azure_ai_project=azure_ai_project, credential=azure_cred) assert callable(simulator) # simulator(scenario=scenario, max_conversation_turns=1, max_simulation_results=3, target=async_callback) @@ -65,7 +65,7 @@ def test_initialization_with_all_valid_scenarios( "azure.ai.evaluation.simulator._model_tools.AdversarialTemplateHandler._get_content_harm_template_collections" ) def test_simulator_raises_validation_error_with_unsupported_scenario( - self, _get_content_harm_template_collections, _get_service_discovery_url + self, _get_content_harm_template_collections, _get_service_discovery_url, azure_cred ): _get_content_harm_template_collections.return_value = [] _get_service_discovery_url.return_value = "some-url" @@ -73,13 +73,12 @@ def test_simulator_raises_validation_error_with_unsupported_scenario( "subscription_id": "test_subscription", "resource_group_name": "test_resource_group", "project_name": "test_workspace", - "credential": "test_credential", } async def callback(x): return x - simulator = AdversarialSimulator(azure_ai_project=azure_ai_project) + simulator = AdversarialSimulator(azure_ai_project=azure_ai_project, credential=azure_cred) with pytest.raises(EvaluationException): outputs = asyncio.run( simulator(