Azure
diff --git a/‎.vscode/cspell.json‎
Lines changed: 5 additions & 1 deletion b/‎.vscode/cspell.json‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎sdk/evaluation/azure-ai-evaluation/CHANGELOG.md‎
Lines changed: 6 additions & 0 deletions b/‎sdk/evaluation/azure-ai-evaluation/CHANGELOG.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py‎
Lines changed: 20 additions & 0 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/constants.py‎
Lines changed: 2 additions & 2 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/constants.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py‎
Lines changed: 22 additions & 2 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py‎
Lines changed: 22 additions & 2 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_converters/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_converters/__init__.py‎
Lines changed: 3 additions & 0 deletions
@@ -68,6 +68,7 @@
     "sdk/digitaltwins/azure-digitaltwins-core/**",
     "sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_vendor/**",
     "sdk/evaluation/azure-ai-evaluation/tests/**",
+    "sdk/evaluation/azure-ai-evaluation/samples/agent_evaluators/**",
     "sdk/eventhub/azure-eventhub-checkpointstoretable/**",
     "sdk/eventhub/azure-eventhub-checkpointstoreblob-aio/**",
     "sdk/eventhub/azure-eventhub/**",
@@ -1401,7 +1402,10 @@
         "upia",
         "xpia",
         "expirable",
-        "ralphe"
+        "ralphe",
+        "Inadherent",
+        "nbformat",
+        "nbconvert",
       ]
     },
     {
 
@@ -45,6 +45,12 @@
     - emotional_state
     - protected_class
     - groundedness
+- New Built-in evaluators for Agent Evaluation (Preview)
+  - IntentResolutionEvaluator - Evaluates the intent resolution of an agent's response to a user query.
+  - ResponseCompletenessEvaluator - Evaluates the response completeness of an agent's response to a user query.
+  - TaskAdherenceEvaluator - Evaluates the task adherence of an agent's response to a user query.
+  - ToolCallAccuracyEvaluator - Evaluates the accuracy of tool calls made by an agent in response to a user query.
+
 
 ### Breaking Changes
 
 
@@ -17,16 +17,20 @@
 from ._evaluators._gleu import GleuScoreEvaluator
 from ._evaluators._groundedness import GroundednessEvaluator
 from ._evaluators._service_groundedness import GroundednessProEvaluator
+from ._evaluators._intent_resolution import IntentResolutionEvaluator
 from ._evaluators._meteor import MeteorScoreEvaluator
 from ._evaluators._protected_material import ProtectedMaterialEvaluator
 from ._evaluators._qa import QAEvaluator
+from ._evaluators._response_completeness import ResponseCompletenessEvaluator
+from ._evaluators._task_adherence import TaskAdherenceEvaluator
 from ._evaluators._relevance import RelevanceEvaluator
 from ._evaluators._retrieval import RetrievalEvaluator
 from ._evaluators._rouge import RougeScoreEvaluator, RougeType
 from ._evaluators._similarity import SimilarityEvaluator
 from ._evaluators._xpia import IndirectAttackEvaluator
 from ._evaluators._code_vulnerability import CodeVulnerabilityEvaluator
 from ._evaluators._ungrounded_attributes import UngroundedAttributesEvaluator
+from ._evaluators._tool_call_accuracy import ToolCallAccuracyEvaluator
 from ._model_configurations import (
     AzureAIProject,
     AzureOpenAIModelConfiguration,
@@ -37,13 +41,26 @@
     OpenAIModelConfiguration,
 )
 
+# The converter from the AI service to the evaluator schema requires a dependency on
+# ai.projects, but we also don't want to force users installing ai.evaluations to pull
+# in ai.projects. So we only import it if it's available and the user has ai.projects.
+try:
+    from ._converters._ai_services import AIAgentConverter
+    _patch_all = ["AIAgentConverter"]
+except ImportError:
+    print("Could not import AIAgentConverter. Please install the dependency with `pip install azure-ai-projects`.")
+    _patch_all = []
+
 __all__ = [
     "evaluate",
     "CoherenceEvaluator",
     "F1ScoreEvaluator",
     "FluencyEvaluator",
     "GroundednessEvaluator",
     "GroundednessProEvaluator",
+    "ResponseCompletenessEvaluator",
+    "TaskAdherenceEvaluator",
+    "IntentResolutionEvaluator",
     "RelevanceEvaluator",
     "SimilarityEvaluator",
     "QAEvaluator",
@@ -69,4 +86,7 @@
     "EvaluationResult",
     "CodeVulnerabilityEvaluator",
     "UngroundedAttributesEvaluator",
+    "ToolCallAccuracyEvaluator",
 ]
+
+__all__.extend([p for p in _patch_all if p not in __all__])
@@ -5,8 +5,8 @@
 
 from azure.core import CaseInsensitiveEnumMeta
 
-
-PROMPT_BASED_REASON_EVALUATORS = ["coherence", "relevance", "retrieval", "groundedness", "fluency"]
+PROMPT_BASED_REASON_EVALUATORS = ["coherence", "relevance", "retrieval", "groundedness", "fluency", "intent_resolution",
+                                  "tool_call_accurate", "response_completeness", "task_adherence"]
 
 
 class CommonConstants:
 
@@ -274,8 +274,26 @@ def validate_annotation(v: object, annotation: Union[str, type, object]) -> bool
 
     return cast(T_TypedDict, o)
 
+def check_score_is_valid(score: Union[str, float], min_score = 1, max_score = 5) -> bool:
+    """Check if the score is valid, i.e. is convertable to number and is in the range [min_score, max_score].
+
+    :param score: The score to check.
+    :type score: Union[str, float]
+    :param min_score: The minimum score. Default is 1.
+    :type min_score: int
+    :param max_score: The maximum score. Default is 5.
+    :type max_score: int
+    :return: True if the score is valid, False otherwise.
+    :rtype: bool
+    """
+    try:
+        numeric_score = float(score)
+    except (ValueError, TypeError):
+        return False
+
+    return min_score <= numeric_score <= max_score
 
-def parse_quality_evaluator_reason_score(llm_output: str) -> Tuple[float, str]:
+def parse_quality_evaluator_reason_score(llm_output: str, valid_score_range: str = "[1-5]") -> Tuple[float, str]:
     """Parse the output of prompt-based quality evaluators that return a score and reason.
 
     Current supported evaluators:
@@ -284,6 +302,8 @@ def parse_quality_evaluator_reason_score(llm_output: str) -> Tuple[float, str]:
         - Retrieval
         - Groundedness
         - Coherence
+        - ResponseCompleteness
+        - TaskAdherence
 
     :param llm_output: The output of the prompt-based quality evaluator.
     :type llm_output: str
@@ -294,7 +314,7 @@ def parse_quality_evaluator_reason_score(llm_output: str) -> Tuple[float, str]:
     reason = ""
     if llm_output:
         try:
-            score_pattern = r"<S2>\D*?([1-5]).*?</S2>"
+            score_pattern = rf"<S2>\D*?({valid_score_range}).*?</S2>"
             reason_pattern = r"<S1>(.*?)</S1>"
             score_match = re.findall(score_pattern, llm_output, re.DOTALL)
             reason_match = re.findall(reason_pattern, llm_output, re.DOTALL)
 
@@ -0,0 +1,3 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# ---------------------------------------------------------`
	`2`	`+# Copyright (c) Microsoft Corporation. All rights reserved.`
	`3`	`+# ---------------------------------------------------------`