feat: GenAI SDK client(evals) Send agent eval request to EvaluateInstance

vertex-sdk-bot · copybara-github · commit 87ea59432458 · 2025-10-17T16:47:46.000-07:00
PiperOrigin-RevId: 820873097
diff --git a/tests/unit/vertexai/genai/replays/test_evaluate_instances.py b/tests/unit/vertexai/genai/replays/test_evaluate_instances.py
@@ -99,6 +99,84 @@ def test_pointwise_metric(client):
     assert response.pointwise_metric_result.score is not None
 
 
+def test_pointwise_metric_with_agent_data(client):
+    """Tests the _evaluate_instances method with PointwiseMetricInput and agent_data."""
+    instance_dict = {"prompt": "What is the capital of France?", "response": "Paris"}
+    json_instance = json.dumps(instance_dict)
+    agent_data = types.AgentData(
+        agent_config=types.AgentConfig(
+            tools=types.Tools(
+                tool=[
+                    genai_types.Tool(
+                        function_declarations=[
+                            genai_types.FunctionDeclaration(name="search")
+                        ]
+                    )
+                ]
+            ),
+            developer_instruction=types.InstanceData(text="instruction"),
+        ),
+        events=types.Events(
+            event=[genai_types.Content(parts=[genai_types.Part(text="hello")])]
+        ),
+    )
+    instance = types.EvaluationInstance(
+        prompt=types.InstanceData(text="What is the capital of France?"),
+        response=types.InstanceData(text="Paris"),
+        agent_data=agent_data,
+    )
+
+    test_input = types.PointwiseMetricInput(
+        instance=types.PointwiseMetricInstance(json_instance=json_instance),
+        metric_spec=genai_types.PointwiseMetricSpec(
+            metric_prompt_template="Evaluate if the response '{response}' correctly answers the prompt '{prompt}'."
+        ),
+    )
+    response = client.evals.evaluate_instances(
+        metric_config=types._EvaluateInstancesRequestParameters(
+            pointwise_metric_input=test_input,
+            instance=instance,
+        )
+    )
+    assert response.pointwise_metric_result is not None
+    assert response.pointwise_metric_result.score is not None
+
+
+def test_predefined_metric_with_agent_data(client):
+    """Tests the _evaluate_instances method with predefined metric and agent_data."""
+    agent_data = types.AgentData(
+        agent_config=types.AgentConfig(
+            tools=types.Tools(
+                tool=[
+                    genai_types.Tool(
+                        function_declarations=[
+                            genai_types.FunctionDeclaration(name="search")
+                        ]
+                    )
+                ]
+            ),
+            developer_instruction=types.InstanceData(text="instruction"),
+        ),
+        events=types.Events(
+            event=[genai_types.Content(parts=[genai_types.Part(text="hello")])]
+        ),
+    )
+    instance = types.EvaluationInstance(
+        prompt=types.InstanceData(text="What is the capital of France?"),
+        response=types.InstanceData(text="Paris"),
+        reference=types.InstanceData(text="Paris"),
+        agent_data=agent_data,
+    )
+
+    response = client.evals.evaluate_instances(
+        metric_config=types._EvaluateInstancesRequestParameters(
+            metrics=[types.Metric(name="general_quality_v1")],
+            instance=instance,
+        )
+    )
+    assert response.metric_results[0].score is not None
+
+
 def test_pairwise_metric_with_autorater(client):
     """Tests the _evaluate_instances method with PairwiseMetricInput and AutoraterConfig."""
 
diff --git a/tests/unit/vertexai/genai/test_evals.py b/tests/unit/vertexai/genai/test_evals.py
@@ -3288,6 +3288,115 @@ def test_merge_with_invalid_eval_case_type(self):
             )
 
 
+@pytest.mark.usefixtures("google_auth_mock")
+class TestPredefinedMetricHandler:
+    """Unit tests for the PredefinedMetricHandler class."""
+
+    def test_eval_case_to_agent_data(self):
+        tool = genai_types.Tool(
+            function_declarations=[
+                genai_types.FunctionDeclaration(
+                    name="get_weather",
+                    description="Get weather in a location",
+                    parameters={
+                        "type": "object",
+                        "properties": {"location": {"type": "string"}},
+                    },
+                )
+            ]
+        )
+        agent_info = vertexai_genai_types.AgentInfo(
+            name="agent1",
+            instruction="instruction1",
+            tool_declarations=[tool],
+        )
+        intermediate_events = [
+            vertexai_genai_types.Event(
+                event_id="event1",
+                content=genai_types.Content(
+                    parts=[genai_types.Part(text="intermediate event")]
+                ),
+            )
+        ]
+        eval_case = vertexai_genai_types.EvalCase(
+            prompt=genai_types.Content(parts=[genai_types.Part(text="Hello")]),
+            responses=[
+                vertexai_genai_types.ResponseCandidate(
+                    response=genai_types.Content(parts=[genai_types.Part(text="Hi")])
+                )
+            ],
+            agent_info=agent_info,
+            intermediate_events=intermediate_events,
+        )
+
+        agent_data = (
+            _evals_metric_handlers.PredefinedMetricHandler._eval_case_to_agent_data(
+                eval_case
+            )
+        )
+
+        assert agent_data.agent_config.developer_instruction.text == "instruction1"
+        assert agent_data.agent_config.tools.tool == [tool]
+        assert agent_data.events.event[0].parts[0].text == "intermediate event"
+
+    def test_eval_case_to_agent_data_events_only(self):
+        intermediate_events = [
+            vertexai_genai_types.Event(
+                event_id="event1",
+                content=genai_types.Content(
+                    parts=[genai_types.Part(text="intermediate event")]
+                ),
+            )
+        ]
+        eval_case = vertexai_genai_types.EvalCase(
+            prompt=genai_types.Content(parts=[genai_types.Part(text="Hello")]),
+            responses=[
+                vertexai_genai_types.ResponseCandidate(
+                    response=genai_types.Content(parts=[genai_types.Part(text="Hi")])
+                )
+            ],
+            agent_info=None,
+            intermediate_events=intermediate_events,
+        )
+
+        agent_data = (
+            _evals_metric_handlers.PredefinedMetricHandler._eval_case_to_agent_data(
+                eval_case
+            )
+        )
+
+        assert agent_data.agent_config is None
+        assert agent_data.events.event[0].parts[0].text == "intermediate event"
+
+    def test_eval_case_to_agent_data_empty_events(self):
+        intermediate_events = [
+            vertexai_genai_types.Event(
+                event_id="event1",
+                content=None,
+            )
+        ]
+        eval_case = vertexai_genai_types.EvalCase(
+            prompt=genai_types.Content(parts=[genai_types.Part(text="Hello")]),
+            responses=[
+                vertexai_genai_types.ResponseCandidate(
+                    response=genai_types.Content(parts=[genai_types.Part(text="Hi")])
+                )
+            ],
+            agent_info=None,
+            intermediate_events=intermediate_events,
+        )
+
+        agent_data = (
+            _evals_metric_handlers.PredefinedMetricHandler._eval_case_to_agent_data(
+                eval_case
+            )
+        )
+
+        assert agent_data.agent_config is None
+        assert agent_data.events is None
+        assert not agent_data.events_text
+
+
 @pytest.mark.usefixtures("google_auth_mock")
 class TestLLMMetricHandlerPayload:
     def setup_method(self):
@@ -3648,7 +3757,9 @@ def test_execute_evaluation_with_agent_info(
         input_dataset = vertexai_genai_types.EvaluationDataset(
             eval_dataset_df=dataset_df
         )
-        computation_metric = vertexai_genai_types.Metric(name="exact_match")
+        predefined_metric = vertexai_genai_types.PredefinedMetricSpec(
+            metric_spec_name="tool_search_validity"
+        )
         tool = {
             "function_declarations": [
                 {
@@ -3671,7 +3782,7 @@ def test_execute_evaluation_with_agent_info(
         result = _evals_common._execute_evaluation(
             api_client=mock_api_client_fixture,
             dataset=input_dataset,
-            metrics=[computation_metric],
+            metrics=[predefined_metric],
             agent_info=agent_info,
         )
 
diff --git a/vertexai/_genai/_evals_constant.py b/vertexai/_genai/_evals_constant.py
@@ -25,6 +25,7 @@
         "multi_turn_text_quality_v1",
         "final_response_match_v2",
         "final_response_reference_free_v1",
+        "final_response_quality_v1",
     }
 )
 
diff --git a/vertexai/_genai/_evals_metric_handlers.py b/vertexai/_genai/_evals_metric_handlers.py
@@ -845,6 +845,51 @@ def _content_to_instance_data(
             contents=types.InstanceDataContents(contents=[content])
         )
 
+    @staticmethod
+    def _eval_case_to_agent_data(
+        eval_case: types.EvalCase,
+    ) -> Optional[types.AgentData]:
+        """Converts an EvalCase object to an AgentData object."""
+        if not eval_case.agent_info and not eval_case.intermediate_events:
+            return None
+        tools = None
+        developer_instruction = None
+        events = None
+        agent_config = None
+
+        if eval_case.agent_info:
+            agent_info = eval_case.agent_info
+            if agent_info.instruction:
+                developer_instruction = types.InstanceData(text=agent_info.instruction)
+            if agent_info.tool_declarations:
+                tool_declarations = agent_info.tool_declarations
+                tools = types.Tools(tool=tool_declarations)
+            if tools or developer_instruction:
+                agent_config = types.AgentConfig(
+                    tools=tools,
+                    developer_instruction=developer_instruction,
+                )
+
+        if eval_case.intermediate_events:
+            event_contents = [
+                event.content
+                for event in eval_case.intermediate_events
+                if event.content
+            ]
+            if event_contents:
+                events = types.Events(event=event_contents)
+
+        if events:
+            return types.AgentData(
+                agent_config=agent_config,
+                events=events,
+            )
+        else:
+            return types.AgentData(
+                agent_config=agent_config,
+                events_text="",
+            )
+
     def _build_request_payload(
         self, eval_case: types.EvalCase, response_index: int
     ) -> dict[str, Any]:
@@ -893,7 +938,6 @@ def _build_request_payload(
                 logger.warning(
                     f"Unsupported type for context: {type(eval_case.context)}"
                 )
-
         instance_payload = types.EvaluationInstance(
             prompt=prompt_instance_data,
             response=PredefinedMetricHandler._content_to_instance_data(
@@ -906,6 +950,7 @@ def _build_request_payload(
                 if other_data_map
                 else None
             ),
+            agent_data=PredefinedMetricHandler._eval_case_to_agent_data(eval_case),
         )
 
         return {
@@ -921,8 +966,7 @@ def get_metric_result(
         try:
             payload = self._build_request_payload(eval_case, response_index)
             api_response = self.module._evaluate_instances(
-                metrics=[self.metric],
-                instance=payload.get("instance"),
+                metrics=[self.metric], instance=payload.get("instance")
             )
 
             if (
diff --git a/vertexai/_genai/_evals_utils.py b/vertexai/_genai/_evals_utils.py
@@ -591,6 +591,10 @@ def MULTI_TURN_CHAT_QUALITY(self) -> LazyLoadedPrebuiltMetric:
     def MULTI_TURN_SAFETY(self) -> LazyLoadedPrebuiltMetric:
         return self.__getattr__("MULTI_TURN_SAFETY")
 
+    @property
+    def FINAL_RESPONSE_QUALITY(self) -> LazyLoadedPrebuiltMetric:
+        return self.__getattr__("FINAL_RESPONSE_QUALITY")
+
 
 PrebuiltMetric = PrebuiltMetricLoader()
 RubricMetric = PrebuiltMetric
diff --git a/vertexai/_genai/types.py b/vertexai/_genai/types.py

Original file line number	Diff line number	Diff line change
`@@ -25,6 +25,7 @@`
`25`	`25`	`"multi_turn_text_quality_v1",`
`26`	`26`	`"final_response_match_v2",`
`27`	`27`	`"final_response_reference_free_v1",`
	`28`	`+ "final_response_quality_v1",`
`28`	`29`	`}`
`29`	`30`	`)`
`30`	`31`