Skip to content

Commit 87ea594

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: GenAI SDK client(evals) Send agent eval request to EvaluateInstance
PiperOrigin-RevId: 820873097
1 parent dc5dcc7 commit 87ea594

File tree

6 files changed

+381
-74
lines changed

6 files changed

+381
-74
lines changed

tests/unit/vertexai/genai/replays/test_evaluate_instances.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,84 @@ def test_pointwise_metric(client):
9999
assert response.pointwise_metric_result.score is not None
100100

101101

102+
def test_pointwise_metric_with_agent_data(client):
103+
"""Tests the _evaluate_instances method with PointwiseMetricInput and agent_data."""
104+
instance_dict = {"prompt": "What is the capital of France?", "response": "Paris"}
105+
json_instance = json.dumps(instance_dict)
106+
agent_data = types.AgentData(
107+
agent_config=types.AgentConfig(
108+
tools=types.Tools(
109+
tool=[
110+
genai_types.Tool(
111+
function_declarations=[
112+
genai_types.FunctionDeclaration(name="search")
113+
]
114+
)
115+
]
116+
),
117+
developer_instruction=types.InstanceData(text="instruction"),
118+
),
119+
events=types.Events(
120+
event=[genai_types.Content(parts=[genai_types.Part(text="hello")])]
121+
),
122+
)
123+
instance = types.EvaluationInstance(
124+
prompt=types.InstanceData(text="What is the capital of France?"),
125+
response=types.InstanceData(text="Paris"),
126+
agent_data=agent_data,
127+
)
128+
129+
test_input = types.PointwiseMetricInput(
130+
instance=types.PointwiseMetricInstance(json_instance=json_instance),
131+
metric_spec=genai_types.PointwiseMetricSpec(
132+
metric_prompt_template="Evaluate if the response '{response}' correctly answers the prompt '{prompt}'."
133+
),
134+
)
135+
response = client.evals.evaluate_instances(
136+
metric_config=types._EvaluateInstancesRequestParameters(
137+
pointwise_metric_input=test_input,
138+
instance=instance,
139+
)
140+
)
141+
assert response.pointwise_metric_result is not None
142+
assert response.pointwise_metric_result.score is not None
143+
144+
145+
def test_predefined_metric_with_agent_data(client):
146+
"""Tests the _evaluate_instances method with predefined metric and agent_data."""
147+
agent_data = types.AgentData(
148+
agent_config=types.AgentConfig(
149+
tools=types.Tools(
150+
tool=[
151+
genai_types.Tool(
152+
function_declarations=[
153+
genai_types.FunctionDeclaration(name="search")
154+
]
155+
)
156+
]
157+
),
158+
developer_instruction=types.InstanceData(text="instruction"),
159+
),
160+
events=types.Events(
161+
event=[genai_types.Content(parts=[genai_types.Part(text="hello")])]
162+
),
163+
)
164+
instance = types.EvaluationInstance(
165+
prompt=types.InstanceData(text="What is the capital of France?"),
166+
response=types.InstanceData(text="Paris"),
167+
reference=types.InstanceData(text="Paris"),
168+
agent_data=agent_data,
169+
)
170+
171+
response = client.evals.evaluate_instances(
172+
metric_config=types._EvaluateInstancesRequestParameters(
173+
metrics=[types.Metric(name="general_quality_v1")],
174+
instance=instance,
175+
)
176+
)
177+
assert response.metric_results[0].score is not None
178+
179+
102180
def test_pairwise_metric_with_autorater(client):
103181
"""Tests the _evaluate_instances method with PairwiseMetricInput and AutoraterConfig."""
104182

tests/unit/vertexai/genai/test_evals.py

Lines changed: 113 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3288,6 +3288,115 @@ def test_merge_with_invalid_eval_case_type(self):
32883288
)
32893289

32903290

3291+
@pytest.mark.usefixtures("google_auth_mock")
3292+
class TestPredefinedMetricHandler:
3293+
"""Unit tests for the PredefinedMetricHandler class."""
3294+
3295+
def test_eval_case_to_agent_data(self):
3296+
tool = genai_types.Tool(
3297+
function_declarations=[
3298+
genai_types.FunctionDeclaration(
3299+
name="get_weather",
3300+
description="Get weather in a location",
3301+
parameters={
3302+
"type": "object",
3303+
"properties": {"location": {"type": "string"}},
3304+
},
3305+
)
3306+
]
3307+
)
3308+
agent_info = vertexai_genai_types.AgentInfo(
3309+
name="agent1",
3310+
instruction="instruction1",
3311+
tool_declarations=[tool],
3312+
)
3313+
intermediate_events = [
3314+
vertexai_genai_types.Event(
3315+
event_id="event1",
3316+
content=genai_types.Content(
3317+
parts=[genai_types.Part(text="intermediate event")]
3318+
),
3319+
)
3320+
]
3321+
eval_case = vertexai_genai_types.EvalCase(
3322+
prompt=genai_types.Content(parts=[genai_types.Part(text="Hello")]),
3323+
responses=[
3324+
vertexai_genai_types.ResponseCandidate(
3325+
response=genai_types.Content(parts=[genai_types.Part(text="Hi")])
3326+
)
3327+
],
3328+
agent_info=agent_info,
3329+
intermediate_events=intermediate_events,
3330+
)
3331+
3332+
agent_data = (
3333+
_evals_metric_handlers.PredefinedMetricHandler._eval_case_to_agent_data(
3334+
eval_case
3335+
)
3336+
)
3337+
3338+
assert agent_data.agent_config.developer_instruction.text == "instruction1"
3339+
assert agent_data.agent_config.tools.tool == [tool]
3340+
assert agent_data.events.event[0].parts[0].text == "intermediate event"
3341+
3342+
def test_eval_case_to_agent_data_events_only(self):
3343+
intermediate_events = [
3344+
vertexai_genai_types.Event(
3345+
event_id="event1",
3346+
content=genai_types.Content(
3347+
parts=[genai_types.Part(text="intermediate event")]
3348+
),
3349+
)
3350+
]
3351+
eval_case = vertexai_genai_types.EvalCase(
3352+
prompt=genai_types.Content(parts=[genai_types.Part(text="Hello")]),
3353+
responses=[
3354+
vertexai_genai_types.ResponseCandidate(
3355+
response=genai_types.Content(parts=[genai_types.Part(text="Hi")])
3356+
)
3357+
],
3358+
agent_info=None,
3359+
intermediate_events=intermediate_events,
3360+
)
3361+
3362+
agent_data = (
3363+
_evals_metric_handlers.PredefinedMetricHandler._eval_case_to_agent_data(
3364+
eval_case
3365+
)
3366+
)
3367+
3368+
assert agent_data.agent_config is None
3369+
assert agent_data.events.event[0].parts[0].text == "intermediate event"
3370+
3371+
def test_eval_case_to_agent_data_empty_events(self):
3372+
intermediate_events = [
3373+
vertexai_genai_types.Event(
3374+
event_id="event1",
3375+
content=None,
3376+
)
3377+
]
3378+
eval_case = vertexai_genai_types.EvalCase(
3379+
prompt=genai_types.Content(parts=[genai_types.Part(text="Hello")]),
3380+
responses=[
3381+
vertexai_genai_types.ResponseCandidate(
3382+
response=genai_types.Content(parts=[genai_types.Part(text="Hi")])
3383+
)
3384+
],
3385+
agent_info=None,
3386+
intermediate_events=intermediate_events,
3387+
)
3388+
3389+
agent_data = (
3390+
_evals_metric_handlers.PredefinedMetricHandler._eval_case_to_agent_data(
3391+
eval_case
3392+
)
3393+
)
3394+
3395+
assert agent_data.agent_config is None
3396+
assert agent_data.events is None
3397+
assert not agent_data.events_text
3398+
3399+
32913400
@pytest.mark.usefixtures("google_auth_mock")
32923401
class TestLLMMetricHandlerPayload:
32933402
def setup_method(self):
@@ -3648,7 +3757,9 @@ def test_execute_evaluation_with_agent_info(
36483757
input_dataset = vertexai_genai_types.EvaluationDataset(
36493758
eval_dataset_df=dataset_df
36503759
)
3651-
computation_metric = vertexai_genai_types.Metric(name="exact_match")
3760+
predefined_metric = vertexai_genai_types.PredefinedMetricSpec(
3761+
metric_spec_name="tool_search_validity"
3762+
)
36523763
tool = {
36533764
"function_declarations": [
36543765
{
@@ -3671,7 +3782,7 @@ def test_execute_evaluation_with_agent_info(
36713782
result = _evals_common._execute_evaluation(
36723783
api_client=mock_api_client_fixture,
36733784
dataset=input_dataset,
3674-
metrics=[computation_metric],
3785+
metrics=[predefined_metric],
36753786
agent_info=agent_info,
36763787
)
36773788

vertexai/_genai/_evals_constant.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
"multi_turn_text_quality_v1",
2626
"final_response_match_v2",
2727
"final_response_reference_free_v1",
28+
"final_response_quality_v1",
2829
}
2930
)
3031

vertexai/_genai/_evals_metric_handlers.py

Lines changed: 47 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -845,6 +845,51 @@ def _content_to_instance_data(
845845
contents=types.InstanceDataContents(contents=[content])
846846
)
847847

848+
@staticmethod
849+
def _eval_case_to_agent_data(
850+
eval_case: types.EvalCase,
851+
) -> Optional[types.AgentData]:
852+
"""Converts an EvalCase object to an AgentData object."""
853+
if not eval_case.agent_info and not eval_case.intermediate_events:
854+
return None
855+
tools = None
856+
developer_instruction = None
857+
events = None
858+
agent_config = None
859+
860+
if eval_case.agent_info:
861+
agent_info = eval_case.agent_info
862+
if agent_info.instruction:
863+
developer_instruction = types.InstanceData(text=agent_info.instruction)
864+
if agent_info.tool_declarations:
865+
tool_declarations = agent_info.tool_declarations
866+
tools = types.Tools(tool=tool_declarations)
867+
if tools or developer_instruction:
868+
agent_config = types.AgentConfig(
869+
tools=tools,
870+
developer_instruction=developer_instruction,
871+
)
872+
873+
if eval_case.intermediate_events:
874+
event_contents = [
875+
event.content
876+
for event in eval_case.intermediate_events
877+
if event.content
878+
]
879+
if event_contents:
880+
events = types.Events(event=event_contents)
881+
882+
if events:
883+
return types.AgentData(
884+
agent_config=agent_config,
885+
events=events,
886+
)
887+
else:
888+
return types.AgentData(
889+
agent_config=agent_config,
890+
events_text="",
891+
)
892+
848893
def _build_request_payload(
849894
self, eval_case: types.EvalCase, response_index: int
850895
) -> dict[str, Any]:
@@ -893,7 +938,6 @@ def _build_request_payload(
893938
logger.warning(
894939
f"Unsupported type for context: {type(eval_case.context)}"
895940
)
896-
897941
instance_payload = types.EvaluationInstance(
898942
prompt=prompt_instance_data,
899943
response=PredefinedMetricHandler._content_to_instance_data(
@@ -906,6 +950,7 @@ def _build_request_payload(
906950
if other_data_map
907951
else None
908952
),
953+
agent_data=PredefinedMetricHandler._eval_case_to_agent_data(eval_case),
909954
)
910955

911956
return {
@@ -921,8 +966,7 @@ def get_metric_result(
921966
try:
922967
payload = self._build_request_payload(eval_case, response_index)
923968
api_response = self.module._evaluate_instances(
924-
metrics=[self.metric],
925-
instance=payload.get("instance"),
969+
metrics=[self.metric], instance=payload.get("instance")
926970
)
927971

928972
if (

vertexai/_genai/_evals_utils.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -591,6 +591,10 @@ def MULTI_TURN_CHAT_QUALITY(self) -> LazyLoadedPrebuiltMetric:
591591
def MULTI_TURN_SAFETY(self) -> LazyLoadedPrebuiltMetric:
592592
return self.__getattr__("MULTI_TURN_SAFETY")
593593

594+
@property
595+
def FINAL_RESPONSE_QUALITY(self) -> LazyLoadedPrebuiltMetric:
596+
return self.__getattr__("FINAL_RESPONSE_QUALITY")
597+
594598

595599
PrebuiltMetric = PrebuiltMetricLoader()
596600
RubricMetric = PrebuiltMetric

0 commit comments

Comments
 (0)