Skip to content

Commit d62afc3

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: GenAI Client(evals) - Add agent data to EvaluationRun show in Vertex AI GenAI SDK evals
PiperOrigin-RevId: 823667266
1 parent db364ab commit d62afc3

File tree

6 files changed

+313
-186
lines changed

6 files changed

+313
-186
lines changed

tests/unit/vertexai/genai/replays/conftest.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -133,10 +133,14 @@ def _get_replay_id(use_vertex: bool, replays_prefix: str) -> str:
133133
)
134134
EVAL_ITEM_REQUEST_GCS_URI = "gs://lakeyk-limited-bucket/agora_eval_080525/request_"
135135
EVAL_ITEM_RESULT_GCS_URI = "gs://lakeyk-limited-bucket/agora_eval_080525/result_"
136+
EVAL_ITEM_REQUEST_GCS_URI_2 = "gs://lakeyk-limited-bucket/eval-data/request_"
137+
EVAL_ITEM_RESULT_GCS_URI_2 = "gs://lakeyk-limited-bucket/eval-data/result_"
136138
EVAL_GCS_URI_ITEMS = {
137139
EVAL_CONFIG_GCS_URI: "test_resources/mock_eval_config.yaml",
138140
EVAL_ITEM_REQUEST_GCS_URI: "test_resources/request_4813679498589372416.json",
139141
EVAL_ITEM_RESULT_GCS_URI: "test_resources/result_1486082323915997184.json",
142+
EVAL_ITEM_REQUEST_GCS_URI_2: "test_resources/request_4813679498589372416.json",
143+
EVAL_ITEM_RESULT_GCS_URI_2: "test_resources/result_1486082323915997184.json",
140144
}
141145

142146

@@ -148,11 +152,15 @@ def _mock_read_file_contents_side_effect(uri: str):
148152
current_dir = os.path.dirname(__file__)
149153
if uri in EVAL_GCS_URI_ITEMS:
150154
local_mock_file_path = os.path.join(current_dir, EVAL_GCS_URI_ITEMS[uri])
151-
elif uri.startswith(EVAL_ITEM_REQUEST_GCS_URI):
155+
elif uri.startswith(EVAL_ITEM_REQUEST_GCS_URI) or uri.startswith(
156+
EVAL_ITEM_REQUEST_GCS_URI_2
157+
):
152158
local_mock_file_path = os.path.join(
153159
current_dir, EVAL_GCS_URI_ITEMS[EVAL_ITEM_REQUEST_GCS_URI]
154160
)
155-
elif uri.startswith(EVAL_ITEM_RESULT_GCS_URI):
161+
elif uri.startswith(EVAL_ITEM_RESULT_GCS_URI) or uri.startswith(
162+
EVAL_ITEM_RESULT_GCS_URI_2
163+
):
156164
local_mock_file_path = os.path.join(
157165
current_dir, EVAL_GCS_URI_ITEMS[EVAL_ITEM_RESULT_GCS_URI]
158166
)

tests/unit/vertexai/genai/replays/test_get_evaluation_run.py

Lines changed: 113 additions & 128 deletions
Original file line numberDiff line numberDiff line change
@@ -16,31 +16,34 @@
1616

1717
from tests.unit.vertexai.genai.replays import pytest_helper
1818
from vertexai import types
19+
from google.genai import types as genai_types
1920
import datetime
2021
import pytest
2122

2223

2324
def test_get_eval_run(client):
2425
"""Tests that get_evaluation_run() returns a correctly structured EvaluationRun."""
26+
client._api_client._http_options.api_version = "v1beta1"
2527
evaluation_run_name = (
26-
"projects/503583131166/locations/us-central1/evaluationRuns/1957799200510967808"
28+
"projects/503583131166/locations/us-central1/evaluationRuns/5133048044039700480"
2729
)
2830
evaluation_run = client.evals.get_evaluation_run(
2931
name=evaluation_run_name, include_evaluation_items=True
3032
)
31-
check_run_1957799200510967808(client, evaluation_run, evaluation_run_name)
32-
check_run_1957799200510967808_evaluation_item_results(
33+
check_run_5133048044039700480(client, evaluation_run, evaluation_run_name)
34+
check_run_5133048044039700480_evaluation_item_results(
3335
client, evaluation_run, evaluation_run_name
3436
)
3537

3638

3739
def test_get_eval_run_include_evaluation_items_false(client):
3840
"""Tests that get_evaluation_run() returns a correctly structured EvaluationRun."""
41+
client._api_client._http_options.api_version = "v1beta1"
3942
evaluation_run_name = (
40-
"projects/503583131166/locations/us-central1/evaluationRuns/1957799200510967808"
43+
"projects/503583131166/locations/us-central1/evaluationRuns/5133048044039700480"
4144
)
4245
evaluation_run = client.evals.get_evaluation_run(name=evaluation_run_name)
43-
check_run_1957799200510967808(client, evaluation_run, evaluation_run_name)
46+
check_run_5133048044039700480(client, evaluation_run, evaluation_run_name)
4447
assert evaluation_run.evaluation_item_results is None
4548

4649

@@ -99,158 +102,142 @@ def test_get_eval_run_eval_set_source(client):
99102
@pytest.mark.asyncio
100103
async def test_get_eval_run_async(client):
101104
"""Tests that get_evaluation_run() returns a correctly structured EvaluationRun."""
102-
eval_run_id = "1957799200510967808"
105+
client._api_client._http_options.api_version = "v1beta1"
106+
eval_run_id = "5133048044039700480"
103107
evaluation_run_name = (
104108
f"projects/503583131166/locations/us-central1/evaluationRuns/{eval_run_id}"
105109
)
106110
evaluation_run = await client.aio.evals.get_evaluation_run(name=eval_run_id)
107-
check_run_1957799200510967808(client, evaluation_run, evaluation_run_name)
111+
check_run_5133048044039700480(client, evaluation_run, evaluation_run_name)
108112
assert evaluation_run.evaluation_item_results is None
109113

110114

111-
def check_run_1957799200510967808(
115+
def check_run_5133048044039700480(
112116
client, evaluation_run: types.EvaluationRun, evaluation_run_name: str
113117
):
114118
assert isinstance(evaluation_run, types.EvaluationRun)
115119
assert evaluation_run.name == evaluation_run_name
116-
assert evaluation_run.display_name == "test2"
117-
assert evaluation_run.metadata == {"pipeline_id": "4460531348888616960"}
120+
assert evaluation_run.display_name == "sdk-test-1"
121+
assert evaluation_run.metadata == {"pipeline_id": "4868043098678099968"}
118122
assert evaluation_run.create_time == datetime.datetime(
119-
2025, 9, 8, 20, 55, 41, 833176, tzinfo=datetime.timezone.utc
123+
2025, 10, 21, 19, 25, 58, 669441, tzinfo=datetime.timezone.utc
120124
)
121125
assert evaluation_run.completion_time == datetime.datetime(
122-
2025, 9, 8, 20, 56, 13, 492971, tzinfo=datetime.timezone.utc
126+
2025, 10, 21, 19, 26, 15, 855568, tzinfo=datetime.timezone.utc
123127
)
124128
assert evaluation_run.state == types.EvaluationRunState.SUCCEEDED
125129
assert evaluation_run.evaluation_set_snapshot == (
126-
"projects/503583131166/locations/us-central1/evaluationSets/8069535738573619200"
130+
"projects/503583131166/locations/us-central1/evaluationSets/3122155626046685184"
127131
)
128-
assert evaluation_run.data_source.bigquery_request_set == types.BigQueryRequestSet(
129-
uri="bq://lakeyk-test-limited.inference_batch_prediction_input.1317387725199900672_1b",
130-
prompt_column="request",
131-
candidate_response_columns={
132-
"baseline_model_response": "baseline_model_response",
133-
"checkpoint_1": "checkpoint_1",
134-
"checkpoint_2": "checkpoint_2",
135-
},
132+
assert (
133+
evaluation_run.data_source.evaluation_set
134+
== "projects/503583131166/locations/us-central1/evaluationSets/3122155626046685184"
136135
)
137136
assert evaluation_run.evaluation_run_results.evaluation_set == (
138-
"projects/503583131166/locations/us-central1/evaluationSets/102386522778501120"
137+
"projects/503583131166/locations/us-central1/evaluationSets/129513673658990592"
139138
)
140139
assert evaluation_run.inference_configs == {
141-
"checkpoint_1": types.EvaluationRunInferenceConfig(
142-
model="projects/503583131166/locations/us-central1/endpoints/9030177948249882624"
143-
),
144-
"checkpoint_2": types.EvaluationRunInferenceConfig(
145-
model="projects/503583131166/locations/us-central1/endpoints/7751155654076661760"
140+
"gemini-2.0-flash-001@default": types.EvaluationRunInferenceConfig(
141+
agent_config=types.EvaluationRunAgentConfig(
142+
developer_instruction={
143+
"parts": [{"text": "example agent developer instruction"}]
144+
},
145+
tools=[
146+
genai_types.Tool(
147+
function_declarations=[
148+
genai_types.FunctionDeclaration(
149+
name="check_chime",
150+
description="Check chime.",
151+
parameters={
152+
"type": "OBJECT",
153+
"properties": {
154+
"nums": {
155+
"type": "STRING",
156+
"description": "List of numbers to be verified.",
157+
}
158+
},
159+
"required": ["nums"],
160+
},
161+
),
162+
],
163+
)
164+
],
165+
)
146166
),
147167
}
148168
assert evaluation_run.evaluation_run_results.summary_metrics == (
149169
types.SummaryMetric(
150170
metrics={
151-
"checkpoint_1/user_defined/MODE": 5,
152-
"checkpoint_2/universal/P90": 1,
153-
"gemini-2.0-flash-001@default/universal/AVERAGE": 0.6943817985685249,
154-
"gemini-2.0-flash-001@default/user_defined/P90": 5,
155-
"gemini-2.0-flash-001@default/universal/VARIANCE": 0.03146487552180889,
156-
"gemini-2.0-flash-001@default/user_defined/P95": 5,
157-
"checkpoint_1/universal/MINIMUM": 0.8571428656578064,
158-
"checkpoint_1/universal/VARIANCE": 0.0015452162403157982,
159-
"gemini-2.0-flash-001@default/universal/STANDARD_DEVIATION": 0.17738341388587855,
160-
"checkpoint_2/user_defined/P95": 5,
161-
"checkpoint_2/universal/MODE": 1,
162-
"checkpoint_2/user_defined/P90": 5,
163-
"checkpoint_2/universal/P99": 1,
171+
"gemini-2.0-flash-001@default/safety_v1/VARIANCE": 0.08950617055834077,
172+
"gemini-2.0-flash-001@default/safety_v1/MAXIMUM": 1,
173+
"gemini-2.0-flash-001@default/universal/AVERAGE": 0.7888888915379842,
174+
"gemini-2.0-flash-001@default/universal/P90": 1,
175+
"gemini-2.0-flash-001@default/safety_v1/MEDIAN": 1,
176+
"gemini-2.0-flash-001@default/universal/P95": 1,
177+
"gemini-2.0-flash-001@default/universal/VARIANCE": 0.08950617055834077,
178+
"gemini-2.0-flash-001@default/universal/STANDARD_DEVIATION": 0.2991758188061675,
179+
"gemini-2.0-flash-001@default/universal/MEDIAN": 1,
180+
"gemini-2.0-flash-001@default/safety_v1/STANDARD_DEVIATION": 0.2991758188061675,
181+
"gemini-2.0-flash-001@default/universal/MODE": 1,
182+
"gemini-2.0-flash-001@default/safety_v1/MODE": 1,
183+
"gemini-2.0-flash-001@default/safety_v1/MINIMUM": 0.3333333432674408,
184+
"gemini-2.0-flash-001@default/safety_v1/P90": 1,
185+
"gemini-2.0-flash-001@default/safety_v1/P95": 1,
186+
"gemini-2.0-flash-001@default/universal/P99": 1,
187+
"gemini-2.0-flash-001@default/safety_v1/AVERAGE": 0.7888888915379842,
188+
"gemini-2.0-flash-001@default/universal/MINIMUM": 0.3333333432674408,
164189
"gemini-2.0-flash-001@default/universal/MAXIMUM": 1,
165-
"checkpoint_2/universal/P95": 1,
166-
"checkpoint_2/user_defined/P99": 5,
167-
"checkpoint_2/universal/MINIMUM": 0.7777777910232544,
168-
"gemini-2.0-flash-001@default/universal/P90": 0.8777777791023255,
169-
"checkpoint_1/universal/AVERAGE": 0.986633250587865,
170-
"checkpoint_1/universal/MAXIMUM": 1,
171-
"checkpoint_1/universal/STANDARD_DEVIATION": 0.0393092386127714,
172-
"gemini-2.0-flash-001@default/universal/P95": 0.9000000059604645,
173-
"gemini-2.0-flash-001@default/user_defined/MAXIMUM": 5,
174-
"gemini-2.0-flash-001@default/user_defined/MINIMUM": 3,
175-
"gemini-2.0-flash-001@default/user_defined/VARIANCE": 0.4044321329639886,
176-
"checkpoint_2/user_defined/MAXIMUM": 5,
177-
"checkpoint_1/universal/MEDIAN": 1,
178-
"gemini-2.0-flash-001@default/universal/MEDIAN": 0.7142857313156128,
179-
"gemini-2.0-flash-001@default/user_defined/AVERAGE": 4.736842105263158,
180-
"gemini-2.0-flash-001@default/user_defined/MEDIAN": 5,
181-
"checkpoint_2/user_defined/AVERAGE": 5,
182-
"checkpoint_2/user_defined/MEDIAN": 5,
183-
"checkpoint_2/user_defined/STANDARD_DEVIATION": 0,
184-
"checkpoint_2/universal/MAXIMUM": 1,
185-
"checkpoint_1/universal/MODE": 1,
186-
"checkpoint_2/user_defined/MINIMUM": 5,
187-
"checkpoint_1/user_defined/VARIANCE": 0,
188-
"checkpoint_2/universal/VARIANCE": 0.005771725970062436,
189-
"checkpoint_2/universal/AVERAGE": 0.9438178790243048,
190-
"checkpoint_1/user_defined/MINIMUM": 5,
191-
"gemini-2.0-flash-001@default/universal/P99": 0.9800000011920929,
192-
"gemini-2.0-flash-001@default/universal/MINIMUM": 0.2857142984867096,
193-
"checkpoint_2/user_defined/VARIANCE": 0,
194-
"checkpoint_1/user_defined/MEDIAN": 5,
195-
"checkpoint_2/universal/STANDARD_DEVIATION": 0.07597187617837561,
196-
"checkpoint_1/user_defined/AVERAGE": 5,
197-
"checkpoint_1/user_defined/MAXIMUM": 5,
198-
"gemini-2.0-flash-001@default/user_defined/MODE": 5,
199-
"checkpoint_1/user_defined/P95": 5,
200-
"checkpoint_1/universal/P99": 1,
201-
"checkpoint_1/user_defined/P90": 5,
202-
"checkpoint_2/universal/MEDIAN": 1,
203-
"checkpoint_1/universal/P95": 1,
204-
"checkpoint_1/user_defined/STANDARD_DEVIATION": 0,
205-
"gemini-2.0-flash-001@default/user_defined/STANDARD_DEVIATION": 0.6359497880839245,
206-
"checkpoint_1/user_defined/P99": 5,
207-
"gemini-2.0-flash-001@default/universal/MODE": [
208-
0.75,
209-
0.8571428656578064,
210-
],
211-
"checkpoint_2/user_defined/MODE": 5,
212-
"checkpoint_1/universal/P90": 1,
213-
"gemini-2.0-flash-001@default/user_defined/P99": 5,
190+
"gemini-2.0-flash-001@default/safety_v1/P99": 1,
214191
},
215-
total_items=19,
192+
total_items=3,
216193
)
217194
)
218195
assert evaluation_run.error is None
219196

220197

221-
def check_run_1957799200510967808_evaluation_item_results(
198+
def check_run_5133048044039700480_evaluation_item_results(
222199
client, evaluation_run: types.EvaluationRun, evaluation_run_name: str
223200
):
224201
eval_result = evaluation_run.evaluation_item_results
225202
assert isinstance(eval_result, types.EvaluationResult)
226203
assert eval_result.summary_metrics == [
227204
types.AggregatedMetricResult(
228-
metric_name="checkpoint_1/universal",
229-
mean_score=0.986633250587865,
230-
stdev_score=0.0393092386127714,
205+
metric_name="safety_v1",
206+
mean_score=0.7888888915379842,
207+
stdev_score=0.2991758188061675,
231208
),
232209
types.AggregatedMetricResult(
233-
metric_name="checkpoint_2/universal",
234-
mean_score=0.9438178790243048,
235-
stdev_score=0.07597187617837561,
236-
),
237-
types.AggregatedMetricResult(
238-
metric_name="gemini-2.0-flash-001@default/universal",
239-
mean_score=0.6943817985685249,
240-
stdev_score=0.17738341388587855,
241-
),
242-
types.AggregatedMetricResult(
243-
metric_name="checkpoint_1/user_defined", mean_score=5, stdev_score=0
244-
),
245-
types.AggregatedMetricResult(
246-
metric_name="checkpoint_2/user_defined", mean_score=5, stdev_score=0
247-
),
248-
types.AggregatedMetricResult(
249-
metric_name="gemini-2.0-flash-001@default/user_defined",
250-
mean_score=4.736842105263158,
251-
stdev_score=0.6359497880839245,
210+
metric_name="universal",
211+
mean_score=0.7888888915379842,
212+
stdev_score=0.2991758188061675,
252213
),
253214
]
215+
# Check the agent info.
216+
assert eval_result.agent_info == types.evals.AgentInfo(
217+
name="gemini-2.0-flash-001@default",
218+
instruction="example agent developer instruction",
219+
description=None,
220+
tool_declarations=[
221+
genai_types.Tool(
222+
function_declarations=[
223+
genai_types.FunctionDeclaration(
224+
name="check_chime",
225+
description="Check chime.",
226+
parameters={
227+
"type": "OBJECT",
228+
"properties": {
229+
"nums": {
230+
"type": "STRING",
231+
"description": "List of numbers to be verified.",
232+
}
233+
},
234+
"required": ["nums"],
235+
},
236+
),
237+
],
238+
)
239+
],
240+
)
254241
# Check the first eval case result.
255242
eval_case_result = eval_result.eval_case_results[0]
256243
assert isinstance(eval_case_result, types.EvalCaseResult)
@@ -264,26 +251,24 @@ def check_run_1957799200510967808_evaluation_item_results(
264251
assert universal_metric_result.explanation is None
265252
# Check the first rubric verdict.
266253
rubric_verdict_0 = universal_metric_result.rubric_verdicts[0]
267-
assert rubric_verdict_0 == (
268-
types.RubricVerdict(
269-
evaluated_rubric=types.Rubric(
270-
content=types.RubricContent(
271-
property=types.RubricContentProperty(
272-
description="The response is in English."
273-
)
274-
),
275-
importance="HIGH",
276-
type="LANGUAGE:PRIMARY_RESPONSE_LANGUAGE",
277-
),
278-
reasoning=("The entire response is written in the English language."),
279-
verdict=True,
280-
)
254+
assert isinstance(rubric_verdict_0, types.RubricVerdict)
255+
assert rubric_verdict_0.evaluated_rubric == types.Rubric(
256+
content=types.RubricContent(
257+
property=types.RubricContentProperty(
258+
description="The response is in English."
259+
)
260+
),
261+
importance="HIGH",
262+
type="LANGUAGE:PRIMARY_RESPONSE_LANGUAGE",
281263
)
264+
assert rubric_verdict_0.reasoning is not None
265+
assert rubric_verdict_0.verdict is True
282266
# Check the first evaluation dataset.
283267
eval_dataset = eval_result.evaluation_dataset[0]
284268
assert isinstance(eval_dataset, types.EvaluationDataset)
285269
assert eval_dataset.candidate_name == "gemini-2.0-flash-001@default"
286-
assert eval_dataset.eval_dataset_df.shape == (19, 3)
270+
assert eval_dataset.eval_dataset_df.shape[0] == 3
271+
assert eval_dataset.eval_dataset_df.shape[1] > 3
287272

288273

289274
pytestmark = pytest_helper.setup(

0 commit comments

Comments
 (0)