1616
1717from tests .unit .vertexai .genai .replays import pytest_helper
1818from vertexai import types
19+ from google .genai import types as genai_types
1920import datetime
2021import pytest
2122
2223
2324def test_get_eval_run (client ):
2425 """Tests that get_evaluation_run() returns a correctly structured EvaluationRun."""
26+ client ._api_client ._http_options .api_version = "v1beta1"
2527 evaluation_run_name = (
26- "projects/503583131166/locations/us-central1/evaluationRuns/1957799200510967808 "
28+ "projects/503583131166/locations/us-central1/evaluationRuns/5133048044039700480 "
2729 )
2830 evaluation_run = client .evals .get_evaluation_run (
2931 name = evaluation_run_name , include_evaluation_items = True
3032 )
31- check_run_1957799200510967808 (client , evaluation_run , evaluation_run_name )
32- check_run_1957799200510967808_evaluation_item_results (
33+ check_run_5133048044039700480 (client , evaluation_run , evaluation_run_name )
34+ check_run_5133048044039700480_evaluation_item_results (
3335 client , evaluation_run , evaluation_run_name
3436 )
3537
3638
3739def test_get_eval_run_include_evaluation_items_false (client ):
3840 """Tests that get_evaluation_run() returns a correctly structured EvaluationRun."""
41+ client ._api_client ._http_options .api_version = "v1beta1"
3942 evaluation_run_name = (
40- "projects/503583131166/locations/us-central1/evaluationRuns/1957799200510967808 "
43+ "projects/503583131166/locations/us-central1/evaluationRuns/5133048044039700480 "
4144 )
4245 evaluation_run = client .evals .get_evaluation_run (name = evaluation_run_name )
43- check_run_1957799200510967808 (client , evaluation_run , evaluation_run_name )
46+ check_run_5133048044039700480 (client , evaluation_run , evaluation_run_name )
4447 assert evaluation_run .evaluation_item_results is None
4548
4649
@@ -99,158 +102,142 @@ def test_get_eval_run_eval_set_source(client):
99102@pytest .mark .asyncio
100103async def test_get_eval_run_async (client ):
101104 """Tests that get_evaluation_run() returns a correctly structured EvaluationRun."""
102- eval_run_id = "1957799200510967808"
105+ client ._api_client ._http_options .api_version = "v1beta1"
106+ eval_run_id = "5133048044039700480"
103107 evaluation_run_name = (
104108 f"projects/503583131166/locations/us-central1/evaluationRuns/{ eval_run_id } "
105109 )
106110 evaluation_run = await client .aio .evals .get_evaluation_run (name = eval_run_id )
107- check_run_1957799200510967808 (client , evaluation_run , evaluation_run_name )
111+ check_run_5133048044039700480 (client , evaluation_run , evaluation_run_name )
108112 assert evaluation_run .evaluation_item_results is None
109113
110114
111- def check_run_1957799200510967808 (
115+ def check_run_5133048044039700480 (
112116 client , evaluation_run : types .EvaluationRun , evaluation_run_name : str
113117):
114118 assert isinstance (evaluation_run , types .EvaluationRun )
115119 assert evaluation_run .name == evaluation_run_name
116- assert evaluation_run .display_name == "test2 "
117- assert evaluation_run .metadata == {"pipeline_id" : "4460531348888616960 " }
120+ assert evaluation_run .display_name == "sdk-test-1 "
121+ assert evaluation_run .metadata == {"pipeline_id" : "4868043098678099968 " }
118122 assert evaluation_run .create_time == datetime .datetime (
119- 2025 , 9 , 8 , 20 , 55 , 41 , 833176 , tzinfo = datetime .timezone .utc
123+ 2025 , 10 , 21 , 19 , 25 , 58 , 669441 , tzinfo = datetime .timezone .utc
120124 )
121125 assert evaluation_run .completion_time == datetime .datetime (
122- 2025 , 9 , 8 , 20 , 56 , 13 , 492971 , tzinfo = datetime .timezone .utc
126+ 2025 , 10 , 21 , 19 , 26 , 15 , 855568 , tzinfo = datetime .timezone .utc
123127 )
124128 assert evaluation_run .state == types .EvaluationRunState .SUCCEEDED
125129 assert evaluation_run .evaluation_set_snapshot == (
126- "projects/503583131166/locations/us-central1/evaluationSets/8069535738573619200 "
130+ "projects/503583131166/locations/us-central1/evaluationSets/3122155626046685184 "
127131 )
128- assert evaluation_run .data_source .bigquery_request_set == types .BigQueryRequestSet (
129- uri = "bq://lakeyk-test-limited.inference_batch_prediction_input.1317387725199900672_1b" ,
130- prompt_column = "request" ,
131- candidate_response_columns = {
132- "baseline_model_response" : "baseline_model_response" ,
133- "checkpoint_1" : "checkpoint_1" ,
134- "checkpoint_2" : "checkpoint_2" ,
135- },
132+ assert (
133+ evaluation_run .data_source .evaluation_set
134+ == "projects/503583131166/locations/us-central1/evaluationSets/3122155626046685184"
136135 )
137136 assert evaluation_run .evaluation_run_results .evaluation_set == (
138- "projects/503583131166/locations/us-central1/evaluationSets/102386522778501120 "
137+ "projects/503583131166/locations/us-central1/evaluationSets/129513673658990592 "
139138 )
140139 assert evaluation_run .inference_configs == {
141- "checkpoint_1" : types .EvaluationRunInferenceConfig (
142- model = "projects/503583131166/locations/us-central1/endpoints/9030177948249882624"
143- ),
144- "checkpoint_2" : types .EvaluationRunInferenceConfig (
145- model = "projects/503583131166/locations/us-central1/endpoints/7751155654076661760"
140+ "gemini-2.0-flash-001@default" : types .EvaluationRunInferenceConfig (
141+ agent_config = types .EvaluationRunAgentConfig (
142+ developer_instruction = {
143+ "parts" : [{"text" : "example agent developer instruction" }]
144+ },
145+ tools = [
146+ genai_types .Tool (
147+ function_declarations = [
148+ genai_types .FunctionDeclaration (
149+ name = "check_chime" ,
150+ description = "Check chime." ,
151+ parameters = {
152+ "type" : "OBJECT" ,
153+ "properties" : {
154+ "nums" : {
155+ "type" : "STRING" ,
156+ "description" : "List of numbers to be verified." ,
157+ }
158+ },
159+ "required" : ["nums" ],
160+ },
161+ ),
162+ ],
163+ )
164+ ],
165+ )
146166 ),
147167 }
148168 assert evaluation_run .evaluation_run_results .summary_metrics == (
149169 types .SummaryMetric (
150170 metrics = {
151- "checkpoint_1/user_defined/MODE" : 5 ,
152- "checkpoint_2/universal/P90" : 1 ,
153- "gemini-2.0-flash-001@default/universal/AVERAGE" : 0.6943817985685249 ,
154- "gemini-2.0-flash-001@default/user_defined/P90" : 5 ,
155- "gemini-2.0-flash-001@default/universal/VARIANCE" : 0.03146487552180889 ,
156- "gemini-2.0-flash-001@default/user_defined/P95" : 5 ,
157- "checkpoint_1/universal/MINIMUM" : 0.8571428656578064 ,
158- "checkpoint_1/universal/VARIANCE" : 0.0015452162403157982 ,
159- "gemini-2.0-flash-001@default/universal/STANDARD_DEVIATION" : 0.17738341388587855 ,
160- "checkpoint_2/user_defined/P95" : 5 ,
161- "checkpoint_2/universal/MODE" : 1 ,
162- "checkpoint_2/user_defined/P90" : 5 ,
163- "checkpoint_2/universal/P99" : 1 ,
171+ "gemini-2.0-flash-001@default/safety_v1/VARIANCE" : 0.08950617055834077 ,
172+ "gemini-2.0-flash-001@default/safety_v1/MAXIMUM" : 1 ,
173+ "gemini-2.0-flash-001@default/universal/AVERAGE" : 0.7888888915379842 ,
174+ "gemini-2.0-flash-001@default/universal/P90" : 1 ,
175+ "gemini-2.0-flash-001@default/safety_v1/MEDIAN" : 1 ,
176+ "gemini-2.0-flash-001@default/universal/P95" : 1 ,
177+ "gemini-2.0-flash-001@default/universal/VARIANCE" : 0.08950617055834077 ,
178+ "gemini-2.0-flash-001@default/universal/STANDARD_DEVIATION" : 0.2991758188061675 ,
179+ "gemini-2.0-flash-001@default/universal/MEDIAN" : 1 ,
180+ "gemini-2.0-flash-001@default/safety_v1/STANDARD_DEVIATION" : 0.2991758188061675 ,
181+ "gemini-2.0-flash-001@default/universal/MODE" : 1 ,
182+ "gemini-2.0-flash-001@default/safety_v1/MODE" : 1 ,
183+ "gemini-2.0-flash-001@default/safety_v1/MINIMUM" : 0.3333333432674408 ,
184+ "gemini-2.0-flash-001@default/safety_v1/P90" : 1 ,
185+ "gemini-2.0-flash-001@default/safety_v1/P95" : 1 ,
186+ "gemini-2.0-flash-001@default/universal/P99" : 1 ,
187+ "gemini-2.0-flash-001@default/safety_v1/AVERAGE" : 0.7888888915379842 ,
188+ "gemini-2.0-flash-001@default/universal/MINIMUM" : 0.3333333432674408 ,
164189 "gemini-2.0-flash-001@default/universal/MAXIMUM" : 1 ,
165- "checkpoint_2/universal/P95" : 1 ,
166- "checkpoint_2/user_defined/P99" : 5 ,
167- "checkpoint_2/universal/MINIMUM" : 0.7777777910232544 ,
168- "gemini-2.0-flash-001@default/universal/P90" : 0.8777777791023255 ,
169- "checkpoint_1/universal/AVERAGE" : 0.986633250587865 ,
170- "checkpoint_1/universal/MAXIMUM" : 1 ,
171- "checkpoint_1/universal/STANDARD_DEVIATION" : 0.0393092386127714 ,
172- "gemini-2.0-flash-001@default/universal/P95" : 0.9000000059604645 ,
173- "gemini-2.0-flash-001@default/user_defined/MAXIMUM" : 5 ,
174- "gemini-2.0-flash-001@default/user_defined/MINIMUM" : 3 ,
175- "gemini-2.0-flash-001@default/user_defined/VARIANCE" : 0.4044321329639886 ,
176- "checkpoint_2/user_defined/MAXIMUM" : 5 ,
177- "checkpoint_1/universal/MEDIAN" : 1 ,
178- "gemini-2.0-flash-001@default/universal/MEDIAN" : 0.7142857313156128 ,
179- "gemini-2.0-flash-001@default/user_defined/AVERAGE" : 4.736842105263158 ,
180- "gemini-2.0-flash-001@default/user_defined/MEDIAN" : 5 ,
181- "checkpoint_2/user_defined/AVERAGE" : 5 ,
182- "checkpoint_2/user_defined/MEDIAN" : 5 ,
183- "checkpoint_2/user_defined/STANDARD_DEVIATION" : 0 ,
184- "checkpoint_2/universal/MAXIMUM" : 1 ,
185- "checkpoint_1/universal/MODE" : 1 ,
186- "checkpoint_2/user_defined/MINIMUM" : 5 ,
187- "checkpoint_1/user_defined/VARIANCE" : 0 ,
188- "checkpoint_2/universal/VARIANCE" : 0.005771725970062436 ,
189- "checkpoint_2/universal/AVERAGE" : 0.9438178790243048 ,
190- "checkpoint_1/user_defined/MINIMUM" : 5 ,
191- "gemini-2.0-flash-001@default/universal/P99" : 0.9800000011920929 ,
192- "gemini-2.0-flash-001@default/universal/MINIMUM" : 0.2857142984867096 ,
193- "checkpoint_2/user_defined/VARIANCE" : 0 ,
194- "checkpoint_1/user_defined/MEDIAN" : 5 ,
195- "checkpoint_2/universal/STANDARD_DEVIATION" : 0.07597187617837561 ,
196- "checkpoint_1/user_defined/AVERAGE" : 5 ,
197- "checkpoint_1/user_defined/MAXIMUM" : 5 ,
198- "gemini-2.0-flash-001@default/user_defined/MODE" : 5 ,
199- "checkpoint_1/user_defined/P95" : 5 ,
200- "checkpoint_1/universal/P99" : 1 ,
201- "checkpoint_1/user_defined/P90" : 5 ,
202- "checkpoint_2/universal/MEDIAN" : 1 ,
203- "checkpoint_1/universal/P95" : 1 ,
204- "checkpoint_1/user_defined/STANDARD_DEVIATION" : 0 ,
205- "gemini-2.0-flash-001@default/user_defined/STANDARD_DEVIATION" : 0.6359497880839245 ,
206- "checkpoint_1/user_defined/P99" : 5 ,
207- "gemini-2.0-flash-001@default/universal/MODE" : [
208- 0.75 ,
209- 0.8571428656578064 ,
210- ],
211- "checkpoint_2/user_defined/MODE" : 5 ,
212- "checkpoint_1/universal/P90" : 1 ,
213- "gemini-2.0-flash-001@default/user_defined/P99" : 5 ,
190+ "gemini-2.0-flash-001@default/safety_v1/P99" : 1 ,
214191 },
215- total_items = 19 ,
192+ total_items = 3 ,
216193 )
217194 )
218195 assert evaluation_run .error is None
219196
220197
221- def check_run_1957799200510967808_evaluation_item_results (
198+ def check_run_5133048044039700480_evaluation_item_results (
222199 client , evaluation_run : types .EvaluationRun , evaluation_run_name : str
223200):
224201 eval_result = evaluation_run .evaluation_item_results
225202 assert isinstance (eval_result , types .EvaluationResult )
226203 assert eval_result .summary_metrics == [
227204 types .AggregatedMetricResult (
228- metric_name = "checkpoint_1/universal " ,
229- mean_score = 0.986633250587865 ,
230- stdev_score = 0.0393092386127714 ,
205+ metric_name = "safety_v1 " ,
206+ mean_score = 0.7888888915379842 ,
207+ stdev_score = 0.2991758188061675 ,
231208 ),
232209 types .AggregatedMetricResult (
233- metric_name = "checkpoint_2/universal" ,
234- mean_score = 0.9438178790243048 ,
235- stdev_score = 0.07597187617837561 ,
236- ),
237- types .AggregatedMetricResult (
238- metric_name = "gemini-2.0-flash-001@default/universal" ,
239- mean_score = 0.6943817985685249 ,
240- stdev_score = 0.17738341388587855 ,
241- ),
242- types .AggregatedMetricResult (
243- metric_name = "checkpoint_1/user_defined" , mean_score = 5 , stdev_score = 0
244- ),
245- types .AggregatedMetricResult (
246- metric_name = "checkpoint_2/user_defined" , mean_score = 5 , stdev_score = 0
247- ),
248- types .AggregatedMetricResult (
249- metric_name = "gemini-2.0-flash-001@default/user_defined" ,
250- mean_score = 4.736842105263158 ,
251- stdev_score = 0.6359497880839245 ,
210+ metric_name = "universal" ,
211+ mean_score = 0.7888888915379842 ,
212+ stdev_score = 0.2991758188061675 ,
252213 ),
253214 ]
215+ # Check the agent info.
216+ assert eval_result .agent_info == types .evals .AgentInfo (
217+ name = "gemini-2.0-flash-001@default" ,
218+ instruction = "example agent developer instruction" ,
219+ description = None ,
220+ tool_declarations = [
221+ genai_types .Tool (
222+ function_declarations = [
223+ genai_types .FunctionDeclaration (
224+ name = "check_chime" ,
225+ description = "Check chime." ,
226+ parameters = {
227+ "type" : "OBJECT" ,
228+ "properties" : {
229+ "nums" : {
230+ "type" : "STRING" ,
231+ "description" : "List of numbers to be verified." ,
232+ }
233+ },
234+ "required" : ["nums" ],
235+ },
236+ ),
237+ ],
238+ )
239+ ],
240+ )
254241 # Check the first eval case result.
255242 eval_case_result = eval_result .eval_case_results [0 ]
256243 assert isinstance (eval_case_result , types .EvalCaseResult )
@@ -264,26 +251,24 @@ def check_run_1957799200510967808_evaluation_item_results(
264251 assert universal_metric_result .explanation is None
265252 # Check the first rubric verdict.
266253 rubric_verdict_0 = universal_metric_result .rubric_verdicts [0 ]
267- assert rubric_verdict_0 == (
268- types .RubricVerdict (
269- evaluated_rubric = types .Rubric (
270- content = types .RubricContent (
271- property = types .RubricContentProperty (
272- description = "The response is in English."
273- )
274- ),
275- importance = "HIGH" ,
276- type = "LANGUAGE:PRIMARY_RESPONSE_LANGUAGE" ,
277- ),
278- reasoning = ("The entire response is written in the English language." ),
279- verdict = True ,
280- )
254+ assert isinstance (rubric_verdict_0 , types .RubricVerdict )
255+ assert rubric_verdict_0 .evaluated_rubric == types .Rubric (
256+ content = types .RubricContent (
257+ property = types .RubricContentProperty (
258+ description = "The response is in English."
259+ )
260+ ),
261+ importance = "HIGH" ,
262+ type = "LANGUAGE:PRIMARY_RESPONSE_LANGUAGE" ,
281263 )
264+ assert rubric_verdict_0 .reasoning is not None
265+ assert rubric_verdict_0 .verdict is True
282266 # Check the first evaluation dataset.
283267 eval_dataset = eval_result .evaluation_dataset [0 ]
284268 assert isinstance (eval_dataset , types .EvaluationDataset )
285269 assert eval_dataset .candidate_name == "gemini-2.0-flash-001@default"
286- assert eval_dataset .eval_dataset_df .shape == (19 , 3 )
270+ assert eval_dataset .eval_dataset_df .shape [0 ] == 3
271+ assert eval_dataset .eval_dataset_df .shape [1 ] > 3
287272
288273
289274pytestmark = pytest_helper .setup (
0 commit comments