@@ -26,30 +26,7 @@ def test_get_eval_run(client):
2626 "projects/503583131166/locations/us-central1/evaluationRuns/1957799200510967808"
2727 )
2828 evaluation_run = client .evals .get_evaluation_run (name = evaluation_run_name )
29- assert isinstance (evaluation_run , types .EvaluationRun )
30- assert evaluation_run .name == evaluation_run_name
31- assert evaluation_run .display_name == "test2"
32- assert evaluation_run .metadata == {"pipeline_id" : "4460531348888616960" }
33- assert evaluation_run .create_time == datetime .datetime (
34- 2025 , 9 , 8 , 20 , 55 , 41 , 833176 , tzinfo = datetime .timezone .utc
35- )
36- assert evaluation_run .completion_time == datetime .datetime (
37- 2025 , 9 , 8 , 20 , 56 , 13 , 492971 , tzinfo = datetime .timezone .utc
38- )
39- assert evaluation_run .state == types .EvaluationRunState .SUCCEEDED
40- assert evaluation_run .evaluation_set_snapshot == (
41- "projects/503583131166/locations/us-central1/evaluationSets/8069535738573619200"
42- )
43- assert evaluation_run .data_source .bigquery_request_set == types .BigQueryRequestSet (
44- uri = "bq://lakeyk-test-limited.inference_batch_prediction_input.1317387725199900672_1b" ,
45- prompt_column = "request" ,
46- candidate_response_columns = {
47- "baseline_model_response" : "baseline_model_response" ,
48- "checkpoint_1" : "checkpoint_1" ,
49- "checkpoint_2" : "checkpoint_2" ,
50- },
51- )
52- assert evaluation_run .error is None
29+ check_run_1957799200510967808 (evaluation_run , evaluation_run_name )
5330
5431
5532def test_get_eval_run_bq_source (client ):
@@ -104,13 +81,30 @@ def test_get_eval_run_eval_set_source(client):
10481async def test_get_eval_run_async (client ):
10582 """Tests that get_evaluation_run() returns a correctly structured EvaluationRun."""
10683 eval_run_id = "1957799200510967808"
107- eval_run_name = (
84+ evaluation_run_name = (
10885 f"projects/503583131166/locations/us-central1/evaluationRuns/{ eval_run_id } "
10986 )
11087 evaluation_run = await client .aio .evals .get_evaluation_run (name = eval_run_id )
88+ check_run_1957799200510967808 (evaluation_run , evaluation_run_name )
89+
90+
91+ def check_run_1957799200510967808 (
92+ evaluation_run : types .EvaluationRun , evaluation_run_name : str
93+ ):
11194 assert isinstance (evaluation_run , types .EvaluationRun )
112- assert evaluation_run .name == eval_run_name
95+ assert evaluation_run .name == evaluation_run_name
11396 assert evaluation_run .display_name == "test2"
97+ assert evaluation_run .metadata == {"pipeline_id" : "4460531348888616960" }
98+ assert evaluation_run .create_time == datetime .datetime (
99+ 2025 , 9 , 8 , 20 , 55 , 41 , 833176 , tzinfo = datetime .timezone .utc
100+ )
101+ assert evaluation_run .completion_time == datetime .datetime (
102+ 2025 , 9 , 8 , 20 , 56 , 13 , 492971 , tzinfo = datetime .timezone .utc
103+ )
104+ assert evaluation_run .state == types .EvaluationRunState .SUCCEEDED
105+ assert evaluation_run .evaluation_set_snapshot == (
106+ "projects/503583131166/locations/us-central1/evaluationSets/8069535738573619200"
107+ )
114108 assert evaluation_run .data_source .bigquery_request_set == types .BigQueryRequestSet (
115109 uri = "bq://lakeyk-test-limited.inference_batch_prediction_input.1317387725199900672_1b" ,
116110 prompt_column = "request" ,
@@ -120,6 +114,80 @@ async def test_get_eval_run_async(client):
120114 "checkpoint_2" : "checkpoint_2" ,
121115 },
122116 )
117+ assert evaluation_run .evaluation_results .evaluation_set == (
118+ "projects/503583131166/locations/us-central1/evaluationSets/102386522778501120"
119+ )
120+ assert evaluation_run .evaluation_results .summary_metrics == (
121+ types .SummaryMetric (
122+ metrics = {
123+ "checkpoint_1/user_defined/MODE" : 5 ,
124+ "checkpoint_2/universal/P90" : 1 ,
125+ "gemini-2.0-flash-001@default/universal/AVERAGE" : 0.6943817985685249 ,
126+ "gemini-2.0-flash-001@default/user_defined/P90" : 5 ,
127+ "gemini-2.0-flash-001@default/universal/VARIANCE" : 0.03146487552180889 ,
128+ "gemini-2.0-flash-001@default/user_defined/P95" : 5 ,
129+ "checkpoint_1/universal/MINIMUM" : 0.8571428656578064 ,
130+ "checkpoint_1/universal/VARIANCE" : 0.0015452162403157982 ,
131+ "gemini-2.0-flash-001@default/universal/STANDARD_DEVIATION" : 0.17738341388587855 ,
132+ "checkpoint_2/user_defined/P95" : 5 ,
133+ "checkpoint_2/universal/MODE" : 1 ,
134+ "checkpoint_2/user_defined/P90" : 5 ,
135+ "checkpoint_2/universal/P99" : 1 ,
136+ "gemini-2.0-flash-001@default/universal/MAXIMUM" : 1 ,
137+ "checkpoint_2/universal/P95" : 1 ,
138+ "checkpoint_2/user_defined/P99" : 5 ,
139+ "checkpoint_2/universal/MINIMUM" : 0.7777777910232544 ,
140+ "gemini-2.0-flash-001@default/universal/P90" : 0.8777777791023255 ,
141+ "checkpoint_1/universal/AVERAGE" : 0.986633250587865 ,
142+ "checkpoint_1/universal/MAXIMUM" : 1 ,
143+ "checkpoint_1/universal/STANDARD_DEVIATION" : 0.0393092386127714 ,
144+ "gemini-2.0-flash-001@default/universal/P95" : 0.9000000059604645 ,
145+ "gemini-2.0-flash-001@default/user_defined/MAXIMUM" : 5 ,
146+ "gemini-2.0-flash-001@default/user_defined/MINIMUM" : 3 ,
147+ "gemini-2.0-flash-001@default/user_defined/VARIANCE" : 0.4044321329639886 ,
148+ "checkpoint_2/user_defined/MAXIMUM" : 5 ,
149+ "checkpoint_1/universal/MEDIAN" : 1 ,
150+ "gemini-2.0-flash-001@default/universal/MEDIAN" : 0.7142857313156128 ,
151+ "gemini-2.0-flash-001@default/user_defined/AVERAGE" : 4.736842105263158 ,
152+ "gemini-2.0-flash-001@default/user_defined/MEDIAN" : 5 ,
153+ "checkpoint_2/user_defined/AVERAGE" : 5 ,
154+ "checkpoint_2/user_defined/MEDIAN" : 5 ,
155+ "checkpoint_2/user_defined/STANDARD_DEVIATION" : 0 ,
156+ "checkpoint_2/universal/MAXIMUM" : 1 ,
157+ "checkpoint_1/universal/MODE" : 1 ,
158+ "checkpoint_2/user_defined/MINIMUM" : 5 ,
159+ "checkpoint_1/user_defined/VARIANCE" : 0 ,
160+ "checkpoint_2/universal/VARIANCE" : 0.005771725970062436 ,
161+ "checkpoint_2/universal/AVERAGE" : 0.9438178790243048 ,
162+ "checkpoint_1/user_defined/MINIMUM" : 5 ,
163+ "gemini-2.0-flash-001@default/universal/P99" : 0.9800000011920929 ,
164+ "gemini-2.0-flash-001@default/universal/MINIMUM" : 0.2857142984867096 ,
165+ "checkpoint_2/user_defined/VARIANCE" : 0 ,
166+ "checkpoint_1/user_defined/MEDIAN" : 5 ,
167+ "checkpoint_2/universal/STANDARD_DEVIATION" : 0.07597187617837561 ,
168+ "checkpoint_1/user_defined/AVERAGE" : 5 ,
169+ "checkpoint_1/user_defined/MAXIMUM" : 5 ,
170+ "gemini-2.0-flash-001@default/user_defined/MODE" : 5 ,
171+ "checkpoint_1/user_defined/P95" : 5 ,
172+ "checkpoint_1/universal/P99" : 1 ,
173+ "checkpoint_1/user_defined/P90" : 5 ,
174+ "checkpoint_2/universal/MEDIAN" : 1 ,
175+ "checkpoint_1/universal/P95" : 1 ,
176+ "checkpoint_1/user_defined/STANDARD_DEVIATION" : 0 ,
177+ "gemini-2.0-flash-001@default/user_defined/STANDARD_DEVIATION" : 0.6359497880839245 ,
178+ "checkpoint_1/user_defined/P99" : 5 ,
179+ "gemini-2.0-flash-001@default/universal/MODE" : [
180+ 0.75 ,
181+ 0.8571428656578064 ,
182+ ],
183+ "checkpoint_2/user_defined/MODE" : 5 ,
184+ "checkpoint_1/universal/P90" : 1 ,
185+ "gemini-2.0-flash-001@default/user_defined/P99" : 5 ,
186+ },
187+ total_items = 19 ,
188+ )
189+ )
190+ assert evaluation_run .error is None
123191
124192
125193pytestmark = pytest_helper .setup (
0 commit comments