Skip to content

Commit f07ecc3

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: GenAI Client(evals) - Add EvaluationResults to get_evaluation_run method response in Vertex AI GenAI SDK evals
PiperOrigin-RevId: 813450051
1 parent 9e6a87a commit f07ecc3

File tree

3 files changed

+201
-26
lines changed

3 files changed

+201
-26
lines changed

tests/unit/vertexai/genai/replays/test_get_evaluation_run.py

Lines changed: 94 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -26,30 +26,7 @@ def test_get_eval_run(client):
2626
"projects/503583131166/locations/us-central1/evaluationRuns/1957799200510967808"
2727
)
2828
evaluation_run = client.evals.get_evaluation_run(name=evaluation_run_name)
29-
assert isinstance(evaluation_run, types.EvaluationRun)
30-
assert evaluation_run.name == evaluation_run_name
31-
assert evaluation_run.display_name == "test2"
32-
assert evaluation_run.metadata == {"pipeline_id": "4460531348888616960"}
33-
assert evaluation_run.create_time == datetime.datetime(
34-
2025, 9, 8, 20, 55, 41, 833176, tzinfo=datetime.timezone.utc
35-
)
36-
assert evaluation_run.completion_time == datetime.datetime(
37-
2025, 9, 8, 20, 56, 13, 492971, tzinfo=datetime.timezone.utc
38-
)
39-
assert evaluation_run.state == types.EvaluationRunState.SUCCEEDED
40-
assert evaluation_run.evaluation_set_snapshot == (
41-
"projects/503583131166/locations/us-central1/evaluationSets/8069535738573619200"
42-
)
43-
assert evaluation_run.data_source.bigquery_request_set == types.BigQueryRequestSet(
44-
uri="bq://lakeyk-test-limited.inference_batch_prediction_input.1317387725199900672_1b",
45-
prompt_column="request",
46-
candidate_response_columns={
47-
"baseline_model_response": "baseline_model_response",
48-
"checkpoint_1": "checkpoint_1",
49-
"checkpoint_2": "checkpoint_2",
50-
},
51-
)
52-
assert evaluation_run.error is None
29+
check_run_1957799200510967808(evaluation_run, evaluation_run_name)
5330

5431

5532
def test_get_eval_run_bq_source(client):
@@ -104,13 +81,30 @@ def test_get_eval_run_eval_set_source(client):
10481
async def test_get_eval_run_async(client):
10582
"""Tests that get_evaluation_run() returns a correctly structured EvaluationRun."""
10683
eval_run_id = "1957799200510967808"
107-
eval_run_name = (
84+
evaluation_run_name = (
10885
f"projects/503583131166/locations/us-central1/evaluationRuns/{eval_run_id}"
10986
)
11087
evaluation_run = await client.aio.evals.get_evaluation_run(name=eval_run_id)
88+
check_run_1957799200510967808(evaluation_run, evaluation_run_name)
89+
90+
91+
def check_run_1957799200510967808(
92+
evaluation_run: types.EvaluationRun, evaluation_run_name: str
93+
):
11194
assert isinstance(evaluation_run, types.EvaluationRun)
112-
assert evaluation_run.name == eval_run_name
95+
assert evaluation_run.name == evaluation_run_name
11396
assert evaluation_run.display_name == "test2"
97+
assert evaluation_run.metadata == {"pipeline_id": "4460531348888616960"}
98+
assert evaluation_run.create_time == datetime.datetime(
99+
2025, 9, 8, 20, 55, 41, 833176, tzinfo=datetime.timezone.utc
100+
)
101+
assert evaluation_run.completion_time == datetime.datetime(
102+
2025, 9, 8, 20, 56, 13, 492971, tzinfo=datetime.timezone.utc
103+
)
104+
assert evaluation_run.state == types.EvaluationRunState.SUCCEEDED
105+
assert evaluation_run.evaluation_set_snapshot == (
106+
"projects/503583131166/locations/us-central1/evaluationSets/8069535738573619200"
107+
)
114108
assert evaluation_run.data_source.bigquery_request_set == types.BigQueryRequestSet(
115109
uri="bq://lakeyk-test-limited.inference_batch_prediction_input.1317387725199900672_1b",
116110
prompt_column="request",
@@ -120,6 +114,80 @@ async def test_get_eval_run_async(client):
120114
"checkpoint_2": "checkpoint_2",
121115
},
122116
)
117+
assert evaluation_run.evaluation_results.evaluation_set == (
118+
"projects/503583131166/locations/us-central1/evaluationSets/102386522778501120"
119+
)
120+
assert evaluation_run.evaluation_results.summary_metrics == (
121+
types.SummaryMetric(
122+
metrics={
123+
"checkpoint_1/user_defined/MODE": 5,
124+
"checkpoint_2/universal/P90": 1,
125+
"gemini-2.0-flash-001@default/universal/AVERAGE": 0.6943817985685249,
126+
"gemini-2.0-flash-001@default/user_defined/P90": 5,
127+
"gemini-2.0-flash-001@default/universal/VARIANCE": 0.03146487552180889,
128+
"gemini-2.0-flash-001@default/user_defined/P95": 5,
129+
"checkpoint_1/universal/MINIMUM": 0.8571428656578064,
130+
"checkpoint_1/universal/VARIANCE": 0.0015452162403157982,
131+
"gemini-2.0-flash-001@default/universal/STANDARD_DEVIATION": 0.17738341388587855,
132+
"checkpoint_2/user_defined/P95": 5,
133+
"checkpoint_2/universal/MODE": 1,
134+
"checkpoint_2/user_defined/P90": 5,
135+
"checkpoint_2/universal/P99": 1,
136+
"gemini-2.0-flash-001@default/universal/MAXIMUM": 1,
137+
"checkpoint_2/universal/P95": 1,
138+
"checkpoint_2/user_defined/P99": 5,
139+
"checkpoint_2/universal/MINIMUM": 0.7777777910232544,
140+
"gemini-2.0-flash-001@default/universal/P90": 0.8777777791023255,
141+
"checkpoint_1/universal/AVERAGE": 0.986633250587865,
142+
"checkpoint_1/universal/MAXIMUM": 1,
143+
"checkpoint_1/universal/STANDARD_DEVIATION": 0.0393092386127714,
144+
"gemini-2.0-flash-001@default/universal/P95": 0.9000000059604645,
145+
"gemini-2.0-flash-001@default/user_defined/MAXIMUM": 5,
146+
"gemini-2.0-flash-001@default/user_defined/MINIMUM": 3,
147+
"gemini-2.0-flash-001@default/user_defined/VARIANCE": 0.4044321329639886,
148+
"checkpoint_2/user_defined/MAXIMUM": 5,
149+
"checkpoint_1/universal/MEDIAN": 1,
150+
"gemini-2.0-flash-001@default/universal/MEDIAN": 0.7142857313156128,
151+
"gemini-2.0-flash-001@default/user_defined/AVERAGE": 4.736842105263158,
152+
"gemini-2.0-flash-001@default/user_defined/MEDIAN": 5,
153+
"checkpoint_2/user_defined/AVERAGE": 5,
154+
"checkpoint_2/user_defined/MEDIAN": 5,
155+
"checkpoint_2/user_defined/STANDARD_DEVIATION": 0,
156+
"checkpoint_2/universal/MAXIMUM": 1,
157+
"checkpoint_1/universal/MODE": 1,
158+
"checkpoint_2/user_defined/MINIMUM": 5,
159+
"checkpoint_1/user_defined/VARIANCE": 0,
160+
"checkpoint_2/universal/VARIANCE": 0.005771725970062436,
161+
"checkpoint_2/universal/AVERAGE": 0.9438178790243048,
162+
"checkpoint_1/user_defined/MINIMUM": 5,
163+
"gemini-2.0-flash-001@default/universal/P99": 0.9800000011920929,
164+
"gemini-2.0-flash-001@default/universal/MINIMUM": 0.2857142984867096,
165+
"checkpoint_2/user_defined/VARIANCE": 0,
166+
"checkpoint_1/user_defined/MEDIAN": 5,
167+
"checkpoint_2/universal/STANDARD_DEVIATION": 0.07597187617837561,
168+
"checkpoint_1/user_defined/AVERAGE": 5,
169+
"checkpoint_1/user_defined/MAXIMUM": 5,
170+
"gemini-2.0-flash-001@default/user_defined/MODE": 5,
171+
"checkpoint_1/user_defined/P95": 5,
172+
"checkpoint_1/universal/P99": 1,
173+
"checkpoint_1/user_defined/P90": 5,
174+
"checkpoint_2/universal/MEDIAN": 1,
175+
"checkpoint_1/universal/P95": 1,
176+
"checkpoint_1/user_defined/STANDARD_DEVIATION": 0,
177+
"gemini-2.0-flash-001@default/user_defined/STANDARD_DEVIATION": 0.6359497880839245,
178+
"checkpoint_1/user_defined/P99": 5,
179+
"gemini-2.0-flash-001@default/universal/MODE": [
180+
0.75,
181+
0.8571428656578064,
182+
],
183+
"checkpoint_2/user_defined/MODE": 5,
184+
"checkpoint_1/universal/P90": 1,
185+
"gemini-2.0-flash-001@default/user_defined/P99": 5,
186+
},
187+
total_items=19,
188+
)
189+
)
190+
assert evaluation_run.error is None
123191

124192

125193
pytestmark = pytest_helper.setup(

vertexai/_genai/evals.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -438,6 +438,26 @@ def _EvaluationRunDataSource_to_vertex(
438438
return to_object
439439

440440

441+
def _EvaluationRunResults_from_vertex(
442+
from_object: Union[dict[str, Any], object],
443+
parent_object: Optional[dict[str, Any]] = None,
444+
) -> dict[str, Any]:
445+
to_object: dict[str, Any] = {}
446+
if getv(from_object, ["evaluationSet"]) is not None:
447+
setv(to_object, ["evaluation_set"], getv(from_object, ["evaluationSet"]))
448+
449+
if getv(from_object, ["summaryMetrics"]) is not None:
450+
setv(
451+
to_object,
452+
["summary_metrics"],
453+
_SummaryMetric_from_vertex(
454+
getv(from_object, ["summaryMetrics"]), to_object
455+
),
456+
)
457+
458+
return to_object
459+
460+
441461
def _EvaluationRun_from_vertex(
442462
from_object: Union[dict[str, Any], object],
443463
parent_object: Optional[dict[str, Any]] = None,
@@ -480,6 +500,15 @@ def _EvaluationRun_from_vertex(
480500
),
481501
)
482502

503+
if getv(from_object, ["evaluationResults"]) is not None:
504+
setv(
505+
to_object,
506+
["evaluation_results"],
507+
_EvaluationRunResults_from_vertex(
508+
getv(from_object, ["evaluationResults"]), to_object
509+
),
510+
)
511+
483512
return to_object
484513

485514

@@ -890,6 +919,23 @@ def _SamplingConfig_to_vertex(
890919
return to_object
891920

892921

922+
def _SummaryMetric_from_vertex(
923+
from_object: Union[dict[str, Any], object],
924+
parent_object: Optional[dict[str, Any]] = None,
925+
) -> dict[str, Any]:
926+
to_object: dict[str, Any] = {}
927+
if getv(from_object, ["metrics"]) is not None:
928+
setv(to_object, ["metrics"], getv(from_object, ["metrics"]))
929+
930+
if getv(from_object, ["totalItems"]) is not None:
931+
setv(to_object, ["total_items"], getv(from_object, ["totalItems"]))
932+
933+
if getv(from_object, ["failedItems"]) is not None:
934+
setv(to_object, ["failed_items"], getv(from_object, ["failedItems"]))
935+
936+
return to_object
937+
938+
893939
def _ToolCallValidInput_to_vertex(
894940
from_object: Union[dict[str, Any], object],
895941
parent_object: Optional[dict[str, Any]] = None,

vertexai/_genai/types.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -488,6 +488,61 @@ class _CreateEvaluationRunParametersDict(TypedDict, total=False):
488488
]
489489

490490

491+
class SummaryMetric(_common.BaseModel):
492+
"""Represents a summary metric for an evaluation run."""
493+
494+
metrics: Optional[dict[str, Any]] = Field(
495+
default=None, description="""Map of metric name to metric value."""
496+
)
497+
total_items: Optional[int] = Field(
498+
default=None, description="""The total number of items that were evaluated."""
499+
)
500+
failed_items: Optional[int] = Field(
501+
default=None, description="""The number of items that failed to be evaluated."""
502+
)
503+
504+
505+
class SummaryMetricDict(TypedDict, total=False):
506+
"""Represents a summary metric for an evaluation run."""
507+
508+
metrics: Optional[dict[str, Any]]
509+
"""Map of metric name to metric value."""
510+
511+
total_items: Optional[int]
512+
"""The total number of items that were evaluated."""
513+
514+
failed_items: Optional[int]
515+
"""The number of items that failed to be evaluated."""
516+
517+
518+
SummaryMetricOrDict = Union[SummaryMetric, SummaryMetricDict]
519+
520+
521+
class EvaluationRunResults(_common.BaseModel):
522+
"""Represents the results of an evaluation run."""
523+
524+
evaluation_set: Optional[str] = Field(
525+
default=None,
526+
description="""The evaluation set where item level results are stored.""",
527+
)
528+
summary_metrics: Optional[SummaryMetric] = Field(
529+
default=None, description="""The summary metrics for the evaluation run."""
530+
)
531+
532+
533+
class EvaluationRunResultsDict(TypedDict, total=False):
534+
"""Represents the results of an evaluation run."""
535+
536+
evaluation_set: Optional[str]
537+
"""The evaluation set where item level results are stored."""
538+
539+
summary_metrics: Optional[SummaryMetricDict]
540+
"""The summary metrics for the evaluation run."""
541+
542+
543+
EvaluationRunResultsOrDict = Union[EvaluationRunResults, EvaluationRunResultsDict]
544+
545+
491546
class EvaluationRun(_common.BaseModel):
492547
"""Represents an evaluation run."""
493548

@@ -506,6 +561,9 @@ class EvaluationRun(_common.BaseModel):
506561
data_source: Optional[EvaluationRunDataSource] = Field(
507562
default=None, description=""""""
508563
)
564+
evaluation_results: Optional[EvaluationRunResults] = Field(
565+
default=None, description="""The results for the evaluation run."""
566+
)
509567

510568

511569
class EvaluationRunDict(TypedDict, total=False):
@@ -538,6 +596,9 @@ class EvaluationRunDict(TypedDict, total=False):
538596
data_source: Optional[EvaluationRunDataSourceDict]
539597
""""""
540598

599+
evaluation_results: Optional[EvaluationRunResultsDict]
600+
"""The results for the evaluation run."""
601+
541602

542603
EvaluationRunOrDict = Union[EvaluationRun, EvaluationRunDict]
543604

0 commit comments

Comments
 (0)