Skip to content

Commit cd61c86

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: GenAI Client(evals) - Add agent eval data converter evals
PiperOrigin-RevId: 819989463
1 parent cf0948f commit cd61c86

File tree

3 files changed

+143
-6
lines changed

3 files changed

+143
-6
lines changed

tests/unit/vertexai/genai/test_evals.py

Lines changed: 102 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1885,6 +1885,56 @@ def test_convert_with_additional_columns(self):
18851885
eval_case = result_dataset.eval_cases[0]
18861886
assert eval_case.custom_column == "custom_value"
18871887

1888+
def test_convert_with_agent_eval_fields(self):
1889+
"""Tests that agent eval data is converted correctly from a flattened format."""
1890+
raw_data_df = pd.DataFrame(
1891+
{
1892+
"prompt": ["Hello"],
1893+
"response": ["Hi"],
1894+
"intermediate_events": [
1895+
[
1896+
{
1897+
"event_id": "event1",
1898+
"content": {"parts": [{"text": "intermediate event"}]},
1899+
}
1900+
]
1901+
],
1902+
}
1903+
)
1904+
raw_data = raw_data_df.to_dict(orient="records")
1905+
result_dataset = self.converter.convert(raw_data)
1906+
assert len(result_dataset.eval_cases) == 1
1907+
eval_case = result_dataset.eval_cases[0]
1908+
assert eval_case.intermediate_events[0].event_id == "event1"
1909+
1910+
def test_convert_with_intermediate_events_as_event_objects(self):
1911+
"""Tests that agent eval data is converted correctly when intermediate_events are Event objects."""
1912+
raw_data_df = pd.DataFrame(
1913+
{
1914+
"prompt": ["Hello"],
1915+
"response": ["Hi"],
1916+
"intermediate_events": [
1917+
[
1918+
vertexai_genai_types.Event(
1919+
event_id="event1",
1920+
content=genai_types.Content(
1921+
parts=[genai_types.Part(text="intermediate event")]
1922+
),
1923+
)
1924+
]
1925+
],
1926+
}
1927+
)
1928+
raw_data = raw_data_df.to_dict(orient="records")
1929+
result_dataset = self.converter.convert(raw_data)
1930+
assert len(result_dataset.eval_cases) == 1
1931+
eval_case = result_dataset.eval_cases[0]
1932+
assert eval_case.intermediate_events[0].event_id == "event1"
1933+
assert (
1934+
eval_case.intermediate_events[0].content.parts[0].text
1935+
== "intermediate event"
1936+
)
1937+
18881938

18891939
class TestOpenAIDataConverter:
18901940
"""Unit tests for the _OpenAIDataConverter class."""
@@ -2765,7 +2815,10 @@ def test_merge_flatten_and_gemini_datasets(self):
27652815
)
27662816

27672817
def test_merge_empty_input_list(self):
2768-
with pytest.raises(ValueError, match="Input 'raw_datasets' cannot be empty."):
2818+
with pytest.raises(
2819+
ValueError,
2820+
match="Input 'raw_datasets' cannot be empty and must be a list of lists.",
2821+
):
27692822
_evals_data_converters.merge_response_datasets_into_canonical_format(
27702823
raw_datasets=[], schemas=[]
27712824
)
@@ -2810,7 +2863,10 @@ def test_merge_mismatched_schemas_list_length(self):
28102863
]
28112864
with pytest.raises(
28122865
ValueError,
2813-
match="A list of schemas must be provided, one for each raw dataset.",
2866+
match=(
2867+
"A list of schemas must be provided, one for each raw dataset. Got 2"
2868+
" schemas for 3 datasets."
2869+
),
28142870
):
28152871
_evals_data_converters.merge_response_datasets_into_canonical_format(
28162872
[raw_dataset_1, raw_dataset_2, raw_dataset_3],
@@ -2824,7 +2880,10 @@ def test_merge_empty_schemas_list(self):
28242880
]
28252881
with pytest.raises(
28262882
ValueError,
2827-
match="A list of schemas must be provided, one for each raw dataset.",
2883+
match=(
2884+
"A list of schemas must be provided, one for each raw dataset. Got 0"
2885+
" schemas for 1 datasets."
2886+
),
28282887
):
28292888
_evals_data_converters.merge_response_datasets_into_canonical_format(
28302889
[raw_dataset_1], schemas=[]
@@ -2918,6 +2977,46 @@ def test_merge_with_different_custom_columns(self):
29182977
assert merged_dataset.eval_cases[1].custom_col_2 == "value_2_2"
29192978
assert merged_dataset.eval_cases[1].custom_col_3 == "value_2_3"
29202979

2980+
def test_merge_with_intermediate_events(self):
2981+
raw_dataset_1 = [
2982+
{
2983+
"prompt": "Prompt 1",
2984+
"response": "Response 1a",
2985+
"intermediate_events": [
2986+
{
2987+
"event_id": "event1",
2988+
"content": {"parts": [{"text": "intermediate event"}]},
2989+
}
2990+
],
2991+
}
2992+
]
2993+
raw_dataset_2 = [
2994+
{
2995+
"prompt": "Prompt 1",
2996+
"response": "Response 1b",
2997+
"intermediate_events": [
2998+
{
2999+
"event_id": "event2",
3000+
"content": {"parts": [{"text": "intermediate event 2"}]},
3001+
}
3002+
],
3003+
}
3004+
]
3005+
schemas = [
3006+
_evals_data_converters.EvalDatasetSchema.FLATTEN,
3007+
_evals_data_converters.EvalDatasetSchema.FLATTEN,
3008+
]
3009+
3010+
merged_dataset = (
3011+
_evals_data_converters.merge_response_datasets_into_canonical_format(
3012+
[raw_dataset_1, raw_dataset_2], schemas=schemas
3013+
)
3014+
)
3015+
3016+
assert len(merged_dataset.eval_cases) == 1
3017+
assert len(merged_dataset.eval_cases[0].intermediate_events) == 1
3018+
assert merged_dataset.eval_cases[0].intermediate_events[0].event_id == "event1"
3019+
29213020
def test_merge_with_metadata(self):
29223021
raw_dataset_1 = [
29233022
{

vertexai/_genai/_evals_data_converters.py

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#
1515
"""Dataset converters for evals."""
1616

17+
import copy
1718
import json
1819
import logging
1920
from typing import Any, Optional, Union
@@ -189,7 +190,7 @@ def convert(self, raw_data: list[dict[str, Any]]) -> types.EvaluationDataset:
189190
f"Expected a dictionary for item at index {i}, but got"
190191
f" {type(item_dict).__name__}: {item_dict}"
191192
)
192-
item = item_dict.copy()
193+
item = copy.deepcopy(item_dict)
193194
eval_case_id = f"eval_case_{i}"
194195
prompt_data = item.pop("prompt", None)
195196
if not prompt_data:
@@ -200,6 +201,7 @@ def convert(self, raw_data: list[dict[str, Any]]) -> types.EvaluationDataset:
200201
reference_data = item.pop("reference", None)
201202
system_instruction_data = item.pop("instruction", None)
202203
rubric_groups_data = item.pop("rubric_groups", None)
204+
intermediate_events_data = item.pop("intermediate_events", None)
203205

204206
if not response_data:
205207
raise ValueError(
@@ -362,6 +364,38 @@ def convert(self, raw_data: list[dict[str, Any]]) -> types.EvaluationDataset:
362364
f"Invalid type for rubric_groups in case {i}. Expected dict."
363365
)
364366

367+
intermediate_events: Optional[list[types.Event]] = None
368+
if intermediate_events_data:
369+
logger.warning(
370+
"intermediate_events attribute is experimental and may change in "
371+
"future versions."
372+
)
373+
if isinstance(intermediate_events_data, list):
374+
intermediate_events = []
375+
for event in intermediate_events_data:
376+
if isinstance(event, dict):
377+
try:
378+
validated_event = types.Event.model_validate(event)
379+
intermediate_events.append(validated_event)
380+
except Exception as e:
381+
logger.warning(
382+
"Failed to validate intermediate event dict for"
383+
f" case {i}: {e}"
384+
)
385+
elif isinstance(event, types.Event):
386+
intermediate_events.append(event)
387+
else:
388+
logger.warning(
389+
"Invalid type for intermediate_event in case"
390+
f" {i}. Expected list of dicts or list of"
391+
" types.Event objects."
392+
)
393+
else:
394+
logger.warning(
395+
f"Invalid type for intermediate_events in case {i}. Expected"
396+
" list of types.Event objects."
397+
)
398+
365399
eval_case = types.EvalCase(
366400
eval_case_id=eval_case_id,
367401
prompt=prompt,
@@ -370,6 +404,7 @@ def convert(self, raw_data: list[dict[str, Any]]) -> types.EvaluationDataset:
370404
conversation_history=conversation_history,
371405
system_instruction=system_instruction,
372406
rubric_groups=rubric_groups,
407+
intermediate_events=intermediate_events,
373408
**item, # Pass remaining columns as extra fields to EvalCase.
374409
# They can be used for custom metric prompt templates.
375410
)
@@ -726,6 +761,7 @@ def merge_response_datasets_into_canonical_format(
726761
"reference",
727762
"system_instruction",
728763
"conversation_history",
764+
"intermediate_events",
729765
},
730766
exclude_none=True,
731767
)
@@ -750,6 +786,7 @@ def merge_response_datasets_into_canonical_format(
750786
"reference",
751787
"system_instruction",
752788
"conversation_history",
789+
"intermediate_events",
753790
},
754791
exclude_none=True,
755792
)
@@ -777,6 +814,7 @@ def merge_response_datasets_into_canonical_format(
777814
reference=base_eval_case.reference,
778815
system_instruction=base_eval_case.system_instruction,
779816
conversation_history=base_eval_case.conversation_history,
817+
intermediate_events=base_eval_case.intermediate_events,
780818
**eval_case_custom_columns,
781819
)
782820
merged_eval_cases.append(merged_case)

vertexai/_genai/_evals_visualization.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,15 +56,15 @@ def _preprocess_df_for_json(df: Optional[pd.DataFrame]) -> Optional[pd.DataFrame
5656
):
5757

5858
def stringify_cell(cell: Any) -> Optional[str]:
59-
if pd.isna(cell):
60-
return None
6159
if isinstance(cell, (dict, list)):
6260
try:
6361
return json.dumps(
6462
cell, ensure_ascii=False, default=_pydantic_serializer
6563
)
6664
except TypeError:
6765
return str(cell)
66+
elif pd.isna(cell):
67+
return None
6868
elif not isinstance(cell, (str, int, float, bool)):
6969
if hasattr(cell, "model_dump"):
7070
return json.dumps(

0 commit comments

Comments
 (0)