Skip to content

Commit 7a1262b

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: GenAI Client(evals) - Add evaluation_df input support to create_evaluation_run method in Vertex AI GenAI SDK evals
PiperOrigin-RevId: 821845677
1 parent 55b7c23 commit 7a1262b

File tree

3 files changed

+215
-10
lines changed

3 files changed

+215
-10
lines changed

tests/unit/vertexai/genai/replays/test_create_evaluation_run.py

Lines changed: 101 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -38,15 +38,15 @@ def test_create_eval_run_data_source_evaluation_set(client):
3838
evaluation_run = client.evals.create_evaluation_run(
3939
name="test4",
4040
display_name="test4",
41-
data_source=types.EvaluationRunDataSource(
41+
dataset=types.EvaluationRunDataSource(
4242
evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
4343
),
4444
agent_info=types.AgentInfo(
4545
name="agent-1",
4646
instruction="agent-1 instruction",
4747
tool_declarations=[tool],
4848
),
49-
dest="gs://lakeyk-test-limited/eval_run_output",
49+
dest="gs://lakeyk-limited-bucket/eval_run_output",
5050
)
5151
assert isinstance(evaluation_run, types.EvaluationRun)
5252
assert evaluation_run.display_name == "test4"
@@ -73,7 +73,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
7373
evaluation_run = client.evals.create_evaluation_run(
7474
name="test5",
7575
display_name="test5",
76-
data_source=types.EvaluationRunDataSource(
76+
dataset=types.EvaluationRunDataSource(
7777
bigquery_request_set=types.BigQueryRequestSet(
7878
uri="bq://lakeyk-test-limited.inference_batch_prediction_input.1317387725199900672_1b",
7979
prompt_column="request",
@@ -84,7 +84,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
8484
},
8585
)
8686
),
87-
dest="gs://lakeyk-test-limited/eval_run_output",
87+
dest="gs://lakeyk-limited-bucket/eval_run_output",
8888
)
8989
assert isinstance(evaluation_run, types.EvaluationRun)
9090
assert evaluation_run.display_name == "test5"
@@ -105,6 +105,101 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
105105
assert evaluation_run.error is None
106106

107107

108+
# Test fails in replay mode because of the timestamp issue
109+
# def test_create_eval_run_data_source_evaluation_dataset(client):
110+
# """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with EvaluationDataset."""
111+
# input_df = pd.DataFrame(
112+
# {
113+
# "prompt": ["prompt1", "prompt2"],
114+
# "reference": ["reference1", "reference2"],
115+
# "response": ["response1", "response2"],
116+
# "intermediate_events": [
117+
# [
118+
# {
119+
# "content": {
120+
# "parts": [
121+
# {"text": "first user input"},
122+
# ],
123+
# "role": "user",
124+
# },
125+
# },
126+
# {
127+
# "content": {
128+
# "parts": [
129+
# {"text": "first model response"},
130+
# ],
131+
# "role": "model",
132+
# },
133+
# },
134+
# ],
135+
# [
136+
# {
137+
# "content": {
138+
# "parts": [
139+
# {"text": "second user input"},
140+
# ],
141+
# "role": "user",
142+
# },
143+
# },
144+
# {
145+
# "content": {
146+
# "parts": [
147+
# {"text": "second model response"},
148+
# ],
149+
# "role": "model",
150+
# },
151+
# },
152+
# ],
153+
# ],
154+
# }
155+
# )
156+
# evaluation_run = client.evals.create_evaluation_run(
157+
# name="test6",
158+
# display_name="test6",
159+
# dataset=types.EvaluationDataset(
160+
# candidate_name="candidate_1",
161+
# eval_dataset_df=input_df,
162+
# ),
163+
# dest="gs://lakeyk-limited-bucket/eval_run_output",
164+
# )
165+
# assert isinstance(evaluation_run, types.EvaluationRun)
166+
# assert evaluation_run.display_name == "test6"
167+
# assert evaluation_run.state == types.EvaluationRunState.PENDING
168+
# assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
169+
# # Check evaluation set
170+
# assert evaluation_run.data_source.evaluation_set
171+
# eval_set = client.evals.get_evaluation_set(
172+
# name=evaluation_run.data_source.evaluation_set
173+
# )
174+
# assert len(eval_set.evaluation_items) == 2
175+
# # Check evaluation items
176+
# for i, eval_item_name in enumerate(eval_set.evaluation_items):
177+
# eval_item = client.evals.get_evaluation_item(name=eval_item_name)
178+
# assert eval_item.evaluation_item_type == types.EvaluationItemType.REQUEST
179+
# assert eval_item.evaluation_request.prompt.text == input_df.iloc[i]["prompt"]
180+
# assert (
181+
# eval_item.evaluation_request.candidate_responses[0].text
182+
# == input_df.iloc[i]["response"]
183+
# )
184+
# assert (
185+
# eval_item.evaluation_request.candidate_responses[0].events[0].parts[0].text
186+
# == input_df.iloc[i]["intermediate_events"][0]["content"]["parts"][0]["text"]
187+
# )
188+
# assert (
189+
# eval_item.evaluation_request.candidate_responses[0].events[0].role
190+
# == input_df.iloc[i]["intermediate_events"][0]["content"]["role"]
191+
# )
192+
# assert (
193+
# eval_item.evaluation_request.candidate_responses[0].events[1].parts[0].text
194+
# == input_df.iloc[i]["intermediate_events"][1]["content"]["parts"][0]["text"]
195+
# )
196+
# assert (
197+
# eval_item.evaluation_request.candidate_responses[0].events[1].role
198+
# == input_df.iloc[i]["intermediate_events"][1]["content"]["role"]
199+
# )
200+
# assert evaluation_run.error is None
201+
202+
108203
pytest_plugins = ("pytest_asyncio",)
109204

110205

@@ -114,7 +209,7 @@ async def test_create_eval_run_async(client):
114209
evaluation_run = await client.aio.evals.create_evaluation_run(
115210
name="test8",
116211
display_name="test8",
117-
data_source=types.EvaluationRunDataSource(
212+
dataset=types.EvaluationRunDataSource(
118213
bigquery_request_set=types.BigQueryRequestSet(
119214
uri="bq://lakeyk-test-limited.inference_batch_prediction_input.1317387725199900672_1b",
120215
prompt_column="request",
@@ -125,7 +220,7 @@ async def test_create_eval_run_async(client):
125220
},
126221
)
127222
),
128-
dest="gs://lakeyk-test-limited/eval_run_output",
223+
dest="gs://lakeyk-limited-bucket/eval_run_output",
129224
)
130225
assert isinstance(evaluation_run, types.EvaluationRun)
131226
assert evaluation_run.display_name == "test8"

vertexai/_genai/_evals_common.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1505,3 +1505,89 @@ async def _convert_evaluation_run_results_async(
15051505
]
15061506
eval_items = await asyncio.gather(*tasks)
15071507
return _get_eval_result_from_eval_items(evaluation_run_results, eval_items)
1508+
1509+
1510+
def _object_to_dict(obj) -> dict[str, Any]:
1511+
"""Converts an object to a dictionary."""
1512+
if not hasattr(obj, "__dict__"):
1513+
return obj # Not an object with attributes, return as is (e.g., int, str)
1514+
1515+
result: dict[str, Any] = {}
1516+
for key, value in obj.__dict__.items():
1517+
if value is None:
1518+
continue
1519+
if isinstance(value, (int, float, str, bool)):
1520+
result[key] = value
1521+
elif isinstance(value, (list, tuple)):
1522+
result[key] = [_object_to_dict(item) for item in value]
1523+
elif hasattr(value, "__dict__"): # Nested object
1524+
result[key] = _object_to_dict(value)
1525+
else:
1526+
result[key] = value # Handle other types like sets, etc.
1527+
return result
1528+
1529+
1530+
def _create_evaluation_set_from_dataframe(
1531+
api_client: BaseApiClient,
1532+
gcs_dest_prefix: str,
1533+
eval_df: pd.DataFrame,
1534+
candidate_name: Optional[str] = None,
1535+
) -> types.EvaluationSet:
1536+
"""Converts a dataframe to an EvaluationSet."""
1537+
eval_item_requests = []
1538+
for _, row in eval_df.iterrows():
1539+
intermediate_events = []
1540+
if "intermediate_events" in row:
1541+
for event in row["intermediate_events"]:
1542+
intermediate_events.append(
1543+
genai_types.Content(
1544+
parts=event["content"]["parts"], role=event["content"]["role"]
1545+
)
1546+
)
1547+
eval_item_requests.append(
1548+
types.EvaluationItemRequest(
1549+
prompt=(
1550+
types.EvaluationPrompt(text=row["prompt"])
1551+
if "prompt" in row
1552+
else None
1553+
),
1554+
golden_response=(
1555+
types.CandidateResponse(text=row["reference"])
1556+
if "reference" in row
1557+
else None
1558+
),
1559+
candidate_responses=[
1560+
types.CandidateResponse(
1561+
candidate=candidate_name or "Candidate 1",
1562+
text=row.get("response", None),
1563+
events=(
1564+
intermediate_events
1565+
if len(intermediate_events) > 0
1566+
else None
1567+
),
1568+
)
1569+
],
1570+
)
1571+
)
1572+
logger.info("Writing evaluation item requests to GCS.")
1573+
gcs_utils = _evals_utils.GcsUtils(api_client=api_client)
1574+
evals_module = evals.Evals(api_client_=api_client)
1575+
eval_items = []
1576+
for eval_item_request in eval_item_requests:
1577+
gcs_uri = gcs_utils.upload_json_to_prefix(
1578+
data=_object_to_dict(eval_item_request),
1579+
gcs_dest_prefix=gcs_dest_prefix,
1580+
filename_prefix="request",
1581+
)
1582+
eval_item = evals_module.create_evaluation_item(
1583+
evaluation_item_type=types.EvaluationItemType.REQUEST,
1584+
gcs_uri=gcs_uri,
1585+
display_name="sdk-generated-eval-item",
1586+
)
1587+
eval_items.append(eval_item.name)
1588+
logger.info("Creating evaluation set from GCS URIs")
1589+
evaluation_set = evals_module.create_evaluation_set(
1590+
evaluation_items=eval_items,
1591+
)
1592+
1593+
return evaluation_set

vertexai/_genai/evals.py

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1307,12 +1307,24 @@ def create_evaluation_run(
13071307
*,
13081308
name: str,
13091309
display_name: Optional[str] = None,
1310-
data_source: types.EvaluationRunDataSource,
1310+
dataset: Union[types.EvaluationRunDataSource, types.EvaluationDataset],
13111311
dest: str,
13121312
agent_info: Optional[types.AgentInfo] = None,
13131313
config: Optional[types.CreateEvaluationRunConfigOrDict] = None,
13141314
) -> types.EvaluationRun:
13151315
"""Creates an EvaluationRun."""
1316+
if type(dataset).__name__ == "EvaluationDataset":
1317+
logger.warning(
1318+
"EvaluationDataset input is experimental and may change in future versions."
1319+
)
1320+
if dataset.eval_dataset_df is None:
1321+
raise ValueError(
1322+
"EvaluationDataset must have eval_dataset_df populated."
1323+
)
1324+
eval_set = _evals_common._create_evaluation_set_from_dataframe(
1325+
self._api_client, dest, dataset.eval_dataset_df, dataset.candidate_name
1326+
)
1327+
dataset = types.EvaluationRunDataSource(evaluation_set=eval_set.name)
13161328
output_config = genai_types.OutputConfig(
13171329
gcs_destination=genai_types.GcsDestination(output_uri_prefix=dest)
13181330
)
@@ -1334,7 +1346,7 @@ def create_evaluation_run(
13341346
return self._create_evaluation_run( # type: ignore[no-any-return]
13351347
name=name,
13361348
display_name=display_name,
1337-
data_source=data_source,
1349+
data_source=dataset,
13381350
evaluation_config=evaluation_config,
13391351
inference_configs=inference_configs,
13401352
config=config,
@@ -2092,12 +2104,24 @@ async def create_evaluation_run(
20922104
*,
20932105
name: str,
20942106
display_name: Optional[str] = None,
2095-
data_source: types.EvaluationRunDataSource,
2107+
dataset: Union[types.EvaluationRunDataSource, types.EvaluationDataset],
20962108
dest: str,
20972109
agent_info: Optional[types.AgentInfo] = None,
20982110
config: Optional[types.CreateEvaluationRunConfigOrDict] = None,
20992111
) -> types.EvaluationRun:
21002112
"""Creates an EvaluationRun."""
2113+
if type(dataset).__name__ == "EvaluationDataset":
2114+
logger.warning(
2115+
"EvaluationDataset input is experimental and may change in future versions."
2116+
)
2117+
if dataset.eval_dataset_df is None:
2118+
raise ValueError(
2119+
"EvaluationDataset must have eval_dataset_df populated."
2120+
)
2121+
eval_set = _evals_common._create_evaluation_set_from_dataframe(
2122+
self._api_client, dest, dataset.eval_dataset_df, dataset.candidate_name
2123+
)
2124+
dataset = types.EvaluationRunDataSource(evaluation_set=eval_set.name)
21012125
output_config = genai_types.OutputConfig(
21022126
gcs_destination=genai_types.GcsDestination(output_uri_prefix=dest)
21032127
)
@@ -2119,7 +2143,7 @@ async def create_evaluation_run(
21192143
result = await self._create_evaluation_run( # type: ignore[no-any-return]
21202144
name=name,
21212145
display_name=display_name,
2122-
data_source=data_source,
2146+
data_source=dataset,
21232147
evaluation_config=evaluation_config,
21242148
inference_configs=inference_configs,
21252149
config=config,

0 commit comments

Comments
 (0)