feat: GenAI Client(evals) - Add labels to EvaluationRun in Vertex AI GenAI SDK evals

vertex-sdk-bot · copybara-github · commit 67cf80b06b35 · 2025-10-21T16:09:32.000-07:00
PiperOrigin-RevId: 822313437
diff --git a/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py b/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py
@@ -19,7 +19,7 @@
 from google.genai import types as genai_types
 import pytest
 
-GCS_DEST = "gs://lakeyk-test-limited/eval_run_output"
+GCS_DEST = "gs://lakeyk-limited-bucket/eval_run_output"
 UNIVERSAL_AR_METRIC = types.EvaluationRunMetric(
     metric="universal_ar_v1",
     metric_config=types.UnifiedMetric(
@@ -51,9 +51,6 @@
 # TODO(b/431231205): Re-enable once Unified Metrics are in prod.
 # def test_create_eval_run_data_source_evaluation_set(client):
 #     """Tests that create_evaluation_run() creates a correctly structured EvaluationRun."""
-#     client._api_client._http_options.base_url = (
-#         "https://us-central1-autopush-aiplatform.sandbox.googleapis.com/"
-#     )
 #     client._api_client._http_options.api_version = "v1beta1"
 #     tool = genai_types.Tool(
 #         function_declarations=[
@@ -80,10 +77,12 @@
 #             LLM_METRIC
 #         ],
 #         agent_info=types.AgentInfo(
+#             agent="project/123/locations/us-central1/reasoningEngines/456",
 #             name="agent-1",
 #             instruction="agent-1 instruction",
 #             tool_declarations=[tool],
 #         ),
+#         labels={"label1": "value1"},
 #     )
 #     assert isinstance(evaluation_run, types.EvaluationRun)
 #     assert evaluation_run.display_name == "test4"
@@ -108,6 +107,10 @@
 #             tools=[tool],
 #         )
 #     )
+#     assert evaluation_run.labels == {
+#        "vertex-ai-evaluation-agent-engine-id": "456",
+#        "label1": "value1",
+#     }
 #     assert evaluation_run.error is None
 
 
@@ -127,6 +130,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
                 },
             )
         ),
+        labels={"label1": "value1"},
         dest=GCS_DEST,
     )
     assert isinstance(evaluation_run, types.EvaluationRun)
@@ -150,6 +154,9 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
         ),
     )
     assert evaluation_run.inference_configs is None
+    assert evaluation_run.labels == {
+        "label1": "value1",
+    }
     assert evaluation_run.error is None
 
 
@@ -289,6 +296,8 @@ async def test_create_eval_run_async(client):
     assert evaluation_run.error is None
     assert evaluation_run.inference_configs is None
     assert evaluation_run.error is None
+    assert evaluation_run.labels is None
+    assert evaluation_run.error is None
 
 
 pytestmark = pytest_helper.setup(
diff --git a/vertexai/_genai/evals.py b/vertexai/_genai/evals.py
@@ -77,6 +77,9 @@ def _CreateEvaluationRunParameters_to_vertex(
     if getv(from_object, ["evaluation_config"]) is not None:
         setv(to_object, ["evaluationConfig"], getv(from_object, ["evaluation_config"]))
 
+    if getv(from_object, ["labels"]) is not None:
+        setv(to_object, ["labels"], getv(from_object, ["labels"]))
+
     if getv(from_object, ["config"]) is not None:
         setv(to_object, ["config"], getv(from_object, ["config"]))
 
@@ -236,6 +239,9 @@ def _EvaluationRun_from_vertex(
     if getv(from_object, ["inferenceConfigs"]) is not None:
         setv(to_object, ["inference_configs"], getv(from_object, ["inferenceConfigs"]))
 
+    if getv(from_object, ["labels"]) is not None:
+        setv(to_object, ["labels"], getv(from_object, ["labels"]))
+
     return to_object
 
 
@@ -464,6 +470,7 @@ def _create_evaluation_run(
         display_name: Optional[str] = None,
         data_source: types.EvaluationRunDataSourceOrDict,
         evaluation_config: types.EvaluationRunConfigOrDict,
+        labels: Optional[dict[str, str]] = None,
         config: Optional[types.CreateEvaluationRunConfigOrDict] = None,
         inference_configs: Optional[
             dict[str, types.EvaluationRunInferenceConfigOrDict]
@@ -478,6 +485,7 @@ def _create_evaluation_run(
             display_name=display_name,
             data_source=data_source,
             evaluation_config=evaluation_config,
+            labels=labels,
             config=config,
             inference_configs=inference_configs,
         )
@@ -1316,6 +1324,7 @@ def create_evaluation_run(
             list[types.EvaluationRunMetricOrDict]
         ] = None,  # TODO: Make required unified metrics available in prod.
         agent_info: Optional[types.AgentInfo] = None,
+        labels: Optional[dict[str, str]] = None,
         config: Optional[types.CreateEvaluationRunConfigOrDict] = None,
     ) -> types.EvaluationRun:
         """Creates an EvaluationRun."""
@@ -1353,13 +1362,25 @@ def create_evaluation_run(
                     tools=agent_info.tool_declarations,
                 )
             )
+            if (
+                not agent_info.agent
+                or len(agent_info.agent.split("reasoningEngines/")) != 2
+            ):
+                raise ValueError(
+                    "agent_info.agent cannot be empty. Please provide a valid reasoning engine resource name in the format of projects/{project}/locations/{location}/reasoningEngines/{reasoning_engine}."
+                )
+            labels = labels or {}
+            labels["vertex-ai-evaluation-agent-engine-id"] = agent_info.agent.split(
+                "reasoningEngines/"
+            )[-1]
 
         return self._create_evaluation_run(  # type: ignore[no-any-return]
             name=name,
             display_name=display_name,
             data_source=dataset,
             evaluation_config=evaluation_config,
             inference_configs=inference_configs,
+            labels=labels,
             config=config,
         )
 
@@ -1566,6 +1587,7 @@ async def _create_evaluation_run(
         display_name: Optional[str] = None,
         data_source: types.EvaluationRunDataSourceOrDict,
         evaluation_config: types.EvaluationRunConfigOrDict,
+        labels: Optional[dict[str, str]] = None,
         config: Optional[types.CreateEvaluationRunConfigOrDict] = None,
         inference_configs: Optional[
             dict[str, types.EvaluationRunInferenceConfigOrDict]
@@ -1580,6 +1602,7 @@ async def _create_evaluation_run(
             display_name=display_name,
             data_source=data_source,
             evaluation_config=evaluation_config,
+            labels=labels,
             config=config,
             inference_configs=inference_configs,
         )
@@ -2121,6 +2144,7 @@ async def create_evaluation_run(
             list[types.EvaluationRunMetricOrDict]
         ] = None,  # TODO: Make required unified metrics available in prod.
         agent_info: Optional[types.AgentInfo] = None,
+        labels: Optional[dict[str, str]] = None,
         config: Optional[types.CreateEvaluationRunConfigOrDict] = None,
     ) -> types.EvaluationRun:
         """Creates an EvaluationRun."""
@@ -2158,13 +2182,25 @@ async def create_evaluation_run(
                     tools=agent_info.tool_declarations,
                 )
             )
+            if (
+                not agent_info.agent
+                or len(agent_info.agent.split("reasoningEngines/")) != 2
+            ):
+                raise ValueError(
+                    "agent_info.agent cannot be empty. Please provide a valid reasoning engine resource name in the format of projects/{project}/locations/{location}/reasoningEngines/{reasoning_engine}."
+                )
+            labels = labels or {}
+            labels["vertex-ai-evaluation-agent-engine-id"] = agent_info.agent.split(
+                "reasoningEngines/"
+            )[-1]
 
         result = await self._create_evaluation_run(  # type: ignore[no-any-return]
             name=name,
             display_name=display_name,
             data_source=dataset,
             evaluation_config=evaluation_config,
             inference_configs=inference_configs,
+            labels=labels,
             config=config,
         )
 
diff --git a/vertexai/_genai/types.py b/vertexai/_genai/types.py
@@ -1220,6 +1220,7 @@ class _CreateEvaluationRunParameters(_common.BaseModel):
     evaluation_config: Optional[EvaluationRunConfig] = Field(
         default=None, description=""""""
     )
+    labels: Optional[dict[str, str]] = Field(default=None, description="""""")
     config: Optional[CreateEvaluationRunConfig] = Field(
         default=None, description=""""""
     )
@@ -1243,6 +1244,9 @@ class _CreateEvaluationRunParametersDict(TypedDict, total=False):
     evaluation_config: Optional[EvaluationRunConfigDict]
     """"""
 
+    labels: Optional[dict[str, str]]
+    """"""
+
     config: Optional[CreateEvaluationRunConfigDict]
     """"""
 
@@ -1482,6 +1486,11 @@ class EventDict(TypedDict, total=False):
 class AgentInfo(_common.BaseModel):
     """The agent info of an agent, used for agent eval."""
 
+    agent: Optional[str] = Field(
+        default=None,
+        description="""The agent engine used to run agent. Agent engine resource name in str type, with format
+            `projects/{project}/locations/{location}/reasoningEngines/{reasoning_engine_id}`.""",
+    )
     name: Optional[str] = Field(
         default=None, description="""Agent name, used as an identifier."""
     )
@@ -1499,6 +1508,10 @@ class AgentInfo(_common.BaseModel):
 class AgentInfoDict(TypedDict, total=False):
     """The agent info of an agent, used for agent eval."""
 
+    agent: Optional[str]
+    """The agent engine used to run agent. Agent engine resource name in str type, with format
+            `projects/{project}/locations/{location}/reasoningEngines/{reasoning_engine_id}`."""
+
     name: Optional[str]
     """Agent name, used as an identifier."""
 
@@ -1919,6 +1932,7 @@ class EvaluationRun(_common.BaseModel):
         default=None,
         description="""This field is experimental and may change in future versions. The inference configs for the evaluation run.""",
     )
+    labels: Optional[dict[str, str]] = Field(default=None, description="""""")
 
     # TODO(b/448806531): Remove all the overridden _from_response methods once the
     # ticket is resolved and published.
@@ -2003,6 +2017,9 @@ class EvaluationRunDict(TypedDict, total=False):
     inference_configs: Optional[dict[str, "EvaluationRunInferenceConfigDict"]]
     """This field is experimental and may change in future versions. The inference configs for the evaluation run."""
 
+    labels: Optional[dict[str, str]]
+    """"""
+
 
 EvaluationRunOrDict = Union[EvaluationRun, EvaluationRunDict]