diff --git a/components/google-cloud/RELEASE.md b/components/google-cloud/RELEASE.md index 88b1876cc96..8027c394856 100644 --- a/components/google-cloud/RELEASE.md +++ b/components/google-cloud/RELEASE.md @@ -6,6 +6,7 @@ * Use `eval_dataset` for train-time evalutation when training a reward model. Requires `eval_dataset` to contain the same fields as the [preference dataset](https://cloud.google.com/vertex-ai/docs/generative-ai/models/tune-text-models-rlhf#human-preference-dataset). * Update the documentation of `GetModel`. * Add CMEK support to `preview.model_evaluation.autosxs_pipeline`. +* Updated component and pipeline inputs/outputs to support creating ModelEvaluations for ModelRegistry models in the AutoSxS pipeline. ## Release 2.10.0 * Fix the missing output of pipeline remote runner. `AutoMLImageTrainingJobRunOp` now passes the model artifacts correctly to downstream components. diff --git a/components/google-cloud/google_cloud_pipeline_components/_implementation/llm/generated/refined_image_versions.py b/components/google-cloud/google_cloud_pipeline_components/_implementation/llm/generated/refined_image_versions.py index a12ecad8850..43935e144e4 100644 --- a/components/google-cloud/google_cloud_pipeline_components/_implementation/llm/generated/refined_image_versions.py +++ b/components/google-cloud/google_cloud_pipeline_components/_implementation/llm/generated/refined_image_versions.py @@ -17,4 +17,4 @@ DO NOT EDIT - This file is generated, manual changes will be overridden. """ -IMAGE_TAG = '20240310_1707' +IMAGE_TAG = '20240313_1707' diff --git a/components/google-cloud/google_cloud_pipeline_components/_implementation/llm/model_evaluation_text_generation_pairwise.py b/components/google-cloud/google_cloud_pipeline_components/_implementation/llm/model_evaluation_text_generation_pairwise.py index 88fed3bc3c4..433fe0a6ad9 100644 --- a/components/google-cloud/google_cloud_pipeline_components/_implementation/llm/model_evaluation_text_generation_pairwise.py +++ b/components/google-cloud/google_cloud_pipeline_components/_implementation/llm/model_evaluation_text_generation_pairwise.py @@ -33,15 +33,24 @@ def model_evaluation_text_generation_pairwise( judgments_dir: str, autosxs_metrics: dsl.Output[dsl.Metrics], # pylint: disable=unused-argument # pytype: disable=unsupported-operands gcp_resources: dsl.OutputPath(str), # pytype: disable=invalid-annotation + model_a_evaluation_path: dsl.OutputPath(str), # pylint: disable=unused-argument # pytype: disable=unsupported-operands + model_b_evaluation_path: dsl.OutputPath(str), # pylint: disable=unused-argument # pytype: disable=unsupported-operands + evaluation_count_path: dsl.OutputPath(int), # pylint: disable=unused-argument # pytype: disable=unsupported-operands + evaluation_dataset_path: dsl.OutputPath(str), # pylint: disable=unused-argument # pytype: disable=unsupported-operands human_preference_column: str = '', project: str = _placeholders.PROJECT_ID_PLACEHOLDER, location: str = _placeholders.LOCATION_PLACEHOLDER, encryption_spec_key_name: str = '', + model_a: str = '', + model_b: str = '', + evaluation_dataset: str = '', + evaluation_dataset_metadata: str = '', # pylint: disable=unused-argument + task: str = '', ) -> dsl.ContainerSpec: # pylint: disable=g-doc-args """Compute AutoSXS metrics using judgments outputs from Arbiter. Args: - judgments_dir: Path where store the Judgments. + judgments_dir: Path to store the Judgments. human_preference_column: The column containing ground truths. The default value is an empty string if not be provided by users. project: Project to upload evaluation metrics to. @@ -49,10 +58,23 @@ def model_evaluation_text_generation_pairwise( encryption_spec_key_name: Customer-managed encryption key options. If this is set, then all resources created by the component will be encrypted with the provided encryption key. + model_a: Resource path for Model A. + model_b: Resource path for Model B. + evaluation_dataset: Path to the evaluation dataset. + evaluation_dataset_metadata: AutoSxS metrics metadata json string. + task: Task that was used for this AutoSxS run. Returns: autosxs_metrics: Autosxs win rate metrics and human alignment metrics. gcp_resources: Tracker for GCP resources created by this component. + model_a_evaluation_path: Path to write the ModelEvaluation for Model A if it + is a + ModelRegistry model. + model_b_evaluation: Path to write the ModelEvaluation for Model B if it is a + ModelRegistry model. + evaluation_count: Path to write the EvaluationCount number to. + evaluation_dataset_path: Path to write the path to the evaluation dataset. + This is needed because Pipeline outputs must be component outputs. """ return gcpc_utils.build_serverless_customjob_container_spec( project=project, @@ -69,6 +91,15 @@ def model_evaluation_text_generation_pairwise( f'--project={project}', f'--location={location}', '--executor_input={{$.json_escape[1]}}', + f'--model_a={model_a}', + f'--model_b={model_b}', + f'--model_a_evaluation_path={model_a_evaluation_path}', + f'--model_b_evaluation_path={model_b_evaluation_path}', + f'--evaluation_count_path={evaluation_count_path}', + f'--evaluation_dataset_path={evaluation_dataset_path}', + f'--evaluation_dataset={evaluation_dataset}', + "--evaluation_dataset_metadata={{$.inputs.parameters['evaluation_dataset_metadata'].json_escape[0]}}", + f'--task={task}', ], encryption_spec_key_name=encryption_spec_key_name, ), diff --git a/components/google-cloud/google_cloud_pipeline_components/preview/model_evaluation/model_based_llm_evaluation/autosxs/autosxs_pipeline.py b/components/google-cloud/google_cloud_pipeline_components/preview/model_evaluation/model_based_llm_evaluation/autosxs/autosxs_pipeline.py index 2db94da7dd9..683ed6be285 100644 --- a/components/google-cloud/google_cloud_pipeline_components/preview/model_evaluation/model_based_llm_evaluation/autosxs/autosxs_pipeline.py +++ b/components/google-cloud/google_cloud_pipeline_components/preview/model_evaluation/model_based_llm_evaluation/autosxs/autosxs_pipeline.py @@ -13,7 +13,7 @@ # limitations under the License. """Optimization AI Inference and AutoSxS pipeline function.""" -from typing import Any, Dict, List +from typing import Any, Dict, List, NamedTuple from google_cloud_pipeline_components import _placeholders from google_cloud_pipeline_components._implementation.llm import batch_prediction_pairwise @@ -21,6 +21,14 @@ from google_cloud_pipeline_components._implementation.llm import online_evaluation_pairwise from kfp import dsl +PipelineOutput = NamedTuple( + 'Outputs', + model_a_evaluation_resource_name=str, + model_b_evaluation_resource_name=str, + evaluation_count=int, + evaluation_dataset_path=str, +) + # pylint: disable=dangerous-default-value,g-bare-generic,unused-argument @dsl.pipeline( @@ -47,7 +55,7 @@ def autosxs_pipeline( bigquery_destination_prefix: str = '', experimental_args: Dict[str, Any] = {}, encryption_spec_key_name: str = '', -): +) -> PipelineOutput: # fmt: off """Evaluates two models side-by-side using an arbiter model. @@ -71,6 +79,12 @@ def autosxs_pipeline( bigquery_destination_prefix: BigQuery table to write judgments to if the specified format is 'bigquery'. experimental_args: Experimentally released arguments. Subject to change. encryption_spec_key_name: Customer-managed encryption key options. If this is set, then all resources created by the pipeline will be encrypted with the provided encryption key. + + Returns: + model_a_evaluation_resource_name: The path to write the ModelEvaluation for Model A to if Model A is a ModelRegistry Model. + model_b_evaluation_resource_name: The path to write the ModelEvaluation for Model B to if Model B is a ModelRegistry Model. + evaluation_count: The count of how many evaluations were included for this AutoSxS run. + evaluation_dataset_path: The path to the overall evaluation dataset including judgments. """ # fmt: on responses = batch_prediction_pairwise.batch_prediction_pairwise( @@ -109,12 +123,29 @@ def autosxs_pipeline( encryption_spec_key_name=encryption_spec_key_name, ).set_display_name('AutoSxS Autorater') - model_evaluation_text_generation_pairwise.model_evaluation_text_generation_pairwise( + metrics = model_evaluation_text_generation_pairwise.model_evaluation_text_generation_pairwise( judgments_dir=winners.outputs['judgments_uri'], human_preference_column=human_preference_column, project=project, location=location, encryption_spec_key_name=encryption_spec_key_name, + model_a=model_a, + model_b=model_b, + evaluation_dataset=evaluation_dataset, + evaluation_dataset_metadata=winners.outputs['metadata'], + task=task, ).set_display_name( 'AutoSxS Metrics' ) + + return PipelineOutput( + model_a_evaluation_resource_name=metrics.outputs[ + 'model_a_evaluation_path' + ], + model_b_evaluation_resource_name=metrics.outputs[ + 'model_b_evaluation_path' + ], + evaluation_count=metrics.outputs['evaluation_count_path'], + # Needs to be a component output + evaluation_dataset_path=metrics.outputs['evaluation_dataset_path'], + )