feat(components): Implement the train time evaluation in reward model…

… training. With the train time eval dataset available, the pipeline outputs the accuracy and cross entropy metrics to the log PiperOrigin-RevId: 613057150
freefood89 · Mar 6, 2024 · 731cb81 · 731cb81
1 parent 547a8ae
commit 731cb81
Show file tree

Hide file tree

Showing 6 changed files with 63 additions and 24 deletions.
diff --git a/components/google-cloud/RELEASE.md b/components/google-cloud/RELEASE.md
@@ -3,6 +3,7 @@
 * Fix bug in `preview.llm.rlhf_pipeline` that caused wrong output artifact to be used for inference after training.
 * Fix issue where AutoSxS was not propagating location to all sub-components.
 * Add CMEK support to `preview.llm.infer_pipeline`.
+* Use `eval_dataset` for train-time evalutation when training a reward model. Requires `eval_dataset` to contain the same fields as the [preference dataset](https://cloud.google.com/vertex-ai/docs/generative-ai/models/tune-text-models-rlhf#human-preference-dataset).
 
 ## Release 2.10.0
 * Fix the missing output of pipeline remote runner. `AutoMLImageTrainingJobRunOp` now passes the model artifacts correctly to downstream components.

diff --git a/...nents/google-cloud/google_cloud_pipeline_components/_implementation/llm/function_based.py b/...nents/google-cloud/google_cloud_pipeline_components/_implementation/llm/function_based.py
@@ -573,25 +573,32 @@ def get_empty_string() -> str:
 def validate_rlhf_inputs(
     large_model_reference: str,
     eval_dataset: Optional[str] = None,
-) -> None:
+) -> str:
   """Checks user-provided arguments are valid for the RLHF pipeline."""
-  models_that_support_bulk_inference = {
-      't5-small',
-      't5-large',
-      't5-xl',
-      't5-xxl',
-      'llama-2-7b',
-      'llama-2-7b-chat',
-      'llama-2-13b',
-      'llama-2-13b-chat',
-  }
-  if (
-      eval_dataset
-      and large_model_reference not in models_that_support_bulk_inference
-  ):
-    raise ValueError(
-        f'eval_dataset not supported for {large_model_reference}. '
-        'Please set this value to None when tuning this model. '
-        'This model can be evaluated after tuning using Batch or Online '
-        'Prediction.'
-    )
+  import json
+  import re
+  import glob
+
+  eval_dataset = eval_dataset or ''
+  gcs_eval_dataset_uri = re.sub('^gs://', '/gcs/', eval_dataset)
+  files_in_the_folder = glob.glob(gcs_eval_dataset_uri)
+  if not files_in_the_folder:
+    return ''
+  one_file = files_in_the_folder[0]
+  required_fields = ('input_text', 'candidate_0', 'candidate_1', 'choice')
+  is_valid_preference_data = True
+  remaining_lines_to_check = 100
+  empty_eval_dataset_for_reward_model = ''
+  with open(one_file, 'r') as inputs:
+    for line in inputs:
+      json_data = json.loads(line)
+      remaining_lines_to_check -= 1
+      is_valid_preference_data = is_valid_preference_data & all(
+          field in json_data for field in required_fields
+      )
+      if not is_valid_preference_data:
+        return empty_eval_dataset_for_reward_model
+      if remaining_lines_to_check == 0:
+        break
+
+  return eval_dataset
diff --git a/.../google_cloud_pipeline_components/_implementation/llm/generated/refined_image_versions.py b/.../google_cloud_pipeline_components/_implementation/llm/generated/refined_image_versions.py
@@ -17,4 +17,4 @@
 DO NOT EDIT - This file is generated, manual changes will be overridden.
 """
 
-IMAGE_TAG = '20240303_0507_RC00'
+IMAGE_TAG = '20240305_0507'
diff --git a/...s/google-cloud/google_cloud_pipeline_components/_implementation/llm/reward_model_graph.py b/...s/google-cloud/google_cloud_pipeline_components/_implementation/llm/reward_model_graph.py
@@ -45,6 +45,7 @@ def pipeline(
     lora_dim: int = 4,
     reward_model_learning_rate_multiplier: float = 1.0,
     reward_model_train_steps: int = 1000,
+    eval_dataset: Optional[str] = None,
     instruction: Optional[str] = None,
     project: str = _placeholders.PROJECT_ID_PLACEHOLDER,
     location: str = _placeholders.LOCATION_PLACEHOLDER,
@@ -119,6 +120,25 @@ def pipeline(
       .set_caching_options(False)
   )
 
+  preference_eval_dataset_importer = (
+      private_text_comparison_importer.private_text_comparison_importer(
+          project=project,
+          location=location,
+          input_text=eval_dataset,
+          inputs_field_name=prompt_column,
+          comma_separated_candidates_field_names=comma_separated_candidates_field_names.output,
+          choice_field_name=choice_column,
+          split=env.TRAIN_SPLIT,
+          large_model_reference=reference_model_metadata.outputs[
+              'reward_model_reference'
+          ],
+          instruction=instruction,
+          encryption_spec_key_name=encryption_spec_key_name,
+      )
+      .set_display_name('Import Preference Eval Dataset')
+      .set_caching_options(False)
+  )
+
   reward_model_image_uri = function_based.resolve_private_refined_image_uri(
       accelerator_type=machine_spec.outputs['accelerator_type'],
   ).set_display_name('Resolve Reward Model Image URI')
@@ -137,6 +157,9 @@ def pipeline(
           input_dataset_path=preference_dataset_importer.outputs[
               'output_dataset_path'
           ],
+          eval_dataset_path=preference_eval_dataset_importer.outputs[
+              'output_dataset_path'
+          ],
           train_steps=reward_model_train_steps,
           accelerator_type=machine_spec.outputs['accelerator_type'],
           accelerator_count=machine_spec.outputs['accelerator_count'],

diff --git a/...google-cloud/google_cloud_pipeline_components/_implementation/llm/reward_model_trainer.py b/...google-cloud/google_cloud_pipeline_components/_implementation/llm/reward_model_trainer.py
@@ -35,6 +35,7 @@ def reward_model_trainer(
     output_adapter_path: kfp.dsl.OutputPath(str),  # pytype: disable=invalid-annotation
     tensorboard_metrics: kfp.dsl.Output[kfp.dsl.Artifact],  # pytype: disable=unsupported-operands
     gcp_resources: kfp.dsl.OutputPath(str),  # pytype: disable=invalid-annotation
+    eval_dataset_path: str = '',
     train_split: str = 'train',
     batch_size: int = 64,
     learning_rate_multiplier: float = 1.0,
@@ -49,6 +50,8 @@ def reward_model_trainer(
     location: Location used to run the job.
     input_model_path: Path to the base model to fine tune.
     input_dataset_path: Path to dataset to use to train a reward model.
+    eval_dataset_path: Path to eval dataset to use during the reward model
+      training.
     train_steps: Number of training steps. These are the number of steps on top
       of any steps used to train the base model.
     accelerator_type: Type of TPU accelerator. Can be either TPU_V2 or TPU_V3.
@@ -94,6 +97,7 @@ def reward_model_trainer(
               f'--train_steps={train_steps}',
               f'--input_model_path={input_model_path}',
               f'--input_dataset_path={input_dataset_path}',
+              f'--eval_dataset_path={eval_dataset_path}',
               f'--output_adapter_path={output_adapter_path}',
               f'--tensorboard_metrics_path={tensorboard_metrics.path}',
               f'--large_model_reference={large_model_reference}',

diff --git a/components/google-cloud/google_cloud_pipeline_components/preview/llm/rlhf/component.py b/components/google-cloud/google_cloud_pipeline_components/preview/llm/rlhf/component.py
@@ -71,7 +71,7 @@ def rlhf_pipeline(
     kl_coeff: Coefficient for KL penalty. This regularizes the policy model and penalizes if it diverges from its initial distribution. If set to 0, the reference language model is not loaded into memory. Default value is 0.1.
     instruction: This field lets the model know what task it needs to perform. Base models have been trained over a large set of varied instructions. You can give a simple and intuitive description of the task and the model will follow it, e.g. "Classify this movie review as positive or negative" or "Translate this sentence to Danish". Do not specify this if your dataset already prepends the instruction to the inputs field.
     deploy_model: Whether to deploy the model to an endpoint in `us-central1`. Default is True.
-    eval_dataset: Optional Cloud storage path to an evaluation dataset. Note, eval dataset can only be provided for third-party models. If provided, inference will be performed on this dataset after training. The dataset format is jsonl. Each example in the dataset must contain a field `input_text` that contains the prompt.
+    eval_dataset: Optional Cloud storage path to an evaluation dataset. The dataset format is jsonl. The evaluation dataset can be used to compute train-time metrics (when training a reward model) or perform bulk inference for third-party models. To compute train-time metrics this dataset must contain the same fields as the peference dataset. For bulk inference with third-party models only `input_text` is needed. Note, train-time metrics are only computed for the first 5000 samples in the dataset for efficient evaluation during training.
     project: Project used to run custom jobs. If not specified the project used to run the pipeline will be used.
     location: Location used to run custom jobs. If not specified the location used to run the pipeline will be used.
     encryption_spec_key_name: Customer-managed encryption key. If this is set, then all resources created by the CustomJob will be encrypted with the provided encryption key. Note that this is not supported for TPU at the moment.
@@ -82,6 +82,10 @@ def rlhf_pipeline(
     endpoint_resource_name: Path the Online Prediction Endpoint. This will be an empty string if the model was not deployed.
   """
   # fmt: on
+  reward_model_eval_dataset = function_based.validate_rlhf_inputs(
+      large_model_reference=large_model_reference,
+      eval_dataset=eval_dataset,
+  ).set_display_name('Validate Inputs')
 
   # LoRA dim for reward model
   reward_lora_dim = 4
@@ -105,6 +109,7 @@ def rlhf_pipeline(
               large_model_reference=large_model_reference,
               prompt_sequence_length=prompt_sequence_length,
               target_sequence_length=target_sequence_length,
+              eval_dataset=reward_model_eval_dataset.output,
               instruction=instruction,
               reward_model_learning_rate_multiplier=reward_model_learning_rate_multiplier,
               reward_model_train_steps=reward_model_train_steps,
@@ -118,7 +123,6 @@ def rlhf_pipeline(
       .set_display_name('Train Reward Model')
       .after(validate_pipeline_task)
   )
-
   rl_model_pipeline = reinforcement_learning_graph.pipeline(
       prompt_dataset=prompt_dataset,
       input_reward_model_path=reward_model_pipeline.outputs[