Skip to content

Commit

Permalink
feat(components): Implement the train time evaluation in reward model…
Browse files Browse the repository at this point in the history
… training. With the train time eval dataset available, the pipeline outputs the accuracy and cross entropy metrics to the log

PiperOrigin-RevId: 613057150
  • Loading branch information
Googler committed Mar 6, 2024
1 parent 547a8ae commit 731cb81
Show file tree
Hide file tree
Showing 6 changed files with 63 additions and 24 deletions.
1 change: 1 addition & 0 deletions components/google-cloud/RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
* Fix bug in `preview.llm.rlhf_pipeline` that caused wrong output artifact to be used for inference after training.
* Fix issue where AutoSxS was not propagating location to all sub-components.
* Add CMEK support to `preview.llm.infer_pipeline`.
* Use `eval_dataset` for train-time evalutation when training a reward model. Requires `eval_dataset` to contain the same fields as the [preference dataset](https://cloud.google.com/vertex-ai/docs/generative-ai/models/tune-text-models-rlhf#human-preference-dataset).

## Release 2.10.0
* Fix the missing output of pipeline remote runner. `AutoMLImageTrainingJobRunOp` now passes the model artifacts correctly to downstream components.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -573,25 +573,32 @@ def get_empty_string() -> str:
def validate_rlhf_inputs(
large_model_reference: str,
eval_dataset: Optional[str] = None,
) -> None:
) -> str:
"""Checks user-provided arguments are valid for the RLHF pipeline."""
models_that_support_bulk_inference = {
't5-small',
't5-large',
't5-xl',
't5-xxl',
'llama-2-7b',
'llama-2-7b-chat',
'llama-2-13b',
'llama-2-13b-chat',
}
if (
eval_dataset
and large_model_reference not in models_that_support_bulk_inference
):
raise ValueError(
f'eval_dataset not supported for {large_model_reference}. '
'Please set this value to None when tuning this model. '
'This model can be evaluated after tuning using Batch or Online '
'Prediction.'
)
import json
import re
import glob

eval_dataset = eval_dataset or ''
gcs_eval_dataset_uri = re.sub('^gs://', '/gcs/', eval_dataset)
files_in_the_folder = glob.glob(gcs_eval_dataset_uri)
if not files_in_the_folder:
return ''
one_file = files_in_the_folder[0]
required_fields = ('input_text', 'candidate_0', 'candidate_1', 'choice')
is_valid_preference_data = True
remaining_lines_to_check = 100
empty_eval_dataset_for_reward_model = ''
with open(one_file, 'r') as inputs:
for line in inputs:
json_data = json.loads(line)
remaining_lines_to_check -= 1
is_valid_preference_data = is_valid_preference_data & all(
field in json_data for field in required_fields
)
if not is_valid_preference_data:
return empty_eval_dataset_for_reward_model
if remaining_lines_to_check == 0:
break

return eval_dataset
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@
DO NOT EDIT - This file is generated, manual changes will be overridden.
"""

IMAGE_TAG = '20240303_0507_RC00'
IMAGE_TAG = '20240305_0507'
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def pipeline(
lora_dim: int = 4,
reward_model_learning_rate_multiplier: float = 1.0,
reward_model_train_steps: int = 1000,
eval_dataset: Optional[str] = None,
instruction: Optional[str] = None,
project: str = _placeholders.PROJECT_ID_PLACEHOLDER,
location: str = _placeholders.LOCATION_PLACEHOLDER,
Expand Down Expand Up @@ -119,6 +120,25 @@ def pipeline(
.set_caching_options(False)
)

preference_eval_dataset_importer = (
private_text_comparison_importer.private_text_comparison_importer(
project=project,
location=location,
input_text=eval_dataset,
inputs_field_name=prompt_column,
comma_separated_candidates_field_names=comma_separated_candidates_field_names.output,
choice_field_name=choice_column,
split=env.TRAIN_SPLIT,
large_model_reference=reference_model_metadata.outputs[
'reward_model_reference'
],
instruction=instruction,
encryption_spec_key_name=encryption_spec_key_name,
)
.set_display_name('Import Preference Eval Dataset')
.set_caching_options(False)
)

reward_model_image_uri = function_based.resolve_private_refined_image_uri(
accelerator_type=machine_spec.outputs['accelerator_type'],
).set_display_name('Resolve Reward Model Image URI')
Expand All @@ -137,6 +157,9 @@ def pipeline(
input_dataset_path=preference_dataset_importer.outputs[
'output_dataset_path'
],
eval_dataset_path=preference_eval_dataset_importer.outputs[
'output_dataset_path'
],
train_steps=reward_model_train_steps,
accelerator_type=machine_spec.outputs['accelerator_type'],
accelerator_count=machine_spec.outputs['accelerator_count'],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def reward_model_trainer(
output_adapter_path: kfp.dsl.OutputPath(str), # pytype: disable=invalid-annotation
tensorboard_metrics: kfp.dsl.Output[kfp.dsl.Artifact], # pytype: disable=unsupported-operands
gcp_resources: kfp.dsl.OutputPath(str), # pytype: disable=invalid-annotation
eval_dataset_path: str = '',
train_split: str = 'train',
batch_size: int = 64,
learning_rate_multiplier: float = 1.0,
Expand All @@ -49,6 +50,8 @@ def reward_model_trainer(
location: Location used to run the job.
input_model_path: Path to the base model to fine tune.
input_dataset_path: Path to dataset to use to train a reward model.
eval_dataset_path: Path to eval dataset to use during the reward model
training.
train_steps: Number of training steps. These are the number of steps on top
of any steps used to train the base model.
accelerator_type: Type of TPU accelerator. Can be either TPU_V2 or TPU_V3.
Expand Down Expand Up @@ -94,6 +97,7 @@ def reward_model_trainer(
f'--train_steps={train_steps}',
f'--input_model_path={input_model_path}',
f'--input_dataset_path={input_dataset_path}',
f'--eval_dataset_path={eval_dataset_path}',
f'--output_adapter_path={output_adapter_path}',
f'--tensorboard_metrics_path={tensorboard_metrics.path}',
f'--large_model_reference={large_model_reference}',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def rlhf_pipeline(
kl_coeff: Coefficient for KL penalty. This regularizes the policy model and penalizes if it diverges from its initial distribution. If set to 0, the reference language model is not loaded into memory. Default value is 0.1.
instruction: This field lets the model know what task it needs to perform. Base models have been trained over a large set of varied instructions. You can give a simple and intuitive description of the task and the model will follow it, e.g. "Classify this movie review as positive or negative" or "Translate this sentence to Danish". Do not specify this if your dataset already prepends the instruction to the inputs field.
deploy_model: Whether to deploy the model to an endpoint in `us-central1`. Default is True.
eval_dataset: Optional Cloud storage path to an evaluation dataset. Note, eval dataset can only be provided for third-party models. If provided, inference will be performed on this dataset after training. The dataset format is jsonl. Each example in the dataset must contain a field `input_text` that contains the prompt.
eval_dataset: Optional Cloud storage path to an evaluation dataset. The dataset format is jsonl. The evaluation dataset can be used to compute train-time metrics (when training a reward model) or perform bulk inference for third-party models. To compute train-time metrics this dataset must contain the same fields as the peference dataset. For bulk inference with third-party models only `input_text` is needed. Note, train-time metrics are only computed for the first 5000 samples in the dataset for efficient evaluation during training.
project: Project used to run custom jobs. If not specified the project used to run the pipeline will be used.
location: Location used to run custom jobs. If not specified the location used to run the pipeline will be used.
encryption_spec_key_name: Customer-managed encryption key. If this is set, then all resources created by the CustomJob will be encrypted with the provided encryption key. Note that this is not supported for TPU at the moment.
Expand All @@ -82,6 +82,10 @@ def rlhf_pipeline(
endpoint_resource_name: Path the Online Prediction Endpoint. This will be an empty string if the model was not deployed.
"""
# fmt: on
reward_model_eval_dataset = function_based.validate_rlhf_inputs(
large_model_reference=large_model_reference,
eval_dataset=eval_dataset,
).set_display_name('Validate Inputs')

# LoRA dim for reward model
reward_lora_dim = 4
Expand All @@ -105,6 +109,7 @@ def rlhf_pipeline(
large_model_reference=large_model_reference,
prompt_sequence_length=prompt_sequence_length,
target_sequence_length=target_sequence_length,
eval_dataset=reward_model_eval_dataset.output,
instruction=instruction,
reward_model_learning_rate_multiplier=reward_model_learning_rate_multiplier,
reward_model_train_steps=reward_model_train_steps,
Expand All @@ -118,7 +123,6 @@ def rlhf_pipeline(
.set_display_name('Train Reward Model')
.after(validate_pipeline_task)
)

rl_model_pipeline = reinforcement_learning_graph.pipeline(
prompt_dataset=prompt_dataset,
input_reward_model_path=reward_model_pipeline.outputs[
Expand Down

0 comments on commit 731cb81

Please sign in to comment.