From 6180127466cdd44ce1b7730d6989ff33233d57c7 Mon Sep 17 00:00:00 2001 From: iusztinpaul Date: Sat, 19 Oct 2024 17:15:18 +0300 Subject: [PATCH] feat: Add SageMaker evaluation code --- README.md | 11 + configs/evaluating.yaml | 9 + configs/training.yaml | 4 +- llm_engineering/model/evaluation/__init__.py | 0 llm_engineering/model/evaluation/evaluate.py | 225 ++++++++++++++++++ .../model/evaluation/requirements.txt | 5 + llm_engineering/model/evaluation/sagemaker.py | 57 +++++ llm_engineering/model/finetuning/finetune.py | 9 +- .../model/finetuning/requirements.txt | 1 - pipelines/__init__.py | 2 + pipelines/evaluating.py | 12 + pyproject.toml | 1 + steps/__init__.py | 4 +- steps/evaluating/__init__.py | 3 + steps/evaluating/evaluate.py | 12 + tools/run.py | 15 ++ 16 files changed, 363 insertions(+), 7 deletions(-) create mode 100644 configs/evaluating.yaml create mode 100644 llm_engineering/model/evaluation/__init__.py create mode 100644 llm_engineering/model/evaluation/evaluate.py create mode 100644 llm_engineering/model/evaluation/requirements.txt create mode 100644 llm_engineering/model/evaluation/sagemaker.py create mode 100644 pipelines/evaluating.py create mode 100644 steps/evaluating/__init__.py create mode 100644 steps/evaluating/evaluate.py diff --git a/README.md b/README.md index 1b78c6c..6b19152 100644 --- a/README.md +++ b/README.md @@ -355,6 +355,12 @@ poetry poe run-training-pipeline ``` This will start the training code using the configs from `configs/training.yaml` directly in SageMaker. You can visualize the results in Comet ML's dashboard. +We start the evaluation pipeline through ZenML by running the following: +```shell +poetry poe run-evaluation-pipeline +``` +This will start the evaluation code using the configs from `configs/evaluating.yaml` directly in SageMaker. You can visualize the results in `*-results` datasets saved to your HuggingFace profile. + ### Inference For creating an AWS SageMaker Inference Endpoint, run: @@ -471,6 +477,11 @@ Run the training pipeline: poetry poe run-training-pipeline ``` +Run the evaluation pipeline: +```shell +poetry poe run-evaluation-pipeline +``` + > [!WARNING] > For this to work, make sure you properly configured AWS SageMaker as described in [Set up cloud infrastructure (for production)](#set-up-cloud-infrastructure-for-production). diff --git a/configs/evaluating.yaml b/configs/evaluating.yaml new file mode 100644 index 0000000..ec91ef9 --- /dev/null +++ b/configs/evaluating.yaml @@ -0,0 +1,9 @@ +settings: + docker: + parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest + skip_build: True + orchestrator.sagemaker: + synchronous: false + +parameters: + is_dummy: true # Change this to 'false' to run the evaluation on the full dataset. diff --git a/configs/training.yaml b/configs/training.yaml index 7ba8f15..38bf83c 100644 --- a/configs/training.yaml +++ b/configs/training.yaml @@ -7,8 +7,8 @@ settings: parameters: finetuning_type: sft - num_train_epochs: 1 # 3 + num_train_epochs: 3 per_device_train_batch_size: 2 learning_rate: 3e-4 dataset_huggingface_workspace: mlabonne - is_dummy: true + is_dummy: true # Change this to 'false' to run the training with the full dataset and epochs. diff --git a/llm_engineering/model/evaluation/__init__.py b/llm_engineering/model/evaluation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/llm_engineering/model/evaluation/evaluate.py b/llm_engineering/model/evaluation/evaluate.py new file mode 100644 index 0000000..b83efe5 --- /dev/null +++ b/llm_engineering/model/evaluation/evaluate.py @@ -0,0 +1,225 @@ +import concurrent.futures +import gc +import json +import os + +from datasets import Dataset, load_dataset +from huggingface_hub import HfApi +from huggingface_hub.utils import RepositoryNotFoundError +from openai import OpenAI +from tqdm.auto import tqdm +from vllm import LLM, SamplingParams + +OPENAI_API_KEY = os.environ["OPENAI_API_KEY"] +DATASET_HUGGINGFACE_WORKSPACE = os.environ["DATASET_HUGGINGFACE_WORKSPACE"] +MODEL_HUGGINGFACE_WORKSPACE = os.environ["MODEL_HUGGINGFACE_WORKSPACE"] +IS_DUMMY = os.environ.get("IS_DUMMY", False) + +print("====== EVAL PARAMETERS ======") # noqa +print(f"{DATASET_HUGGINGFACE_WORKSPACE=}") # noqa +print(f"{MODEL_HUGGINGFACE_WORKSPACE=}") # noqa +print(f"{IS_DUMMY=}") # noqa +print("=============================") # noqa + + +def generate_answers(model_id: str, dataset_name: str): + def format(sample): + return "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{}\n\n### Response:\n".format( + sample["instruction"] + ) + + dataset = load_dataset(dataset_name, split="test") + if IS_DUMMY: + dataset = dataset.select(range(10)) + print(f"Dataset size: {len(dataset)}") # noqa + dataset = dataset.map(lambda sample: {"prompt": format(sample)}) + + print(f"Generating answers for {model_id}") # noqa + llm = LLM(model=model_id, max_model_len=2048) + sampling_params = SamplingParams(temperature=0.8, top_p=0.95, min_p=0.05, max_tokens=2048) + outputs = llm.generate(dataset["prompt"], sampling_params) + + answers = [output.outputs[0].text for output in outputs] + dataset = dataset.add_column("answers", answers) + + print(f"Uploading results for {model_id}") # noqa + dataset.push_to_hub(f"{DATASET_HUGGINGFACE_WORKSPACE}/{model_id.split('/')[-1]}-results") + gc.collect() + + return dataset + + +def evaluate_answer(instruction: str, answer: str, client: OpenAI) -> dict: + prompt = f"""You are an expert judge. Please evaluate the quality of a given answer to an instruction based on two criteria: +1. Accuracy: How factually correct is the information presented in the answer? You are a technical expert in this topic. +2. Style: Is the tone and writing style appropriate for a blog post or social media content? It should use simple but technical words and avoid formal or academic language. + +Accuracy scale: +1 (Poor): Contains factual errors or misleading information +2 (Good): Mostly accurate with minor errors or omissions +3 (Excellent): Highly accurate and comprehensive + +Style scale: +1 (Poor): Too formal, uses some overly complex words +2 (Good): Good balance of technical content and accessibility, but still uses formal words and expressions +3 (Excellent): Perfectly accessible language for blog/social media, uses simple but precise technical terms when necessary + +Example of bad style: The Llama2 7B model constitutes a noteworthy progression in the field of artificial intelligence, serving as the successor to its predecessor, the original Llama architecture. +Example of excellent style: Llama2 7B outperforms the original Llama model across multiple benchmarks. + +Instruction: {instruction} + +Answer: {answer} + +Provide your evaluation in JSON format with the following structure: +{{ + "accuracy": {{ + "analysis": "...", + "score": 0 + }}, + "style": {{ + "analysis": "...", + "score": 0 + }} +}} +""" + + completion = client.chat.completions.create( + model="gpt-4o-mini", + messages=[ + { + "role": "system", + "content": "You are a helpful assistant who evaluates answers based on accuracy and style. Provide your response in JSON format with a short analysis and score for each criterion.", + }, + {"role": "user", "content": prompt}, + ], + response_format={"type": "json_object"}, + max_tokens=1000, + temperature=0.9, + ) + + # Parse the structured output + return json.loads(completion.choices[0].message.content) + + +def evaluate_batch(batch, start_index): + client = OpenAI(api_key=OPENAI_API_KEY) + return [(i, evaluate_answer(instr, ans, client)) for i, (instr, ans) in enumerate(batch, start=start_index)] + + +def evaluate_answers(model_id: str, num_threads: int = 10, batch_size: int = 5) -> Dataset: + # Load the dataset + dataset = load_dataset(f"{DATASET_HUGGINGFACE_WORKSPACE}/{model_id.split('/')[-1]}-results", split="all") + + # Create batches of instruction-answer pairs with their original indices + batches = [ + (i, list(zip(dataset["instruction"][i : i + batch_size], dataset["answers"][i : i + batch_size], strict=False))) + for i in range(0, len(dataset), batch_size) + ] + + evaluations = [None] * len(dataset) + + with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor: + futures = [executor.submit(evaluate_batch, batch, start_index) for start_index, batch in batches] + + for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)): + for index, evaluation in future.result(): + evaluations[index] = evaluation + + # Replace the 'evaluation' column if it exists, otherwise add it + if "evaluation" in dataset.column_names: + dataset = dataset.remove_columns(["evaluation"]) + dataset = dataset.add_column("evaluation", evaluations) + + # Post-process evaluations + accuracy_scores = [] + style_scores = [] + + for evaluation in dataset["evaluation"]: + try: + eval_dict = json.loads(evaluation) if isinstance(evaluation, str) else evaluation + accuracy_score = eval_dict["accuracy"]["score"] + style_score = eval_dict["style"]["score"] + + accuracy_scores.append(accuracy_score) + style_scores.append(style_score) + + except (json.JSONDecodeError, KeyError, TypeError): + # If there's an error, append None to maintain alignment + accuracy_scores.append(None) + style_scores.append(None) + + # Add new columns to the dataset + if "accuracy" in dataset.column_names: + dataset = dataset.remove_columns(["accuracy"]) + dataset = dataset.add_column("accuracy", accuracy_scores) + if "style" in dataset.column_names: + dataset = dataset.remove_columns(["style"]) + dataset = dataset.add_column("style", style_scores) + + dataset.push_to_hub(f"{DATASET_HUGGINGFACE_WORKSPACE}/{model_id.split('/')[-1]}-results") + + return dataset + + +def check_if_huggingface_model_exists(model_id: str, default_value: str) -> str: + api = HfApi() + + try: + api.model_info(model_id) + print(f"Found model on HF: '{model_id}'.") # noqa + except RepositoryNotFoundError: + print(f"Model '{model_id}' does not exist.") # noqa + model_id = default_value + print(f"Defaulting to '{model_id}'") # noqa + print("Train your own model to avoid this behavior.") # noqa + + return model_id + + +def check_if_huggingface_dataset_exists(dataset_id: str, default_value: str) -> str: + api = HfApi() + + try: + api.dataset_info(dataset_id) + print(f"Found dataset on HF: '{dataset_id}'.") # noqa + except RepositoryNotFoundError: + print(f"Dataset '{dataset_id}' does not exist.") # noqa + dataset_id = default_value + print(f"Defaulting to '{dataset_id}'") # noqa + print("Use a valid dataset or create your own to avoid this behavior.") # noqa + + return dataset_id + + +model_ids = [ + check_if_huggingface_model_exists( + f"{MODEL_HUGGINGFACE_WORKSPACE}/TwinLlama-3.1-8B", default_value="mlabonne/TwinLlama-3.1-8B" + ), + check_if_huggingface_model_exists( + f"{MODEL_HUGGINGFACE_WORKSPACE}/TwinLlama-3.1-8B-DPO", default_value="mlabonne/TwinLlama-3.1-8B-DPO" + ), + "meta-llama/Meta-Llama-3.1-8B-Instruct", +] + +if __name__ == "__main__": + # Run generation + for model_id in model_ids: + dataset_name = check_if_huggingface_dataset_exists( + f"{DATASET_HUGGINGFACE_WORKSPACE}/llmtwin", default_value="mlabonne/llmtwin" + ) + generate_answers(model_id, dataset_name=dataset_name) + + # Run evaluation + for model_id in model_ids: + evaluate_answers(model_id) + + # Analyze results + for model_id in model_ids: + dataset = load_dataset(f"{DATASET_HUGGINGFACE_WORKSPACE}/{model_id.split('/')[-1]}-results", split="all") + + score = sum(dataset["accuracy"]) / len(dataset["accuracy"]) + print(f"{model_id.split('/')[-1]} - Accuracy: {score:.2f}") # noqa + + score = sum(dataset["style"]) / len(dataset["style"]) + print(f"{model_id.split('/')[-1]} - Style: {score:.2f}") # noqa diff --git a/llm_engineering/model/evaluation/requirements.txt b/llm_engineering/model/evaluation/requirements.txt new file mode 100644 index 0000000..b62ab24 --- /dev/null +++ b/llm_engineering/model/evaluation/requirements.txt @@ -0,0 +1,5 @@ +transformers==4.43.3 +datasets==2.20.0 +vllm==0.6.1.post2 +tqdm==4.66.4 +openai==1.52.0 \ No newline at end of file diff --git a/llm_engineering/model/evaluation/sagemaker.py b/llm_engineering/model/evaluation/sagemaker.py new file mode 100644 index 0000000..a3c11f2 --- /dev/null +++ b/llm_engineering/model/evaluation/sagemaker.py @@ -0,0 +1,57 @@ +from pathlib import Path + +from huggingface_hub import HfApi +from loguru import logger +from sagemaker.huggingface import HuggingFaceProcessor + +from llm_engineering import settings + +evaluation_dir = Path(__file__).resolve().parent +evaluation_requirements_path = evaluation_dir / "requirements.txt" + + +def run_evaluation_on_sagemaker(is_dummy: bool = True) -> None: + assert settings.HUGGINGFACE_ACCESS_TOKEN, "Hugging Face access token is required." + assert settings.OPENAI_API_KEY, "OpenAI API key is required." + assert settings.AWS_ARN_ROLE, "AWS ARN role is required." + + if not evaluation_dir.exists(): + raise FileNotFoundError(f"The directory {evaluation_dir} does not exist.") + if not evaluation_requirements_path.exists(): + raise FileNotFoundError(f"The file {evaluation_requirements_path} does not exist.") + + api = HfApi() + user_info = api.whoami(token=settings.HUGGINGFACE_ACCESS_TOKEN) + huggingface_user = user_info["name"] + logger.info(f"Current Hugging Face user: {huggingface_user}") + + env = { + "HUGGING_FACE_HUB_TOKEN": settings.HUGGINGFACE_ACCESS_TOKEN, + "OPENAI_API_KEY": settings.OPENAI_API_KEY, + "DATASET_HUGGINGFACE_WORKSPACE": huggingface_user, + "MODEL_HUGGINGFACE_WORKSPACE": huggingface_user, + } + if is_dummy: + env["IS_DUMMY"] = "True" + + # Initialize the HuggingFaceProcessor + hfp = HuggingFaceProcessor( + role=settings.AWS_ARN_ROLE, + instance_count=1, + instance_type="ml.g5.2xlarge", + transformers_version="4.36", + pytorch_version="2.1", + py_version="py310", + base_job_name="evaluate-llm-twin", + env=env, + ) + + # Run the processing job + hfp.run( + code="evaluate.py", + source_dir=str(evaluation_dir), + ) + + +if __name__ == "__main__": + run_evaluation_on_sagemaker() diff --git a/llm_engineering/model/finetuning/finetune.py b/llm_engineering/model/finetuning/finetune.py index 9c90d4e..81ed509 100644 --- a/llm_engineering/model/finetuning/finetune.py +++ b/llm_engineering/model/finetuning/finetune.py @@ -83,6 +83,11 @@ def finetune( EOS_TOKEN = tokenizer.eos_token print(f"Setting EOS_TOKEN to {EOS_TOKEN}") # noqa + if is_dummy is True: + num_train_epochs = 1 + print(f"Training in dummy mode. Setting num_train_epochs to '{num_train_epochs}'") # noqa + print(f"Training in dummy mode. Reducing dataset size to '400'.") # noqa + if finetuning_type == "sft": def format_samples_sft(examples): @@ -218,9 +223,9 @@ def check_if_huggingface_model_exists(model_id: str, default_value: str = "mlabo try: api.model_info(model_id) except RepositoryNotFoundError: - print(f"Model '{sft_base_model_repo_id}' does not exist.") # noqa + print(f"Model '{model_id}' does not exist.") # noqa model_id = default_value - print(f"Defaulting to '{sft_base_model_repo_id}'") # noqa + print(f"Defaulting to '{model_id}'") # noqa print("Train your own 'TwinLlama-3.1-8B' to avoid this behavior.") # noqa return model_id diff --git a/llm_engineering/model/finetuning/requirements.txt b/llm_engineering/model/finetuning/requirements.txt index 3994175..ca2b28a 100644 --- a/llm_engineering/model/finetuning/requirements.txt +++ b/llm_engineering/model/finetuning/requirements.txt @@ -6,5 +6,4 @@ trl==0.9.6 bitsandbytes==0.43.3 comet-ml==3.44.3 flash-attn==2.3.6 -# unsloth==2024.10.2 unsloth==2024.9.post2 \ No newline at end of file diff --git a/pipelines/__init__.py b/pipelines/__init__.py index 8e93cc6..f1472a3 100644 --- a/pipelines/__init__.py +++ b/pipelines/__init__.py @@ -1,5 +1,6 @@ from .digital_data_etl import digital_data_etl from .end_to_end_data import end_to_end_data +from .evaluating import evaluating from .export_artifact_to_json import export_artifact_to_json from .feature_engineering import feature_engineering from .generate_datasets import generate_datasets @@ -8,6 +9,7 @@ __all__ = [ "generate_datasets", "end_to_end_data", + "evaluating", "export_artifact_to_json", "digital_data_etl", "feature_engineering", diff --git a/pipelines/evaluating.py b/pipelines/evaluating.py new file mode 100644 index 0000000..f76e378 --- /dev/null +++ b/pipelines/evaluating.py @@ -0,0 +1,12 @@ +from zenml import pipeline + +from steps import evaluating as evaluating_steps + + +@pipeline +def evaluating( + is_dummy: bool = False, +) -> None: + evaluating_steps.evaluate( + is_dummy=is_dummy, + ) diff --git a/pyproject.toml b/pyproject.toml index c52c349..cd11af2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -86,6 +86,7 @@ run-import-data-warehouse-from-json = "poetry run python -m tools.data_warehouse # Training pipelines run-training-pipeline = "poetry run python -m tools.run --no-cache --run-training" +run-evaluation-pipeline = "poetry run python -m tools.run --no-cache --run-evaluation" # Inference call-rag-retrieval-module = "poetry run python -m tools.rag" diff --git a/steps/__init__.py b/steps/__init__.py index 9e159b2..03d6ee8 100644 --- a/steps/__init__.py +++ b/steps/__init__.py @@ -1,3 +1,3 @@ -from . import etl, export, feature_engineering, generate_datasets +from . import etl, evaluating, export, feature_engineering, generate_datasets, training -__all__ = ["generate_datasets", "export", "etl", "feature_engineering"] +__all__ = ["generate_datasets", "export", "etl", "feature_engineering", "training", "evaluating"] diff --git a/steps/evaluating/__init__.py b/steps/evaluating/__init__.py new file mode 100644 index 0000000..542eb78 --- /dev/null +++ b/steps/evaluating/__init__.py @@ -0,0 +1,3 @@ +from .evaluate import evaluate + +__all__ = ["evaluate"] diff --git a/steps/evaluating/evaluate.py b/steps/evaluating/evaluate.py new file mode 100644 index 0000000..0451fe3 --- /dev/null +++ b/steps/evaluating/evaluate.py @@ -0,0 +1,12 @@ +from zenml import step + +from llm_engineering.model.evaluation.sagemaker import run_evaluation_on_sagemaker + + +@step +def evaluate( + is_dummy: bool = False, +) -> None: + run_evaluation_on_sagemaker( + is_dummy=is_dummy, + ) diff --git a/tools/run.py b/tools/run.py index 30c0ab5..c07faea 100644 --- a/tools/run.py +++ b/tools/run.py @@ -8,6 +8,7 @@ from pipelines import ( digital_data_etl, end_to_end_data, + evaluating, export_artifact_to_json, feature_engineering, generate_datasets, @@ -97,6 +98,12 @@ default=False, help="Whether to run the training pipeline.", ) +@click.option( + "--run-evaluation", + is_flag=True, + default=False, + help="Whether to run the evaluation pipeline.", +) @click.option( "--export-settings", is_flag=True, @@ -113,6 +120,7 @@ def main( run_generate_instruct_datasets: bool = False, run_generate_preference_datasets: bool = False, run_training: bool = False, + run_evaluation: bool = False, export_settings: bool = False, ) -> None: assert ( @@ -123,6 +131,7 @@ def main( or run_generate_instruct_datasets or run_generate_preference_datasets or run_training + or run_evaluation or export_settings ), "Please specify an action to run." @@ -180,6 +189,12 @@ def main( pipeline_args["run_name"] = f"training_run_{dt.now().strftime('%Y_%m_%d_%H_%M_%S')}" training.with_options(**pipeline_args)(**run_args_cd) + if run_evaluation: + run_args_cd = {} + pipeline_args["config_path"] = root_dir / "configs" / "evaluating.yaml" + pipeline_args["run_name"] = f"evaluation_run_{dt.now().strftime('%Y_%m_%d_%H_%M_%S')}" + evaluating.with_options(**pipeline_args)(**run_args_cd) + if __name__ == "__main__": main()