From 6180127466cdd44ce1b7730d6989ff33233d57c7 Mon Sep 17 00:00:00 2001
From: iusztinpaul
Date: Sat, 19 Oct 2024 17:15:18 +0300
Subject: [PATCH] feat: Add SageMaker evaluation code
---
README.md | 11 +
configs/evaluating.yaml | 9 +
configs/training.yaml | 4 +-
llm_engineering/model/evaluation/__init__.py | 0
llm_engineering/model/evaluation/evaluate.py | 225 ++++++++++++++++++
.../model/evaluation/requirements.txt | 5 +
llm_engineering/model/evaluation/sagemaker.py | 57 +++++
llm_engineering/model/finetuning/finetune.py | 9 +-
.../model/finetuning/requirements.txt | 1 -
pipelines/__init__.py | 2 +
pipelines/evaluating.py | 12 +
pyproject.toml | 1 +
steps/__init__.py | 4 +-
steps/evaluating/__init__.py | 3 +
steps/evaluating/evaluate.py | 12 +
tools/run.py | 15 ++
16 files changed, 363 insertions(+), 7 deletions(-)
create mode 100644 configs/evaluating.yaml
create mode 100644 llm_engineering/model/evaluation/__init__.py
create mode 100644 llm_engineering/model/evaluation/evaluate.py
create mode 100644 llm_engineering/model/evaluation/requirements.txt
create mode 100644 llm_engineering/model/evaluation/sagemaker.py
create mode 100644 pipelines/evaluating.py
create mode 100644 steps/evaluating/__init__.py
create mode 100644 steps/evaluating/evaluate.py
diff --git a/README.md b/README.md
index 1b78c6c..6b19152 100644
--- a/README.md
+++ b/README.md
@@ -355,6 +355,12 @@ poetry poe run-training-pipeline
```
This will start the training code using the configs from `configs/training.yaml` directly in SageMaker. You can visualize the results in Comet ML's dashboard.
+We start the evaluation pipeline through ZenML by running the following:
+```shell
+poetry poe run-evaluation-pipeline
+```
+This will start the evaluation code using the configs from `configs/evaluating.yaml` directly in SageMaker. You can visualize the results in `*-results` datasets saved to your HuggingFace profile.
+
### Inference
For creating an AWS SageMaker Inference Endpoint, run:
@@ -471,6 +477,11 @@ Run the training pipeline:
poetry poe run-training-pipeline
```
+Run the evaluation pipeline:
+```shell
+poetry poe run-evaluation-pipeline
+```
+
> [!WARNING]
> For this to work, make sure you properly configured AWS SageMaker as described in [Set up cloud infrastructure (for production)](#set-up-cloud-infrastructure-for-production).
diff --git a/configs/evaluating.yaml b/configs/evaluating.yaml
new file mode 100644
index 0000000..ec91ef9
--- /dev/null
+++ b/configs/evaluating.yaml
@@ -0,0 +1,9 @@
+settings:
+ docker:
+ parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest
+ skip_build: True
+ orchestrator.sagemaker:
+ synchronous: false
+
+parameters:
+ is_dummy: true # Change this to 'false' to run the evaluation on the full dataset.
diff --git a/configs/training.yaml b/configs/training.yaml
index 7ba8f15..38bf83c 100644
--- a/configs/training.yaml
+++ b/configs/training.yaml
@@ -7,8 +7,8 @@ settings:
parameters:
finetuning_type: sft
- num_train_epochs: 1 # 3
+ num_train_epochs: 3
per_device_train_batch_size: 2
learning_rate: 3e-4
dataset_huggingface_workspace: mlabonne
- is_dummy: true
+ is_dummy: true # Change this to 'false' to run the training with the full dataset and epochs.
diff --git a/llm_engineering/model/evaluation/__init__.py b/llm_engineering/model/evaluation/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/llm_engineering/model/evaluation/evaluate.py b/llm_engineering/model/evaluation/evaluate.py
new file mode 100644
index 0000000..b83efe5
--- /dev/null
+++ b/llm_engineering/model/evaluation/evaluate.py
@@ -0,0 +1,225 @@
+import concurrent.futures
+import gc
+import json
+import os
+
+from datasets import Dataset, load_dataset
+from huggingface_hub import HfApi
+from huggingface_hub.utils import RepositoryNotFoundError
+from openai import OpenAI
+from tqdm.auto import tqdm
+from vllm import LLM, SamplingParams
+
+OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
+DATASET_HUGGINGFACE_WORKSPACE = os.environ["DATASET_HUGGINGFACE_WORKSPACE"]
+MODEL_HUGGINGFACE_WORKSPACE = os.environ["MODEL_HUGGINGFACE_WORKSPACE"]
+IS_DUMMY = os.environ.get("IS_DUMMY", False)
+
+print("====== EVAL PARAMETERS ======") # noqa
+print(f"{DATASET_HUGGINGFACE_WORKSPACE=}") # noqa
+print(f"{MODEL_HUGGINGFACE_WORKSPACE=}") # noqa
+print(f"{IS_DUMMY=}") # noqa
+print("=============================") # noqa
+
+
+def generate_answers(model_id: str, dataset_name: str):
+ def format(sample):
+ return "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{}\n\n### Response:\n".format(
+ sample["instruction"]
+ )
+
+ dataset = load_dataset(dataset_name, split="test")
+ if IS_DUMMY:
+ dataset = dataset.select(range(10))
+ print(f"Dataset size: {len(dataset)}") # noqa
+ dataset = dataset.map(lambda sample: {"prompt": format(sample)})
+
+ print(f"Generating answers for {model_id}") # noqa
+ llm = LLM(model=model_id, max_model_len=2048)
+ sampling_params = SamplingParams(temperature=0.8, top_p=0.95, min_p=0.05, max_tokens=2048)
+ outputs = llm.generate(dataset["prompt"], sampling_params)
+
+ answers = [output.outputs[0].text for output in outputs]
+ dataset = dataset.add_column("answers", answers)
+
+ print(f"Uploading results for {model_id}") # noqa
+ dataset.push_to_hub(f"{DATASET_HUGGINGFACE_WORKSPACE}/{model_id.split('/')[-1]}-results")
+ gc.collect()
+
+ return dataset
+
+
+def evaluate_answer(instruction: str, answer: str, client: OpenAI) -> dict:
+ prompt = f"""You are an expert judge. Please evaluate the quality of a given answer to an instruction based on two criteria:
+1. Accuracy: How factually correct is the information presented in the answer? You are a technical expert in this topic.
+2. Style: Is the tone and writing style appropriate for a blog post or social media content? It should use simple but technical words and avoid formal or academic language.
+
+Accuracy scale:
+1 (Poor): Contains factual errors or misleading information
+2 (Good): Mostly accurate with minor errors or omissions
+3 (Excellent): Highly accurate and comprehensive
+
+Style scale:
+1 (Poor): Too formal, uses some overly complex words
+2 (Good): Good balance of technical content and accessibility, but still uses formal words and expressions
+3 (Excellent): Perfectly accessible language for blog/social media, uses simple but precise technical terms when necessary
+
+Example of bad style: The Llama2 7B model constitutes a noteworthy progression in the field of artificial intelligence, serving as the successor to its predecessor, the original Llama architecture.
+Example of excellent style: Llama2 7B outperforms the original Llama model across multiple benchmarks.
+
+Instruction: {instruction}
+
+Answer: {answer}
+
+Provide your evaluation in JSON format with the following structure:
+{{
+ "accuracy": {{
+ "analysis": "...",
+ "score": 0
+ }},
+ "style": {{
+ "analysis": "...",
+ "score": 0
+ }}
+}}
+"""
+
+ completion = client.chat.completions.create(
+ model="gpt-4o-mini",
+ messages=[
+ {
+ "role": "system",
+ "content": "You are a helpful assistant who evaluates answers based on accuracy and style. Provide your response in JSON format with a short analysis and score for each criterion.",
+ },
+ {"role": "user", "content": prompt},
+ ],
+ response_format={"type": "json_object"},
+ max_tokens=1000,
+ temperature=0.9,
+ )
+
+ # Parse the structured output
+ return json.loads(completion.choices[0].message.content)
+
+
+def evaluate_batch(batch, start_index):
+ client = OpenAI(api_key=OPENAI_API_KEY)
+ return [(i, evaluate_answer(instr, ans, client)) for i, (instr, ans) in enumerate(batch, start=start_index)]
+
+
+def evaluate_answers(model_id: str, num_threads: int = 10, batch_size: int = 5) -> Dataset:
+ # Load the dataset
+ dataset = load_dataset(f"{DATASET_HUGGINGFACE_WORKSPACE}/{model_id.split('/')[-1]}-results", split="all")
+
+ # Create batches of instruction-answer pairs with their original indices
+ batches = [
+ (i, list(zip(dataset["instruction"][i : i + batch_size], dataset["answers"][i : i + batch_size], strict=False)))
+ for i in range(0, len(dataset), batch_size)
+ ]
+
+ evaluations = [None] * len(dataset)
+
+ with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
+ futures = [executor.submit(evaluate_batch, batch, start_index) for start_index, batch in batches]
+
+ for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
+ for index, evaluation in future.result():
+ evaluations[index] = evaluation
+
+ # Replace the 'evaluation' column if it exists, otherwise add it
+ if "evaluation" in dataset.column_names:
+ dataset = dataset.remove_columns(["evaluation"])
+ dataset = dataset.add_column("evaluation", evaluations)
+
+ # Post-process evaluations
+ accuracy_scores = []
+ style_scores = []
+
+ for evaluation in dataset["evaluation"]:
+ try:
+ eval_dict = json.loads(evaluation) if isinstance(evaluation, str) else evaluation
+ accuracy_score = eval_dict["accuracy"]["score"]
+ style_score = eval_dict["style"]["score"]
+
+ accuracy_scores.append(accuracy_score)
+ style_scores.append(style_score)
+
+ except (json.JSONDecodeError, KeyError, TypeError):
+ # If there's an error, append None to maintain alignment
+ accuracy_scores.append(None)
+ style_scores.append(None)
+
+ # Add new columns to the dataset
+ if "accuracy" in dataset.column_names:
+ dataset = dataset.remove_columns(["accuracy"])
+ dataset = dataset.add_column("accuracy", accuracy_scores)
+ if "style" in dataset.column_names:
+ dataset = dataset.remove_columns(["style"])
+ dataset = dataset.add_column("style", style_scores)
+
+ dataset.push_to_hub(f"{DATASET_HUGGINGFACE_WORKSPACE}/{model_id.split('/')[-1]}-results")
+
+ return dataset
+
+
+def check_if_huggingface_model_exists(model_id: str, default_value: str) -> str:
+ api = HfApi()
+
+ try:
+ api.model_info(model_id)
+ print(f"Found model on HF: '{model_id}'.") # noqa
+ except RepositoryNotFoundError:
+ print(f"Model '{model_id}' does not exist.") # noqa
+ model_id = default_value
+ print(f"Defaulting to '{model_id}'") # noqa
+ print("Train your own model to avoid this behavior.") # noqa
+
+ return model_id
+
+
+def check_if_huggingface_dataset_exists(dataset_id: str, default_value: str) -> str:
+ api = HfApi()
+
+ try:
+ api.dataset_info(dataset_id)
+ print(f"Found dataset on HF: '{dataset_id}'.") # noqa
+ except RepositoryNotFoundError:
+ print(f"Dataset '{dataset_id}' does not exist.") # noqa
+ dataset_id = default_value
+ print(f"Defaulting to '{dataset_id}'") # noqa
+ print("Use a valid dataset or create your own to avoid this behavior.") # noqa
+
+ return dataset_id
+
+
+model_ids = [
+ check_if_huggingface_model_exists(
+ f"{MODEL_HUGGINGFACE_WORKSPACE}/TwinLlama-3.1-8B", default_value="mlabonne/TwinLlama-3.1-8B"
+ ),
+ check_if_huggingface_model_exists(
+ f"{MODEL_HUGGINGFACE_WORKSPACE}/TwinLlama-3.1-8B-DPO", default_value="mlabonne/TwinLlama-3.1-8B-DPO"
+ ),
+ "meta-llama/Meta-Llama-3.1-8B-Instruct",
+]
+
+if __name__ == "__main__":
+ # Run generation
+ for model_id in model_ids:
+ dataset_name = check_if_huggingface_dataset_exists(
+ f"{DATASET_HUGGINGFACE_WORKSPACE}/llmtwin", default_value="mlabonne/llmtwin"
+ )
+ generate_answers(model_id, dataset_name=dataset_name)
+
+ # Run evaluation
+ for model_id in model_ids:
+ evaluate_answers(model_id)
+
+ # Analyze results
+ for model_id in model_ids:
+ dataset = load_dataset(f"{DATASET_HUGGINGFACE_WORKSPACE}/{model_id.split('/')[-1]}-results", split="all")
+
+ score = sum(dataset["accuracy"]) / len(dataset["accuracy"])
+ print(f"{model_id.split('/')[-1]} - Accuracy: {score:.2f}") # noqa
+
+ score = sum(dataset["style"]) / len(dataset["style"])
+ print(f"{model_id.split('/')[-1]} - Style: {score:.2f}") # noqa
diff --git a/llm_engineering/model/evaluation/requirements.txt b/llm_engineering/model/evaluation/requirements.txt
new file mode 100644
index 0000000..b62ab24
--- /dev/null
+++ b/llm_engineering/model/evaluation/requirements.txt
@@ -0,0 +1,5 @@
+transformers==4.43.3
+datasets==2.20.0
+vllm==0.6.1.post2
+tqdm==4.66.4
+openai==1.52.0
\ No newline at end of file
diff --git a/llm_engineering/model/evaluation/sagemaker.py b/llm_engineering/model/evaluation/sagemaker.py
new file mode 100644
index 0000000..a3c11f2
--- /dev/null
+++ b/llm_engineering/model/evaluation/sagemaker.py
@@ -0,0 +1,57 @@
+from pathlib import Path
+
+from huggingface_hub import HfApi
+from loguru import logger
+from sagemaker.huggingface import HuggingFaceProcessor
+
+from llm_engineering import settings
+
+evaluation_dir = Path(__file__).resolve().parent
+evaluation_requirements_path = evaluation_dir / "requirements.txt"
+
+
+def run_evaluation_on_sagemaker(is_dummy: bool = True) -> None:
+ assert settings.HUGGINGFACE_ACCESS_TOKEN, "Hugging Face access token is required."
+ assert settings.OPENAI_API_KEY, "OpenAI API key is required."
+ assert settings.AWS_ARN_ROLE, "AWS ARN role is required."
+
+ if not evaluation_dir.exists():
+ raise FileNotFoundError(f"The directory {evaluation_dir} does not exist.")
+ if not evaluation_requirements_path.exists():
+ raise FileNotFoundError(f"The file {evaluation_requirements_path} does not exist.")
+
+ api = HfApi()
+ user_info = api.whoami(token=settings.HUGGINGFACE_ACCESS_TOKEN)
+ huggingface_user = user_info["name"]
+ logger.info(f"Current Hugging Face user: {huggingface_user}")
+
+ env = {
+ "HUGGING_FACE_HUB_TOKEN": settings.HUGGINGFACE_ACCESS_TOKEN,
+ "OPENAI_API_KEY": settings.OPENAI_API_KEY,
+ "DATASET_HUGGINGFACE_WORKSPACE": huggingface_user,
+ "MODEL_HUGGINGFACE_WORKSPACE": huggingface_user,
+ }
+ if is_dummy:
+ env["IS_DUMMY"] = "True"
+
+ # Initialize the HuggingFaceProcessor
+ hfp = HuggingFaceProcessor(
+ role=settings.AWS_ARN_ROLE,
+ instance_count=1,
+ instance_type="ml.g5.2xlarge",
+ transformers_version="4.36",
+ pytorch_version="2.1",
+ py_version="py310",
+ base_job_name="evaluate-llm-twin",
+ env=env,
+ )
+
+ # Run the processing job
+ hfp.run(
+ code="evaluate.py",
+ source_dir=str(evaluation_dir),
+ )
+
+
+if __name__ == "__main__":
+ run_evaluation_on_sagemaker()
diff --git a/llm_engineering/model/finetuning/finetune.py b/llm_engineering/model/finetuning/finetune.py
index 9c90d4e..81ed509 100644
--- a/llm_engineering/model/finetuning/finetune.py
+++ b/llm_engineering/model/finetuning/finetune.py
@@ -83,6 +83,11 @@ def finetune(
EOS_TOKEN = tokenizer.eos_token
print(f"Setting EOS_TOKEN to {EOS_TOKEN}") # noqa
+ if is_dummy is True:
+ num_train_epochs = 1
+ print(f"Training in dummy mode. Setting num_train_epochs to '{num_train_epochs}'") # noqa
+ print(f"Training in dummy mode. Reducing dataset size to '400'.") # noqa
+
if finetuning_type == "sft":
def format_samples_sft(examples):
@@ -218,9 +223,9 @@ def check_if_huggingface_model_exists(model_id: str, default_value: str = "mlabo
try:
api.model_info(model_id)
except RepositoryNotFoundError:
- print(f"Model '{sft_base_model_repo_id}' does not exist.") # noqa
+ print(f"Model '{model_id}' does not exist.") # noqa
model_id = default_value
- print(f"Defaulting to '{sft_base_model_repo_id}'") # noqa
+ print(f"Defaulting to '{model_id}'") # noqa
print("Train your own 'TwinLlama-3.1-8B' to avoid this behavior.") # noqa
return model_id
diff --git a/llm_engineering/model/finetuning/requirements.txt b/llm_engineering/model/finetuning/requirements.txt
index 3994175..ca2b28a 100644
--- a/llm_engineering/model/finetuning/requirements.txt
+++ b/llm_engineering/model/finetuning/requirements.txt
@@ -6,5 +6,4 @@ trl==0.9.6
bitsandbytes==0.43.3
comet-ml==3.44.3
flash-attn==2.3.6
-# unsloth==2024.10.2
unsloth==2024.9.post2
\ No newline at end of file
diff --git a/pipelines/__init__.py b/pipelines/__init__.py
index 8e93cc6..f1472a3 100644
--- a/pipelines/__init__.py
+++ b/pipelines/__init__.py
@@ -1,5 +1,6 @@
from .digital_data_etl import digital_data_etl
from .end_to_end_data import end_to_end_data
+from .evaluating import evaluating
from .export_artifact_to_json import export_artifact_to_json
from .feature_engineering import feature_engineering
from .generate_datasets import generate_datasets
@@ -8,6 +9,7 @@
__all__ = [
"generate_datasets",
"end_to_end_data",
+ "evaluating",
"export_artifact_to_json",
"digital_data_etl",
"feature_engineering",
diff --git a/pipelines/evaluating.py b/pipelines/evaluating.py
new file mode 100644
index 0000000..f76e378
--- /dev/null
+++ b/pipelines/evaluating.py
@@ -0,0 +1,12 @@
+from zenml import pipeline
+
+from steps import evaluating as evaluating_steps
+
+
+@pipeline
+def evaluating(
+ is_dummy: bool = False,
+) -> None:
+ evaluating_steps.evaluate(
+ is_dummy=is_dummy,
+ )
diff --git a/pyproject.toml b/pyproject.toml
index c52c349..cd11af2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -86,6 +86,7 @@ run-import-data-warehouse-from-json = "poetry run python -m tools.data_warehouse
# Training pipelines
run-training-pipeline = "poetry run python -m tools.run --no-cache --run-training"
+run-evaluation-pipeline = "poetry run python -m tools.run --no-cache --run-evaluation"
# Inference
call-rag-retrieval-module = "poetry run python -m tools.rag"
diff --git a/steps/__init__.py b/steps/__init__.py
index 9e159b2..03d6ee8 100644
--- a/steps/__init__.py
+++ b/steps/__init__.py
@@ -1,3 +1,3 @@
-from . import etl, export, feature_engineering, generate_datasets
+from . import etl, evaluating, export, feature_engineering, generate_datasets, training
-__all__ = ["generate_datasets", "export", "etl", "feature_engineering"]
+__all__ = ["generate_datasets", "export", "etl", "feature_engineering", "training", "evaluating"]
diff --git a/steps/evaluating/__init__.py b/steps/evaluating/__init__.py
new file mode 100644
index 0000000..542eb78
--- /dev/null
+++ b/steps/evaluating/__init__.py
@@ -0,0 +1,3 @@
+from .evaluate import evaluate
+
+__all__ = ["evaluate"]
diff --git a/steps/evaluating/evaluate.py b/steps/evaluating/evaluate.py
new file mode 100644
index 0000000..0451fe3
--- /dev/null
+++ b/steps/evaluating/evaluate.py
@@ -0,0 +1,12 @@
+from zenml import step
+
+from llm_engineering.model.evaluation.sagemaker import run_evaluation_on_sagemaker
+
+
+@step
+def evaluate(
+ is_dummy: bool = False,
+) -> None:
+ run_evaluation_on_sagemaker(
+ is_dummy=is_dummy,
+ )
diff --git a/tools/run.py b/tools/run.py
index 30c0ab5..c07faea 100644
--- a/tools/run.py
+++ b/tools/run.py
@@ -8,6 +8,7 @@
from pipelines import (
digital_data_etl,
end_to_end_data,
+ evaluating,
export_artifact_to_json,
feature_engineering,
generate_datasets,
@@ -97,6 +98,12 @@
default=False,
help="Whether to run the training pipeline.",
)
+@click.option(
+ "--run-evaluation",
+ is_flag=True,
+ default=False,
+ help="Whether to run the evaluation pipeline.",
+)
@click.option(
"--export-settings",
is_flag=True,
@@ -113,6 +120,7 @@ def main(
run_generate_instruct_datasets: bool = False,
run_generate_preference_datasets: bool = False,
run_training: bool = False,
+ run_evaluation: bool = False,
export_settings: bool = False,
) -> None:
assert (
@@ -123,6 +131,7 @@ def main(
or run_generate_instruct_datasets
or run_generate_preference_datasets
or run_training
+ or run_evaluation
or export_settings
), "Please specify an action to run."
@@ -180,6 +189,12 @@ def main(
pipeline_args["run_name"] = f"training_run_{dt.now().strftime('%Y_%m_%d_%H_%M_%S')}"
training.with_options(**pipeline_args)(**run_args_cd)
+ if run_evaluation:
+ run_args_cd = {}
+ pipeline_args["config_path"] = root_dir / "configs" / "evaluating.yaml"
+ pipeline_args["run_name"] = f"evaluation_run_{dt.now().strftime('%Y_%m_%d_%H_%M_%S')}"
+ evaluating.with_options(**pipeline_args)(**run_args_cd)
+
if __name__ == "__main__":
main()