Skip to content

Commit

Permalink
feat: Add SageMaker evaluation code
Browse files Browse the repository at this point in the history
  • Loading branch information
iusztinpaul committed Oct 19, 2024
1 parent 27afd25 commit 6180127
Show file tree
Hide file tree
Showing 16 changed files with 363 additions and 7 deletions.
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,12 @@ poetry poe run-training-pipeline
```
This will start the training code using the configs from `configs/training.yaml` directly in SageMaker. You can visualize the results in Comet ML's dashboard.

We start the evaluation pipeline through ZenML by running the following:
```shell
poetry poe run-evaluation-pipeline
```
This will start the evaluation code using the configs from `configs/evaluating.yaml` directly in SageMaker. You can visualize the results in `*-results` datasets saved to your HuggingFace profile.

### Inference

For creating an AWS SageMaker Inference Endpoint, run:
Expand Down Expand Up @@ -471,6 +477,11 @@ Run the training pipeline:
poetry poe run-training-pipeline
```

Run the evaluation pipeline:
```shell
poetry poe run-evaluation-pipeline
```

> [!WARNING]
> For this to work, make sure you properly configured AWS SageMaker as described in [Set up cloud infrastructure (for production)](#set-up-cloud-infrastructure-for-production).
Expand Down
9 changes: 9 additions & 0 deletions configs/evaluating.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
settings:
docker:
parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest
skip_build: True
orchestrator.sagemaker:
synchronous: false

parameters:
is_dummy: true # Change this to 'false' to run the evaluation on the full dataset.
4 changes: 2 additions & 2 deletions configs/training.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ settings:

parameters:
finetuning_type: sft
num_train_epochs: 1 # 3
num_train_epochs: 3
per_device_train_batch_size: 2
learning_rate: 3e-4
dataset_huggingface_workspace: mlabonne
is_dummy: true
is_dummy: true # Change this to 'false' to run the training with the full dataset and epochs.
Empty file.
225 changes: 225 additions & 0 deletions llm_engineering/model/evaluation/evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
import concurrent.futures
import gc
import json
import os

from datasets import Dataset, load_dataset
from huggingface_hub import HfApi
from huggingface_hub.utils import RepositoryNotFoundError
from openai import OpenAI
from tqdm.auto import tqdm
from vllm import LLM, SamplingParams

OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
DATASET_HUGGINGFACE_WORKSPACE = os.environ["DATASET_HUGGINGFACE_WORKSPACE"]
MODEL_HUGGINGFACE_WORKSPACE = os.environ["MODEL_HUGGINGFACE_WORKSPACE"]
IS_DUMMY = os.environ.get("IS_DUMMY", False)

print("====== EVAL PARAMETERS ======") # noqa
print(f"{DATASET_HUGGINGFACE_WORKSPACE=}") # noqa
print(f"{MODEL_HUGGINGFACE_WORKSPACE=}") # noqa
print(f"{IS_DUMMY=}") # noqa
print("=============================") # noqa


def generate_answers(model_id: str, dataset_name: str):
def format(sample):
return "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{}\n\n### Response:\n".format(
sample["instruction"]
)

dataset = load_dataset(dataset_name, split="test")
if IS_DUMMY:
dataset = dataset.select(range(10))
print(f"Dataset size: {len(dataset)}") # noqa
dataset = dataset.map(lambda sample: {"prompt": format(sample)})

print(f"Generating answers for {model_id}") # noqa
llm = LLM(model=model_id, max_model_len=2048)
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, min_p=0.05, max_tokens=2048)
outputs = llm.generate(dataset["prompt"], sampling_params)

answers = [output.outputs[0].text for output in outputs]
dataset = dataset.add_column("answers", answers)

print(f"Uploading results for {model_id}") # noqa
dataset.push_to_hub(f"{DATASET_HUGGINGFACE_WORKSPACE}/{model_id.split('/')[-1]}-results")
gc.collect()

return dataset


def evaluate_answer(instruction: str, answer: str, client: OpenAI) -> dict:
prompt = f"""You are an expert judge. Please evaluate the quality of a given answer to an instruction based on two criteria:
1. Accuracy: How factually correct is the information presented in the answer? You are a technical expert in this topic.
2. Style: Is the tone and writing style appropriate for a blog post or social media content? It should use simple but technical words and avoid formal or academic language.
Accuracy scale:
1 (Poor): Contains factual errors or misleading information
2 (Good): Mostly accurate with minor errors or omissions
3 (Excellent): Highly accurate and comprehensive
Style scale:
1 (Poor): Too formal, uses some overly complex words
2 (Good): Good balance of technical content and accessibility, but still uses formal words and expressions
3 (Excellent): Perfectly accessible language for blog/social media, uses simple but precise technical terms when necessary
Example of bad style: The Llama2 7B model constitutes a noteworthy progression in the field of artificial intelligence, serving as the successor to its predecessor, the original Llama architecture.
Example of excellent style: Llama2 7B outperforms the original Llama model across multiple benchmarks.
Instruction: {instruction}
Answer: {answer}
Provide your evaluation in JSON format with the following structure:
{{
"accuracy": {{
"analysis": "...",
"score": 0
}},
"style": {{
"analysis": "...",
"score": 0
}}
}}
"""

completion = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": "You are a helpful assistant who evaluates answers based on accuracy and style. Provide your response in JSON format with a short analysis and score for each criterion.",
},
{"role": "user", "content": prompt},
],
response_format={"type": "json_object"},
max_tokens=1000,
temperature=0.9,
)

# Parse the structured output
return json.loads(completion.choices[0].message.content)


def evaluate_batch(batch, start_index):
client = OpenAI(api_key=OPENAI_API_KEY)
return [(i, evaluate_answer(instr, ans, client)) for i, (instr, ans) in enumerate(batch, start=start_index)]


def evaluate_answers(model_id: str, num_threads: int = 10, batch_size: int = 5) -> Dataset:
# Load the dataset
dataset = load_dataset(f"{DATASET_HUGGINGFACE_WORKSPACE}/{model_id.split('/')[-1]}-results", split="all")

# Create batches of instruction-answer pairs with their original indices
batches = [
(i, list(zip(dataset["instruction"][i : i + batch_size], dataset["answers"][i : i + batch_size], strict=False)))
for i in range(0, len(dataset), batch_size)
]

evaluations = [None] * len(dataset)

with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
futures = [executor.submit(evaluate_batch, batch, start_index) for start_index, batch in batches]

for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
for index, evaluation in future.result():
evaluations[index] = evaluation

# Replace the 'evaluation' column if it exists, otherwise add it
if "evaluation" in dataset.column_names:
dataset = dataset.remove_columns(["evaluation"])
dataset = dataset.add_column("evaluation", evaluations)

# Post-process evaluations
accuracy_scores = []
style_scores = []

for evaluation in dataset["evaluation"]:
try:
eval_dict = json.loads(evaluation) if isinstance(evaluation, str) else evaluation
accuracy_score = eval_dict["accuracy"]["score"]
style_score = eval_dict["style"]["score"]

accuracy_scores.append(accuracy_score)
style_scores.append(style_score)

except (json.JSONDecodeError, KeyError, TypeError):
# If there's an error, append None to maintain alignment
accuracy_scores.append(None)
style_scores.append(None)

# Add new columns to the dataset
if "accuracy" in dataset.column_names:
dataset = dataset.remove_columns(["accuracy"])
dataset = dataset.add_column("accuracy", accuracy_scores)
if "style" in dataset.column_names:
dataset = dataset.remove_columns(["style"])
dataset = dataset.add_column("style", style_scores)

dataset.push_to_hub(f"{DATASET_HUGGINGFACE_WORKSPACE}/{model_id.split('/')[-1]}-results")

return dataset


def check_if_huggingface_model_exists(model_id: str, default_value: str) -> str:
api = HfApi()

try:
api.model_info(model_id)
print(f"Found model on HF: '{model_id}'.") # noqa
except RepositoryNotFoundError:
print(f"Model '{model_id}' does not exist.") # noqa
model_id = default_value
print(f"Defaulting to '{model_id}'") # noqa
print("Train your own model to avoid this behavior.") # noqa

return model_id


def check_if_huggingface_dataset_exists(dataset_id: str, default_value: str) -> str:
api = HfApi()

try:
api.dataset_info(dataset_id)
print(f"Found dataset on HF: '{dataset_id}'.") # noqa
except RepositoryNotFoundError:
print(f"Dataset '{dataset_id}' does not exist.") # noqa
dataset_id = default_value
print(f"Defaulting to '{dataset_id}'") # noqa
print("Use a valid dataset or create your own to avoid this behavior.") # noqa

return dataset_id


model_ids = [
check_if_huggingface_model_exists(
f"{MODEL_HUGGINGFACE_WORKSPACE}/TwinLlama-3.1-8B", default_value="mlabonne/TwinLlama-3.1-8B"
),
check_if_huggingface_model_exists(
f"{MODEL_HUGGINGFACE_WORKSPACE}/TwinLlama-3.1-8B-DPO", default_value="mlabonne/TwinLlama-3.1-8B-DPO"
),
"meta-llama/Meta-Llama-3.1-8B-Instruct",
]

if __name__ == "__main__":
# Run generation
for model_id in model_ids:
dataset_name = check_if_huggingface_dataset_exists(
f"{DATASET_HUGGINGFACE_WORKSPACE}/llmtwin", default_value="mlabonne/llmtwin"
)
generate_answers(model_id, dataset_name=dataset_name)

# Run evaluation
for model_id in model_ids:
evaluate_answers(model_id)

# Analyze results
for model_id in model_ids:
dataset = load_dataset(f"{DATASET_HUGGINGFACE_WORKSPACE}/{model_id.split('/')[-1]}-results", split="all")

score = sum(dataset["accuracy"]) / len(dataset["accuracy"])
print(f"{model_id.split('/')[-1]} - Accuracy: {score:.2f}") # noqa

score = sum(dataset["style"]) / len(dataset["style"])
print(f"{model_id.split('/')[-1]} - Style: {score:.2f}") # noqa
5 changes: 5 additions & 0 deletions llm_engineering/model/evaluation/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
transformers==4.43.3
datasets==2.20.0
vllm==0.6.1.post2
tqdm==4.66.4
openai==1.52.0
57 changes: 57 additions & 0 deletions llm_engineering/model/evaluation/sagemaker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from pathlib import Path

from huggingface_hub import HfApi
from loguru import logger
from sagemaker.huggingface import HuggingFaceProcessor

from llm_engineering import settings

evaluation_dir = Path(__file__).resolve().parent
evaluation_requirements_path = evaluation_dir / "requirements.txt"


def run_evaluation_on_sagemaker(is_dummy: bool = True) -> None:
assert settings.HUGGINGFACE_ACCESS_TOKEN, "Hugging Face access token is required."
assert settings.OPENAI_API_KEY, "OpenAI API key is required."
assert settings.AWS_ARN_ROLE, "AWS ARN role is required."

if not evaluation_dir.exists():
raise FileNotFoundError(f"The directory {evaluation_dir} does not exist.")
if not evaluation_requirements_path.exists():
raise FileNotFoundError(f"The file {evaluation_requirements_path} does not exist.")

api = HfApi()
user_info = api.whoami(token=settings.HUGGINGFACE_ACCESS_TOKEN)
huggingface_user = user_info["name"]
logger.info(f"Current Hugging Face user: {huggingface_user}")

env = {
"HUGGING_FACE_HUB_TOKEN": settings.HUGGINGFACE_ACCESS_TOKEN,
"OPENAI_API_KEY": settings.OPENAI_API_KEY,
"DATASET_HUGGINGFACE_WORKSPACE": huggingface_user,
"MODEL_HUGGINGFACE_WORKSPACE": huggingface_user,
}
if is_dummy:
env["IS_DUMMY"] = "True"

# Initialize the HuggingFaceProcessor
hfp = HuggingFaceProcessor(
role=settings.AWS_ARN_ROLE,
instance_count=1,
instance_type="ml.g5.2xlarge",
transformers_version="4.36",
pytorch_version="2.1",
py_version="py310",
base_job_name="evaluate-llm-twin",
env=env,
)

# Run the processing job
hfp.run(
code="evaluate.py",
source_dir=str(evaluation_dir),
)


if __name__ == "__main__":
run_evaluation_on_sagemaker()
9 changes: 7 additions & 2 deletions llm_engineering/model/finetuning/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,11 @@ def finetune(
EOS_TOKEN = tokenizer.eos_token
print(f"Setting EOS_TOKEN to {EOS_TOKEN}") # noqa

if is_dummy is True:
num_train_epochs = 1
print(f"Training in dummy mode. Setting num_train_epochs to '{num_train_epochs}'") # noqa
print(f"Training in dummy mode. Reducing dataset size to '400'.") # noqa

if finetuning_type == "sft":

def format_samples_sft(examples):
Expand Down Expand Up @@ -218,9 +223,9 @@ def check_if_huggingface_model_exists(model_id: str, default_value: str = "mlabo
try:
api.model_info(model_id)
except RepositoryNotFoundError:
print(f"Model '{sft_base_model_repo_id}' does not exist.") # noqa
print(f"Model '{model_id}' does not exist.") # noqa
model_id = default_value
print(f"Defaulting to '{sft_base_model_repo_id}'") # noqa
print(f"Defaulting to '{model_id}'") # noqa
print("Train your own 'TwinLlama-3.1-8B' to avoid this behavior.") # noqa

return model_id
Expand Down
1 change: 0 additions & 1 deletion llm_engineering/model/finetuning/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,4 @@ trl==0.9.6
bitsandbytes==0.43.3
comet-ml==3.44.3
flash-attn==2.3.6
# unsloth==2024.10.2
unsloth==2024.9.post2
2 changes: 2 additions & 0 deletions pipelines/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from .digital_data_etl import digital_data_etl
from .end_to_end_data import end_to_end_data
from .evaluating import evaluating
from .export_artifact_to_json import export_artifact_to_json
from .feature_engineering import feature_engineering
from .generate_datasets import generate_datasets
Expand All @@ -8,6 +9,7 @@
__all__ = [
"generate_datasets",
"end_to_end_data",
"evaluating",
"export_artifact_to_json",
"digital_data_etl",
"feature_engineering",
Expand Down
12 changes: 12 additions & 0 deletions pipelines/evaluating.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from zenml import pipeline

from steps import evaluating as evaluating_steps


@pipeline
def evaluating(
is_dummy: bool = False,
) -> None:
evaluating_steps.evaluate(
is_dummy=is_dummy,
)
Loading

0 comments on commit 6180127

Please sign in to comment.