From 6180127466cdd44ce1b7730d6989ff33233d57c7 Mon Sep 17 00:00:00 2001
From: iusztinpaul <p.e.iusztin@gmail.com>
Date: Sat, 19 Oct 2024 17:15:18 +0300
Subject: [PATCH] feat: Add SageMaker evaluation code

---
 README.md                                     |  11 +
 configs/evaluating.yaml                       |   9 +
 configs/training.yaml                         |   4 +-
 llm_engineering/model/evaluation/__init__.py  |   0
 llm_engineering/model/evaluation/evaluate.py  | 225 ++++++++++++++++++
 .../model/evaluation/requirements.txt         |   5 +
 llm_engineering/model/evaluation/sagemaker.py |  57 +++++
 llm_engineering/model/finetuning/finetune.py  |   9 +-
 .../model/finetuning/requirements.txt         |   1 -
 pipelines/__init__.py                         |   2 +
 pipelines/evaluating.py                       |  12 +
 pyproject.toml                                |   1 +
 steps/__init__.py                             |   4 +-
 steps/evaluating/__init__.py                  |   3 +
 steps/evaluating/evaluate.py                  |  12 +
 tools/run.py                                  |  15 ++
 16 files changed, 363 insertions(+), 7 deletions(-)
 create mode 100644 configs/evaluating.yaml
 create mode 100644 llm_engineering/model/evaluation/__init__.py
 create mode 100644 llm_engineering/model/evaluation/evaluate.py
 create mode 100644 llm_engineering/model/evaluation/requirements.txt
 create mode 100644 llm_engineering/model/evaluation/sagemaker.py
 create mode 100644 pipelines/evaluating.py
 create mode 100644 steps/evaluating/__init__.py
 create mode 100644 steps/evaluating/evaluate.py

diff --git a/README.md b/README.md
index 1b78c6c..6b19152 100644
--- a/README.md
+++ b/README.md
@@ -355,6 +355,12 @@ poetry poe run-training-pipeline
 ```
 This will start the training code using the configs from `configs/training.yaml` directly in SageMaker. You can visualize the results in Comet ML's dashboard.
 
+We start the evaluation pipeline through ZenML by running the following:
+```shell
+poetry poe run-evaluation-pipeline
+```
+This will start the evaluation code using the configs from `configs/evaluating.yaml` directly in SageMaker. You can visualize the results in `*-results` datasets saved to your HuggingFace profile.
+
 ### Inference
 
 For creating an AWS SageMaker Inference Endpoint, run:
@@ -471,6 +477,11 @@ Run the training pipeline:
 poetry poe run-training-pipeline
 ```
 
+Run the evaluation pipeline:
+```shell
+poetry poe run-evaluation-pipeline
+```
+
 > [!WARNING]
 > For this to work, make sure you properly configured AWS SageMaker as described in [Set up cloud infrastructure (for production)](#set-up-cloud-infrastructure-for-production).
 
diff --git a/configs/evaluating.yaml b/configs/evaluating.yaml
new file mode 100644
index 0000000..ec91ef9
--- /dev/null
+++ b/configs/evaluating.yaml
@@ -0,0 +1,9 @@
+settings:
+  docker:
+    parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest
+    skip_build: True
+  orchestrator.sagemaker:
+    synchronous: false
+
+parameters:
+  is_dummy: true # Change this to 'false' to run the evaluation on the full dataset.
diff --git a/configs/training.yaml b/configs/training.yaml
index 7ba8f15..38bf83c 100644
--- a/configs/training.yaml
+++ b/configs/training.yaml
@@ -7,8 +7,8 @@ settings:
 
 parameters:
   finetuning_type: sft
-  num_train_epochs: 1 # 3
+  num_train_epochs: 3
   per_device_train_batch_size: 2
   learning_rate: 3e-4
   dataset_huggingface_workspace: mlabonne
-  is_dummy: true
+  is_dummy: true # Change this to 'false' to run the training with the full dataset and epochs.
diff --git a/llm_engineering/model/evaluation/__init__.py b/llm_engineering/model/evaluation/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/llm_engineering/model/evaluation/evaluate.py b/llm_engineering/model/evaluation/evaluate.py
new file mode 100644
index 0000000..b83efe5
--- /dev/null
+++ b/llm_engineering/model/evaluation/evaluate.py
@@ -0,0 +1,225 @@
+import concurrent.futures
+import gc
+import json
+import os
+
+from datasets import Dataset, load_dataset
+from huggingface_hub import HfApi
+from huggingface_hub.utils import RepositoryNotFoundError
+from openai import OpenAI
+from tqdm.auto import tqdm
+from vllm import LLM, SamplingParams
+
+OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
+DATASET_HUGGINGFACE_WORKSPACE = os.environ["DATASET_HUGGINGFACE_WORKSPACE"]
+MODEL_HUGGINGFACE_WORKSPACE = os.environ["MODEL_HUGGINGFACE_WORKSPACE"]
+IS_DUMMY = os.environ.get("IS_DUMMY", False)
+
+print("====== EVAL PARAMETERS ======")  # noqa
+print(f"{DATASET_HUGGINGFACE_WORKSPACE=}")  # noqa
+print(f"{MODEL_HUGGINGFACE_WORKSPACE=}")  # noqa
+print(f"{IS_DUMMY=}")  # noqa
+print("=============================")  # noqa
+
+
+def generate_answers(model_id: str, dataset_name: str):
+    def format(sample):
+        return "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{}\n\n### Response:\n".format(
+            sample["instruction"]
+        )
+
+    dataset = load_dataset(dataset_name, split="test")
+    if IS_DUMMY:
+        dataset = dataset.select(range(10))
+    print(f"Dataset size: {len(dataset)}")  # noqa
+    dataset = dataset.map(lambda sample: {"prompt": format(sample)})
+
+    print(f"Generating answers for {model_id}")  # noqa
+    llm = LLM(model=model_id, max_model_len=2048)
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95, min_p=0.05, max_tokens=2048)
+    outputs = llm.generate(dataset["prompt"], sampling_params)
+
+    answers = [output.outputs[0].text for output in outputs]
+    dataset = dataset.add_column("answers", answers)
+
+    print(f"Uploading results for {model_id}")  # noqa
+    dataset.push_to_hub(f"{DATASET_HUGGINGFACE_WORKSPACE}/{model_id.split('/')[-1]}-results")
+    gc.collect()
+
+    return dataset
+
+
+def evaluate_answer(instruction: str, answer: str, client: OpenAI) -> dict:
+    prompt = f"""You are an expert judge. Please evaluate the quality of a given answer to an instruction based on two criteria:
+1. Accuracy: How factually correct is the information presented in the answer? You are a technical expert in this topic.
+2. Style: Is the tone and writing style appropriate for a blog post or social media content? It should use simple but technical words and avoid formal or academic language.
+
+Accuracy scale:
+1 (Poor): Contains factual errors or misleading information
+2 (Good): Mostly accurate with minor errors or omissions
+3 (Excellent): Highly accurate and comprehensive
+
+Style scale:
+1 (Poor): Too formal, uses some overly complex words
+2 (Good): Good balance of technical content and accessibility, but still uses formal words and expressions
+3 (Excellent): Perfectly accessible language for blog/social media, uses simple but precise technical terms when necessary
+
+Example of bad style: The Llama2 7B model constitutes a noteworthy progression in the field of artificial intelligence, serving as the successor to its predecessor, the original Llama architecture.
+Example of excellent style: Llama2 7B outperforms the original Llama model across multiple benchmarks.
+
+Instruction: {instruction}
+
+Answer: {answer}
+
+Provide your evaluation in JSON format with the following structure:
+{{
+    "accuracy": {{
+        "analysis": "...",
+        "score": 0
+    }},
+    "style": {{
+        "analysis": "...",
+        "score": 0
+    }}
+}}
+"""
+
+    completion = client.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "system",
+                "content": "You are a helpful assistant who evaluates answers based on accuracy and style. Provide your response in JSON format with a short analysis and score for each criterion.",
+            },
+            {"role": "user", "content": prompt},
+        ],
+        response_format={"type": "json_object"},
+        max_tokens=1000,
+        temperature=0.9,
+    )
+
+    # Parse the structured output
+    return json.loads(completion.choices[0].message.content)
+
+
+def evaluate_batch(batch, start_index):
+    client = OpenAI(api_key=OPENAI_API_KEY)
+    return [(i, evaluate_answer(instr, ans, client)) for i, (instr, ans) in enumerate(batch, start=start_index)]
+
+
+def evaluate_answers(model_id: str, num_threads: int = 10, batch_size: int = 5) -> Dataset:
+    # Load the dataset
+    dataset = load_dataset(f"{DATASET_HUGGINGFACE_WORKSPACE}/{model_id.split('/')[-1]}-results", split="all")
+
+    # Create batches of instruction-answer pairs with their original indices
+    batches = [
+        (i, list(zip(dataset["instruction"][i : i + batch_size], dataset["answers"][i : i + batch_size], strict=False)))
+        for i in range(0, len(dataset), batch_size)
+    ]
+
+    evaluations = [None] * len(dataset)
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
+        futures = [executor.submit(evaluate_batch, batch, start_index) for start_index, batch in batches]
+
+        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
+            for index, evaluation in future.result():
+                evaluations[index] = evaluation
+
+    # Replace the 'evaluation' column if it exists, otherwise add it
+    if "evaluation" in dataset.column_names:
+        dataset = dataset.remove_columns(["evaluation"])
+    dataset = dataset.add_column("evaluation", evaluations)
+
+    # Post-process evaluations
+    accuracy_scores = []
+    style_scores = []
+
+    for evaluation in dataset["evaluation"]:
+        try:
+            eval_dict = json.loads(evaluation) if isinstance(evaluation, str) else evaluation
+            accuracy_score = eval_dict["accuracy"]["score"]
+            style_score = eval_dict["style"]["score"]
+
+            accuracy_scores.append(accuracy_score)
+            style_scores.append(style_score)
+
+        except (json.JSONDecodeError, KeyError, TypeError):
+            # If there's an error, append None to maintain alignment
+            accuracy_scores.append(None)
+            style_scores.append(None)
+
+    # Add new columns to the dataset
+    if "accuracy" in dataset.column_names:
+        dataset = dataset.remove_columns(["accuracy"])
+    dataset = dataset.add_column("accuracy", accuracy_scores)
+    if "style" in dataset.column_names:
+        dataset = dataset.remove_columns(["style"])
+    dataset = dataset.add_column("style", style_scores)
+
+    dataset.push_to_hub(f"{DATASET_HUGGINGFACE_WORKSPACE}/{model_id.split('/')[-1]}-results")
+
+    return dataset
+
+
+def check_if_huggingface_model_exists(model_id: str, default_value: str) -> str:
+    api = HfApi()
+
+    try:
+        api.model_info(model_id)
+        print(f"Found model on HF: '{model_id}'.")  # noqa
+    except RepositoryNotFoundError:
+        print(f"Model '{model_id}' does not exist.")  # noqa
+        model_id = default_value
+        print(f"Defaulting to '{model_id}'")  # noqa
+        print("Train your own model to avoid this behavior.")  # noqa
+
+    return model_id
+
+
+def check_if_huggingface_dataset_exists(dataset_id: str, default_value: str) -> str:
+    api = HfApi()
+
+    try:
+        api.dataset_info(dataset_id)
+        print(f"Found dataset on HF: '{dataset_id}'.")  # noqa
+    except RepositoryNotFoundError:
+        print(f"Dataset '{dataset_id}' does not exist.")  # noqa
+        dataset_id = default_value
+        print(f"Defaulting to '{dataset_id}'")  # noqa
+        print("Use a valid dataset or create your own to avoid this behavior.")  # noqa
+
+    return dataset_id
+
+
+model_ids = [
+    check_if_huggingface_model_exists(
+        f"{MODEL_HUGGINGFACE_WORKSPACE}/TwinLlama-3.1-8B", default_value="mlabonne/TwinLlama-3.1-8B"
+    ),
+    check_if_huggingface_model_exists(
+        f"{MODEL_HUGGINGFACE_WORKSPACE}/TwinLlama-3.1-8B-DPO", default_value="mlabonne/TwinLlama-3.1-8B-DPO"
+    ),
+    "meta-llama/Meta-Llama-3.1-8B-Instruct",
+]
+
+if __name__ == "__main__":
+    # Run generation
+    for model_id in model_ids:
+        dataset_name = check_if_huggingface_dataset_exists(
+            f"{DATASET_HUGGINGFACE_WORKSPACE}/llmtwin", default_value="mlabonne/llmtwin"
+        )
+        generate_answers(model_id, dataset_name=dataset_name)
+
+    # Run evaluation
+    for model_id in model_ids:
+        evaluate_answers(model_id)
+
+    # Analyze results
+    for model_id in model_ids:
+        dataset = load_dataset(f"{DATASET_HUGGINGFACE_WORKSPACE}/{model_id.split('/')[-1]}-results", split="all")
+
+        score = sum(dataset["accuracy"]) / len(dataset["accuracy"])
+        print(f"{model_id.split('/')[-1]} - Accuracy: {score:.2f}")  # noqa
+
+        score = sum(dataset["style"]) / len(dataset["style"])
+        print(f"{model_id.split('/')[-1]} - Style: {score:.2f}")  # noqa
diff --git a/llm_engineering/model/evaluation/requirements.txt b/llm_engineering/model/evaluation/requirements.txt
new file mode 100644
index 0000000..b62ab24
--- /dev/null
+++ b/llm_engineering/model/evaluation/requirements.txt
@@ -0,0 +1,5 @@
+transformers==4.43.3
+datasets==2.20.0
+vllm==0.6.1.post2
+tqdm==4.66.4
+openai==1.52.0
\ No newline at end of file
diff --git a/llm_engineering/model/evaluation/sagemaker.py b/llm_engineering/model/evaluation/sagemaker.py
new file mode 100644
index 0000000..a3c11f2
--- /dev/null
+++ b/llm_engineering/model/evaluation/sagemaker.py
@@ -0,0 +1,57 @@
+from pathlib import Path
+
+from huggingface_hub import HfApi
+from loguru import logger
+from sagemaker.huggingface import HuggingFaceProcessor
+
+from llm_engineering import settings
+
+evaluation_dir = Path(__file__).resolve().parent
+evaluation_requirements_path = evaluation_dir / "requirements.txt"
+
+
+def run_evaluation_on_sagemaker(is_dummy: bool = True) -> None:
+    assert settings.HUGGINGFACE_ACCESS_TOKEN, "Hugging Face access token is required."
+    assert settings.OPENAI_API_KEY, "OpenAI API key is required."
+    assert settings.AWS_ARN_ROLE, "AWS ARN role is required."
+
+    if not evaluation_dir.exists():
+        raise FileNotFoundError(f"The directory {evaluation_dir} does not exist.")
+    if not evaluation_requirements_path.exists():
+        raise FileNotFoundError(f"The file {evaluation_requirements_path} does not exist.")
+
+    api = HfApi()
+    user_info = api.whoami(token=settings.HUGGINGFACE_ACCESS_TOKEN)
+    huggingface_user = user_info["name"]
+    logger.info(f"Current Hugging Face user: {huggingface_user}")
+
+    env = {
+        "HUGGING_FACE_HUB_TOKEN": settings.HUGGINGFACE_ACCESS_TOKEN,
+        "OPENAI_API_KEY": settings.OPENAI_API_KEY,
+        "DATASET_HUGGINGFACE_WORKSPACE": huggingface_user,
+        "MODEL_HUGGINGFACE_WORKSPACE": huggingface_user,
+    }
+    if is_dummy:
+        env["IS_DUMMY"] = "True"
+
+    # Initialize the HuggingFaceProcessor
+    hfp = HuggingFaceProcessor(
+        role=settings.AWS_ARN_ROLE,
+        instance_count=1,
+        instance_type="ml.g5.2xlarge",
+        transformers_version="4.36",
+        pytorch_version="2.1",
+        py_version="py310",
+        base_job_name="evaluate-llm-twin",
+        env=env,
+    )
+
+    # Run the processing job
+    hfp.run(
+        code="evaluate.py",
+        source_dir=str(evaluation_dir),
+    )
+
+
+if __name__ == "__main__":
+    run_evaluation_on_sagemaker()
diff --git a/llm_engineering/model/finetuning/finetune.py b/llm_engineering/model/finetuning/finetune.py
index 9c90d4e..81ed509 100644
--- a/llm_engineering/model/finetuning/finetune.py
+++ b/llm_engineering/model/finetuning/finetune.py
@@ -83,6 +83,11 @@ def finetune(
     EOS_TOKEN = tokenizer.eos_token
     print(f"Setting EOS_TOKEN to {EOS_TOKEN}")  # noqa
 
+    if is_dummy is True:
+        num_train_epochs = 1
+        print(f"Training in dummy mode. Setting num_train_epochs to '{num_train_epochs}'")  # noqa
+        print(f"Training in dummy mode. Reducing dataset size to '400'.")  # noqa
+
     if finetuning_type == "sft":
 
         def format_samples_sft(examples):
@@ -218,9 +223,9 @@ def check_if_huggingface_model_exists(model_id: str, default_value: str = "mlabo
     try:
         api.model_info(model_id)
     except RepositoryNotFoundError:
-        print(f"Model '{sft_base_model_repo_id}' does not exist.")  # noqa
+        print(f"Model '{model_id}' does not exist.")  # noqa
         model_id = default_value
-        print(f"Defaulting to '{sft_base_model_repo_id}'")  # noqa
+        print(f"Defaulting to '{model_id}'")  # noqa
         print("Train your own 'TwinLlama-3.1-8B' to avoid this behavior.")  # noqa
 
     return model_id
diff --git a/llm_engineering/model/finetuning/requirements.txt b/llm_engineering/model/finetuning/requirements.txt
index 3994175..ca2b28a 100644
--- a/llm_engineering/model/finetuning/requirements.txt
+++ b/llm_engineering/model/finetuning/requirements.txt
@@ -6,5 +6,4 @@ trl==0.9.6
 bitsandbytes==0.43.3
 comet-ml==3.44.3
 flash-attn==2.3.6
-# unsloth==2024.10.2
 unsloth==2024.9.post2
\ No newline at end of file
diff --git a/pipelines/__init__.py b/pipelines/__init__.py
index 8e93cc6..f1472a3 100644
--- a/pipelines/__init__.py
+++ b/pipelines/__init__.py
@@ -1,5 +1,6 @@
 from .digital_data_etl import digital_data_etl
 from .end_to_end_data import end_to_end_data
+from .evaluating import evaluating
 from .export_artifact_to_json import export_artifact_to_json
 from .feature_engineering import feature_engineering
 from .generate_datasets import generate_datasets
@@ -8,6 +9,7 @@
 __all__ = [
     "generate_datasets",
     "end_to_end_data",
+    "evaluating",
     "export_artifact_to_json",
     "digital_data_etl",
     "feature_engineering",
diff --git a/pipelines/evaluating.py b/pipelines/evaluating.py
new file mode 100644
index 0000000..f76e378
--- /dev/null
+++ b/pipelines/evaluating.py
@@ -0,0 +1,12 @@
+from zenml import pipeline
+
+from steps import evaluating as evaluating_steps
+
+
+@pipeline
+def evaluating(
+    is_dummy: bool = False,
+) -> None:
+    evaluating_steps.evaluate(
+        is_dummy=is_dummy,
+    )
diff --git a/pyproject.toml b/pyproject.toml
index c52c349..cd11af2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -86,6 +86,7 @@ run-import-data-warehouse-from-json = "poetry run python -m tools.data_warehouse
 
 # Training pipelines
 run-training-pipeline = "poetry run python -m tools.run --no-cache --run-training"
+run-evaluation-pipeline = "poetry run python -m tools.run --no-cache --run-evaluation"
 
 # Inference
 call-rag-retrieval-module = "poetry run python -m tools.rag"
diff --git a/steps/__init__.py b/steps/__init__.py
index 9e159b2..03d6ee8 100644
--- a/steps/__init__.py
+++ b/steps/__init__.py
@@ -1,3 +1,3 @@
-from . import etl, export, feature_engineering, generate_datasets
+from . import etl, evaluating, export, feature_engineering, generate_datasets, training
 
-__all__ = ["generate_datasets", "export", "etl", "feature_engineering"]
+__all__ = ["generate_datasets", "export", "etl", "feature_engineering", "training", "evaluating"]
diff --git a/steps/evaluating/__init__.py b/steps/evaluating/__init__.py
new file mode 100644
index 0000000..542eb78
--- /dev/null
+++ b/steps/evaluating/__init__.py
@@ -0,0 +1,3 @@
+from .evaluate import evaluate
+
+__all__ = ["evaluate"]
diff --git a/steps/evaluating/evaluate.py b/steps/evaluating/evaluate.py
new file mode 100644
index 0000000..0451fe3
--- /dev/null
+++ b/steps/evaluating/evaluate.py
@@ -0,0 +1,12 @@
+from zenml import step
+
+from llm_engineering.model.evaluation.sagemaker import run_evaluation_on_sagemaker
+
+
+@step
+def evaluate(
+    is_dummy: bool = False,
+) -> None:
+    run_evaluation_on_sagemaker(
+        is_dummy=is_dummy,
+    )
diff --git a/tools/run.py b/tools/run.py
index 30c0ab5..c07faea 100644
--- a/tools/run.py
+++ b/tools/run.py
@@ -8,6 +8,7 @@
 from pipelines import (
     digital_data_etl,
     end_to_end_data,
+    evaluating,
     export_artifact_to_json,
     feature_engineering,
     generate_datasets,
@@ -97,6 +98,12 @@
     default=False,
     help="Whether to run the training pipeline.",
 )
+@click.option(
+    "--run-evaluation",
+    is_flag=True,
+    default=False,
+    help="Whether to run the evaluation pipeline.",
+)
 @click.option(
     "--export-settings",
     is_flag=True,
@@ -113,6 +120,7 @@ def main(
     run_generate_instruct_datasets: bool = False,
     run_generate_preference_datasets: bool = False,
     run_training: bool = False,
+    run_evaluation: bool = False,
     export_settings: bool = False,
 ) -> None:
     assert (
@@ -123,6 +131,7 @@ def main(
         or run_generate_instruct_datasets
         or run_generate_preference_datasets
         or run_training
+        or run_evaluation
         or export_settings
     ), "Please specify an action to run."
 
@@ -180,6 +189,12 @@ def main(
         pipeline_args["run_name"] = f"training_run_{dt.now().strftime('%Y_%m_%d_%H_%M_%S')}"
         training.with_options(**pipeline_args)(**run_args_cd)
 
+    if run_evaluation:
+        run_args_cd = {}
+        pipeline_args["config_path"] = root_dir / "configs" / "evaluating.yaml"
+        pipeline_args["run_name"] = f"evaluation_run_{dt.now().strftime('%Y_%m_%d_%H_%M_%S')}"
+        evaluating.with_options(**pipeline_args)(**run_args_cd)
+
 
 if __name__ == "__main__":
     main()