-
Notifications
You must be signed in to change notification settings - Fork 379
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
27afd25
commit 6180127
Showing
16 changed files
with
363 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
settings: | ||
docker: | ||
parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest | ||
skip_build: True | ||
orchestrator.sagemaker: | ||
synchronous: false | ||
|
||
parameters: | ||
is_dummy: true # Change this to 'false' to run the evaluation on the full dataset. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,225 @@ | ||
import concurrent.futures | ||
import gc | ||
import json | ||
import os | ||
|
||
from datasets import Dataset, load_dataset | ||
from huggingface_hub import HfApi | ||
from huggingface_hub.utils import RepositoryNotFoundError | ||
from openai import OpenAI | ||
from tqdm.auto import tqdm | ||
from vllm import LLM, SamplingParams | ||
|
||
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"] | ||
DATASET_HUGGINGFACE_WORKSPACE = os.environ["DATASET_HUGGINGFACE_WORKSPACE"] | ||
MODEL_HUGGINGFACE_WORKSPACE = os.environ["MODEL_HUGGINGFACE_WORKSPACE"] | ||
IS_DUMMY = os.environ.get("IS_DUMMY", False) | ||
|
||
print("====== EVAL PARAMETERS ======") # noqa | ||
print(f"{DATASET_HUGGINGFACE_WORKSPACE=}") # noqa | ||
print(f"{MODEL_HUGGINGFACE_WORKSPACE=}") # noqa | ||
print(f"{IS_DUMMY=}") # noqa | ||
print("=============================") # noqa | ||
|
||
|
||
def generate_answers(model_id: str, dataset_name: str): | ||
def format(sample): | ||
return "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{}\n\n### Response:\n".format( | ||
sample["instruction"] | ||
) | ||
|
||
dataset = load_dataset(dataset_name, split="test") | ||
if IS_DUMMY: | ||
dataset = dataset.select(range(10)) | ||
print(f"Dataset size: {len(dataset)}") # noqa | ||
dataset = dataset.map(lambda sample: {"prompt": format(sample)}) | ||
|
||
print(f"Generating answers for {model_id}") # noqa | ||
llm = LLM(model=model_id, max_model_len=2048) | ||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, min_p=0.05, max_tokens=2048) | ||
outputs = llm.generate(dataset["prompt"], sampling_params) | ||
|
||
answers = [output.outputs[0].text for output in outputs] | ||
dataset = dataset.add_column("answers", answers) | ||
|
||
print(f"Uploading results for {model_id}") # noqa | ||
dataset.push_to_hub(f"{DATASET_HUGGINGFACE_WORKSPACE}/{model_id.split('/')[-1]}-results") | ||
gc.collect() | ||
|
||
return dataset | ||
|
||
|
||
def evaluate_answer(instruction: str, answer: str, client: OpenAI) -> dict: | ||
prompt = f"""You are an expert judge. Please evaluate the quality of a given answer to an instruction based on two criteria: | ||
1. Accuracy: How factually correct is the information presented in the answer? You are a technical expert in this topic. | ||
2. Style: Is the tone and writing style appropriate for a blog post or social media content? It should use simple but technical words and avoid formal or academic language. | ||
Accuracy scale: | ||
1 (Poor): Contains factual errors or misleading information | ||
2 (Good): Mostly accurate with minor errors or omissions | ||
3 (Excellent): Highly accurate and comprehensive | ||
Style scale: | ||
1 (Poor): Too formal, uses some overly complex words | ||
2 (Good): Good balance of technical content and accessibility, but still uses formal words and expressions | ||
3 (Excellent): Perfectly accessible language for blog/social media, uses simple but precise technical terms when necessary | ||
Example of bad style: The Llama2 7B model constitutes a noteworthy progression in the field of artificial intelligence, serving as the successor to its predecessor, the original Llama architecture. | ||
Example of excellent style: Llama2 7B outperforms the original Llama model across multiple benchmarks. | ||
Instruction: {instruction} | ||
Answer: {answer} | ||
Provide your evaluation in JSON format with the following structure: | ||
{{ | ||
"accuracy": {{ | ||
"analysis": "...", | ||
"score": 0 | ||
}}, | ||
"style": {{ | ||
"analysis": "...", | ||
"score": 0 | ||
}} | ||
}} | ||
""" | ||
|
||
completion = client.chat.completions.create( | ||
model="gpt-4o-mini", | ||
messages=[ | ||
{ | ||
"role": "system", | ||
"content": "You are a helpful assistant who evaluates answers based on accuracy and style. Provide your response in JSON format with a short analysis and score for each criterion.", | ||
}, | ||
{"role": "user", "content": prompt}, | ||
], | ||
response_format={"type": "json_object"}, | ||
max_tokens=1000, | ||
temperature=0.9, | ||
) | ||
|
||
# Parse the structured output | ||
return json.loads(completion.choices[0].message.content) | ||
|
||
|
||
def evaluate_batch(batch, start_index): | ||
client = OpenAI(api_key=OPENAI_API_KEY) | ||
return [(i, evaluate_answer(instr, ans, client)) for i, (instr, ans) in enumerate(batch, start=start_index)] | ||
|
||
|
||
def evaluate_answers(model_id: str, num_threads: int = 10, batch_size: int = 5) -> Dataset: | ||
# Load the dataset | ||
dataset = load_dataset(f"{DATASET_HUGGINGFACE_WORKSPACE}/{model_id.split('/')[-1]}-results", split="all") | ||
|
||
# Create batches of instruction-answer pairs with their original indices | ||
batches = [ | ||
(i, list(zip(dataset["instruction"][i : i + batch_size], dataset["answers"][i : i + batch_size], strict=False))) | ||
for i in range(0, len(dataset), batch_size) | ||
] | ||
|
||
evaluations = [None] * len(dataset) | ||
|
||
with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor: | ||
futures = [executor.submit(evaluate_batch, batch, start_index) for start_index, batch in batches] | ||
|
||
for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)): | ||
for index, evaluation in future.result(): | ||
evaluations[index] = evaluation | ||
|
||
# Replace the 'evaluation' column if it exists, otherwise add it | ||
if "evaluation" in dataset.column_names: | ||
dataset = dataset.remove_columns(["evaluation"]) | ||
dataset = dataset.add_column("evaluation", evaluations) | ||
|
||
# Post-process evaluations | ||
accuracy_scores = [] | ||
style_scores = [] | ||
|
||
for evaluation in dataset["evaluation"]: | ||
try: | ||
eval_dict = json.loads(evaluation) if isinstance(evaluation, str) else evaluation | ||
accuracy_score = eval_dict["accuracy"]["score"] | ||
style_score = eval_dict["style"]["score"] | ||
|
||
accuracy_scores.append(accuracy_score) | ||
style_scores.append(style_score) | ||
|
||
except (json.JSONDecodeError, KeyError, TypeError): | ||
# If there's an error, append None to maintain alignment | ||
accuracy_scores.append(None) | ||
style_scores.append(None) | ||
|
||
# Add new columns to the dataset | ||
if "accuracy" in dataset.column_names: | ||
dataset = dataset.remove_columns(["accuracy"]) | ||
dataset = dataset.add_column("accuracy", accuracy_scores) | ||
if "style" in dataset.column_names: | ||
dataset = dataset.remove_columns(["style"]) | ||
dataset = dataset.add_column("style", style_scores) | ||
|
||
dataset.push_to_hub(f"{DATASET_HUGGINGFACE_WORKSPACE}/{model_id.split('/')[-1]}-results") | ||
|
||
return dataset | ||
|
||
|
||
def check_if_huggingface_model_exists(model_id: str, default_value: str) -> str: | ||
api = HfApi() | ||
|
||
try: | ||
api.model_info(model_id) | ||
print(f"Found model on HF: '{model_id}'.") # noqa | ||
except RepositoryNotFoundError: | ||
print(f"Model '{model_id}' does not exist.") # noqa | ||
model_id = default_value | ||
print(f"Defaulting to '{model_id}'") # noqa | ||
print("Train your own model to avoid this behavior.") # noqa | ||
|
||
return model_id | ||
|
||
|
||
def check_if_huggingface_dataset_exists(dataset_id: str, default_value: str) -> str: | ||
api = HfApi() | ||
|
||
try: | ||
api.dataset_info(dataset_id) | ||
print(f"Found dataset on HF: '{dataset_id}'.") # noqa | ||
except RepositoryNotFoundError: | ||
print(f"Dataset '{dataset_id}' does not exist.") # noqa | ||
dataset_id = default_value | ||
print(f"Defaulting to '{dataset_id}'") # noqa | ||
print("Use a valid dataset or create your own to avoid this behavior.") # noqa | ||
|
||
return dataset_id | ||
|
||
|
||
model_ids = [ | ||
check_if_huggingface_model_exists( | ||
f"{MODEL_HUGGINGFACE_WORKSPACE}/TwinLlama-3.1-8B", default_value="mlabonne/TwinLlama-3.1-8B" | ||
), | ||
check_if_huggingface_model_exists( | ||
f"{MODEL_HUGGINGFACE_WORKSPACE}/TwinLlama-3.1-8B-DPO", default_value="mlabonne/TwinLlama-3.1-8B-DPO" | ||
), | ||
"meta-llama/Meta-Llama-3.1-8B-Instruct", | ||
] | ||
|
||
if __name__ == "__main__": | ||
# Run generation | ||
for model_id in model_ids: | ||
dataset_name = check_if_huggingface_dataset_exists( | ||
f"{DATASET_HUGGINGFACE_WORKSPACE}/llmtwin", default_value="mlabonne/llmtwin" | ||
) | ||
generate_answers(model_id, dataset_name=dataset_name) | ||
|
||
# Run evaluation | ||
for model_id in model_ids: | ||
evaluate_answers(model_id) | ||
|
||
# Analyze results | ||
for model_id in model_ids: | ||
dataset = load_dataset(f"{DATASET_HUGGINGFACE_WORKSPACE}/{model_id.split('/')[-1]}-results", split="all") | ||
|
||
score = sum(dataset["accuracy"]) / len(dataset["accuracy"]) | ||
print(f"{model_id.split('/')[-1]} - Accuracy: {score:.2f}") # noqa | ||
|
||
score = sum(dataset["style"]) / len(dataset["style"]) | ||
print(f"{model_id.split('/')[-1]} - Style: {score:.2f}") # noqa |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
transformers==4.43.3 | ||
datasets==2.20.0 | ||
vllm==0.6.1.post2 | ||
tqdm==4.66.4 | ||
openai==1.52.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
from pathlib import Path | ||
|
||
from huggingface_hub import HfApi | ||
from loguru import logger | ||
from sagemaker.huggingface import HuggingFaceProcessor | ||
|
||
from llm_engineering import settings | ||
|
||
evaluation_dir = Path(__file__).resolve().parent | ||
evaluation_requirements_path = evaluation_dir / "requirements.txt" | ||
|
||
|
||
def run_evaluation_on_sagemaker(is_dummy: bool = True) -> None: | ||
assert settings.HUGGINGFACE_ACCESS_TOKEN, "Hugging Face access token is required." | ||
assert settings.OPENAI_API_KEY, "OpenAI API key is required." | ||
assert settings.AWS_ARN_ROLE, "AWS ARN role is required." | ||
|
||
if not evaluation_dir.exists(): | ||
raise FileNotFoundError(f"The directory {evaluation_dir} does not exist.") | ||
if not evaluation_requirements_path.exists(): | ||
raise FileNotFoundError(f"The file {evaluation_requirements_path} does not exist.") | ||
|
||
api = HfApi() | ||
user_info = api.whoami(token=settings.HUGGINGFACE_ACCESS_TOKEN) | ||
huggingface_user = user_info["name"] | ||
logger.info(f"Current Hugging Face user: {huggingface_user}") | ||
|
||
env = { | ||
"HUGGING_FACE_HUB_TOKEN": settings.HUGGINGFACE_ACCESS_TOKEN, | ||
"OPENAI_API_KEY": settings.OPENAI_API_KEY, | ||
"DATASET_HUGGINGFACE_WORKSPACE": huggingface_user, | ||
"MODEL_HUGGINGFACE_WORKSPACE": huggingface_user, | ||
} | ||
if is_dummy: | ||
env["IS_DUMMY"] = "True" | ||
|
||
# Initialize the HuggingFaceProcessor | ||
hfp = HuggingFaceProcessor( | ||
role=settings.AWS_ARN_ROLE, | ||
instance_count=1, | ||
instance_type="ml.g5.2xlarge", | ||
transformers_version="4.36", | ||
pytorch_version="2.1", | ||
py_version="py310", | ||
base_job_name="evaluate-llm-twin", | ||
env=env, | ||
) | ||
|
||
# Run the processing job | ||
hfp.run( | ||
code="evaluate.py", | ||
source_dir=str(evaluation_dir), | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
run_evaluation_on_sagemaker() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,5 +6,4 @@ trl==0.9.6 | |
bitsandbytes==0.43.3 | ||
comet-ml==3.44.3 | ||
flash-attn==2.3.6 | ||
# unsloth==2024.10.2 | ||
unsloth==2024.9.post2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
from zenml import pipeline | ||
|
||
from steps import evaluating as evaluating_steps | ||
|
||
|
||
@pipeline | ||
def evaluating( | ||
is_dummy: bool = False, | ||
) -> None: | ||
evaluating_steps.evaluate( | ||
is_dummy=is_dummy, | ||
) |
Oops, something went wrong.