Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unitxt evaluator #156

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
24 changes: 24 additions & 0 deletions scripts/test_unitxt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# First Party
from instructlab.eval.unitxt import UnitxtEvaluator


def test_unitxt():
print("===> Executing 'test_unitxt'...")
try:
model_path = "instructlab/granite-7b-lab"
unitxt_recipe = "card=cards.wnli,template=templates.classification.multi_class.relation.default,max_train_instances=5,loader_limit=20,num_demos=3,demos_pool_size=10"
unitxt = UnitxtEvaluator(model_path=model_path, unitxt_recipe=unitxt_recipe)
overall_score, single_scores = unitxt.run()
print(f"Overall scores: {overall_score}")
sample_score = "f1_micro,none"
assert sample_score in overall_score
assert overall_score[sample_score] > 0
except Exception as exc:
print(f"'test_unitxt_branch' failed: {exc}")
return False

return True


if __name__ == "__main__":
assert test_unitxt() == True
5 changes: 2 additions & 3 deletions src/instructlab/eval/mmlu.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def run(self, server_url: str | None = None) -> tuple:
agg_score: float = 0.0

results = self._run_mmlu(server_url)
for task, result in results.items():
for task, result in results["results"].items():
agg_score += float(result["acc,none"])
individual_scores[task] = {
"score": float(result["acc,none"]),
Expand Down Expand Up @@ -177,8 +177,7 @@ def _run_mmlu(self, server_url: str | None = None) -> dict:
device=self.device,
task_manager=tm,
)
results = mmlu_output["results"]
return results
return mmlu_output

# This method converts general errors from simple_evaluate
# into a more user-understandable error
Expand Down
133 changes: 133 additions & 0 deletions src/instructlab/eval/unitxt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
"""
Unitxt - Flexible, Shareable and Reusable Data Preparation and Evaluation for Generative AI
https://github.com/IBM/unitxt
https://arxiv.org/abs/2401.14019
"""

# Standard
from uuid import uuid4
import os
import shutil

# Third Party
from lm_eval.tasks.unitxt import task
import yaml

# First Party
from instructlab.eval.mmlu import MMLUBranchEvaluator

# Local
from .logger_config import setup_logger

logger = setup_logger(__name__)

TEMP_DIR_PREFIX = "unitxt_temp"


class UnitxtEvaluator(MMLUBranchEvaluator):
"""
An evaluator class, running Unitxt evaluation

Attributes:
model_path Absolute path to or name of a huggingface model
unitxt_recipe unitxt recipe (see unitxt.ai for more information)
A Recipe holds a complete specification of a unitxt pipeline
Example: card=cards.wnli,template=templates.classification.multi_class.relation.default,max_train_instances=5,loader_limit=20,num_demos=3,demos_pool_size=10

"""

name = "unitxt"

def __init__(
self,
model_path,
unitxt_recipe: str,
):
unitxt_task = self.assign_task_name()
tasks_dir = self.assign_tasks_dir(unitxt_task)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this using a local directory? If so, it needs to be built off a param like output_dir with mt_bench.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is a temporary directory, deleted at the end of the evaluation process. Would you prefer the user specified an output dir? It does not contain anything of use for the user, just the files required for lm eval to run unitxt.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would make sense to use the output_dir so it doesn't confuse the user in a local directory. Also, since you do want to remove at the end, the create/remove logic should probably be in a try/finally block.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so a user would specify an output dir but will find it doesn't exist at the end of the run?
If the user specifies it, I guess I will not delete it, right?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so a user would specify an output dir but will find it doesn't exist at the end of the run?

The current output dir is a working dir for mt_bench.

If the user specifies it, I guess I will not delete it, right?

I was expecting you would create your directory inside the output_dir and then delete it when you are done. Or you could delete the directory before you start if there is some value in leaving it around.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it make sense to put all this into a memory filesystem? In general, it is best to avoid unnecessary disk writes, especially for something that's likely to run in a cloud service where it may or may not have write permissions on some sort of disk.

Copy link
Author

@Roni-Friedman Roni-Friedman Nov 7, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jwm4 memory filesystem does not seem to work well, as directory is later accessed also by lm-eval inside the mmlu class and I don't want to start passing this filesystem around (unless owners support such an overall change)

Copy link
Author

@Roni-Friedman Roni-Friedman Nov 7, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would make sense to use the output_dir so it doesn't confuse the user in a local directory.

@danmcp So I'm not entirely sure what you mean here. I see mt_bench has output_dir: str = "eval_output",, but this is created only if one calls for mt_bench and does not enter a different output dir. Doing the following, although not sure it makes a lot of sense:

def assign_tasks_dir(self, task_name):
return os.path.join( "eval_output" ,f"{TEMP_DIR_PREFIX}_{task_name}")

Copy link
Member

@danmcp danmcp Nov 7, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Apologies if my request wasn't clear, my suggestion was like mt_bench:

  • Accept the root dir to use for output as a var
  • Default it to the same root dir as mt_bench

super().__init__(
model_path=model_path, tasks_dir=tasks_dir, tasks=[unitxt_task], few_shots=0
)
self.unitxt_recipe = unitxt_recipe

def assign_tasks_dir(self, task_name):
return os.path.join("eval_output", f"{TEMP_DIR_PREFIX}_{task_name}")

def assign_task_name(self):
return str(uuid4())

def prepare_unitxt_files(self) -> None:
taskname = self.tasks[0]
yaml_file = os.path.join(str(self.tasks_dir), f"{taskname}.yaml")
create_unitxt_pointer(self.tasks_dir)
create_unitxt_yaml(
yaml_file=yaml_file, unitxt_recipe=self.unitxt_recipe, task_name=taskname
)

def remove_unitxt_files(self):
if self.tasks_dir.startswith(
TEMP_DIR_PREFIX
): # to avoid unintended deletion if this class is inherited
shutil.rmtree(self.tasks_dir)
else:
logger.warning(
"unitxt tasks dir '%s' did not start with '%s' prefix and therefore was not deleted",
self.tasks_dir,
TEMP_DIR_PREFIX,
)

def run(self, server_url: str | None = None) -> tuple:
Roni-Friedman marked this conversation as resolved.
Show resolved Hide resolved
"""
Runs evaluation

Attributes:
server_url(str|None) Model server endpoint (Ex: http://localhost:8000/v1) for the model being evaluated

Returns:
overall_scores Average scores for the task group
individual_scores Individual scores for each task in the task group
"""
self.prepare_unitxt_files()
logger.debug(locals())
os.environ["TOKENIZERS_PARALLELISM"] = "true"
try:
results = self._run_mmlu(server_url=server_url)
taskname = self.tasks[0]
global_scores = results["results"][taskname]
global_scores.pop("alias")
try:
instances = results["samples"][taskname]
instance_scores = {}
metrics = [
metric.replace("metrics.", "")
for metric in instances[0]["doc"]["metrics"]
]
for i, instance in enumerate(instances):
scores = {}
for metric in metrics:
scores[metric] = instance[metric][0]
instance_scores[i] = scores
except KeyError as e:
logger.error("Error in extracting single instance scores")
logger.error(e)
logger.error(e.__traceback__)
instance_scores = None
finally:
self.remove_unitxt_files()
return global_scores, instance_scores


def create_unitxt_yaml(yaml_file: str, unitxt_recipe: str, task_name: str) -> None:
data = {"task": task_name, "include": "unitxt", "recipe": unitxt_recipe}
with open(yaml_file, "w", encoding="utf-8") as file:
yaml.dump(data, file, default_flow_style=False)
logger.debug("task %s unitxt recipe written to %s", task_name, yaml_file)


def create_unitxt_pointer(tasks_dir):
class_line = "class: !function " + task.__file__.replace("task.py", "task.Unitxt")
output_file = os.path.join(tasks_dir, "unitxt")
os.makedirs(os.path.dirname(output_file), exist_ok=True)
with open(output_file, "w", encoding="utf-8") as f:
f.write(class_line)
logger.debug("Unitxt task pointer written to %s", output_file)
Loading