Skip to content

Commit

Permalink
Revert "moving custom tasks to code"
Browse files Browse the repository at this point in the history
This reverts commit cb163be.
  • Loading branch information
thomwolf committed Feb 7, 2024
1 parent cb163be commit 37db422
Show file tree
Hide file tree
Showing 8 changed files with 191 additions and 65 deletions.
1 change: 0 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,5 +37,4 @@ repos:
rev: 'v0.1.6'
hooks:
- id: ruff
args: ['--fix']
- id: ruff-format
Empty file removed src/lighteval/logging/__init__.py
Empty file.
4 changes: 3 additions & 1 deletion src/lighteval/logging/evaluation_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -556,7 +556,7 @@ def push_results_to_tensorboard( # noqa: C901

tb_context.close() # flushes the unfinished write operations
time.sleep(5)
files = os.listdir(str(output_dir_tb))
files = os.listdir(output_dir_tb)
for file in files:
os.rename(os.path.join(output_dir_tb, file), os.path.join(output_dir_tb, f"{global_step:07d}_{file}"))

Expand All @@ -566,3 +566,5 @@ def push_results_to_tensorboard( # noqa: C901
f"Pushed to tensorboard at https://huggingface.co/tensorboard/{lighteval_config.logging.hub_repo_tensorboard}/"
f" at {output_dir_tb} and global_step {global_step}"
)
# except Exception as e:
# logger.warning(f"Could not push to tensorboard\n{e}")
Empty file removed src/lighteval/models/__init__.py
Empty file.
Empty file removed src/lighteval/tasks/__init__.py
Empty file.
37 changes: 0 additions & 37 deletions src/lighteval/tasks/lighteval_task.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import collections
import random
from dataclasses import dataclass
from multiprocessing import Pool
from pathlib import Path
from typing import TYPE_CHECKING, List, Optional, Tuple
Expand Down Expand Up @@ -40,42 +39,6 @@
from lighteval.logging.evaluation_tracker import EvaluationTracker


@dataclass
class CustomEvaluationTaskConfig:
name: str
prompt_function: str
hf_repo: str
hf_subset: str
metric: Tuple[Metrics]
hf_avail_splits: Optional[Tuple[str]] = None
evaluation_splits: Optional[Tuple[str]] = None
few_shots_split: Optional[str] = None
few_shots_select: Optional[str] = None
generation_size: int = -1
stop_sequence: Optional[Tuple[str]] = None
output_regex: Optional[str] = None

frozen: bool = False
suite: Optional[Tuple[str]] = None # we use this to know if we should use a custom lighteval or bigcode task

def __post_init__(self):
if self.suite is None:
self.suite = ["custom"]
if self.hf_avail_splits is None:
self.hf_avail_splits = ["train", "validation", "test"]
if self.evaluation_splits is None:
self.evaluation_splits = ["validation"]
if self.stop_sequence is None:
self.stop_sequence = ["\n"]

# Convert list to tuple for hashing
self.metric = tuple(self.metric)
self.hf_avail_splits = tuple(self.hf_avail_splits) if self.hf_avail_splits else None
self.evaluation_splits = tuple(self.evaluation_splits) if self.evaluation_splits else None
self.suite = tuple(self.suite) if self.suite else None
self.stop_sequence = tuple(self.stop_sequence) if self.stop_sequence else None


class LightevalTask:
def __init__(self, name: str, cfg: dict, cache_dir: Optional[str] = None, custom_tasks_module=None):
"""
Expand Down
55 changes: 29 additions & 26 deletions tasks_examples/custom_tasks/custom_evaluation_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,56 +6,59 @@
"""
import re
from dataclasses import asdict
from typing import Dict, List, Tuple
from typing import Dict, List

from lighteval.metrics import MetricCategory, Metrics
from lighteval.tasks.lighteval_task import CustomEvaluationTaskConfig
from lighteval.tasks.requests import Doc
from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES

from .custom_evaluation_utils import *

_TASKS_STRINGS: List[Tuple[CustomEvaluationTaskConfig, str]] = []
_TASKS: List[CustomEvaluationTaskConfig] = []

# fmt: off
LETTER_INDICES = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"]
# fmt: on

_TASKS_STRINGS: List[Tuple[CustomEvaluationTask, str]] = []
_TASKS: List[CustomEvaluationTask] = []

## COMMON_SENSE_REASONING_TASKS ##
COMMON_SENSE_REASONING_TASKS = [
CustomEvaluationTaskConfig(
CustomEvaluationTask(
name="hellaswag",
prompt_function="hellaswag_prompt",
hf_repo="hellaswag",
hf_subset="default",
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
),
CustomEvaluationTaskConfig(
CustomEvaluationTask(
name="winogrande",
prompt_function="winogrande",
hf_repo="winogrande",
hf_subset="winogrande_xl",
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
),
CustomEvaluationTaskConfig(
CustomEvaluationTask(
name="piqa",
prompt_function="piqa_harness",
hf_repo="piqa",
hf_subset="plain_text",
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
),
CustomEvaluationTaskConfig(
CustomEvaluationTask(
name="siqa",
prompt_function="siqa_prompt",
hf_repo="lighteval/siqa",
hf_subset="default",
hf_avail_splits=["train", "validation"],
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
),
CustomEvaluationTaskConfig(
CustomEvaluationTask(
name="openbookqa",
prompt_function="openbookqa",
hf_repo="openbookqa",
hf_subset="main",
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
),
CustomEvaluationTaskConfig(
CustomEvaluationTask(
name="arc:easy",
prompt_function="arc",
hf_repo="ai2_arc",
Expand All @@ -64,7 +67,7 @@
generation_size=1,
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
),
CustomEvaluationTaskConfig(
CustomEvaluationTask(
name="arc:challenge",
prompt_function="arc",
hf_repo="ai2_arc",
Expand All @@ -73,7 +76,7 @@
generation_size=1,
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
),
CustomEvaluationTaskConfig(
CustomEvaluationTask(
name="commonsense_qa",
prompt_function="commonsense_qa_prompt",
hf_repo="commonsense_qa",
Expand Down Expand Up @@ -131,7 +134,7 @@ def preprocess(text):
## WORLD_KNOWLEDGE_TASKS ##

WORLD_KNOWLEDGE_TASKS = [
CustomEvaluationTaskConfig(
CustomEvaluationTask(
name="trivia_qa",
prompt_function="triviaqa",
hf_repo="trivia_qa",
Expand All @@ -140,7 +143,7 @@ def preprocess(text):
generation_size=20,
stop_sequence=["\n", ".", ","],
),
CustomEvaluationTaskConfig(
CustomEvaluationTask(
name="natural_questions",
prompt_function="natural_questions_prompt",
hf_repo="lighteval/natural_questions_clean",
Expand Down Expand Up @@ -170,14 +173,14 @@ def natural_questions_prompt(line, task_name: str = None):
## Reading comprehension ##

READING_COMP_TASKS = [
CustomEvaluationTaskConfig(
CustomEvaluationTask(
name="super_glue:boolq",
prompt_function="boolq_prompt",
hf_repo="super_glue",
hf_subset="boolq",
metric=["target_perplexity"],
),
CustomEvaluationTaskConfig(
CustomEvaluationTask(
name="quac",
prompt_function="quac",
hf_repo="lighteval/quac_helm",
Expand All @@ -204,7 +207,7 @@ def boolq_prompt(line, task_name: str = None):


## MATH ##
class CustomMathEvaluationTask(CustomEvaluationTaskConfig):
class CustomMathEvaluationTask(CustomEvaluationTask):
"""Custom class for math tasks with all the defaults set"""

def __init__(
Expand Down Expand Up @@ -251,7 +254,7 @@ def __init__(
CustomMathEvaluationTask(name="math:prealgebra", hf_subset="prealgebra"),
CustomMathEvaluationTask(name="math:precalculus", hf_subset="precalculus"),
]
GSM8K = CustomEvaluationTaskConfig(
GSM8K = CustomEvaluationTask(
name="gsm8k",
prompt_function="gsm8k",
hf_repo="gsm8k",
Expand All @@ -272,7 +275,7 @@ def __init__(


## MMLU ##
class CustomMMLUEvaluationTask(CustomEvaluationTaskConfig):
class CustomMMLUEvaluationTask(CustomEvaluationTask):
def __init__(
self,
name,
Expand Down Expand Up @@ -415,7 +418,7 @@ def mmlu_prompt(line, task_name: str = None):
## BBH ##


class CustomBBHEvaluationTask(CustomEvaluationTaskConfig):
class CustomBBHEvaluationTask(CustomEvaluationTask):
def __init__(
self,
name,
Expand Down Expand Up @@ -506,7 +509,7 @@ def bbh_prompt(line, task_name: str = None):


## AGI eval ##
class CustomAGIEvalEvaluationTask(CustomEvaluationTaskConfig):
class CustomAGIEvalEvaluationTask(CustomEvaluationTask):
def __init__(
self,
name,
Expand Down Expand Up @@ -617,17 +620,17 @@ def agi_eval_prompt_no_letters(line, task_name: str = None):


## HUMAN EVAL ##
# human_eval = CustomEvaluationTaskConfig(
# human_eval = CustomEvaluationTask(
# name="human_eval",
# prompt_function="human_eval",
# hf_repo="lighteval/human_eval",
# metric=["human_eval_pass_at_1"],
# ),


def has_generative_metrics(task: CustomEvaluationTaskConfig) -> bool:
def has_generative_metrics(task: CustomEvaluationTask) -> bool:
for metric in task.metric:
if metric.category == MetricCategory.GENERATIVE:
if metric in NEEDS_GENERATION_ONLY:
return True
return False

Expand Down
Loading

0 comments on commit 37db422

Please sign in to comment.