Skip to content

Commit 37db422

Browse files
committed
Revert "moving custom tasks to code"
This reverts commit cb163be.
1 parent cb163be commit 37db422

File tree

8 files changed

+191
-65
lines changed

8 files changed

+191
-65
lines changed

.pre-commit-config.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,5 +37,4 @@ repos:
3737
rev: 'v0.1.6'
3838
hooks:
3939
- id: ruff
40-
args: ['--fix']
4140
- id: ruff-format

src/lighteval/logging/__init__.py

Whitespace-only changes.

src/lighteval/logging/evaluation_tracker.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -556,7 +556,7 @@ def push_results_to_tensorboard( # noqa: C901
556556

557557
tb_context.close() # flushes the unfinished write operations
558558
time.sleep(5)
559-
files = os.listdir(str(output_dir_tb))
559+
files = os.listdir(output_dir_tb)
560560
for file in files:
561561
os.rename(os.path.join(output_dir_tb, file), os.path.join(output_dir_tb, f"{global_step:07d}_{file}"))
562562

@@ -566,3 +566,5 @@ def push_results_to_tensorboard( # noqa: C901
566566
f"Pushed to tensorboard at https://huggingface.co/tensorboard/{lighteval_config.logging.hub_repo_tensorboard}/"
567567
f" at {output_dir_tb} and global_step {global_step}"
568568
)
569+
# except Exception as e:
570+
# logger.warning(f"Could not push to tensorboard\n{e}")

src/lighteval/models/__init__.py

Whitespace-only changes.

src/lighteval/tasks/__init__.py

Whitespace-only changes.

src/lighteval/tasks/lighteval_task.py

Lines changed: 0 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import collections
22
import random
3-
from dataclasses import dataclass
43
from multiprocessing import Pool
54
from pathlib import Path
65
from typing import TYPE_CHECKING, List, Optional, Tuple
@@ -40,42 +39,6 @@
4039
from lighteval.logging.evaluation_tracker import EvaluationTracker
4140

4241

43-
@dataclass
44-
class CustomEvaluationTaskConfig:
45-
name: str
46-
prompt_function: str
47-
hf_repo: str
48-
hf_subset: str
49-
metric: Tuple[Metrics]
50-
hf_avail_splits: Optional[Tuple[str]] = None
51-
evaluation_splits: Optional[Tuple[str]] = None
52-
few_shots_split: Optional[str] = None
53-
few_shots_select: Optional[str] = None
54-
generation_size: int = -1
55-
stop_sequence: Optional[Tuple[str]] = None
56-
output_regex: Optional[str] = None
57-
58-
frozen: bool = False
59-
suite: Optional[Tuple[str]] = None # we use this to know if we should use a custom lighteval or bigcode task
60-
61-
def __post_init__(self):
62-
if self.suite is None:
63-
self.suite = ["custom"]
64-
if self.hf_avail_splits is None:
65-
self.hf_avail_splits = ["train", "validation", "test"]
66-
if self.evaluation_splits is None:
67-
self.evaluation_splits = ["validation"]
68-
if self.stop_sequence is None:
69-
self.stop_sequence = ["\n"]
70-
71-
# Convert list to tuple for hashing
72-
self.metric = tuple(self.metric)
73-
self.hf_avail_splits = tuple(self.hf_avail_splits) if self.hf_avail_splits else None
74-
self.evaluation_splits = tuple(self.evaluation_splits) if self.evaluation_splits else None
75-
self.suite = tuple(self.suite) if self.suite else None
76-
self.stop_sequence = tuple(self.stop_sequence) if self.stop_sequence else None
77-
78-
7942
class LightevalTask:
8043
def __init__(self, name: str, cfg: dict, cache_dir: Optional[str] = None, custom_tasks_module=None):
8144
"""

tasks_examples/custom_tasks/custom_evaluation_tasks.py

Lines changed: 29 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -6,56 +6,59 @@
66
"""
77
import re
88
from dataclasses import asdict
9-
from typing import Dict, List, Tuple
9+
from typing import Dict, List
1010

11-
from lighteval.metrics import MetricCategory, Metrics
12-
from lighteval.tasks.lighteval_task import CustomEvaluationTaskConfig
1311
from lighteval.tasks.requests import Doc
14-
from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
1512

13+
from .custom_evaluation_utils import *
1614

17-
_TASKS_STRINGS: List[Tuple[CustomEvaluationTaskConfig, str]] = []
18-
_TASKS: List[CustomEvaluationTaskConfig] = []
15+
16+
# fmt: off
17+
LETTER_INDICES = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"]
18+
# fmt: on
19+
20+
_TASKS_STRINGS: List[Tuple[CustomEvaluationTask, str]] = []
21+
_TASKS: List[CustomEvaluationTask] = []
1922

2023
## COMMON_SENSE_REASONING_TASKS ##
2124
COMMON_SENSE_REASONING_TASKS = [
22-
CustomEvaluationTaskConfig(
25+
CustomEvaluationTask(
2326
name="hellaswag",
2427
prompt_function="hellaswag_prompt",
2528
hf_repo="hellaswag",
2629
hf_subset="default",
2730
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
2831
),
29-
CustomEvaluationTaskConfig(
32+
CustomEvaluationTask(
3033
name="winogrande",
3134
prompt_function="winogrande",
3235
hf_repo="winogrande",
3336
hf_subset="winogrande_xl",
3437
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
3538
),
36-
CustomEvaluationTaskConfig(
39+
CustomEvaluationTask(
3740
name="piqa",
3841
prompt_function="piqa_harness",
3942
hf_repo="piqa",
4043
hf_subset="plain_text",
4144
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
4245
),
43-
CustomEvaluationTaskConfig(
46+
CustomEvaluationTask(
4447
name="siqa",
4548
prompt_function="siqa_prompt",
4649
hf_repo="lighteval/siqa",
4750
hf_subset="default",
4851
hf_avail_splits=["train", "validation"],
4952
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
5053
),
51-
CustomEvaluationTaskConfig(
54+
CustomEvaluationTask(
5255
name="openbookqa",
5356
prompt_function="openbookqa",
5457
hf_repo="openbookqa",
5558
hf_subset="main",
5659
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
5760
),
58-
CustomEvaluationTaskConfig(
61+
CustomEvaluationTask(
5962
name="arc:easy",
6063
prompt_function="arc",
6164
hf_repo="ai2_arc",
@@ -64,7 +67,7 @@
6467
generation_size=1,
6568
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
6669
),
67-
CustomEvaluationTaskConfig(
70+
CustomEvaluationTask(
6871
name="arc:challenge",
6972
prompt_function="arc",
7073
hf_repo="ai2_arc",
@@ -73,7 +76,7 @@
7376
generation_size=1,
7477
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
7578
),
76-
CustomEvaluationTaskConfig(
79+
CustomEvaluationTask(
7780
name="commonsense_qa",
7881
prompt_function="commonsense_qa_prompt",
7982
hf_repo="commonsense_qa",
@@ -131,7 +134,7 @@ def preprocess(text):
131134
## WORLD_KNOWLEDGE_TASKS ##
132135

133136
WORLD_KNOWLEDGE_TASKS = [
134-
CustomEvaluationTaskConfig(
137+
CustomEvaluationTask(
135138
name="trivia_qa",
136139
prompt_function="triviaqa",
137140
hf_repo="trivia_qa",
@@ -140,7 +143,7 @@ def preprocess(text):
140143
generation_size=20,
141144
stop_sequence=["\n", ".", ","],
142145
),
143-
CustomEvaluationTaskConfig(
146+
CustomEvaluationTask(
144147
name="natural_questions",
145148
prompt_function="natural_questions_prompt",
146149
hf_repo="lighteval/natural_questions_clean",
@@ -170,14 +173,14 @@ def natural_questions_prompt(line, task_name: str = None):
170173
## Reading comprehension ##
171174

172175
READING_COMP_TASKS = [
173-
CustomEvaluationTaskConfig(
176+
CustomEvaluationTask(
174177
name="super_glue:boolq",
175178
prompt_function="boolq_prompt",
176179
hf_repo="super_glue",
177180
hf_subset="boolq",
178181
metric=["target_perplexity"],
179182
),
180-
CustomEvaluationTaskConfig(
183+
CustomEvaluationTask(
181184
name="quac",
182185
prompt_function="quac",
183186
hf_repo="lighteval/quac_helm",
@@ -204,7 +207,7 @@ def boolq_prompt(line, task_name: str = None):
204207

205208

206209
## MATH ##
207-
class CustomMathEvaluationTask(CustomEvaluationTaskConfig):
210+
class CustomMathEvaluationTask(CustomEvaluationTask):
208211
"""Custom class for math tasks with all the defaults set"""
209212

210213
def __init__(
@@ -251,7 +254,7 @@ def __init__(
251254
CustomMathEvaluationTask(name="math:prealgebra", hf_subset="prealgebra"),
252255
CustomMathEvaluationTask(name="math:precalculus", hf_subset="precalculus"),
253256
]
254-
GSM8K = CustomEvaluationTaskConfig(
257+
GSM8K = CustomEvaluationTask(
255258
name="gsm8k",
256259
prompt_function="gsm8k",
257260
hf_repo="gsm8k",
@@ -272,7 +275,7 @@ def __init__(
272275

273276

274277
## MMLU ##
275-
class CustomMMLUEvaluationTask(CustomEvaluationTaskConfig):
278+
class CustomMMLUEvaluationTask(CustomEvaluationTask):
276279
def __init__(
277280
self,
278281
name,
@@ -415,7 +418,7 @@ def mmlu_prompt(line, task_name: str = None):
415418
## BBH ##
416419

417420

418-
class CustomBBHEvaluationTask(CustomEvaluationTaskConfig):
421+
class CustomBBHEvaluationTask(CustomEvaluationTask):
419422
def __init__(
420423
self,
421424
name,
@@ -506,7 +509,7 @@ def bbh_prompt(line, task_name: str = None):
506509

507510

508511
## AGI eval ##
509-
class CustomAGIEvalEvaluationTask(CustomEvaluationTaskConfig):
512+
class CustomAGIEvalEvaluationTask(CustomEvaluationTask):
510513
def __init__(
511514
self,
512515
name,
@@ -617,17 +620,17 @@ def agi_eval_prompt_no_letters(line, task_name: str = None):
617620

618621

619622
## HUMAN EVAL ##
620-
# human_eval = CustomEvaluationTaskConfig(
623+
# human_eval = CustomEvaluationTask(
621624
# name="human_eval",
622625
# prompt_function="human_eval",
623626
# hf_repo="lighteval/human_eval",
624627
# metric=["human_eval_pass_at_1"],
625628
# ),
626629

627630

628-
def has_generative_metrics(task: CustomEvaluationTaskConfig) -> bool:
631+
def has_generative_metrics(task: CustomEvaluationTask) -> bool:
629632
for metric in task.metric:
630-
if metric.category == MetricCategory.GENERATIVE:
633+
if metric in NEEDS_GENERATION_ONLY:
631634
return True
632635
return False
633636

0 commit comments

Comments
 (0)