6
6
"""
7
7
import re
8
8
from dataclasses import asdict
9
- from typing import Dict , List , Tuple
9
+ from typing import Dict , List
10
10
11
- from lighteval .metrics import MetricCategory , Metrics
12
- from lighteval .tasks .lighteval_task import CustomEvaluationTaskConfig
13
11
from lighteval .tasks .requests import Doc
14
- from lighteval .tasks .tasks_prompt_formatting import LETTER_INDICES
15
12
13
+ from .custom_evaluation_utils import *
16
14
17
- _TASKS_STRINGS : List [Tuple [CustomEvaluationTaskConfig , str ]] = []
18
- _TASKS : List [CustomEvaluationTaskConfig ] = []
15
+
16
+ # fmt: off
17
+ LETTER_INDICES = ["A" , "B" , "C" , "D" , "E" , "F" , "G" , "H" , "I" , "J" , "K" , "L" , "M" , "N" , "O" , "P" , "Q" , "R" , "S" , "T" , "U" , "V" , "W" , "X" , "Y" , "Z" ]
18
+ # fmt: on
19
+
20
+ _TASKS_STRINGS : List [Tuple [CustomEvaluationTask , str ]] = []
21
+ _TASKS : List [CustomEvaluationTask ] = []
19
22
20
23
## COMMON_SENSE_REASONING_TASKS ##
21
24
COMMON_SENSE_REASONING_TASKS = [
22
- CustomEvaluationTaskConfig (
25
+ CustomEvaluationTask (
23
26
name = "hellaswag" ,
24
27
prompt_function = "hellaswag_prompt" ,
25
28
hf_repo = "hellaswag" ,
26
29
hf_subset = "default" ,
27
30
metric = ["loglikelihood_acc" , "loglikelihood_acc_norm_nospace" ],
28
31
),
29
- CustomEvaluationTaskConfig (
32
+ CustomEvaluationTask (
30
33
name = "winogrande" ,
31
34
prompt_function = "winogrande" ,
32
35
hf_repo = "winogrande" ,
33
36
hf_subset = "winogrande_xl" ,
34
37
metric = ["loglikelihood_acc" , "loglikelihood_acc_norm_nospace" ],
35
38
),
36
- CustomEvaluationTaskConfig (
39
+ CustomEvaluationTask (
37
40
name = "piqa" ,
38
41
prompt_function = "piqa_harness" ,
39
42
hf_repo = "piqa" ,
40
43
hf_subset = "plain_text" ,
41
44
metric = ["loglikelihood_acc" , "loglikelihood_acc_norm_nospace" ],
42
45
),
43
- CustomEvaluationTaskConfig (
46
+ CustomEvaluationTask (
44
47
name = "siqa" ,
45
48
prompt_function = "siqa_prompt" ,
46
49
hf_repo = "lighteval/siqa" ,
47
50
hf_subset = "default" ,
48
51
hf_avail_splits = ["train" , "validation" ],
49
52
metric = ["loglikelihood_acc" , "loglikelihood_acc_norm_nospace" ],
50
53
),
51
- CustomEvaluationTaskConfig (
54
+ CustomEvaluationTask (
52
55
name = "openbookqa" ,
53
56
prompt_function = "openbookqa" ,
54
57
hf_repo = "openbookqa" ,
55
58
hf_subset = "main" ,
56
59
metric = ["loglikelihood_acc" , "loglikelihood_acc_norm_nospace" ],
57
60
),
58
- CustomEvaluationTaskConfig (
61
+ CustomEvaluationTask (
59
62
name = "arc:easy" ,
60
63
prompt_function = "arc" ,
61
64
hf_repo = "ai2_arc" ,
64
67
generation_size = 1 ,
65
68
metric = ["loglikelihood_acc" , "loglikelihood_acc_norm_nospace" ],
66
69
),
67
- CustomEvaluationTaskConfig (
70
+ CustomEvaluationTask (
68
71
name = "arc:challenge" ,
69
72
prompt_function = "arc" ,
70
73
hf_repo = "ai2_arc" ,
73
76
generation_size = 1 ,
74
77
metric = ["loglikelihood_acc" , "loglikelihood_acc_norm_nospace" ],
75
78
),
76
- CustomEvaluationTaskConfig (
79
+ CustomEvaluationTask (
77
80
name = "commonsense_qa" ,
78
81
prompt_function = "commonsense_qa_prompt" ,
79
82
hf_repo = "commonsense_qa" ,
@@ -131,7 +134,7 @@ def preprocess(text):
131
134
## WORLD_KNOWLEDGE_TASKS ##
132
135
133
136
WORLD_KNOWLEDGE_TASKS = [
134
- CustomEvaluationTaskConfig (
137
+ CustomEvaluationTask (
135
138
name = "trivia_qa" ,
136
139
prompt_function = "triviaqa" ,
137
140
hf_repo = "trivia_qa" ,
@@ -140,7 +143,7 @@ def preprocess(text):
140
143
generation_size = 20 ,
141
144
stop_sequence = ["\n " , "." , "," ],
142
145
),
143
- CustomEvaluationTaskConfig (
146
+ CustomEvaluationTask (
144
147
name = "natural_questions" ,
145
148
prompt_function = "natural_questions_prompt" ,
146
149
hf_repo = "lighteval/natural_questions_clean" ,
@@ -170,14 +173,14 @@ def natural_questions_prompt(line, task_name: str = None):
170
173
## Reading comprehension ##
171
174
172
175
READING_COMP_TASKS = [
173
- CustomEvaluationTaskConfig (
176
+ CustomEvaluationTask (
174
177
name = "super_glue:boolq" ,
175
178
prompt_function = "boolq_prompt" ,
176
179
hf_repo = "super_glue" ,
177
180
hf_subset = "boolq" ,
178
181
metric = ["target_perplexity" ],
179
182
),
180
- CustomEvaluationTaskConfig (
183
+ CustomEvaluationTask (
181
184
name = "quac" ,
182
185
prompt_function = "quac" ,
183
186
hf_repo = "lighteval/quac_helm" ,
@@ -204,7 +207,7 @@ def boolq_prompt(line, task_name: str = None):
204
207
205
208
206
209
## MATH ##
207
- class CustomMathEvaluationTask (CustomEvaluationTaskConfig ):
210
+ class CustomMathEvaluationTask (CustomEvaluationTask ):
208
211
"""Custom class for math tasks with all the defaults set"""
209
212
210
213
def __init__ (
@@ -251,7 +254,7 @@ def __init__(
251
254
CustomMathEvaluationTask (name = "math:prealgebra" , hf_subset = "prealgebra" ),
252
255
CustomMathEvaluationTask (name = "math:precalculus" , hf_subset = "precalculus" ),
253
256
]
254
- GSM8K = CustomEvaluationTaskConfig (
257
+ GSM8K = CustomEvaluationTask (
255
258
name = "gsm8k" ,
256
259
prompt_function = "gsm8k" ,
257
260
hf_repo = "gsm8k" ,
@@ -272,7 +275,7 @@ def __init__(
272
275
273
276
274
277
## MMLU ##
275
- class CustomMMLUEvaluationTask (CustomEvaluationTaskConfig ):
278
+ class CustomMMLUEvaluationTask (CustomEvaluationTask ):
276
279
def __init__ (
277
280
self ,
278
281
name ,
@@ -415,7 +418,7 @@ def mmlu_prompt(line, task_name: str = None):
415
418
## BBH ##
416
419
417
420
418
- class CustomBBHEvaluationTask (CustomEvaluationTaskConfig ):
421
+ class CustomBBHEvaluationTask (CustomEvaluationTask ):
419
422
def __init__ (
420
423
self ,
421
424
name ,
@@ -506,7 +509,7 @@ def bbh_prompt(line, task_name: str = None):
506
509
507
510
508
511
## AGI eval ##
509
- class CustomAGIEvalEvaluationTask (CustomEvaluationTaskConfig ):
512
+ class CustomAGIEvalEvaluationTask (CustomEvaluationTask ):
510
513
def __init__ (
511
514
self ,
512
515
name ,
@@ -617,17 +620,17 @@ def agi_eval_prompt_no_letters(line, task_name: str = None):
617
620
618
621
619
622
## HUMAN EVAL ##
620
- # human_eval = CustomEvaluationTaskConfig (
623
+ # human_eval = CustomEvaluationTask (
621
624
# name="human_eval",
622
625
# prompt_function="human_eval",
623
626
# hf_repo="lighteval/human_eval",
624
627
# metric=["human_eval_pass_at_1"],
625
628
# ),
626
629
627
630
628
- def has_generative_metrics (task : CustomEvaluationTaskConfig ) -> bool :
631
+ def has_generative_metrics (task : CustomEvaluationTask ) -> bool :
629
632
for metric in task .metric :
630
- if metric . category == MetricCategory . GENERATIVE :
633
+ if metric in NEEDS_GENERATION_ONLY :
631
634
return True
632
635
return False
633
636
0 commit comments