huggingface
diff --git a/‎.pre-commit-config.yaml
Lines changed: 0 additions & 1 deletion b/‎.pre-commit-config.yaml
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/lighteval/logging/__init__.py b/‎src/lighteval/logging/__init__.py
diff --git a/‎src/lighteval/logging/evaluation_tracker.py
Lines changed: 3 additions & 1 deletion b/‎src/lighteval/logging/evaluation_tracker.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/lighteval/models/__init__.py b/‎src/lighteval/models/__init__.py
diff --git a/‎src/lighteval/tasks/__init__.py b/‎src/lighteval/tasks/__init__.py
diff --git a/‎src/lighteval/tasks/lighteval_task.py
Lines changed: 0 additions & 37 deletions b/‎src/lighteval/tasks/lighteval_task.py
Lines changed: 0 additions & 37 deletions
diff --git a/‎tasks_examples/custom_tasks/custom_evaluation_tasks.py
Lines changed: 29 additions & 26 deletions b/‎tasks_examples/custom_tasks/custom_evaluation_tasks.py
Lines changed: 29 additions & 26 deletions
@@ -37,5 +37,4 @@ repos:
     rev: 'v0.1.6'
     hooks:
       - id: ruff
-        args: ['--fix']
       - id: ruff-format
@@ -556,7 +556,7 @@ def push_results_to_tensorboard(  # noqa: C901
 
         tb_context.close()  # flushes the unfinished write operations
         time.sleep(5)
-        files = os.listdir(str(output_dir_tb))
+        files = os.listdir(output_dir_tb)
         for file in files:
             os.rename(os.path.join(output_dir_tb, file), os.path.join(output_dir_tb, f"{global_step:07d}_{file}"))
 
@@ -566,3 +566,5 @@ def push_results_to_tensorboard(  # noqa: C901
             f"Pushed to tensorboard at https://huggingface.co/tensorboard/{lighteval_config.logging.hub_repo_tensorboard}/"
             f" at {output_dir_tb} and global_step {global_step}"
         )
+        # except Exception as e:
+        #     logger.warning(f"Could not push to tensorboard\n{e}")
@@ -1,6 +1,5 @@
 import collections
 import random
-from dataclasses import dataclass
 from multiprocessing import Pool
 from pathlib import Path
 from typing import TYPE_CHECKING, List, Optional, Tuple
@@ -40,42 +39,6 @@
     from lighteval.logging.evaluation_tracker import EvaluationTracker
 
 
-@dataclass
-class CustomEvaluationTaskConfig:
-    name: str
-    prompt_function: str
-    hf_repo: str
-    hf_subset: str
-    metric: Tuple[Metrics]
-    hf_avail_splits: Optional[Tuple[str]] = None
-    evaluation_splits: Optional[Tuple[str]] = None
-    few_shots_split: Optional[str] = None
-    few_shots_select: Optional[str] = None
-    generation_size: int = -1
-    stop_sequence: Optional[Tuple[str]] = None
-    output_regex: Optional[str] = None
-
-    frozen: bool = False
-    suite: Optional[Tuple[str]] = None  # we use this to know if we should use a custom lighteval or bigcode task
-
-    def __post_init__(self):
-        if self.suite is None:
-            self.suite = ["custom"]
-        if self.hf_avail_splits is None:
-            self.hf_avail_splits = ["train", "validation", "test"]
-        if self.evaluation_splits is None:
-            self.evaluation_splits = ["validation"]
-        if self.stop_sequence is None:
-            self.stop_sequence = ["\n"]
-
-        # Convert list to tuple for hashing
-        self.metric = tuple(self.metric)
-        self.hf_avail_splits = tuple(self.hf_avail_splits) if self.hf_avail_splits else None
-        self.evaluation_splits = tuple(self.evaluation_splits) if self.evaluation_splits else None
-        self.suite = tuple(self.suite) if self.suite else None
-        self.stop_sequence = tuple(self.stop_sequence) if self.stop_sequence else None
-
-
 class LightevalTask:
     def __init__(self, name: str, cfg: dict, cache_dir: Optional[str] = None, custom_tasks_module=None):
         """
 
@@ -6,56 +6,59 @@
 """
 import re
 from dataclasses import asdict
-from typing import Dict, List, Tuple
+from typing import Dict, List
 
-from lighteval.metrics import MetricCategory, Metrics
-from lighteval.tasks.lighteval_task import CustomEvaluationTaskConfig
 from lighteval.tasks.requests import Doc
-from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
 
+from .custom_evaluation_utils import *
 
-_TASKS_STRINGS: List[Tuple[CustomEvaluationTaskConfig, str]] = []
-_TASKS: List[CustomEvaluationTaskConfig] = []
+
+# fmt: off
+LETTER_INDICES = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"]
+# fmt: on
+
+_TASKS_STRINGS: List[Tuple[CustomEvaluationTask, str]] = []
+_TASKS: List[CustomEvaluationTask] = []
 
 ## COMMON_SENSE_REASONING_TASKS ##
 COMMON_SENSE_REASONING_TASKS = [
-    CustomEvaluationTaskConfig(
+    CustomEvaluationTask(
         name="hellaswag",
         prompt_function="hellaswag_prompt",
         hf_repo="hellaswag",
         hf_subset="default",
         metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
     ),
-    CustomEvaluationTaskConfig(
+    CustomEvaluationTask(
         name="winogrande",
         prompt_function="winogrande",
         hf_repo="winogrande",
         hf_subset="winogrande_xl",
         metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
     ),
-    CustomEvaluationTaskConfig(
+    CustomEvaluationTask(
         name="piqa",
         prompt_function="piqa_harness",
         hf_repo="piqa",
         hf_subset="plain_text",
         metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
     ),
-    CustomEvaluationTaskConfig(
+    CustomEvaluationTask(
         name="siqa",
         prompt_function="siqa_prompt",
         hf_repo="lighteval/siqa",
         hf_subset="default",
         hf_avail_splits=["train", "validation"],
         metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
     ),
-    CustomEvaluationTaskConfig(
+    CustomEvaluationTask(
         name="openbookqa",
         prompt_function="openbookqa",
         hf_repo="openbookqa",
         hf_subset="main",
         metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
     ),
-    CustomEvaluationTaskConfig(
+    CustomEvaluationTask(
         name="arc:easy",
         prompt_function="arc",
         hf_repo="ai2_arc",
@@ -64,7 +67,7 @@
         generation_size=1,
         metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
     ),
-    CustomEvaluationTaskConfig(
+    CustomEvaluationTask(
         name="arc:challenge",
         prompt_function="arc",
         hf_repo="ai2_arc",
@@ -73,7 +76,7 @@
         generation_size=1,
         metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
     ),
-    CustomEvaluationTaskConfig(
+    CustomEvaluationTask(
         name="commonsense_qa",
         prompt_function="commonsense_qa_prompt",
         hf_repo="commonsense_qa",
@@ -131,7 +134,7 @@ def preprocess(text):
 ## WORLD_KNOWLEDGE_TASKS ##
 
 WORLD_KNOWLEDGE_TASKS = [
-    CustomEvaluationTaskConfig(
+    CustomEvaluationTask(
         name="trivia_qa",
         prompt_function="triviaqa",
         hf_repo="trivia_qa",
@@ -140,7 +143,7 @@ def preprocess(text):
         generation_size=20,
         stop_sequence=["\n", ".", ","],
     ),
-    CustomEvaluationTaskConfig(
+    CustomEvaluationTask(
         name="natural_questions",
         prompt_function="natural_questions_prompt",
         hf_repo="lighteval/natural_questions_clean",
@@ -170,14 +173,14 @@ def natural_questions_prompt(line, task_name: str = None):
 ## Reading comprehension ##
 
 READING_COMP_TASKS = [
-    CustomEvaluationTaskConfig(
+    CustomEvaluationTask(
         name="super_glue:boolq",
         prompt_function="boolq_prompt",
         hf_repo="super_glue",
         hf_subset="boolq",
         metric=["target_perplexity"],
     ),
-    CustomEvaluationTaskConfig(
+    CustomEvaluationTask(
         name="quac",
         prompt_function="quac",
         hf_repo="lighteval/quac_helm",
@@ -204,7 +207,7 @@ def boolq_prompt(line, task_name: str = None):
 
 
 ## MATH ##
-class CustomMathEvaluationTask(CustomEvaluationTaskConfig):
+class CustomMathEvaluationTask(CustomEvaluationTask):
     """Custom class for math tasks with all the defaults set"""
 
     def __init__(
@@ -251,7 +254,7 @@ def __init__(
     CustomMathEvaluationTask(name="math:prealgebra", hf_subset="prealgebra"),
     CustomMathEvaluationTask(name="math:precalculus", hf_subset="precalculus"),
 ]
-GSM8K = CustomEvaluationTaskConfig(
+GSM8K = CustomEvaluationTask(
     name="gsm8k",
     prompt_function="gsm8k",
     hf_repo="gsm8k",
@@ -272,7 +275,7 @@ def __init__(
 
 
 ## MMLU ##
-class CustomMMLUEvaluationTask(CustomEvaluationTaskConfig):
+class CustomMMLUEvaluationTask(CustomEvaluationTask):
     def __init__(
         self,
         name,
@@ -415,7 +418,7 @@ def mmlu_prompt(line, task_name: str = None):
 ## BBH ##
 
 
-class CustomBBHEvaluationTask(CustomEvaluationTaskConfig):
+class CustomBBHEvaluationTask(CustomEvaluationTask):
     def __init__(
         self,
         name,
@@ -506,7 +509,7 @@ def bbh_prompt(line, task_name: str = None):
 
 
 ## AGI eval ##
-class CustomAGIEvalEvaluationTask(CustomEvaluationTaskConfig):
+class CustomAGIEvalEvaluationTask(CustomEvaluationTask):
     def __init__(
         self,
         name,
@@ -617,17 +620,17 @@ def agi_eval_prompt_no_letters(line, task_name: str = None):
 
 
 ## HUMAN EVAL ##
-# human_eval = CustomEvaluationTaskConfig(
+# human_eval = CustomEvaluationTask(
 #         name="human_eval",
 #         prompt_function="human_eval",
 #         hf_repo="lighteval/human_eval",
 #         metric=["human_eval_pass_at_1"],
 #     ),
 
 
-def has_generative_metrics(task: CustomEvaluationTaskConfig) -> bool:
+def has_generative_metrics(task: CustomEvaluationTask) -> bool:
     for metric in task.metric:
-        if metric.category == MetricCategory.GENERATIVE:
+        if metric in NEEDS_GENERATION_ONLY:
             return True
     return False