Open
Description
Describe the bug
When Probability metric is used alongside Accuracy metric, we evaluate 2xN_SAMPLES requests. We don't need to do this as both use the same logprobabilities for evaluation.
To Reproduce
Use this as custom tasks
from lighteval.metrics.metrics import Metrics
from lighteval.tasks.lighteval_task import LightevalTaskConfig
from lighteval.tasks.default_prompts import LETTER_INDICES
from lighteval.metrics.normalizations import LogProbCharNorm, LogProbPMINorm, LogProbTokenNorm
from lighteval.tasks.templates.utils.formulation import CFFormulation, HybridFormulation, MCFFormulation
from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
from lighteval.tasks.templates.hellaswag import get_hellaswag_prompt_function
from lighteval.tasks.templates.qa import get_qa_prompt_function
from lighteval.metrics.dynamic_metrics import loglikelihood_acc_metric, probability_metric
from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
from lighteval.tasks.multilingual.tasks import xcsqa_tasks
from lighteval.utils.language import Language
from lighteval.metrics.metrics import Metrics
hellaswag_task = [
LightevalTaskConfig(
name=f"hellaswag_cf",
suite=["lighteval"],
prompt_function=get_hellaswag_prompt_function(
language=Language.ENGLISH,
adapter=lambda line: {
# We don't use activity_label as they are not available
"ctx_a": line["ctx_a"],
"ctx_b": line["ctx_b"],
"continuations": line["endings"],
"gold_idx": int(line["label"]),
},
formulation=CFFormulation(),
),
hf_repo="Rowan/hellaswag",
hf_subset="default",
evaluation_splits=["validation"],
hf_avail_splits=["train", "validation"],
metric=get_metrics_for_formulation(
CFFormulation(),
[
loglikelihood_acc_metric(normalization=LogProbTokenNorm()),
probability_metric(normalization=None),
],
),
)
]
Expected behavior
Only N_SAMPLES loglikelihood requests are called.
Version info
commit: 1607dc1