Skip to content

Add MCQ support to Yourbench evaluation #734

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
May 20, 2025
102 changes: 102 additions & 0 deletions examples/custom_tasks_templates/custom_yourbench_task_mcq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
# MIT License

# Copyright (c) 2024 The HuggingFace Team

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.


import logging

from aenum import extend_enum

from lighteval.metrics.dynamic_metrics import multilingual_extractive_match_metric
from lighteval.metrics.metrics import Metrics
from lighteval.metrics.utils.extractive_match_utils import IndicesExtractionConfig
from lighteval.tasks.lighteval_task import LightevalTaskConfig
from lighteval.tasks.requests import Doc
from lighteval.utils.language import Language


logger = logging.getLogger(__name__)


ZEROSHOT_QA_INSTRUCTION = """
Answer the following multiple-choice question by selecting only one letter: A, B, C, or D. Do not explain your answer.
"""

ZEROSHOT_QA_USER_PROMPT = (
ZEROSHOT_QA_INSTRUCTION
+ """
Question: {question}

Choices:
{options}

Answer:
"""
)


def yourbench_prompt(line, task_name: str = ""):
options = "\n".join(f"{chr(65 + i)}. {choice}" for i, choice in enumerate(line["choices"]))

gold_raw = line["gold"][0]

if isinstance(gold_raw, str) and gold_raw.strip().isalpha():
gold_index = ord(gold_raw.strip().upper()) - ord("A")
elif isinstance(gold_raw, int):
gold_index = gold_raw
else:
raise ValueError(f"Unexpected gold label format: {gold_raw!r}")

return Doc(
instruction=ZEROSHOT_QA_INSTRUCTION,
task_name=task_name,
query=ZEROSHOT_QA_USER_PROMPT.format(question=line["question"], options=options),
choices=line["choices"],
gold_index=gold_index,
)


yourbench_metrics = multilingual_extractive_match_metric(
language=Language.ENGLISH,
gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
precision=6,
)

extend_enum(Metrics, "yourbench_metrics", yourbench_metrics)

yourbench_mcq = LightevalTaskConfig(
name="HF_TASK_NAME", # noqa: F821
suite=["custom"],
prompt_function=yourbench_prompt,
hf_repo="HF_DATASET_NAME", # noqa: F821
hf_subset="lighteval",
hf_avail_splits=["train"],
evaluation_splits=["train"],
few_shots_split=None,
few_shots_select=None,
generation_size=8192,
metric=[Metrics.yourbench_metrics],
trust_dataset=True,
version=0,
)

TASKS_TABLE = [yourbench_mcq]
23 changes: 19 additions & 4 deletions src/lighteval/metrics/utils/extractive_match_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,18 +286,22 @@ def lazy_indices_regex(
translation_literal = TRANSLATION_LITERALS[language]
# First get indices to predict
indices = get_prefix(indices_config.prefix_for_extraction, translation_literal)[:len_choices]
indice_str_re = f"(?P<indices>{'|'.join([re.escape(i) for i in indices])})"
indices_escaped = [re.escape(i) for i in indices]
# We allow both (A) and A
indices_wrapped = [rf"(?:{i}|\({i}\))" for i in indices_escaped]
indice_str_re = f"(?P<indices>{'|'.join(indices_wrapped)})"

# The answer keys are either surrounded with <space>**answer**., or '<space>answer.' or the same without the dot
full_stop_re = rf"[{re.escape(translation_literal.full_stop)}\.]"
comma_re = rf"[{re.escape(translation_literal.comma)}\,]"
colon_re = rf"[{re.escape(translation_literal.colon)}\:]"
space_re = re.escape(translation_literal.sentence_space)

answer_prefix_re = rf"(^|{space_re})(?:\*\*)?"
answer_prefix_re = rf"(?:^|{space_re})(?:\*\*)?"
answer_suffix_re = rf"(?:\*\*)?(?:{full_stop_re}|{comma_re}|{colon_re}|{space_re}|$)"
answer_re = f"{answer_prefix_re}{indice_str_re}{answer_suffix_re}"
answer_re_start = rf"^(?:\*\*)?{indice_str_re}{answer_suffix_re}"
answer_re_line_start = rf"\n(?:\*\*)?{indice_str_re}{answer_suffix_re}"

answer_word = f"(?i:{translation_literal.answer})"

Expand All @@ -320,8 +324,10 @@ def lazy_indices_regex(
(f"{answer_word}{colon_re}.{{0,50}}?{answer_re}", 100),
# Answer word patterns
(f"{answer_word}.{{0,50}}?{answer_re}", 150),
# Start of line patterns
# Start of the string
(answer_re_start, 200),
# Start of the line
(answer_re_line_start, 210),
]
)

Expand Down Expand Up @@ -490,6 +496,15 @@ def extract_latex(
return latex_exprs[0], latex_strs[0]


def extract_indices(
match: re.Match, target_type: IndicesExtractionConfig, timeout_seconds: int
) -> tuple[str | None, str]:
def normalize_index(index: str) -> str:
return index.replace("(", "").replace(")", "").strip()

return normalize_index(match.group("indices")), normalize_index(match.group("indices"))


def extract_match(
match: re.Match, target_type: ExtractionTarget, timeout_seconds: int
) -> tuple[Basic | MatrixBase | str | None, str]:
Expand All @@ -510,7 +525,7 @@ def extract_match(
elif isinstance(target_type, ExprExtractionConfig):
return extract_expr(match, timeout_seconds=timeout_seconds)
elif isinstance(target_type, IndicesExtractionConfig):
return match.group("indices"), match.group("indices")
return extract_indices(match, target_type, timeout_seconds=timeout_seconds)


def extract_target_from_pred(
Expand Down
8 changes: 7 additions & 1 deletion tests/metrics/test_extractive_match.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,15 @@ def compare_strings(
# Test answer with reasoning
("B", "Let's think step by step. It's not A because it doesn't make sense, therefore I think it's B", 1),
("D", "The answer is for sure D, it can't be A or B", 1),
("D", "The answer: D, doesn't makese nsense for answer to be A or B", 1),
("D", "The answer: D, it doesn't make sense for it to be A or B", 1),
# Test minimal answer format
("D", "D. it can't be A or B", 1),
("(D) Alina", "D", 1),
("(A) Cecile", "C", 0),
("C Cecile", "C", 1),
("Alina and the answer is\n(C) Cecile", "C", 1),
("Alina and the answer is\nC Cecile", "C", 1),
("A Peter\nCelina bum", "A", 1),
],
)
def test_extraction_abc(gold, pred, expected):
Expand Down
Loading