Skip to content

Commit 4c3d414

Browse files
alozowskihynky1999
andauthored
Add MCQ support to Yourbench evaluation (#734)
* Add MCQ support to Yourbench evaluation --------- Co-authored-by: Hynek Kydlíček <kydlicek.hynek@gmail.com>
1 parent d4f5d09 commit 4c3d414

File tree

3 files changed

+128
-5
lines changed

3 files changed

+128
-5
lines changed
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
# MIT License
2+
3+
# Copyright (c) 2024 The HuggingFace Team
4+
5+
# Permission is hereby granted, free of charge, to any person obtaining a copy
6+
# of this software and associated documentation files (the "Software"), to deal
7+
# in the Software without restriction, including without limitation the rights
8+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
# copies of the Software, and to permit persons to whom the Software is
10+
# furnished to do so, subject to the following conditions:
11+
12+
# The above copyright notice and this permission notice shall be included in all
13+
# copies or substantial portions of the Software.
14+
15+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
# SOFTWARE.
22+
23+
24+
import logging
25+
26+
from aenum import extend_enum
27+
28+
from lighteval.metrics.dynamic_metrics import multilingual_extractive_match_metric
29+
from lighteval.metrics.metrics import Metrics
30+
from lighteval.metrics.utils.extractive_match_utils import IndicesExtractionConfig
31+
from lighteval.tasks.lighteval_task import LightevalTaskConfig
32+
from lighteval.tasks.requests import Doc
33+
from lighteval.utils.language import Language
34+
35+
36+
logger = logging.getLogger(__name__)
37+
38+
39+
ZEROSHOT_QA_INSTRUCTION = """
40+
Answer the following multiple-choice question by selecting only one letter: A, B, C, or D. Do not explain your answer.
41+
"""
42+
43+
ZEROSHOT_QA_USER_PROMPT = (
44+
ZEROSHOT_QA_INSTRUCTION
45+
+ """
46+
Question: {question}
47+
48+
Choices:
49+
{options}
50+
51+
Answer:
52+
"""
53+
)
54+
55+
56+
def yourbench_prompt(line, task_name: str = ""):
57+
options = "\n".join(f"{chr(65 + i)}. {choice}" for i, choice in enumerate(line["choices"]))
58+
59+
gold_raw = line["gold"][0]
60+
61+
if isinstance(gold_raw, str) and gold_raw.strip().isalpha():
62+
gold_index = ord(gold_raw.strip().upper()) - ord("A")
63+
elif isinstance(gold_raw, int):
64+
gold_index = gold_raw
65+
else:
66+
raise ValueError(f"Unexpected gold label format: {gold_raw!r}")
67+
68+
return Doc(
69+
instruction=ZEROSHOT_QA_INSTRUCTION,
70+
task_name=task_name,
71+
query=ZEROSHOT_QA_USER_PROMPT.format(question=line["question"], options=options),
72+
choices=line["choices"],
73+
gold_index=gold_index,
74+
)
75+
76+
77+
yourbench_metrics = multilingual_extractive_match_metric(
78+
language=Language.ENGLISH,
79+
gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
80+
pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
81+
precision=6,
82+
)
83+
84+
extend_enum(Metrics, "yourbench_metrics", yourbench_metrics)
85+
86+
yourbench_mcq = LightevalTaskConfig(
87+
name="HF_TASK_NAME", # noqa: F821
88+
suite=["custom"],
89+
prompt_function=yourbench_prompt,
90+
hf_repo="HF_DATASET_NAME", # noqa: F821
91+
hf_subset="lighteval",
92+
hf_avail_splits=["train"],
93+
evaluation_splits=["train"],
94+
few_shots_split=None,
95+
few_shots_select=None,
96+
generation_size=8192,
97+
metric=[Metrics.yourbench_metrics],
98+
trust_dataset=True,
99+
version=0,
100+
)
101+
102+
TASKS_TABLE = [yourbench_mcq]

src/lighteval/metrics/utils/extractive_match_utils.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -286,18 +286,22 @@ def lazy_indices_regex(
286286
translation_literal = TRANSLATION_LITERALS[language]
287287
# First get indices to predict
288288
indices = get_prefix(indices_config.prefix_for_extraction, translation_literal)[:len_choices]
289-
indice_str_re = f"(?P<indices>{'|'.join([re.escape(i) for i in indices])})"
289+
indices_escaped = [re.escape(i) for i in indices]
290+
# We allow both (A) and A
291+
indices_wrapped = [rf"(?:{i}|\({i}\))" for i in indices_escaped]
292+
indice_str_re = f"(?P<indices>{'|'.join(indices_wrapped)})"
290293

291294
# The answer keys are either surrounded with <space>**answer**., or '<space>answer.' or the same without the dot
292295
full_stop_re = rf"[{re.escape(translation_literal.full_stop)}\.]"
293296
comma_re = rf"[{re.escape(translation_literal.comma)}\,]"
294297
colon_re = rf"[{re.escape(translation_literal.colon)}\:]"
295298
space_re = re.escape(translation_literal.sentence_space)
296299

297-
answer_prefix_re = rf"(^|{space_re})(?:\*\*)?"
300+
answer_prefix_re = rf"(?:^|{space_re})(?:\*\*)?"
298301
answer_suffix_re = rf"(?:\*\*)?(?:{full_stop_re}|{comma_re}|{colon_re}|{space_re}|$)"
299302
answer_re = f"{answer_prefix_re}{indice_str_re}{answer_suffix_re}"
300303
answer_re_start = rf"^(?:\*\*)?{indice_str_re}{answer_suffix_re}"
304+
answer_re_line_start = rf"\n(?:\*\*)?{indice_str_re}{answer_suffix_re}"
301305

302306
answer_word = f"(?i:{translation_literal.answer})"
303307

@@ -320,8 +324,10 @@ def lazy_indices_regex(
320324
(f"{answer_word}{colon_re}.{{0,50}}?{answer_re}", 100),
321325
# Answer word patterns
322326
(f"{answer_word}.{{0,50}}?{answer_re}", 150),
323-
# Start of line patterns
327+
# Start of the string
324328
(answer_re_start, 200),
329+
# Start of the line
330+
(answer_re_line_start, 210),
325331
]
326332
)
327333

@@ -490,6 +496,15 @@ def extract_latex(
490496
return latex_exprs[0], latex_strs[0]
491497

492498

499+
def extract_indices(
500+
match: re.Match, target_type: IndicesExtractionConfig, timeout_seconds: int
501+
) -> tuple[str | None, str]:
502+
def normalize_index(index: str) -> str:
503+
return index.replace("(", "").replace(")", "").strip()
504+
505+
return normalize_index(match.group("indices")), normalize_index(match.group("indices"))
506+
507+
493508
def extract_match(
494509
match: re.Match, target_type: ExtractionTarget, timeout_seconds: int
495510
) -> tuple[Basic | MatrixBase | str | None, str]:
@@ -510,7 +525,7 @@ def extract_match(
510525
elif isinstance(target_type, ExprExtractionConfig):
511526
return extract_expr(match, timeout_seconds=timeout_seconds)
512527
elif isinstance(target_type, IndicesExtractionConfig):
513-
return match.group("indices"), match.group("indices")
528+
return extract_indices(match, target_type, timeout_seconds=timeout_seconds)
514529

515530

516531
def extract_target_from_pred(

tests/metrics/test_extractive_match.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,9 +80,15 @@ def compare_strings(
8080
# Test answer with reasoning
8181
("B", "Let's think step by step. It's not A because it doesn't make sense, therefore I think it's B", 1),
8282
("D", "The answer is for sure D, it can't be A or B", 1),
83-
("D", "The answer: D, doesn't makese nsense for answer to be A or B", 1),
83+
("D", "The answer: D, it doesn't make sense for it to be A or B", 1),
8484
# Test minimal answer format
8585
("D", "D. it can't be A or B", 1),
86+
("(D) Alina", "D", 1),
87+
("(A) Cecile", "C", 0),
88+
("C Cecile", "C", 1),
89+
("Alina and the answer is\n(C) Cecile", "C", 1),
90+
("Alina and the answer is\nC Cecile", "C", 1),
91+
("A Peter\nCelina bum", "A", 1),
8692
],
8793
)
8894
def test_extraction_abc(gold, pred, expected):

0 commit comments

Comments
 (0)