Add MCQ support to Yourbench evaluation (#734)

alozowski · hynky1999 · web-flow · commit 4c3d414cdc97 · 2025-05-20T14:12:34.000+02:00
* Add MCQ support to Yourbench evaluation

---------

Co-authored-by: Hynek Kydlíček &lt;kydlicek.hynek@gmail.com&gt;
diff --git a/examples/custom_tasks_templates/custom_yourbench_task_mcq.py b/examples/custom_tasks_templates/custom_yourbench_task_mcq.py
@@ -0,0 +1,102 @@
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
+import logging
+
+from aenum import extend_enum
+
+from lighteval.metrics.dynamic_metrics import multilingual_extractive_match_metric
+from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.utils.extractive_match_utils import IndicesExtractionConfig
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc
+from lighteval.utils.language import Language
+
+
+logger = logging.getLogger(__name__)
+
+
+ZEROSHOT_QA_INSTRUCTION = """
+Answer the following multiple-choice question by selecting only one letter: A, B, C, or D. Do not explain your answer.
+"""
+
+ZEROSHOT_QA_USER_PROMPT = (
+    ZEROSHOT_QA_INSTRUCTION
+    + """
+Question: {question}
+
+Choices:
+{options}
+
+Answer:
+"""
+)
+
+
+def yourbench_prompt(line, task_name: str = ""):
+    options = "\n".join(f"{chr(65 + i)}. {choice}" for i, choice in enumerate(line["choices"]))
+
+    gold_raw = line["gold"][0]
+
+    if isinstance(gold_raw, str) and gold_raw.strip().isalpha():
+        gold_index = ord(gold_raw.strip().upper()) - ord("A")
+    elif isinstance(gold_raw, int):
+        gold_index = gold_raw
+    else:
+        raise ValueError(f"Unexpected gold label format: {gold_raw!r}")
+
+    return Doc(
+        instruction=ZEROSHOT_QA_INSTRUCTION,
+        task_name=task_name,
+        query=ZEROSHOT_QA_USER_PROMPT.format(question=line["question"], options=options),
+        choices=line["choices"],
+        gold_index=gold_index,
+    )
+
+
+yourbench_metrics = multilingual_extractive_match_metric(
+    language=Language.ENGLISH,
+    gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
+    pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
+    precision=6,
+)
+
+extend_enum(Metrics, "yourbench_metrics", yourbench_metrics)
+
+yourbench_mcq = LightevalTaskConfig(
+    name="HF_TASK_NAME",  # noqa: F821
+    suite=["custom"],
+    prompt_function=yourbench_prompt,
+    hf_repo="HF_DATASET_NAME",  # noqa: F821
+    hf_subset="lighteval",
+    hf_avail_splits=["train"],
+    evaluation_splits=["train"],
+    few_shots_split=None,
+    few_shots_select=None,
+    generation_size=8192,
+    metric=[Metrics.yourbench_metrics],
+    trust_dataset=True,
+    version=0,
+)
+
+TASKS_TABLE = [yourbench_mcq]
diff --git a/src/lighteval/metrics/utils/extractive_match_utils.py b/src/lighteval/metrics/utils/extractive_match_utils.py
@@ -286,18 +286,22 @@ def lazy_indices_regex(
     translation_literal = TRANSLATION_LITERALS[language]
     # First get indices to predict
     indices = get_prefix(indices_config.prefix_for_extraction, translation_literal)[:len_choices]
-    indice_str_re = f"(?P<indices>{'|'.join([re.escape(i) for i in indices])})"
+    indices_escaped = [re.escape(i) for i in indices]
+    # We allow both (A) and A
+    indices_wrapped = [rf"(?:{i}|\({i}\))" for i in indices_escaped]
+    indice_str_re = f"(?P<indices>{'|'.join(indices_wrapped)})"
 
     # The answer keys are either surrounded with <space>**answer**., or '<space>answer.' or the same without the dot
     full_stop_re = rf"[{re.escape(translation_literal.full_stop)}\.]"
     comma_re = rf"[{re.escape(translation_literal.comma)}\,]"
     colon_re = rf"[{re.escape(translation_literal.colon)}\:]"
     space_re = re.escape(translation_literal.sentence_space)
 
-    answer_prefix_re = rf"(^|{space_re})(?:\*\*)?"
+    answer_prefix_re = rf"(?:^|{space_re})(?:\*\*)?"
     answer_suffix_re = rf"(?:\*\*)?(?:{full_stop_re}|{comma_re}|{colon_re}|{space_re}|$)"
     answer_re = f"{answer_prefix_re}{indice_str_re}{answer_suffix_re}"
     answer_re_start = rf"^(?:\*\*)?{indice_str_re}{answer_suffix_re}"
+    answer_re_line_start = rf"\n(?:\*\*)?{indice_str_re}{answer_suffix_re}"
 
     answer_word = f"(?i:{translation_literal.answer})"
 
@@ -320,8 +324,10 @@ def lazy_indices_regex(
             (f"{answer_word}{colon_re}.{{0,50}}?{answer_re}", 100),
             # Answer word patterns
             (f"{answer_word}.{{0,50}}?{answer_re}", 150),
-            # Start of line patterns
+            # Start of the string
             (answer_re_start, 200),
+            # Start of the line
+            (answer_re_line_start, 210),
         ]
     )
 
@@ -490,6 +496,15 @@ def extract_latex(
     return latex_exprs[0], latex_strs[0]
 
 
+def extract_indices(
+    match: re.Match, target_type: IndicesExtractionConfig, timeout_seconds: int
+) -> tuple[str | None, str]:
+    def normalize_index(index: str) -> str:
+        return index.replace("(", "").replace(")", "").strip()
+
+    return normalize_index(match.group("indices")), normalize_index(match.group("indices"))
+
+
 def extract_match(
     match: re.Match, target_type: ExtractionTarget, timeout_seconds: int
 ) -> tuple[Basic | MatrixBase | str | None, str]:
@@ -510,7 +525,7 @@ def extract_match(
     elif isinstance(target_type, ExprExtractionConfig):
         return extract_expr(match, timeout_seconds=timeout_seconds)
     elif isinstance(target_type, IndicesExtractionConfig):
-        return match.group("indices"), match.group("indices")
+        return extract_indices(match, target_type, timeout_seconds=timeout_seconds)
 
 
 def extract_target_from_pred(
diff --git a/tests/metrics/test_extractive_match.py b/tests/metrics/test_extractive_match.py
@@ -80,9 +80,15 @@ def compare_strings(
         # Test answer with reasoning
         ("B", "Let's think step by step. It's not A because it doesn't make sense, therefore I think it's B", 1),
         ("D", "The answer is for sure D, it can't be A or B", 1),
-        ("D", "The answer: D, doesn't makese nsense for answer to be A or B", 1),
+        ("D", "The answer: D, it doesn't make sense for it to be A or B", 1),
         # Test minimal answer format
         ("D", "D. it can't be A or B", 1),
+        ("(D) Alina", "D", 1),
+        ("(A) Cecile", "C", 0),
+        ("C Cecile", "C", 1),
+        ("Alina and the answer is\n(C) Cecile", "C", 1),
+        ("Alina and the answer is\nC Cecile", "C", 1),
+        ("A Peter\nCelina bum", "A", 1),
     ],
 )
 def test_extraction_abc(gold, pred, expected):