Skip to content

Adds More Generative tasks #694

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
May 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions src/lighteval/metrics/dynamic_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@ def loglikelihood_acc_metric(normalization: LogProbNormalization | None = None)
Creates an accuracy (loglikelihood) metric, which returns accuracy given normalization.
"""

normalization_str = normalization.name if normalization else ""
metric_name = f"acc_{normalization_str}"
normalization_str = f"_{normalization.name}" if normalization else ""
metric_name = f"acc{normalization_str}"
return SampleLevelMetric(
metric_name=metric_name,
sample_level_fn=LoglikelihoodAcc(logprob_normalization=normalization).compute,
Expand All @@ -83,8 +83,8 @@ def normalized_multi_choice_prob_metric(
Creates a normalized multi-choice probability metric, which returns the probability of the gold choice / sum of probabilities of all choices (after logprobs are normalized).
"""

normalization_str = normalization.name if normalization else ""
metric_name = "_".join(filter(None, ["normalized_mc_prob_", normalization_str]))
normalization_str = f"_{normalization.name}" if normalization else ""
metric_name = f"normalized_mc_prob{normalization_str}"

return SampleLevelMetric(
metric_name=metric_name,
Expand All @@ -108,8 +108,8 @@ def probability_metric(
Creates a probability metric, which returns the probability of the gold choice given normalization.
"""

normalization_str = normalization.name if normalization else ""
metric_name = "_".join(filter(None, ["prob", normalization_str]))
normalization_str = f"_{normalization.name}" if normalization else ""
metric_name = f"prob{normalization_str}"

return SampleLevelMetric(
metric_name=metric_name,
Expand Down Expand Up @@ -188,7 +188,7 @@ def multilingual_quasi_exact_match_metric(
def multilingual_extractive_match_metric(
language: Language = Language.ENGLISH,
gold_extraction_target: Sequence[ExtractionTarget] = (ExprExtractionConfig(),),
pred_extraction_target: Sequence[ExtractionTarget] = (ExprExtractionConfig(),),
pred_extraction_target: Sequence[ExtractionTarget] = (ExprExtractionConfig(), LatexExtractionConfig()),
aggregation_function: Callable[[list[float]], float] = max,
fallback_mode: Literal["no_fallback", "first_match"] = "first_match",
extraction_mode: Literal["first_match", "any_match"] = "any_match",
Expand Down
7 changes: 7 additions & 0 deletions src/lighteval/tasks/default_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -2774,3 +2774,10 @@ def xsum(line, task_name: str = None):
choices=[str(line["summary"])],
specific={"text": line["article"]},
)


# Utility for drop task
def get_drop_date(x):
components = [x["day"], x["month"], x["year"]]
components = list(filter(lambda x: x, components))
return " ".join(components)
139 changes: 116 additions & 23 deletions src/lighteval/tasks/default_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
import lighteval.tasks.default_prompts as prompt
from lighteval.metrics.metrics import Metrics
from lighteval.tasks.lighteval_task import LightevalTaskConfig
from lighteval.tasks.templates.qa import get_qa_prompt_function
from lighteval.utils.language import Language


abstract_narrative_understanding_bigbench = LightevalTaskConfig(
Expand Down Expand Up @@ -6627,21 +6629,28 @@
trust_dataset=True,
version=0,
)
coqa_lighteval = LightevalTaskConfig(
coqa_first_question = LightevalTaskConfig(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For all evals in this file, either don't remove the original version (so we keep lighteval + yours) or keep the same name and change the version to 1.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The original version didn't make any sense; each consecutive question can depend on each other. So I think this should be really the "first"version. Same for drop, it was outright broken iirc

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a matter of reproducibility/continuity in the eval suite: if people search for these tasks in the future they will have completely disappeared - if you just say your version is v1, it makes it easy for people to update and see what to replace things by

Copy link
Collaborator Author

@hynky1999 hynky1999 May 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed it. Can I merge @clefourrier ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I must be blind but I still see the name as coqa_first_question not the original coqa_lighteval

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe I'm not looking at the correct commit though, so if this is fixed feel free to merge

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's just variable name....

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes XD minimal changes in PRs to avoid confusing the users, especially on evals they might be using

name="coqa",
suite=["lighteval"],
prompt_function=prompt.coqa,
hf_repo="coqa",
prompt_function=get_qa_prompt_function(
Language.ENGLISH,
lambda line: {
"question": line["questions"][0],
"context": line["story"],
"choices": [line["answers"]["input_text"][0]],
},
),
suite=("lighteval",),
hf_repo="stanfordnlp/coqa",
hf_subset="default",
hf_avail_splits=["train", "validation"],
evaluation_splits=["validation"],
few_shots_split=None,
few_shots_select=None,
generation_size=10,
metric=[Metrics.perfect_exact_match, Metrics.f1_score],
stop_sequence=["\n"],
trust_dataset=True,
version=0,
stop_sequence=["\n", "Question:", "question:"],
generation_size=100,
version=1,
metric=(
Metrics.prefix_quasi_exact_match,
Metrics.f1_score_quasi,
),
)
coqa_bb_lighteval = LightevalTaskConfig(
name="coqa_bb",
Expand Down Expand Up @@ -6835,21 +6844,43 @@
trust_dataset=True,
version=0,
)
drop_lighteval = LightevalTaskConfig(
drop_qa = LightevalTaskConfig(
name="drop",
suite=["lighteval"],
prompt_function=prompt.drop,
prompt_function=get_qa_prompt_function(
Language.ENGLISH,
lambda line: {
"context": line["passage"],
"question": line["question"],
"choices": list(
filter(
lambda x: x,
[line["answer"].get("number")]
+ line["answer"]["spans"]
+ [prompt.get_drop_date(line["answer"].get("date"))],
)
),
},
),
suite=("lighteval",),
hf_repo="lighteval/drop_harness",
hf_subset="default",
hf_avail_splits=["train", "validation"],
evaluation_splits=["validation"],
hf_filter=lambda line: list(
filter(
lambda x: x,
[line["answer"].get("number")]
+ line["answer"]["spans"]
+ [prompt.get_drop_date(line["answer"].get("date"))],
)
),
evaluation_splits=("validation",),
few_shots_split="train",
few_shots_select="random_sampling_from_train",
generation_size=None,
metric=[Metrics.drop],
stop_sequence=["."],
trust_dataset=True,
version=0,
generation_size=250,
stop_sequence=["Question:", "question:", "\n"],
metric=(
Metrics.prefix_quasi_exact_match,
Metrics.f1_score_quasi,
),
version=1,
)
dyck_language_2_helm = LightevalTaskConfig(
name="dyck_language:2",
Expand Down Expand Up @@ -8581,6 +8612,27 @@
trust_dataset=True,
version=0,
)
jeopardy = LightevalTaskConfig(
name="jeopardy",
prompt_function=get_qa_prompt_function(
Language.ENGLISH,
lambda line: {
"question": line["question"],
"choices": [line["answer"]],
},
),
suite=("lighteval",),
hf_repo="openaccess-ai-collective/jeopardy",
hf_subset="default",
evaluation_splits=("train",),
few_shots_split="train",
generation_size=250,
stop_sequence=["\n", "Question:", "question:"],
metric=(
Metrics.prefix_quasi_exact_match,
Metrics.f1_score_quasi,
),
)
kanji_ascii_bigbench = LightevalTaskConfig(
name="kanji_ascii",
suite=["bigbench", "bigbench_json"],
Expand Down Expand Up @@ -13665,6 +13717,24 @@
trust_dataset=True,
version=0,
)
natural_questions = LightevalTaskConfig(
name="natural_questions",
prompt_function=get_qa_prompt_function(
Language.ENGLISH,
lambda line: {"question": line["question"], "choices": [line["answer"]]},
),
suite=("lighteval",),
hf_repo="lighteval/small_natural_questions",
hf_subset="default",
evaluation_splits=("test",),
few_shots_split="few_shot",
generation_size=250,
stop_sequence=["\n", "Question:", "question:"],
metric=(
Metrics.prefix_quasi_exact_match,
Metrics.f1_score_quasi,
),
)
navigate_bigbench = LightevalTaskConfig(
name="navigate",
suite=["bigbench", "bigbench_json"],
Expand Down Expand Up @@ -14885,7 +14955,7 @@
hf_subset="default",
hf_avail_splits=["test"],
evaluation_splits=["test"],
few_shots_split=None,
few_shots_split="few_shot",
few_shots_select=None,
generation_size=2048,
metric=[Metrics.simpleqa_judge],
Expand Down Expand Up @@ -15074,6 +15144,29 @@
trust_dataset=True,
version=0,
)
squad_v2 = LightevalTaskConfig(
name="squad_v2",
prompt_function=get_qa_prompt_function(
Language.ENGLISH,
lambda line: {
"question": line["question"],
"context": line["context"],
"choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
},
),
suite=("lighteval",),
hf_repo="rajpurkar/squad_v2",
hf_subset="squad_v2",
hf_filter=lambda line: any(ans for ans in line["answers"]["text"] if len(ans) > 0),
evaluation_splits=("validation",),
few_shots_split="train",
stop_sequence=["\n", "Question:", "question:"],
generation_size=200,
metric=(
Metrics.prefix_quasi_exact_match,
Metrics.f1_score_quasi,
),
)
storycloze_2016_lighteval = LightevalTaskConfig(
name="storycloze:2016",
suite=["lighteval", "storycloze"],
Expand Down
Loading