|
22 | 22 | import lighteval.tasks.default_prompts as prompt
|
23 | 23 | from lighteval.metrics.metrics import Metrics
|
24 | 24 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
|
| 25 | +from lighteval.tasks.templates.qa import get_qa_prompt_function |
| 26 | +from lighteval.utils.language import Language |
25 | 27 |
|
26 | 28 |
|
27 | 29 | abstract_narrative_understanding_bigbench = LightevalTaskConfig(
|
|
6627 | 6629 | trust_dataset=True,
|
6628 | 6630 | version=0,
|
6629 | 6631 | )
|
6630 |
| -coqa_lighteval = LightevalTaskConfig( |
| 6632 | +coqa_first_question = LightevalTaskConfig( |
6631 | 6633 | name="coqa",
|
6632 |
| - suite=["lighteval"], |
6633 |
| - prompt_function=prompt.coqa, |
6634 |
| - hf_repo="coqa", |
| 6634 | + prompt_function=get_qa_prompt_function( |
| 6635 | + Language.ENGLISH, |
| 6636 | + lambda line: { |
| 6637 | + "question": line["questions"][0], |
| 6638 | + "context": line["story"], |
| 6639 | + "choices": [line["answers"]["input_text"][0]], |
| 6640 | + }, |
| 6641 | + ), |
| 6642 | + suite=("lighteval",), |
| 6643 | + hf_repo="stanfordnlp/coqa", |
6635 | 6644 | hf_subset="default",
|
6636 | 6645 | hf_avail_splits=["train", "validation"],
|
6637 | 6646 | evaluation_splits=["validation"],
|
6638 |
| - few_shots_split=None, |
6639 |
| - few_shots_select=None, |
6640 |
| - generation_size=10, |
6641 |
| - metric=[Metrics.perfect_exact_match, Metrics.f1_score], |
6642 |
| - stop_sequence=["\n"], |
6643 |
| - trust_dataset=True, |
6644 |
| - version=0, |
| 6647 | + stop_sequence=["\n", "Question:", "question:"], |
| 6648 | + generation_size=100, |
| 6649 | + version=1, |
| 6650 | + metric=( |
| 6651 | + Metrics.prefix_quasi_exact_match, |
| 6652 | + Metrics.f1_score_quasi, |
| 6653 | + ), |
6645 | 6654 | )
|
6646 | 6655 | coqa_bb_lighteval = LightevalTaskConfig(
|
6647 | 6656 | name="coqa_bb",
|
|
6835 | 6844 | trust_dataset=True,
|
6836 | 6845 | version=0,
|
6837 | 6846 | )
|
6838 |
| -drop_lighteval = LightevalTaskConfig( |
| 6847 | +drop_qa = LightevalTaskConfig( |
6839 | 6848 | name="drop",
|
6840 |
| - suite=["lighteval"], |
6841 |
| - prompt_function=prompt.drop, |
| 6849 | + prompt_function=get_qa_prompt_function( |
| 6850 | + Language.ENGLISH, |
| 6851 | + lambda line: { |
| 6852 | + "context": line["passage"], |
| 6853 | + "question": line["question"], |
| 6854 | + "choices": list( |
| 6855 | + filter( |
| 6856 | + lambda x: x, |
| 6857 | + [line["answer"].get("number")] |
| 6858 | + + line["answer"]["spans"] |
| 6859 | + + [prompt.get_drop_date(line["answer"].get("date"))], |
| 6860 | + ) |
| 6861 | + ), |
| 6862 | + }, |
| 6863 | + ), |
| 6864 | + suite=("lighteval",), |
6842 | 6865 | hf_repo="lighteval/drop_harness",
|
6843 | 6866 | hf_subset="default",
|
6844 |
| - hf_avail_splits=["train", "validation"], |
6845 |
| - evaluation_splits=["validation"], |
| 6867 | + hf_filter=lambda line: list( |
| 6868 | + filter( |
| 6869 | + lambda x: x, |
| 6870 | + [line["answer"].get("number")] |
| 6871 | + + line["answer"]["spans"] |
| 6872 | + + [prompt.get_drop_date(line["answer"].get("date"))], |
| 6873 | + ) |
| 6874 | + ), |
| 6875 | + evaluation_splits=("validation",), |
6846 | 6876 | few_shots_split="train",
|
6847 |
| - few_shots_select="random_sampling_from_train", |
6848 |
| - generation_size=None, |
6849 |
| - metric=[Metrics.drop], |
6850 |
| - stop_sequence=["."], |
6851 |
| - trust_dataset=True, |
6852 |
| - version=0, |
| 6877 | + generation_size=250, |
| 6878 | + stop_sequence=["Question:", "question:", "\n"], |
| 6879 | + metric=( |
| 6880 | + Metrics.prefix_quasi_exact_match, |
| 6881 | + Metrics.f1_score_quasi, |
| 6882 | + ), |
| 6883 | + version=1, |
6853 | 6884 | )
|
6854 | 6885 | dyck_language_2_helm = LightevalTaskConfig(
|
6855 | 6886 | name="dyck_language:2",
|
|
8581 | 8612 | trust_dataset=True,
|
8582 | 8613 | version=0,
|
8583 | 8614 | )
|
| 8615 | +jeopardy = LightevalTaskConfig( |
| 8616 | + name="jeopardy", |
| 8617 | + prompt_function=get_qa_prompt_function( |
| 8618 | + Language.ENGLISH, |
| 8619 | + lambda line: { |
| 8620 | + "question": line["question"], |
| 8621 | + "choices": [line["answer"]], |
| 8622 | + }, |
| 8623 | + ), |
| 8624 | + suite=("lighteval",), |
| 8625 | + hf_repo="openaccess-ai-collective/jeopardy", |
| 8626 | + hf_subset="default", |
| 8627 | + evaluation_splits=("train",), |
| 8628 | + few_shots_split="train", |
| 8629 | + generation_size=250, |
| 8630 | + stop_sequence=["\n", "Question:", "question:"], |
| 8631 | + metric=( |
| 8632 | + Metrics.prefix_quasi_exact_match, |
| 8633 | + Metrics.f1_score_quasi, |
| 8634 | + ), |
| 8635 | +) |
8584 | 8636 | kanji_ascii_bigbench = LightevalTaskConfig(
|
8585 | 8637 | name="kanji_ascii",
|
8586 | 8638 | suite=["bigbench", "bigbench_json"],
|
|
13665 | 13717 | trust_dataset=True,
|
13666 | 13718 | version=0,
|
13667 | 13719 | )
|
| 13720 | +natural_questions = LightevalTaskConfig( |
| 13721 | + name="natural_questions", |
| 13722 | + prompt_function=get_qa_prompt_function( |
| 13723 | + Language.ENGLISH, |
| 13724 | + lambda line: {"question": line["question"], "choices": [line["answer"]]}, |
| 13725 | + ), |
| 13726 | + suite=("lighteval",), |
| 13727 | + hf_repo="lighteval/small_natural_questions", |
| 13728 | + hf_subset="default", |
| 13729 | + evaluation_splits=("test",), |
| 13730 | + few_shots_split="few_shot", |
| 13731 | + generation_size=250, |
| 13732 | + stop_sequence=["\n", "Question:", "question:"], |
| 13733 | + metric=( |
| 13734 | + Metrics.prefix_quasi_exact_match, |
| 13735 | + Metrics.f1_score_quasi, |
| 13736 | + ), |
| 13737 | +) |
13668 | 13738 | navigate_bigbench = LightevalTaskConfig(
|
13669 | 13739 | name="navigate",
|
13670 | 13740 | suite=["bigbench", "bigbench_json"],
|
|
14885 | 14955 | hf_subset="default",
|
14886 | 14956 | hf_avail_splits=["test"],
|
14887 | 14957 | evaluation_splits=["test"],
|
14888 |
| - few_shots_split=None, |
| 14958 | + few_shots_split="few_shot", |
14889 | 14959 | few_shots_select=None,
|
14890 | 14960 | generation_size=2048,
|
14891 | 14961 | metric=[Metrics.simpleqa_judge],
|
|
15074 | 15144 | trust_dataset=True,
|
15075 | 15145 | version=0,
|
15076 | 15146 | )
|
| 15147 | +squad_v2 = LightevalTaskConfig( |
| 15148 | + name="squad_v2", |
| 15149 | + prompt_function=get_qa_prompt_function( |
| 15150 | + Language.ENGLISH, |
| 15151 | + lambda line: { |
| 15152 | + "question": line["question"], |
| 15153 | + "context": line["context"], |
| 15154 | + "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], |
| 15155 | + }, |
| 15156 | + ), |
| 15157 | + suite=("lighteval",), |
| 15158 | + hf_repo="rajpurkar/squad_v2", |
| 15159 | + hf_subset="squad_v2", |
| 15160 | + hf_filter=lambda line: any(ans for ans in line["answers"]["text"] if len(ans) > 0), |
| 15161 | + evaluation_splits=("validation",), |
| 15162 | + few_shots_split="train", |
| 15163 | + stop_sequence=["\n", "Question:", "question:"], |
| 15164 | + generation_size=200, |
| 15165 | + metric=( |
| 15166 | + Metrics.prefix_quasi_exact_match, |
| 15167 | + Metrics.f1_score_quasi, |
| 15168 | + ), |
| 15169 | +) |
15077 | 15170 | storycloze_2016_lighteval = LightevalTaskConfig(
|
15078 | 15171 | name="storycloze:2016",
|
15079 | 15172 | suite=["lighteval", "storycloze"],
|
|
0 commit comments