-
Notifications
You must be signed in to change notification settings - Fork 95
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* add multilignaul dynamic generative metrics * draft * finish multichoice config * update tokenizers + install nltk reqs * use punkt tab * Update src/lighteval/utils/imports.py Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com> * Update src/lighteval/metrics/normalizations.py Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com> * fix imports * remove unused import * finish implementation of templates + move stuff around * resolve nits * when in rome do as romans do (handle error messages the same way) * fix utils * nicers tests + fix them * nicer todo * add nice doscrings 📃 * add even more docstring * nit * fix test * add multilingual to dev group * merge nli, add languagees to literals * translation literals * add nli * add rcb + chinese nli * Update src/lighteval/tasks/multilingual/tasks.py Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> * Update src/lighteval/tasks/multilingual/tasks.py Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> * Update src/lighteval/tasks/multilingual/tasks.py Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> * Update src/lighteval/tasks/multilingual/tasks.py Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> * Update src/lighteval/tasks/multilingual/tasks.py Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> * Update src/lighteval/tasks/multilingual/tasks.py Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> * Update src/lighteval/tasks/multilingual/tasks.py Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> * add two new tasks + docs --------- Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com> Co-authored-by: Hynek Kydlicek <kydliceh.hynek@gmail.com> Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com>
- Loading branch information
1 parent
170ed87
commit 551572a
Showing
1 changed file
with
324 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,324 @@ | ||
# MIT License | ||
|
||
# Copyright (c) 2024 The HuggingFace Team | ||
|
||
# Permission is hereby granted, free of charge, to any person obtaining a copy | ||
# of this software and associated documentation files (the "Software"), to deal | ||
# in the Software without restriction, including without limitation the rights | ||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
# copies of the Software, and to permit persons to whom the Software is | ||
# furnished to do so, subject to the following conditions: | ||
|
||
# The above copyright notice and this permission notice shall be included in all | ||
# copies or substantial portions of the Software. | ||
|
||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
# SOFTWARE. | ||
|
||
from langcodes import Language as LangCodeLanguage | ||
from langcodes import standardize_tag | ||
|
||
from lighteval.metrics.dynamic_metrics import loglikelihood_acc_metric | ||
from lighteval.metrics.normalizations import LogProbTokenNorm | ||
from lighteval.tasks.lighteval_task import LightevalTaskConfig | ||
from lighteval.tasks.templates.nli import get_nli_prompt_function | ||
from lighteval.tasks.templates.utils.formulation import ( | ||
CFFormulation, | ||
HybridFormulation, | ||
MCFFormulation, | ||
) | ||
from lighteval.utils.language import Language | ||
|
||
|
||
# ------------------------------- NLI Tasks ------------------------------- # | ||
# NLI (Natural Language Inference) tasks involve determining the logical relationship | ||
# between two given sentences: a premise and a hypothesis. The goal is to classify | ||
# whether the hypothesis is entailed by, contradicts, or is neutral with respect to | ||
# the premise. After our inspection we found the neutral label to be quite ambiguous | ||
# and decided to exclude it. But you can easily add it by modifying the adapters | ||
|
||
|
||
# The XNLI dataset is a multilingual variant of MultiNLI | ||
# https://aclanthology.org/D18-1269/ | ||
xnli_tasks = [ | ||
LightevalTaskConfig( | ||
name=f"xnli_{language.value}_{formulation.name.lower()}", | ||
suite=["lighteval"], | ||
metric=[loglikelihood_acc_metric(normalization=LogProbTokenNorm())], | ||
prompt_function=get_nli_prompt_function( | ||
language=language, | ||
adapter=lambda line: { | ||
"premise": line["premise"], | ||
"hypothesis": line["hypothesis"], | ||
# Since we ignore the neutral label | ||
"gold_idx": {0: 0, 2: 1}[line["label"]], | ||
}, | ||
relations=["entailment", "contradiction"], | ||
formulation=formulation, | ||
), | ||
hf_filter=lambda line: line["label"] in [0, 2], | ||
hf_repo="facebook/xnli", | ||
hf_subset=standardize_tag(language.value), | ||
evaluation_splits=["validation"], | ||
few_shots_split="train", | ||
) | ||
for language in [ | ||
Language.ARABIC, | ||
Language.ENGLISH, | ||
Language.FRENCH, | ||
Language.SPANISH, | ||
Language.BULGARIAN, | ||
Language.GERMAN, | ||
Language.GREEK, | ||
Language.ENGLISH, | ||
Language.FRENCH, | ||
Language.HINDI, | ||
Language.RUSSIAN, | ||
Language.SWAHILI, | ||
Language.THAI, | ||
Language.TURKISH, | ||
Language.URDU, | ||
Language.VIETNAMESE, | ||
Language.CHINESE, | ||
] | ||
for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] | ||
] | ||
|
||
# Improvement on XNLI with better translation, from our experience models tend to | ||
# perform better on XNLI2.0 than XNLI | ||
# https://arxiv.org/abs/2301.06527 | ||
xnli2_tasks = [ | ||
LightevalTaskConfig( | ||
name=f"xnli2.0_{language.value}_{formulation.name.lower()}", | ||
suite=["lighteval"], | ||
metric=[loglikelihood_acc_metric(normalization=LogProbTokenNorm())], | ||
prompt_function=get_nli_prompt_function( | ||
language=language, | ||
adapter=lambda line: { | ||
"premise": line["premise"], | ||
"hypothesis": line["hypothesis"], | ||
# Since we ignore the neutral label | ||
"gold_idx": {0: 0, 2: 1}[line["label"]], | ||
}, | ||
relations=["entailment", "contradiction"], | ||
formulation=formulation, | ||
), | ||
hf_filter=lambda line: line["label"] in [0, 2], | ||
hf_repo=f"Harsit/xnli2.0_train_{LangCodeLanguage(standardize_tag(language.value)).language_name().lower()}", | ||
hf_subset="default", | ||
evaluation_splits=["train"], | ||
) | ||
for language in [ | ||
Language.ENGLISH, | ||
Language.FRENCH, | ||
Language.PUNJABI, | ||
Language.GUJARATI, | ||
Language.KANNADA, | ||
Language.ASSAMESE, | ||
Language.BENGALI, | ||
Language.MARATHI, | ||
Language.SANSKRIT, | ||
Language.TAMIL, | ||
Language.GERMAN, | ||
Language.ENGLISH, | ||
Language.URDU, | ||
Language.VIETNAMESE, | ||
Language.TURKISH, | ||
Language.THAI, | ||
Language.SWAHILI, | ||
Language.SPANISH, | ||
Language.RUSSIAN, | ||
Language.HINDI, | ||
Language.GREEK, | ||
Language.CHINESE, | ||
Language.BULGARIAN, | ||
Language.ARABIC, | ||
# Theoretically also: Bhojpuri, Gujarati, Odiya | ||
] | ||
for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] | ||
] | ||
|
||
# Another variant of XNLI, with emphasis on Indic languages | ||
# https://arxiv.org/abs/2204.08776 | ||
xnli_indic_tasks = [ | ||
LightevalTaskConfig( | ||
name=f"indicnxnli_{language.value}_{formulation.name.lower()}", | ||
suite=["lighteval"], | ||
prompt_function=get_nli_prompt_function( | ||
language=language, | ||
adapter=lambda line: { | ||
"premise": line["premise"], | ||
"hypothesis": line["hypothesis"], | ||
# Since we ignore the neutral label | ||
"gold_idx": {0: 0, 2: 1}[line["label"]], | ||
}, | ||
relations=["entailment", "contradiction"], | ||
formulation=formulation, | ||
), | ||
hf_repo="Divyanshu/indicxnli", | ||
hf_subset=standardize_tag(language.value), | ||
# Ignore neutral | ||
hf_filter=lambda x: int(x["label"]) in [0, 2], | ||
evaluation_splits=["validation"], | ||
few_shots_split="train", | ||
few_shots_select=None, | ||
generation_size=-1, | ||
metric=[ | ||
loglikelihood_acc_metric(normalization=LogProbTokenNorm()), | ||
], | ||
) | ||
for language in [ | ||
Language.ASSAMESE, | ||
Language.BENGALI, | ||
Language.GUJARATI, | ||
Language.HINDI, | ||
Language.KANNADA, | ||
Language.MALAYALAM, | ||
Language.MARATHI, | ||
Language.ORIYA, | ||
Language.PUNJABI, | ||
Language.TAMIL, | ||
Language.TELUGU, | ||
] | ||
for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] | ||
] | ||
|
||
# PAWS-X: A Cross-lingual Adversarial Dataset for Paraphrase Identification | ||
# This dataset contains paraphrase identification pairs in multiple languages. | ||
# It's derived from PAWS (Paraphrase Adversaries from Word Scrambling) and | ||
# We treat paraphrase as entailment and non-paraphrase as contradiction | ||
# https://arxiv.org/abs/1908.11828 | ||
|
||
paws_x_tasks = [ | ||
LightevalTaskConfig( | ||
name=f"pawsx_{language.value}_{formulation.name.lower()}", | ||
suite=("lighteval",), | ||
prompt_function=get_nli_prompt_function( | ||
language=language, | ||
adapter=lambda line: { | ||
"premise": line["sentence1"], | ||
"hypothesis": line["sentence2"], | ||
# Since we ignore the neutral label | ||
"gold_idx": int(line["label"]), | ||
}, | ||
relations=["entailment", "contradiction"], | ||
formulation=formulation, | ||
), | ||
hf_repo="google-research-datasets/paws-x", | ||
hf_subset=standardize_tag(language.value), | ||
evaluation_splits=("test",), | ||
few_shots_split="train", | ||
metric=[ | ||
loglikelihood_acc_metric(normalization=LogProbTokenNorm()), | ||
], | ||
) | ||
for language in [ | ||
Language.GERMAN, | ||
Language.ENGLISH, | ||
Language.SPANISH, | ||
Language.FRENCH, | ||
Language.JAPANESE, | ||
Language.KOREAN, | ||
Language.CHINESE, | ||
] | ||
for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] | ||
] | ||
|
||
# Russian Commitment Bank (RCB) is a large-scale NLI dataset with Russian sentences, | ||
# collected from the web and crowdsourcing. | ||
# https://arxiv.org/abs/2401.04531 | ||
rcb_tasks = [ | ||
LightevalTaskConfig( | ||
name=f"rcb_{Language.RUSSIAN.value}_{formulation.name.lower()}", | ||
prompt_function=get_nli_prompt_function( | ||
language=Language.RUSSIAN, | ||
adapter=lambda line: { | ||
"premise": line["inputs"]["premise"], | ||
"hypothesis": line["inputs"]["hypothesis"], | ||
# Since we ignore the neutral label | ||
"gold_idx": int(line["outputs"]) - 1, | ||
}, | ||
relations=["entailment", "contradiction"], | ||
formulation=formulation, | ||
), | ||
suite=("lighteval",), | ||
hf_repo="ai-forever/MERA", | ||
hf_subset="rcb", | ||
# Ignore neutral label | ||
hf_filter=lambda x: int(x["outputs"] or "0") in [1, 2], | ||
evaluation_splits=("train", "validation"), | ||
metric=[ | ||
loglikelihood_acc_metric(normalization=LogProbTokenNorm()), | ||
], | ||
) | ||
for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] | ||
] | ||
|
||
# Native Chinese NLI dataset based. | ||
# https://arxiv.org/pdf/2010.05444 | ||
# We find this benchmark to have really good signal compared to other Chinese NLI | ||
ocnli_tasks = [ | ||
LightevalTaskConfig( | ||
name=f"ocnli_{Language.CHINESE.value}_{formulation.name.lower()}", | ||
prompt_function=get_nli_prompt_function( | ||
language=Language.CHINESE, | ||
adapter=lambda line: { | ||
"premise": line["sentence1"], | ||
"hypothesis": line["sentence2"], | ||
# Since we ignore the neutral label | ||
"gold_idx": {1: 0, 2: 1}[line["label"]], | ||
}, | ||
relations=["entailment", "contradiction"], | ||
formulation=formulation, | ||
), | ||
suite=("lighteval",), | ||
hf_repo="clue/clue", | ||
hf_subset="ocnli", | ||
# Only keep the positive and negative examples | ||
hf_filter=lambda x: int(x["label"]) in [1, 2], | ||
evaluation_splits=("validation",), | ||
few_shots_split="train", | ||
metric=[ | ||
loglikelihood_acc_metric(normalization=LogProbTokenNorm()), | ||
], | ||
) | ||
for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] | ||
] | ||
|
||
# https://arxiv.org/abs/2004.05986 | ||
# Native Chinese NLI dataset based on MNLI approach (Machine Translated) | ||
cmnli_tasks = [ | ||
LightevalTaskConfig( | ||
name=f"cmnli_{Language.CHINESE.value}_{formulation.name.lower()}", | ||
prompt_function=get_nli_prompt_function( | ||
language=Language.CHINESE, | ||
adapter=lambda line: { | ||
"premise": line["sentence1"], | ||
"hypothesis": line["sentence2"], | ||
# Since we ignore the neutral label | ||
"gold_idx": {"entailment": 0, "contradiction": 1}[line["label"]], | ||
}, | ||
relations=["entailment", "contradiction"], | ||
formulation=formulation, | ||
), | ||
suite=("lighteval",), | ||
hf_repo="fenffef/cmnli", | ||
hf_subset="default", | ||
hf_filter=lambda x: x["label"] in ["entailment", "contradiction"], | ||
# Only keep the positive and negative examples | ||
evaluation_splits=("validation",), | ||
few_shots_split="train", | ||
metric=[ | ||
loglikelihood_acc_metric(normalization=LogProbTokenNorm()), | ||
], | ||
) | ||
for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] | ||
] | ||
|
||
|
||
TASKS_TABLE = [*xnli_tasks, *xnli2_tasks, *xnli_indic_tasks, *paws_x_tasks, *rcb_tasks, *ocnli_tasks, *cmnli_tasks] |