Skip to content

Commit 551572a

Browse files
hynky1999NathanHBHynek Kydlicekclefourrier
authored
Multilingual NLI Tasks (#329)
* add multilignaul dynamic generative metrics * draft * finish multichoice config * update tokenizers + install nltk reqs * use punkt tab * Update src/lighteval/utils/imports.py Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com> * Update src/lighteval/metrics/normalizations.py Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com> * fix imports * remove unused import * finish implementation of templates + move stuff around * resolve nits * when in rome do as romans do (handle error messages the same way) * fix utils * nicers tests + fix them * nicer todo * add nice doscrings 📃 * add even more docstring * nit * fix test * add multilingual to dev group * merge nli, add languagees to literals * translation literals * add nli * add rcb + chinese nli * Update src/lighteval/tasks/multilingual/tasks.py Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> * Update src/lighteval/tasks/multilingual/tasks.py Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> * Update src/lighteval/tasks/multilingual/tasks.py Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> * Update src/lighteval/tasks/multilingual/tasks.py Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> * Update src/lighteval/tasks/multilingual/tasks.py Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> * Update src/lighteval/tasks/multilingual/tasks.py Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> * Update src/lighteval/tasks/multilingual/tasks.py Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> * add two new tasks + docs --------- Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com> Co-authored-by: Hynek Kydlicek <kydliceh.hynek@gmail.com> Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com>
1 parent 170ed87 commit 551572a

File tree

1 file changed

+324
-0
lines changed
  • src/lighteval/tasks/multilingual

1 file changed

+324
-0
lines changed
Lines changed: 324 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,324 @@
1+
# MIT License
2+
3+
# Copyright (c) 2024 The HuggingFace Team
4+
5+
# Permission is hereby granted, free of charge, to any person obtaining a copy
6+
# of this software and associated documentation files (the "Software"), to deal
7+
# in the Software without restriction, including without limitation the rights
8+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
# copies of the Software, and to permit persons to whom the Software is
10+
# furnished to do so, subject to the following conditions:
11+
12+
# The above copyright notice and this permission notice shall be included in all
13+
# copies or substantial portions of the Software.
14+
15+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
# SOFTWARE.
22+
23+
from langcodes import Language as LangCodeLanguage
24+
from langcodes import standardize_tag
25+
26+
from lighteval.metrics.dynamic_metrics import loglikelihood_acc_metric
27+
from lighteval.metrics.normalizations import LogProbTokenNorm
28+
from lighteval.tasks.lighteval_task import LightevalTaskConfig
29+
from lighteval.tasks.templates.nli import get_nli_prompt_function
30+
from lighteval.tasks.templates.utils.formulation import (
31+
CFFormulation,
32+
HybridFormulation,
33+
MCFFormulation,
34+
)
35+
from lighteval.utils.language import Language
36+
37+
38+
# ------------------------------- NLI Tasks ------------------------------- #
39+
# NLI (Natural Language Inference) tasks involve determining the logical relationship
40+
# between two given sentences: a premise and a hypothesis. The goal is to classify
41+
# whether the hypothesis is entailed by, contradicts, or is neutral with respect to
42+
# the premise. After our inspection we found the neutral label to be quite ambiguous
43+
# and decided to exclude it. But you can easily add it by modifying the adapters
44+
45+
46+
# The XNLI dataset is a multilingual variant of MultiNLI
47+
# https://aclanthology.org/D18-1269/
48+
xnli_tasks = [
49+
LightevalTaskConfig(
50+
name=f"xnli_{language.value}_{formulation.name.lower()}",
51+
suite=["lighteval"],
52+
metric=[loglikelihood_acc_metric(normalization=LogProbTokenNorm())],
53+
prompt_function=get_nli_prompt_function(
54+
language=language,
55+
adapter=lambda line: {
56+
"premise": line["premise"],
57+
"hypothesis": line["hypothesis"],
58+
# Since we ignore the neutral label
59+
"gold_idx": {0: 0, 2: 1}[line["label"]],
60+
},
61+
relations=["entailment", "contradiction"],
62+
formulation=formulation,
63+
),
64+
hf_filter=lambda line: line["label"] in [0, 2],
65+
hf_repo="facebook/xnli",
66+
hf_subset=standardize_tag(language.value),
67+
evaluation_splits=["validation"],
68+
few_shots_split="train",
69+
)
70+
for language in [
71+
Language.ARABIC,
72+
Language.ENGLISH,
73+
Language.FRENCH,
74+
Language.SPANISH,
75+
Language.BULGARIAN,
76+
Language.GERMAN,
77+
Language.GREEK,
78+
Language.ENGLISH,
79+
Language.FRENCH,
80+
Language.HINDI,
81+
Language.RUSSIAN,
82+
Language.SWAHILI,
83+
Language.THAI,
84+
Language.TURKISH,
85+
Language.URDU,
86+
Language.VIETNAMESE,
87+
Language.CHINESE,
88+
]
89+
for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
90+
]
91+
92+
# Improvement on XNLI with better translation, from our experience models tend to
93+
# perform better on XNLI2.0 than XNLI
94+
# https://arxiv.org/abs/2301.06527
95+
xnli2_tasks = [
96+
LightevalTaskConfig(
97+
name=f"xnli2.0_{language.value}_{formulation.name.lower()}",
98+
suite=["lighteval"],
99+
metric=[loglikelihood_acc_metric(normalization=LogProbTokenNorm())],
100+
prompt_function=get_nli_prompt_function(
101+
language=language,
102+
adapter=lambda line: {
103+
"premise": line["premise"],
104+
"hypothesis": line["hypothesis"],
105+
# Since we ignore the neutral label
106+
"gold_idx": {0: 0, 2: 1}[line["label"]],
107+
},
108+
relations=["entailment", "contradiction"],
109+
formulation=formulation,
110+
),
111+
hf_filter=lambda line: line["label"] in [0, 2],
112+
hf_repo=f"Harsit/xnli2.0_train_{LangCodeLanguage(standardize_tag(language.value)).language_name().lower()}",
113+
hf_subset="default",
114+
evaluation_splits=["train"],
115+
)
116+
for language in [
117+
Language.ENGLISH,
118+
Language.FRENCH,
119+
Language.PUNJABI,
120+
Language.GUJARATI,
121+
Language.KANNADA,
122+
Language.ASSAMESE,
123+
Language.BENGALI,
124+
Language.MARATHI,
125+
Language.SANSKRIT,
126+
Language.TAMIL,
127+
Language.GERMAN,
128+
Language.ENGLISH,
129+
Language.URDU,
130+
Language.VIETNAMESE,
131+
Language.TURKISH,
132+
Language.THAI,
133+
Language.SWAHILI,
134+
Language.SPANISH,
135+
Language.RUSSIAN,
136+
Language.HINDI,
137+
Language.GREEK,
138+
Language.CHINESE,
139+
Language.BULGARIAN,
140+
Language.ARABIC,
141+
# Theoretically also: Bhojpuri, Gujarati, Odiya
142+
]
143+
for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
144+
]
145+
146+
# Another variant of XNLI, with emphasis on Indic languages
147+
# https://arxiv.org/abs/2204.08776
148+
xnli_indic_tasks = [
149+
LightevalTaskConfig(
150+
name=f"indicnxnli_{language.value}_{formulation.name.lower()}",
151+
suite=["lighteval"],
152+
prompt_function=get_nli_prompt_function(
153+
language=language,
154+
adapter=lambda line: {
155+
"premise": line["premise"],
156+
"hypothesis": line["hypothesis"],
157+
# Since we ignore the neutral label
158+
"gold_idx": {0: 0, 2: 1}[line["label"]],
159+
},
160+
relations=["entailment", "contradiction"],
161+
formulation=formulation,
162+
),
163+
hf_repo="Divyanshu/indicxnli",
164+
hf_subset=standardize_tag(language.value),
165+
# Ignore neutral
166+
hf_filter=lambda x: int(x["label"]) in [0, 2],
167+
evaluation_splits=["validation"],
168+
few_shots_split="train",
169+
few_shots_select=None,
170+
generation_size=-1,
171+
metric=[
172+
loglikelihood_acc_metric(normalization=LogProbTokenNorm()),
173+
],
174+
)
175+
for language in [
176+
Language.ASSAMESE,
177+
Language.BENGALI,
178+
Language.GUJARATI,
179+
Language.HINDI,
180+
Language.KANNADA,
181+
Language.MALAYALAM,
182+
Language.MARATHI,
183+
Language.ORIYA,
184+
Language.PUNJABI,
185+
Language.TAMIL,
186+
Language.TELUGU,
187+
]
188+
for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
189+
]
190+
191+
# PAWS-X: A Cross-lingual Adversarial Dataset for Paraphrase Identification
192+
# This dataset contains paraphrase identification pairs in multiple languages.
193+
# It's derived from PAWS (Paraphrase Adversaries from Word Scrambling) and
194+
# We treat paraphrase as entailment and non-paraphrase as contradiction
195+
# https://arxiv.org/abs/1908.11828
196+
197+
paws_x_tasks = [
198+
LightevalTaskConfig(
199+
name=f"pawsx_{language.value}_{formulation.name.lower()}",
200+
suite=("lighteval",),
201+
prompt_function=get_nli_prompt_function(
202+
language=language,
203+
adapter=lambda line: {
204+
"premise": line["sentence1"],
205+
"hypothesis": line["sentence2"],
206+
# Since we ignore the neutral label
207+
"gold_idx": int(line["label"]),
208+
},
209+
relations=["entailment", "contradiction"],
210+
formulation=formulation,
211+
),
212+
hf_repo="google-research-datasets/paws-x",
213+
hf_subset=standardize_tag(language.value),
214+
evaluation_splits=("test",),
215+
few_shots_split="train",
216+
metric=[
217+
loglikelihood_acc_metric(normalization=LogProbTokenNorm()),
218+
],
219+
)
220+
for language in [
221+
Language.GERMAN,
222+
Language.ENGLISH,
223+
Language.SPANISH,
224+
Language.FRENCH,
225+
Language.JAPANESE,
226+
Language.KOREAN,
227+
Language.CHINESE,
228+
]
229+
for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
230+
]
231+
232+
# Russian Commitment Bank (RCB) is a large-scale NLI dataset with Russian sentences,
233+
# collected from the web and crowdsourcing.
234+
# https://arxiv.org/abs/2401.04531
235+
rcb_tasks = [
236+
LightevalTaskConfig(
237+
name=f"rcb_{Language.RUSSIAN.value}_{formulation.name.lower()}",
238+
prompt_function=get_nli_prompt_function(
239+
language=Language.RUSSIAN,
240+
adapter=lambda line: {
241+
"premise": line["inputs"]["premise"],
242+
"hypothesis": line["inputs"]["hypothesis"],
243+
# Since we ignore the neutral label
244+
"gold_idx": int(line["outputs"]) - 1,
245+
},
246+
relations=["entailment", "contradiction"],
247+
formulation=formulation,
248+
),
249+
suite=("lighteval",),
250+
hf_repo="ai-forever/MERA",
251+
hf_subset="rcb",
252+
# Ignore neutral label
253+
hf_filter=lambda x: int(x["outputs"] or "0") in [1, 2],
254+
evaluation_splits=("train", "validation"),
255+
metric=[
256+
loglikelihood_acc_metric(normalization=LogProbTokenNorm()),
257+
],
258+
)
259+
for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
260+
]
261+
262+
# Native Chinese NLI dataset based.
263+
# https://arxiv.org/pdf/2010.05444
264+
# We find this benchmark to have really good signal compared to other Chinese NLI
265+
ocnli_tasks = [
266+
LightevalTaskConfig(
267+
name=f"ocnli_{Language.CHINESE.value}_{formulation.name.lower()}",
268+
prompt_function=get_nli_prompt_function(
269+
language=Language.CHINESE,
270+
adapter=lambda line: {
271+
"premise": line["sentence1"],
272+
"hypothesis": line["sentence2"],
273+
# Since we ignore the neutral label
274+
"gold_idx": {1: 0, 2: 1}[line["label"]],
275+
},
276+
relations=["entailment", "contradiction"],
277+
formulation=formulation,
278+
),
279+
suite=("lighteval",),
280+
hf_repo="clue/clue",
281+
hf_subset="ocnli",
282+
# Only keep the positive and negative examples
283+
hf_filter=lambda x: int(x["label"]) in [1, 2],
284+
evaluation_splits=("validation",),
285+
few_shots_split="train",
286+
metric=[
287+
loglikelihood_acc_metric(normalization=LogProbTokenNorm()),
288+
],
289+
)
290+
for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
291+
]
292+
293+
# https://arxiv.org/abs/2004.05986
294+
# Native Chinese NLI dataset based on MNLI approach (Machine Translated)
295+
cmnli_tasks = [
296+
LightevalTaskConfig(
297+
name=f"cmnli_{Language.CHINESE.value}_{formulation.name.lower()}",
298+
prompt_function=get_nli_prompt_function(
299+
language=Language.CHINESE,
300+
adapter=lambda line: {
301+
"premise": line["sentence1"],
302+
"hypothesis": line["sentence2"],
303+
# Since we ignore the neutral label
304+
"gold_idx": {"entailment": 0, "contradiction": 1}[line["label"]],
305+
},
306+
relations=["entailment", "contradiction"],
307+
formulation=formulation,
308+
),
309+
suite=("lighteval",),
310+
hf_repo="fenffef/cmnli",
311+
hf_subset="default",
312+
hf_filter=lambda x: x["label"] in ["entailment", "contradiction"],
313+
# Only keep the positive and negative examples
314+
evaluation_splits=("validation",),
315+
few_shots_split="train",
316+
metric=[
317+
loglikelihood_acc_metric(normalization=LogProbTokenNorm()),
318+
],
319+
)
320+
for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
321+
]
322+
323+
324+
TASKS_TABLE = [*xnli_tasks, *xnli2_tasks, *xnli_indic_tasks, *paws_x_tasks, *rcb_tasks, *ocnli_tasks, *cmnli_tasks]

0 commit comments

Comments
 (0)