Multilingual NLI Tasks (#329)

hynky1999 · NathanHB · Hynek Kydlicek · web-flow · commit 551572ae66aa · 2024-09-30T20:13:41.000+02:00
* add multilignaul dynamic generative metrics

* draft

* finish multichoice config

* update tokenizers + install nltk reqs

* use punkt tab

* Update src/lighteval/utils/imports.py

Co-authored-by: Nathan Habib &lt;30601243+NathanHB@users.noreply.github.com&gt;

* Update src/lighteval/metrics/normalizations.py

Co-authored-by: Nathan Habib &lt;30601243+NathanHB@users.noreply.github.com&gt;

* fix imports

* remove unused import

* finish implementation of templates + move stuff around

* resolve nits

* when in rome do as romans do (handle error messages the same way)

* fix utils

* nicers tests + fix them

* nicer todo

* add nice doscrings 📃

* add even more docstring

* nit

* fix test

* add multilingual to dev group

* merge nli, add languagees to literals

* translation literals

* add nli

* add rcb + chinese nli

* Update src/lighteval/tasks/multilingual/tasks.py

Co-authored-by: Clémentine Fourrier &lt;22726840+clefourrier@users.noreply.github.com&gt;

* Update src/lighteval/tasks/multilingual/tasks.py

Co-authored-by: Clémentine Fourrier &lt;22726840+clefourrier@users.noreply.github.com&gt;

* Update src/lighteval/tasks/multilingual/tasks.py

Co-authored-by: Clémentine Fourrier &lt;22726840+clefourrier@users.noreply.github.com&gt;

* Update src/lighteval/tasks/multilingual/tasks.py

Co-authored-by: Clémentine Fourrier &lt;22726840+clefourrier@users.noreply.github.com&gt;

* Update src/lighteval/tasks/multilingual/tasks.py

Co-authored-by: Clémentine Fourrier &lt;22726840+clefourrier@users.noreply.github.com&gt;

* Update src/lighteval/tasks/multilingual/tasks.py

Co-authored-by: Clémentine Fourrier &lt;22726840+clefourrier@users.noreply.github.com&gt;

* Update src/lighteval/tasks/multilingual/tasks.py

Co-authored-by: Clémentine Fourrier &lt;22726840+clefourrier@users.noreply.github.com&gt;

* add two new tasks + docs

---------

Co-authored-by: Nathan Habib &lt;30601243+NathanHB@users.noreply.github.com&gt;
Co-authored-by: Hynek Kydlicek &lt;kydliceh.hynek@gmail.com&gt;
Co-authored-by: Clémentine Fourrier &lt;22726840+clefourrier@users.noreply.github.com&gt;
diff --git a/src/lighteval/tasks/multilingual/tasks.py b/src/lighteval/tasks/multilingual/tasks.py
@@ -0,0 +1,324 @@
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from langcodes import Language as LangCodeLanguage
+from langcodes import standardize_tag
+
+from lighteval.metrics.dynamic_metrics import loglikelihood_acc_metric
+from lighteval.metrics.normalizations import LogProbTokenNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.nli import get_nli_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+    CFFormulation,
+    HybridFormulation,
+    MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+# ------------------------------- NLI Tasks ------------------------------- #
+# NLI (Natural Language Inference) tasks involve determining the logical relationship
+# between two given sentences: a premise and a hypothesis. The goal is to classify
+# whether the hypothesis is entailed by, contradicts, or is neutral with respect to
+# the premise. After our inspection we found the neutral label to be quite ambiguous
+# and decided to exclude it. But you can easily add it by modifying the adapters
+
+
+# The XNLI dataset is a multilingual variant of MultiNLI
+# https://aclanthology.org/D18-1269/
+xnli_tasks = [
+    LightevalTaskConfig(
+        name=f"xnli_{language.value}_{formulation.name.lower()}",
+        suite=["lighteval"],
+        metric=[loglikelihood_acc_metric(normalization=LogProbTokenNorm())],
+        prompt_function=get_nli_prompt_function(
+            language=language,
+            adapter=lambda line: {
+                "premise": line["premise"],
+                "hypothesis": line["hypothesis"],
+                # Since we ignore the neutral label
+                "gold_idx": {0: 0, 2: 1}[line["label"]],
+            },
+            relations=["entailment", "contradiction"],
+            formulation=formulation,
+        ),
+        hf_filter=lambda line: line["label"] in [0, 2],
+        hf_repo="facebook/xnli",
+        hf_subset=standardize_tag(language.value),
+        evaluation_splits=["validation"],
+        few_shots_split="train",
+    )
+    for language in [
+        Language.ARABIC,
+        Language.ENGLISH,
+        Language.FRENCH,
+        Language.SPANISH,
+        Language.BULGARIAN,
+        Language.GERMAN,
+        Language.GREEK,
+        Language.ENGLISH,
+        Language.FRENCH,
+        Language.HINDI,
+        Language.RUSSIAN,
+        Language.SWAHILI,
+        Language.THAI,
+        Language.TURKISH,
+        Language.URDU,
+        Language.VIETNAMESE,
+        Language.CHINESE,
+    ]
+    for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
+]
+
+# Improvement on XNLI with better translation, from our experience models tend to
+# perform better on XNLI2.0 than XNLI
+# https://arxiv.org/abs/2301.06527
+xnli2_tasks = [
+    LightevalTaskConfig(
+        name=f"xnli2.0_{language.value}_{formulation.name.lower()}",
+        suite=["lighteval"],
+        metric=[loglikelihood_acc_metric(normalization=LogProbTokenNorm())],
+        prompt_function=get_nli_prompt_function(
+            language=language,
+            adapter=lambda line: {
+                "premise": line["premise"],
+                "hypothesis": line["hypothesis"],
+                # Since we ignore the neutral label
+                "gold_idx": {0: 0, 2: 1}[line["label"]],
+            },
+            relations=["entailment", "contradiction"],
+            formulation=formulation,
+        ),
+        hf_filter=lambda line: line["label"] in [0, 2],
+        hf_repo=f"Harsit/xnli2.0_train_{LangCodeLanguage(standardize_tag(language.value)).language_name().lower()}",
+        hf_subset="default",
+        evaluation_splits=["train"],
+    )
+    for language in [
+        Language.ENGLISH,
+        Language.FRENCH,
+        Language.PUNJABI,
+        Language.GUJARATI,
+        Language.KANNADA,
+        Language.ASSAMESE,
+        Language.BENGALI,
+        Language.MARATHI,
+        Language.SANSKRIT,
+        Language.TAMIL,
+        Language.GERMAN,
+        Language.ENGLISH,
+        Language.URDU,
+        Language.VIETNAMESE,
+        Language.TURKISH,
+        Language.THAI,
+        Language.SWAHILI,
+        Language.SPANISH,
+        Language.RUSSIAN,
+        Language.HINDI,
+        Language.GREEK,
+        Language.CHINESE,
+        Language.BULGARIAN,
+        Language.ARABIC,
+        # Theoretically also: Bhojpuri, Gujarati, Odiya
+    ]
+    for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
+]
+
+# Another variant of XNLI, with emphasis on Indic languages
+# https://arxiv.org/abs/2204.08776
+xnli_indic_tasks = [
+    LightevalTaskConfig(
+        name=f"indicnxnli_{language.value}_{formulation.name.lower()}",
+        suite=["lighteval"],
+        prompt_function=get_nli_prompt_function(
+            language=language,
+            adapter=lambda line: {
+                "premise": line["premise"],
+                "hypothesis": line["hypothesis"],
+                # Since we ignore the neutral label
+                "gold_idx": {0: 0, 2: 1}[line["label"]],
+            },
+            relations=["entailment", "contradiction"],
+            formulation=formulation,
+        ),
+        hf_repo="Divyanshu/indicxnli",
+        hf_subset=standardize_tag(language.value),
+        # Ignore neutral
+        hf_filter=lambda x: int(x["label"]) in [0, 2],
+        evaluation_splits=["validation"],
+        few_shots_split="train",
+        few_shots_select=None,
+        generation_size=-1,
+        metric=[
+            loglikelihood_acc_metric(normalization=LogProbTokenNorm()),
+        ],
+    )
+    for language in [
+        Language.ASSAMESE,
+        Language.BENGALI,
+        Language.GUJARATI,
+        Language.HINDI,
+        Language.KANNADA,
+        Language.MALAYALAM,
+        Language.MARATHI,
+        Language.ORIYA,
+        Language.PUNJABI,
+        Language.TAMIL,
+        Language.TELUGU,
+    ]
+    for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
+]
+
+# PAWS-X: A Cross-lingual Adversarial Dataset for Paraphrase Identification
+# This dataset contains paraphrase identification pairs in multiple languages.
+# It's derived from PAWS (Paraphrase Adversaries from Word Scrambling) and
+# We treat paraphrase as entailment and non-paraphrase as contradiction
+# https://arxiv.org/abs/1908.11828
+
+paws_x_tasks = [
+    LightevalTaskConfig(
+        name=f"pawsx_{language.value}_{formulation.name.lower()}",
+        suite=("lighteval",),
+        prompt_function=get_nli_prompt_function(
+            language=language,
+            adapter=lambda line: {
+                "premise": line["sentence1"],
+                "hypothesis": line["sentence2"],
+                # Since we ignore the neutral label
+                "gold_idx": int(line["label"]),
+            },
+            relations=["entailment", "contradiction"],
+            formulation=formulation,
+        ),
+        hf_repo="google-research-datasets/paws-x",
+        hf_subset=standardize_tag(language.value),
+        evaluation_splits=("test",),
+        few_shots_split="train",
+        metric=[
+            loglikelihood_acc_metric(normalization=LogProbTokenNorm()),
+        ],
+    )
+    for language in [
+        Language.GERMAN,
+        Language.ENGLISH,
+        Language.SPANISH,
+        Language.FRENCH,
+        Language.JAPANESE,
+        Language.KOREAN,
+        Language.CHINESE,
+    ]
+    for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
+]
+
+# Russian Commitment Bank (RCB) is a large-scale NLI dataset with Russian sentences,
+# collected from the web and crowdsourcing.
+# https://arxiv.org/abs/2401.04531
+rcb_tasks = [
+    LightevalTaskConfig(
+        name=f"rcb_{Language.RUSSIAN.value}_{formulation.name.lower()}",
+        prompt_function=get_nli_prompt_function(
+            language=Language.RUSSIAN,
+            adapter=lambda line: {
+                "premise": line["inputs"]["premise"],
+                "hypothesis": line["inputs"]["hypothesis"],
+                # Since we ignore the neutral label
+                "gold_idx": int(line["outputs"]) - 1,
+            },
+            relations=["entailment", "contradiction"],
+            formulation=formulation,
+        ),
+        suite=("lighteval",),
+        hf_repo="ai-forever/MERA",
+        hf_subset="rcb",
+        # Ignore neutral label
+        hf_filter=lambda x: int(x["outputs"] or "0") in [1, 2],
+        evaluation_splits=("train", "validation"),
+        metric=[
+            loglikelihood_acc_metric(normalization=LogProbTokenNorm()),
+        ],
+    )
+    for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
+]
+
+# Native Chinese NLI dataset based.
+# https://arxiv.org/pdf/2010.05444
+# We find this benchmark to have really good signal compared to other Chinese NLI
+ocnli_tasks = [
+    LightevalTaskConfig(
+        name=f"ocnli_{Language.CHINESE.value}_{formulation.name.lower()}",
+        prompt_function=get_nli_prompt_function(
+            language=Language.CHINESE,
+            adapter=lambda line: {
+                "premise": line["sentence1"],
+                "hypothesis": line["sentence2"],
+                # Since we ignore the neutral label
+                "gold_idx": {1: 0, 2: 1}[line["label"]],
+            },
+            relations=["entailment", "contradiction"],
+            formulation=formulation,
+        ),
+        suite=("lighteval",),
+        hf_repo="clue/clue",
+        hf_subset="ocnli",
+        # Only keep the positive and negative examples
+        hf_filter=lambda x: int(x["label"]) in [1, 2],
+        evaluation_splits=("validation",),
+        few_shots_split="train",
+        metric=[
+            loglikelihood_acc_metric(normalization=LogProbTokenNorm()),
+        ],
+    )
+    for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
+]
+
+# https://arxiv.org/abs/2004.05986
+# Native Chinese NLI dataset based on MNLI approach (Machine Translated)
+cmnli_tasks = [
+    LightevalTaskConfig(
+        name=f"cmnli_{Language.CHINESE.value}_{formulation.name.lower()}",
+        prompt_function=get_nli_prompt_function(
+            language=Language.CHINESE,
+            adapter=lambda line: {
+                "premise": line["sentence1"],
+                "hypothesis": line["sentence2"],
+                # Since we ignore the neutral label
+                "gold_idx": {"entailment": 0, "contradiction": 1}[line["label"]],
+            },
+            relations=["entailment", "contradiction"],
+            formulation=formulation,
+        ),
+        suite=("lighteval",),
+        hf_repo="fenffef/cmnli",
+        hf_subset="default",
+        hf_filter=lambda x: x["label"] in ["entailment", "contradiction"],
+        # Only keep the positive and negative examples
+        evaluation_splits=("validation",),
+        few_shots_split="train",
+        metric=[
+            loglikelihood_acc_metric(normalization=LogProbTokenNorm()),
+        ],
+    )
+    for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
+]
+
+
+TASKS_TABLE = [*xnli_tasks, *xnli2_tasks, *xnli_indic_tasks, *paws_x_tasks, *rcb_tasks, *ocnli_tasks, *cmnli_tasks]