From 6c1058fe8c03fc8e215b71cbaa8e0da4d6422822 Mon Sep 17 00:00:00 2001 From: Parth Shandilya Date: Tue, 2 Apr 2019 09:18:47 +0530 Subject: [PATCH] Added Multi NLI Mismatched Dataset --- .gitignore | 5 +- .../multinli_1.0_dev_mismatched.txt | 3 + .../multinli_1.0/multinli_1.0_train.txt | 4 + tensorflow_datasets/text/__init__.py | 1 + .../text/multi_nli_mismatch.py | 158 ++++++++++++++++++ .../text/multi_nli_mismatch_test.py | 36 ++++ .../url_checksums/multi_nli_mismatch.txt | 1 + 7 files changed, 207 insertions(+), 1 deletion(-) create mode 100644 tensorflow_datasets/testing/test_data/fake_examples/multi_nli_mismatch/multinli_1.0/multinli_1.0_dev_mismatched.txt create mode 100644 tensorflow_datasets/testing/test_data/fake_examples/multi_nli_mismatch/multinli_1.0/multinli_1.0_train.txt create mode 100644 tensorflow_datasets/text/multi_nli_mismatch.py create mode 100644 tensorflow_datasets/text/multi_nli_mismatch_test.py create mode 100644 tensorflow_datasets/url_checksums/multi_nli_mismatch.txt diff --git a/.gitignore b/.gitignore index 9e0fbc8ee58..13271b0bd6b 100644 --- a/.gitignore +++ b/.gitignore @@ -17,4 +17,7 @@ dist/ .pytest_cache/ # Other -*.DS_Store \ No newline at end of file +*.DS_Store + +.vscode/ +data/ \ No newline at end of file diff --git a/tensorflow_datasets/testing/test_data/fake_examples/multi_nli_mismatch/multinli_1.0/multinli_1.0_dev_mismatched.txt b/tensorflow_datasets/testing/test_data/fake_examples/multi_nli_mismatch/multinli_1.0/multinli_1.0_dev_mismatched.txt new file mode 100644 index 00000000000..e6d5bbab5af --- /dev/null +++ b/tensorflow_datasets/testing/test_data/fake_examples/multi_nli_mismatch/multinli_1.0/multinli_1.0_dev_mismatched.txt @@ -0,0 +1,3 @@ +skip header +entailment 1 2 4 4 inflammable flammable 8 +neutral 1 5 6 4 5 legal moral 7 8 diff --git a/tensorflow_datasets/testing/test_data/fake_examples/multi_nli_mismatch/multinli_1.0/multinli_1.0_train.txt b/tensorflow_datasets/testing/test_data/fake_examples/multi_nli_mismatch/multinli_1.0/multinli_1.0_train.txt new file mode 100644 index 00000000000..1d6732db5e5 --- /dev/null +++ b/tensorflow_datasets/testing/test_data/fake_examples/multi_nli_mismatch/multinli_1.0/multinli_1.0_train.txt @@ -0,0 +1,4 @@ +skip header +neutral 1 2 3 4 correlation causation 7 +entailment 1 2 3 4 correlation correlation 7 +contradiction 1 2 3 4 correlation independence 7 diff --git a/tensorflow_datasets/text/__init__.py b/tensorflow_datasets/text/__init__.py index 552df4b0ae5..72f569ca51a 100644 --- a/tensorflow_datasets/text/__init__.py +++ b/tensorflow_datasets/text/__init__.py @@ -20,5 +20,6 @@ from tensorflow_datasets.text.lm1b import Lm1b from tensorflow_datasets.text.lm1b import Lm1bConfig from tensorflow_datasets.text.multi_nli import MultiNLI +from tensorflow_datasets.text.multi_nli_mismatch import MultiNLIMismatch from tensorflow_datasets.text.squad import Squad from tensorflow_datasets.text.wikipedia import Wikipedia diff --git a/tensorflow_datasets/text/multi_nli_mismatch.py b/tensorflow_datasets/text/multi_nli_mismatch.py new file mode 100644 index 00000000000..1efffa2d8ef --- /dev/null +++ b/tensorflow_datasets/text/multi_nli_mismatch.py @@ -0,0 +1,158 @@ +# coding=utf-8 +# Copyright 2019 The TensorFlow Datasets Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""The Multi-Genre NLI Corpus.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + +import tensorflow as tf +from tensorflow_datasets.core import api_utils +import tensorflow_datasets.public_api as tfds + +_CITATION = """\ +@InProceedings{N18-1101, + author = "Williams, Adina + and Nangia, Nikita + and Bowman, Samuel", + title = "A Broad-Coverage Challenge Corpus for + Sentence Understanding through Inference", + booktitle = "Proceedings of the 2018 Conference of + the North American Chapter of the + Association for Computational Linguistics: + Human Language Technologies, Volume 1 (Long + Papers)", + year = "2018", + publisher = "Association for Computational Linguistics", + pages = "1112--1122", + location = "New Orleans, Louisiana", + url = "http://aclweb.org/anthology/N18-1101" +} +""" + +_DESCRIPTION = """\ +The Multi-Genre Natural Language Inference (MultiNLI) corpus is a +crowd-sourced collection of 433k sentence pairs annotated with textual +entailment information. The corpus is modeled on the SNLI corpus, but differs in +that covers a range of genres of spoken and written text, and supports a +distinctive cross-genre generalization evaluation. The corpus served as the +basis for the shared task of the RepEval 2017 Workshop at EMNLP in Copenhagen. +""" + +ROOT_URL = "http://storage.googleapis.com/tfds-data/downloads/multi_nli/multinli_1.0.zip" + +class MultiNLIMismatchConfig(tfds.core.BuilderConfig): + """BuilderConfig for MultiNLI Mismatch.""" + + @api_utils.disallow_positional_args + def __init__(self, text_encoder_config=None, **kwargs): + """BuilderConfig for MultiNLI Mismatch. + + Args: + text_encoder_config: `tfds.features.text.TextEncoderConfig`, configuration + for the `tfds.features.text.TextEncoder` used for the features feature. + **kwargs: keyword arguments forwarded to super. + """ + super(MultiNLIMismatchConfig, self).__init__(**kwargs) + self.text_encoder_config = ( + text_encoder_config or tfds.features.text.TextEncoderConfig()) + + +class MultiNLIMismatch(tfds.core.GeneratorBasedBuilder): + """MultiNLI: The Stanford Question Answering Dataset. Version 1.1.""" + + BUILDER_CONFIGS = [ + MultiNLIMismatchConfig( + name="plain_text", + version="0.0.1", + description="Plain text", + ), + ] + + def _info(self): + return tfds.core.DatasetInfo( + builder=self, + description=_DESCRIPTION, + features=tfds.features.FeaturesDict({ + "premise": + tfds.features.Text( + encoder_config=self.builder_config.text_encoder_config), + "hypothesis": + tfds.features.Text( + encoder_config=self.builder_config.text_encoder_config), + "label": + tfds.features.Text( + encoder_config=self.builder_config.text_encoder_config), + }), + # No default supervised_keys (as we have to pass both premise + # and hypothesis as input). + supervised_keys=None, + urls=["https://www.nyu.edu/projects/bowman/multinli/"], + citation=_CITATION, + ) + + def _vocab_text_gen(self, filepath): + for ex in self._generate_examples(filepath): + yield " ".join([ex["premise"], ex["hypothesis"], ex["label"]]) + + def _split_generators(self, dl_manager): + + downloaded_dir = dl_manager.download_and_extract(ROOT_URL) + mnli_path = os.path.join(downloaded_dir, "multinli_1.0") + train_path = os.path.join(mnli_path, "multinli_1.0_train.txt") + + validation_path = os.path.join(mnli_path, "multinli_1.0_dev_mismatched.txt") + + # Generate shared vocabulary + # maybe_build_from_corpus uses SubwordTextEncoder if that's configured + self.info.features["premise"].maybe_build_from_corpus( + self._vocab_text_gen(train_path)) + encoder = self.info.features["premise"].encoder + + self.info.features["premise"].maybe_set_encoder(encoder) + self.info.features["hypothesis"].maybe_set_encoder(encoder) + self.info.features["label"].maybe_set_encoder(encoder) + + return [ + tfds.core.SplitGenerator( + name=tfds.Split.TRAIN, + num_shards=10, + gen_kwargs={"filepath": train_path}), + tfds.core.SplitGenerator( + name=tfds.Split.VALIDATION, + num_shards=1, + gen_kwargs={"filepath": validation_path}), + ] + + def _generate_examples(self, filepath): + """Generate mnli mismatch examples. + + Args: + filepath: a string + Yields: + dictionaries containing "premise", "hypothesis" and "label" strings + """ + for idx, line in enumerate(tf.io.gfile.GFile(filepath, "rb")): + if idx == 0: continue + line = tf.compat.as_text(line.strip()) + split_line = line.split("\t") + yield { + "premise": split_line[5], + "hypothesis": split_line[6], + "label": split_line[0] + } diff --git a/tensorflow_datasets/text/multi_nli_mismatch_test.py b/tensorflow_datasets/text/multi_nli_mismatch_test.py new file mode 100644 index 00000000000..730b64a0bd2 --- /dev/null +++ b/tensorflow_datasets/text/multi_nli_mismatch_test.py @@ -0,0 +1,36 @@ +# coding=utf-8 +# Copyright 2019 The TensorFlow Datasets Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for multinli dataset module.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow_datasets import testing +from tensorflow_datasets.text import multi_nli_mismatch + + +class MultiNLIMismatchTest(testing.DatasetBuilderTestCase): + DATASET_CLASS = multi_nli_mismatch.MultiNLIMismatch + + SPLITS = { + "train": 3, + "validation": 2, + } + + +if __name__ == "__main__": + testing.test_main() diff --git a/tensorflow_datasets/url_checksums/multi_nli_mismatch.txt b/tensorflow_datasets/url_checksums/multi_nli_mismatch.txt new file mode 100644 index 00000000000..397a556c890 --- /dev/null +++ b/tensorflow_datasets/url_checksums/multi_nli_mismatch.txt @@ -0,0 +1 @@ +http://storage.googleapis.com/tfds-data/downloads/multi_nli/multinli_1.0.zip 226850426 049f507b9e36b1fcb756cfd5aeb3b7a0cfcb84bf023793652987f7e7e0957822