From 6c1058fe8c03fc8e215b71cbaa8e0da4d6422822 Mon Sep 17 00:00:00 2001
From: Parth Shandilya <parth1989shandilya@gmail.com>
Date: Tue, 2 Apr 2019 09:18:47 +0530
Subject: [PATCH] Added Multi NLI Mismatched Dataset

---
 .gitignore                                    |   5 +-
 .../multinli_1.0_dev_mismatched.txt           |   3 +
 .../multinli_1.0/multinli_1.0_train.txt       |   4 +
 tensorflow_datasets/text/__init__.py          |   1 +
 .../text/multi_nli_mismatch.py                | 158 ++++++++++++++++++
 .../text/multi_nli_mismatch_test.py           |  36 ++++
 .../url_checksums/multi_nli_mismatch.txt      |   1 +
 7 files changed, 207 insertions(+), 1 deletion(-)
 create mode 100644 tensorflow_datasets/testing/test_data/fake_examples/multi_nli_mismatch/multinli_1.0/multinli_1.0_dev_mismatched.txt
 create mode 100644 tensorflow_datasets/testing/test_data/fake_examples/multi_nli_mismatch/multinli_1.0/multinli_1.0_train.txt
 create mode 100644 tensorflow_datasets/text/multi_nli_mismatch.py
 create mode 100644 tensorflow_datasets/text/multi_nli_mismatch_test.py
 create mode 100644 tensorflow_datasets/url_checksums/multi_nli_mismatch.txt

diff --git a/.gitignore b/.gitignore
index 9e0fbc8ee58..13271b0bd6b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,4 +17,7 @@ dist/
 .pytest_cache/
 
 # Other
-*.DS_Store
\ No newline at end of file
+*.DS_Store
+
+.vscode/
+data/
\ No newline at end of file
diff --git a/tensorflow_datasets/testing/test_data/fake_examples/multi_nli_mismatch/multinli_1.0/multinli_1.0_dev_mismatched.txt b/tensorflow_datasets/testing/test_data/fake_examples/multi_nli_mismatch/multinli_1.0/multinli_1.0_dev_mismatched.txt
new file mode 100644
index 00000000000..e6d5bbab5af
--- /dev/null
+++ b/tensorflow_datasets/testing/test_data/fake_examples/multi_nli_mismatch/multinli_1.0/multinli_1.0_dev_mismatched.txt
@@ -0,0 +1,3 @@
+skip header
+entailment	1	2	4	4	inflammable	flammable	8
+neutral	1	5	6	4	5	legal	moral	7	8
diff --git a/tensorflow_datasets/testing/test_data/fake_examples/multi_nli_mismatch/multinli_1.0/multinli_1.0_train.txt b/tensorflow_datasets/testing/test_data/fake_examples/multi_nli_mismatch/multinli_1.0/multinli_1.0_train.txt
new file mode 100644
index 00000000000..1d6732db5e5
--- /dev/null
+++ b/tensorflow_datasets/testing/test_data/fake_examples/multi_nli_mismatch/multinli_1.0/multinli_1.0_train.txt
@@ -0,0 +1,4 @@
+skip header
+neutral	1	2	3	4	correlation	causation	7
+entailment	1	2	3	4	correlation	correlation	7
+contradiction	1	2	3	4	correlation	independence	7
diff --git a/tensorflow_datasets/text/__init__.py b/tensorflow_datasets/text/__init__.py
index 552df4b0ae5..72f569ca51a 100644
--- a/tensorflow_datasets/text/__init__.py
+++ b/tensorflow_datasets/text/__init__.py
@@ -20,5 +20,6 @@
 from tensorflow_datasets.text.lm1b import Lm1b
 from tensorflow_datasets.text.lm1b import Lm1bConfig
 from tensorflow_datasets.text.multi_nli import MultiNLI
+from tensorflow_datasets.text.multi_nli_mismatch import MultiNLIMismatch
 from tensorflow_datasets.text.squad import Squad
 from tensorflow_datasets.text.wikipedia import Wikipedia
diff --git a/tensorflow_datasets/text/multi_nli_mismatch.py b/tensorflow_datasets/text/multi_nli_mismatch.py
new file mode 100644
index 00000000000..1efffa2d8ef
--- /dev/null
+++ b/tensorflow_datasets/text/multi_nli_mismatch.py
@@ -0,0 +1,158 @@
+# coding=utf-8
+# Copyright 2019 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""The Multi-Genre NLI Corpus."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import tensorflow as tf
+from tensorflow_datasets.core import api_utils
+import tensorflow_datasets.public_api as tfds
+
+_CITATION = """\
+@InProceedings{N18-1101,
+  author = "Williams, Adina
+            and Nangia, Nikita
+            and Bowman, Samuel",
+  title = "A Broad-Coverage Challenge Corpus for
+           Sentence Understanding through Inference",
+  booktitle = "Proceedings of the 2018 Conference of
+               the North American Chapter of the
+               Association for Computational Linguistics:
+               Human Language Technologies, Volume 1 (Long
+               Papers)",
+  year = "2018",
+  publisher = "Association for Computational Linguistics",
+  pages = "1112--1122",
+  location = "New Orleans, Louisiana",
+  url = "http://aclweb.org/anthology/N18-1101"
+}
+"""
+
+_DESCRIPTION = """\
+The Multi-Genre Natural Language Inference (MultiNLI) corpus is a
+crowd-sourced collection of 433k sentence pairs annotated with textual
+entailment information. The corpus is modeled on the SNLI corpus, but differs in
+that covers a range of genres of spoken and written text, and supports a
+distinctive cross-genre generalization evaluation. The corpus served as the
+basis for the shared task of the RepEval 2017 Workshop at EMNLP in Copenhagen.
+"""
+
+ROOT_URL = "http://storage.googleapis.com/tfds-data/downloads/multi_nli/multinli_1.0.zip"
+
+class MultiNLIMismatchConfig(tfds.core.BuilderConfig):
+  """BuilderConfig for MultiNLI Mismatch."""
+
+  @api_utils.disallow_positional_args
+  def __init__(self, text_encoder_config=None, **kwargs):
+    """BuilderConfig for MultiNLI Mismatch.
+
+    Args:
+      text_encoder_config: `tfds.features.text.TextEncoderConfig`, configuration
+        for the `tfds.features.text.TextEncoder` used for the features feature.
+      **kwargs: keyword arguments forwarded to super.
+    """
+    super(MultiNLIMismatchConfig, self).__init__(**kwargs)
+    self.text_encoder_config = (
+        text_encoder_config or tfds.features.text.TextEncoderConfig())
+
+
+class MultiNLIMismatch(tfds.core.GeneratorBasedBuilder):
+  """MultiNLI: The Stanford Question Answering Dataset. Version 1.1."""
+
+  BUILDER_CONFIGS = [
+      MultiNLIMismatchConfig(
+          name="plain_text",
+          version="0.0.1",
+          description="Plain text",
+      ),
+  ]
+
+  def _info(self):
+    return tfds.core.DatasetInfo(
+        builder=self,
+        description=_DESCRIPTION,
+        features=tfds.features.FeaturesDict({
+            "premise":
+                tfds.features.Text(
+                    encoder_config=self.builder_config.text_encoder_config),
+            "hypothesis":
+                tfds.features.Text(
+                    encoder_config=self.builder_config.text_encoder_config),
+            "label":
+                tfds.features.Text(
+                    encoder_config=self.builder_config.text_encoder_config),
+        }),
+        # No default supervised_keys (as we have to pass both premise
+        # and hypothesis as input).
+        supervised_keys=None,
+        urls=["https://www.nyu.edu/projects/bowman/multinli/"],
+        citation=_CITATION,
+    )
+
+  def _vocab_text_gen(self, filepath):
+    for ex in self._generate_examples(filepath):
+      yield " ".join([ex["premise"], ex["hypothesis"], ex["label"]])
+
+  def _split_generators(self, dl_manager):
+
+    downloaded_dir = dl_manager.download_and_extract(ROOT_URL)
+    mnli_path = os.path.join(downloaded_dir, "multinli_1.0")
+    train_path = os.path.join(mnli_path, "multinli_1.0_train.txt")
+
+    validation_path = os.path.join(mnli_path, "multinli_1.0_dev_mismatched.txt")
+
+    # Generate shared vocabulary
+    # maybe_build_from_corpus uses SubwordTextEncoder if that's configured
+    self.info.features["premise"].maybe_build_from_corpus(
+        self._vocab_text_gen(train_path))
+    encoder = self.info.features["premise"].encoder
+
+    self.info.features["premise"].maybe_set_encoder(encoder)
+    self.info.features["hypothesis"].maybe_set_encoder(encoder)
+    self.info.features["label"].maybe_set_encoder(encoder)
+
+    return [
+        tfds.core.SplitGenerator(
+            name=tfds.Split.TRAIN,
+            num_shards=10,
+            gen_kwargs={"filepath": train_path}),
+        tfds.core.SplitGenerator(
+            name=tfds.Split.VALIDATION,
+            num_shards=1,
+            gen_kwargs={"filepath": validation_path}),
+    ]
+
+  def _generate_examples(self, filepath):
+    """Generate mnli mismatch examples.
+
+    Args:
+      filepath: a string
+    Yields:
+      dictionaries containing "premise", "hypothesis" and "label" strings
+    """
+    for idx, line in enumerate(tf.io.gfile.GFile(filepath, "rb")):
+      if idx == 0: continue
+      line = tf.compat.as_text(line.strip())
+      split_line = line.split("\t")
+      yield {
+          "premise": split_line[5],
+          "hypothesis": split_line[6],
+          "label": split_line[0]
+      }
diff --git a/tensorflow_datasets/text/multi_nli_mismatch_test.py b/tensorflow_datasets/text/multi_nli_mismatch_test.py
new file mode 100644
index 00000000000..730b64a0bd2
--- /dev/null
+++ b/tensorflow_datasets/text/multi_nli_mismatch_test.py
@@ -0,0 +1,36 @@
+# coding=utf-8
+# Copyright 2019 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for multinli dataset module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow_datasets import testing
+from tensorflow_datasets.text import multi_nli_mismatch
+
+
+class MultiNLIMismatchTest(testing.DatasetBuilderTestCase):
+  DATASET_CLASS = multi_nli_mismatch.MultiNLIMismatch
+
+  SPLITS = {
+      "train": 3,
+      "validation": 2,
+  }
+
+
+if __name__ == "__main__":
+  testing.test_main()
diff --git a/tensorflow_datasets/url_checksums/multi_nli_mismatch.txt b/tensorflow_datasets/url_checksums/multi_nli_mismatch.txt
new file mode 100644
index 00000000000..397a556c890
--- /dev/null
+++ b/tensorflow_datasets/url_checksums/multi_nli_mismatch.txt
@@ -0,0 +1 @@
+http://storage.googleapis.com/tfds-data/downloads/multi_nli/multinli_1.0.zip 226850426 049f507b9e36b1fcb756cfd5aeb3b7a0cfcb84bf023793652987f7e7e0957822