Adds the Corr2Cause dataset

PiperOrigin-RevId: 549923326
9Olive · Jul 21, 2023 · 8c40872 · 8c40872
1 parent 1f156e3
commit 8c40872
Show file tree

Hide file tree

Showing 10 changed files with 165 additions and 0 deletions.
diff --git a/tensorflow_datasets/datasets/corr2cause/CITATIONS.bib b/tensorflow_datasets/datasets/corr2cause/CITATIONS.bib
@@ -0,0 +1,8 @@
+@misc{jin2023large,
+      title={Can Large Language Models Infer Causation from Correlation?}, 
+      author={Zhijing Jin and Jiarui Liu and Zhiheng Lyu and Spencer Poff and Mrinmaya Sachan and Rada Mihalcea and Mona Diab and Bernhard Schölkopf},
+      year={2023},
+      eprint={2306.05836},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
diff --git a/tensorflow_datasets/datasets/corr2cause/README.md b/tensorflow_datasets/datasets/corr2cause/README.md
@@ -0,0 +1,13 @@
+# Corr2cause
+
+Causal inference is one of the hallmarks of human intelligence.
+
+Corr2cause is a large-scale dataset of more than 400K samples, on which
+seventeen existing LLMs are evaluated in the related paper.
+
+Overall, Corr2cause contains 415,944 samples, with 18.57% in valid samples.
+The average length of the premise is 424.11 tokens, and hypothesis 10.83 tokens.
+The data is split into 411,452 training samples, 2,246 development and test
+samples, respectively. Since the main purpose of the dataset is to benchmark the
+performance of LLMs, the test and development sets have been prioritized to have
+a comprehensive coverage over all sizes of graphs.
diff --git a/tensorflow_datasets/datasets/corr2cause/TAGS.txt b/tensorflow_datasets/datasets/corr2cause/TAGS.txt
@@ -0,0 +1,2 @@
+content.data-type.text # Contains text data.
+content.language.en # Contains text in language English / en.
diff --git a/tensorflow_datasets/datasets/corr2cause/__init__.py b/tensorflow_datasets/datasets/corr2cause/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2023 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/tensorflow_datasets/datasets/corr2cause/checksums.tsv b/tensorflow_datasets/datasets/corr2cause/checksums.tsv
@@ -0,0 +1,3 @@
+https://huggingface.co/datasets/causalnlp/corr2cause/raw/main/dev.csv	2951531	4c93d696a61b4bbf71deffdf1652e69358829e8488b83eff7defe3ee6ec9e4b2	dev.csv
+https://huggingface.co/datasets/causalnlp/corr2cause/raw/main/test.csv	2226955	ce4e5487b37fc4b36b38b5e682ec94d5fc30ae29cdf8a6a5c379200893af3550	test.csv
+https://huggingface.co/datasets/causalnlp/corr2cause/resolve/main/train.csv	757363839	358786f0a76449c990d3102302baa5e51781d3316c26e4c5a539a0d5e63898aa	train.csv
diff --git a/tensorflow_datasets/datasets/corr2cause/corr2cause_dataset_builder.py b/tensorflow_datasets/datasets/corr2cause/corr2cause_dataset_builder.py
@@ -0,0 +1,71 @@
+# coding=utf-8
+# Copyright 2023 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""corr2cause dataset."""
+import csv
+from etils import epath
+import numpy as np
+import tensorflow_datasets.public_api as tfds
+
+
+_URL_PATH = 'https://huggingface.co/datasets/causalnlp/corr2cause/'
+
+
+class Builder(tfds.core.GeneratorBasedBuilder):
+  """DatasetBuilder for corr2cause dataset."""
+
+  VERSION = tfds.core.Version('1.0.0')
+  RELEASE_NOTES = {
+      '1.0.0': 'Initial release.',
+  }
+
+  def _info(self) -> tfds.core.DatasetInfo:
+    """Returns the dataset metadata."""
+    return self.dataset_info_from_configs(
+        features=tfds.features.FeaturesDict({
+            'input': tfds.features.Text(),
+            'label': np.int64,
+        }),
+        supervised_keys=None,
+        homepage='https://github.com/causalNLP/corr2cause/tree/main',
+        license='https://github.com/causalNLP/corr2cause/blob/main/LICENSE',
+    )
+
+  def _split_generators(self, dl_manager: tfds.download.DownloadManager):
+    """Returns SplitGenerators."""
+    split_names = ['train', 'dev', 'test']
+
+    data_dict = {
+        'train': _URL_PATH + 'resolve/main/train.csv',
+        'dev': _URL_PATH + 'raw/main/dev.csv',
+        'test': _URL_PATH + 'raw/main/test.csv',
+    }
+
+    path = dl_manager.download_and_extract(data_dict)
+
+    return {
+        split: self._generate_examples(filepath=path[split])
+        for split in split_names
+    }
+
+  def _generate_examples(self, filepath):
+    """Yields examples."""
+    with epath.Path(filepath).open() as csvfile:
+      reader = csv.DictReader(csvfile)
+      for i, row in enumerate(reader):
+        yield i, {
+            'input': row['input'],
+            'label': row['label'],
+        }
diff --git a/tensorflow_datasets/datasets/corr2cause/corr2cause_dataset_builder_test.py b/tensorflow_datasets/datasets/corr2cause/corr2cause_dataset_builder_test.py
@@ -0,0 +1,40 @@
+# coding=utf-8
+# Copyright 2023 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""corr2cause dataset."""
+
+from tensorflow_datasets.datasets.corr2cause import corr2cause_dataset_builder
+import tensorflow_datasets.public_api as tfds
+
+
+class Corr2causeTest(tfds.testing.DatasetBuilderTestCase):
+  """Tests for corr2cause dataset."""
+
+  DATASET_CLASS = corr2cause_dataset_builder.Builder
+  SPLITS = {
+      'train': 3,  # Number of fake train example
+      'dev': 1,  # Number of fake dev example
+      'test': 1,  # Number of fake test example
+  }
+
+  DL_EXTRACT_RESULT = {
+      'train': 'train.csv',
+      'dev': 'dev.csv',
+      'test': 'test.csv',
+  }
+
+
+if __name__ == '__main__':
+  tfds.testing.test_main()
diff --git a/tensorflow_datasets/datasets/corr2cause/dummy_data/dev.csv b/tensorflow_datasets/datasets/corr2cause/dummy_data/dev.csv
@@ -0,0 +1,3 @@
+input,label
+"Premise: Suppose there is a closed system of 2 variables, A and B. All the statistical relations among these 2 variables are as follows: A correlates with B.
+Hypothesis: A directly affects B.",0
diff --git a/tensorflow_datasets/datasets/corr2cause/dummy_data/test.csv b/tensorflow_datasets/datasets/corr2cause/dummy_data/test.csv
@@ -0,0 +1,3 @@
+input,label
+"Premise: Suppose there is a closed system of 2 variables, A and B. All the statistical relations among these 2 variables are as follows: A is independent of B.
+Hypothesis: A directly affects B.",0
diff --git a/tensorflow_datasets/datasets/corr2cause/dummy_data/train.csv b/tensorflow_datasets/datasets/corr2cause/dummy_data/train.csv
@@ -0,0 +1,7 @@
+input,label
+"Premise: Suppose there is a closed system of 4 variables, A, B, C and D. All the statistical relations among these 4 variables are as follows: A correlates with B. A correlates with C. A correlates with D. B correlates with C. B correlates with D. C correlates with D. However, B and D are independent given A. B and D are independent given A and C. C and D are independent given A. C and D are independent given A and B.
+Hypothesis: A directly affects B.",0
+"Premise: Suppose there is a closed system of 4 variables, A, B, C and D. All the statistical relations among these 4 variables are as follows: A correlates with B. A correlates with C. A correlates with D. B correlates with C. B correlates with D. C correlates with D. However, B and D are independent given A. B and D are independent given A and C. C and D are independent given A. C and D are independent given A and B.
+Hypothesis: A influences B through some mediator(s).",0
+"Premise: Suppose there is a closed system of 4 variables, A, B, C and D. All the statistical relations among these 4 variables are as follows: A correlates with B. A correlates with C. A correlates with D. B correlates with C. B correlates with D. C correlates with D. However, B and D are independent given A. B and D are independent given A and C. C and D are independent given A. C and D are independent given A and B.
+Hypothesis: B directly affects A.",0
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		content.data-type.text # Contains text data.
		content.language.en # Contains text in language English / en.