From 2f6c1565eaf9d0c79720c7fb1bba28c89e34c650 Mon Sep 17 00:00:00 2001 From: Carlos Esteves Date: Fri, 15 Mar 2024 04:26:30 -0700 Subject: [PATCH] Add QM9 checksums, README.md and CITATIONS.bib. PiperOrigin-RevId: 616084769 --- .../datasets/qm9/CITATIONS.bib | 8 +++++ tensorflow_datasets/datasets/qm9/README.md | 5 +++ tensorflow_datasets/datasets/qm9/__init__.py | 15 +++++++++ .../datasets/qm9/checksums.tsv | 3 ++ .../datasets/qm9/qm9_dataset_builder.py | 31 +++++-------------- .../datasets/qm9/qm9_dataset_builder_test.py | 1 - 6 files changed, 38 insertions(+), 25 deletions(-) create mode 100644 tensorflow_datasets/datasets/qm9/CITATIONS.bib create mode 100644 tensorflow_datasets/datasets/qm9/README.md create mode 100644 tensorflow_datasets/datasets/qm9/__init__.py create mode 100644 tensorflow_datasets/datasets/qm9/checksums.tsv diff --git a/tensorflow_datasets/datasets/qm9/CITATIONS.bib b/tensorflow_datasets/datasets/qm9/CITATIONS.bib new file mode 100644 index 00000000000..2a876d01f41 --- /dev/null +++ b/tensorflow_datasets/datasets/qm9/CITATIONS.bib @@ -0,0 +1,8 @@ +@article{ramakrishnan2014quantum, + title={Quantum chemistry structures and properties of 134 kilo molecules}, + author={Ramakrishnan, Raghunathan and Dral, Pavlo O and Rupp, Matthias and von Lilienfeld, O Anatole}, + journal={Scientific Data}, + volume={1}, + year={2014}, + publisher={Nature Publishing Group} +} \ No newline at end of file diff --git a/tensorflow_datasets/datasets/qm9/README.md b/tensorflow_datasets/datasets/qm9/README.md new file mode 100644 index 00000000000..2cce221e4d4 --- /dev/null +++ b/tensorflow_datasets/datasets/qm9/README.md @@ -0,0 +1,5 @@ +QM9 consists of computed geometric, energetic, electronic, and thermodynamic +properties for 134k stable small organic molecules made up of CHONF. As usual, +we remove the uncharacterized molecules and provide the remaining 130,831 in the +original order (not shuffled). We provide a single 'train' split, users are +expected to make their own validation/test splits. diff --git a/tensorflow_datasets/datasets/qm9/__init__.py b/tensorflow_datasets/datasets/qm9/__init__.py new file mode 100644 index 00000000000..5310ec58c7d --- /dev/null +++ b/tensorflow_datasets/datasets/qm9/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2024 The TensorFlow Datasets Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/tensorflow_datasets/datasets/qm9/checksums.tsv b/tensorflow_datasets/datasets/qm9/checksums.tsv new file mode 100644 index 00000000000..a2a048cb2ca --- /dev/null +++ b/tensorflow_datasets/datasets/qm9/checksums.tsv @@ -0,0 +1,3 @@ +https://figshare.com/ndownloader/files/3195395 964 af739d4a0fbe894a56f346ad6045ee1b74c58f8a8c9ef3fbfed920c9ad8b00f9 atomref.txt +https://springernature.figshare.com/ndownloader/files/3195389 86144227 3a63848ac80691bdb8d41834b575afad345b9300d7a2db0c38adb7f6eaa8360c dsgdb9nsd.xyz.tar.bz2 +https://springernature.figshare.com/ndownloader/files/3195404 486752 3aa5115d540b356de94791d4a74c3bf1ed91c469ecf52a4f5d7cc0506fe02e24 uncharacterized.txt diff --git a/tensorflow_datasets/datasets/qm9/qm9_dataset_builder.py b/tensorflow_datasets/datasets/qm9/qm9_dataset_builder.py index 1b73098269e..01ff0847490 100644 --- a/tensorflow_datasets/datasets/qm9/qm9_dataset_builder.py +++ b/tensorflow_datasets/datasets/qm9/qm9_dataset_builder.py @@ -30,22 +30,6 @@ pd = tfds.core.lazy_imports.pandas -_DESCRIPTION = """\ -QM9 consists of computed geometric, energetic, electronic, and thermodynamic -properties for 134k stable small organic molecules made up of CHONF. -""" - -_CITATION = """\ -@article{ramakrishnan2014quantum, - title={Quantum chemistry structures and properties of 134 kilo molecules}, - author={Ramakrishnan, Raghunathan and Dral, Pavlo O and Rupp, Matthias and von Lilienfeld, O Anatole}, - journal={Scientific Data}, - volume={1}, - year={2014}, - publisher={Nature Publishing Group} -} -""" - _HOMEPAGE = 'https://doi.org/10.6084/m9.figshare.c.978904.v5' _ATOMREF_URL = 'https://figshare.com/ndownloader/files/3195395' @@ -145,15 +129,14 @@ class Builder(tfds.core.GeneratorBasedBuilder): def _info(self) -> tfds.core.DatasetInfo: """Returns the dataset metadata.""" - return tfds.core.DatasetInfo( - builder=self, - description=_DESCRIPTION, + return self.dataset_info_from_configs( disable_shuffling=True, features=tfds.features.FeaturesDict({ 'num_atoms': tfds.features.Tensor(shape=(), dtype=np.int64), 'charges': tfds.features.Tensor(shape=(29,), dtype=np.int64), - 'Mulliken_charges': tfds.features.Tensor(shape=(29,), - dtype=np.float32), + 'Mulliken_charges': tfds.features.Tensor( + shape=(29,), dtype=np.float32 + ), 'positions': tfds.features.Tensor(shape=(29, 3), dtype=np.float32), 'index': tfds.features.Tensor(shape=(), dtype=np.int64), 'A': tfds.features.Tensor(shape=(), dtype=np.float32), @@ -180,13 +163,13 @@ def _info(self) -> tfds.core.DatasetInfo: 'SMILES_relaxed': tfds.features.Tensor(shape=(), dtype=np.str_), 'InChI': tfds.features.Tensor(shape=(), dtype=np.str_), 'InChI_relaxed': tfds.features.Tensor(shape=(), dtype=np.str_), - 'frequencies': tfds.features.Tensor(shape=(None,), - dtype=np.float32), + 'frequencies': tfds.features.Tensor( + shape=(None,), dtype=np.float32 + ), }), # These are returned if `as_supervised=True` in `builder.as_dataset`. supervised_keys=None, homepage=_HOMEPAGE, - citation=_CITATION, ) def _split_generators( diff --git a/tensorflow_datasets/datasets/qm9/qm9_dataset_builder_test.py b/tensorflow_datasets/datasets/qm9/qm9_dataset_builder_test.py index ae4919ffb38..6c964a93aec 100644 --- a/tensorflow_datasets/datasets/qm9/qm9_dataset_builder_test.py +++ b/tensorflow_datasets/datasets/qm9/qm9_dataset_builder_test.py @@ -29,7 +29,6 @@ class Qm9Test(testing.DatasetBuilderTestCase): qm9_dataset_builder._VALIDATION_SIZE = 1 qm9_dataset_builder._TEST_SIZE = 1 - SKIP_CHECKSUMS = True DATASET_CLASS = qm9_dataset_builder.Builder DL_EXTRACT_RESULT = {