Use discrete versioning instead of datetime

PiperOrigin-RevId: 223138202
shankar0206 · Nov 28, 2018 · 68cafe1 · 68cafe1
1 parent 3fffe5f
commit 68cafe1
Show file tree

Hide file tree

Showing 13 changed files with 101 additions and 31 deletions.
diff --git a/tensorflow_datasets/core/dataset_builder.py b/tensorflow_datasets/core/dataset_builder.py
@@ -20,7 +20,6 @@
 from __future__ import print_function
 
 import abc
-import datetime
 import functools
 import os
 
@@ -151,9 +150,15 @@ def download_and_prepare(
       )
 
     # Otherwise, create a new version in a new data_dir.
-    curr_date = datetime.datetime.now()
-    version_str = curr_date.strftime("v_%Y%m%d_%H%M")
-    data_dir = self._get_data_dir(version=version_str)
+    data_dir = self._get_data_dir(version=self.info.version)
+    if tf.gfile.Exists(data_dir):
+      # If generation is determinism, the dataset can be re-generated and raise
+      # an error only if generated files are different
+      raise ValueError(
+          "Trying to overwrite an existing dataset {} at {}. A dataset with "
+          "the same version {} already exists. If the dataset has changed, "
+          "please update the version number.".format(
+              self.name, data_dir, self.info.version))
     tf.logging.info("Generating dataset %s (%s)", self.name, data_dir)
 
     self._check_available_size(data_dir)
@@ -274,11 +279,17 @@ def _get_data_dir(self, version=None):
 
     # Get the most recent directory
     if tf.gfile.Exists(data_root_dir):
-      version_dirnames = [
-          f for f in sorted(tf.gfile.ListDirectory(data_root_dir))
-          if ".incomplete" not in f
-      ]
+      version_dirnames = {}
+      for filename in tf.gfile.ListDirectory(data_root_dir):
+        try:
+          version_dirnames[filename] = utils.str_to_version(filename)
+        except ValueError:  # Invalid version (ex: incomplete data dir)
+          pass
+      # If found valid data directories, take the biggest version
       if version_dirnames:
+        version_dirnames = [
+            k for k, _ in sorted(version_dirnames.items(), key=lambda x: x[-1])
+        ]
         return os.path.join(data_root_dir, version_dirnames[-1])
 
     # No directory found

diff --git a/tensorflow_datasets/core/dataset_builder_test.py b/tensorflow_datasets/core/dataset_builder_test.py
@@ -100,6 +100,20 @@ def test_load(self):
       self.assertEqual(20, len(data))
       self.assertLess(data[0]["x"], 30)
 
+  def test_get_data_dir(self):
+    # Test that the dataset load the most recent dir
+    with test_utils.tmp_dir(self.get_temp_dir()) as tmp_dir:
+      builder = DummyDatasetSharedGenerator(data_dir=tmp_dir)
+
+      # The dataset folder contains multiple versions
+      tf.gfile.MakeDirs(os.path.join(tmp_dir, builder.name, "14.0.0.invalid"))
+      tf.gfile.MakeDirs(os.path.join(tmp_dir, builder.name, "10.0.0"))
+      tf.gfile.MakeDirs(os.path.join(tmp_dir, builder.name, "9.0.0"))
+
+      # The last valid version is chosen by default
+      most_recent_dir = os.path.join(tmp_dir, builder.name, "10.0.0")
+      self.assertEqual(builder._get_data_dir(), most_recent_dir)
+
 
 class DatasetBuilderReadTest(tf.test.TestCase):
 

diff --git a/tensorflow_datasets/core/dataset_info.py b/tensorflow_datasets/core/dataset_info.py
@@ -42,6 +42,7 @@
 from tensorflow_datasets.core import api_utils
 from tensorflow_datasets.core import dataset_utils
 from tensorflow_datasets.core import splits as splits_lib
+from tensorflow_datasets.core import utils
 from tensorflow_datasets.core.proto import dataset_info_pb2
 from google.protobuf import json_format
 from tensorflow_metadata.proto.v0 import schema_pb2
@@ -65,6 +66,7 @@ class DatasetInfo(object):
   Properties:
     name: `str`, name of this dataset.
     description: `str`, description of this dataset.
+    version: `str`, semantic version of the dataset (ex: '1.2.0')
     features: `tfds.features.FeaturesDict`: Information on the feature dict of
       the `tf.data.Dataset` object from the `builder.as_dataset()` method.
     splits: `SplitDict`, the available Splits for this dataset.
@@ -83,6 +85,7 @@ class DatasetInfo(object):
   def __init__(self,
                name=None,
                description=None,
+               version=None,
                features=None,
                supervised_keys=None,
                splits=None,
@@ -94,6 +97,7 @@ def __init__(self,
     Args:
       name: (`str`) Name of the dataset, usually set to builder.name.
       description: `str`, description of this dataset.
+      version: `str`, semantic version of the dataset (ex: '1.2.0')
       features: (`tfds.features.FeaturesDict`) Information on the feature dict
         of the `tf.data.Dataset()` object from the `builder.as_dataset()`
         method.
@@ -105,9 +109,13 @@ def __init__(self,
         size of the dataset that we will be downloading from the internet.
       citation: `str`, optional, the citation to use for this dataset.
     """
+    version = version or "0.0.0"
+    utils.str_to_version(version)  # Ensure that the version is valid
+
     self._info_proto = dataset_info_pb2.DatasetInfo(
         name=name,
         description=description,
+        version=version,
         size_in_bytes=int(size_in_bytes),
         citation=citation)
     if urls:
@@ -133,17 +141,8 @@ def as_proto(self):
 
     return self._info_proto
 
-  @property
-  def name(self):
-    return self._info_proto.name
-
-  @property
-  def description(self):
-    return self._info_proto.description
-
-  @property
-  def citation(self):
-    return self._info_proto.citation
+  def __getattr__(self, key):
+    return getattr(self.as_proto, key)
 
   @property
   def features(self):
@@ -179,10 +178,6 @@ def splits(self, split_dict):
   def urls(self):
     return self._info_proto.location.urls
 
-  @property
-  def size_in_bytes(self):
-    return self._info_proto.size_in_bytes
-
   @property
   def num_examples(self):
     return sum(s.num_examples for s in self.splits.values())

diff --git a/tensorflow_datasets/core/proto/dataset_info.proto b/tensorflow_datasets/core/proto/dataset_info.proto
@@ -43,6 +43,9 @@ message DatasetInfo {
   string name = 1;
   string description = 2;
 
+  // Version string of the dataset (ex: '1.0.5')
+  string version = 9;
+
   // A citation string if one exists for this dataset.
   string citation = 3;
 

diff --git a/tensorflow_datasets/core/proto/dataset_info_generated_pb2.py b/tensorflow_datasets/core/proto/dataset_info_generated_pb2.py
@@ -37,7 +37,7 @@
   package='tensorflow_datasets',
   syntax='proto3',
   serialized_options=_b('\370\001\001'),
-  serialized_pb=_b('\n\x12\x64\x61taset_info.proto\x12\x13tensorflow_datasets\x1a-tensorflow_metadata/proto/v0/statistics.proto\x1a)tensorflow_metadata/proto/v0/schema.proto\"\x1f\n\x0f\x44\x61tasetLocation\x12\x0c\n\x04urls\x18\x01 \x03(\t\"s\n\tSplitInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x12\n\nnum_shards\x18\x02 \x01(\x03\x12\x44\n\nstatistics\x18\x03 \x01(\x0b\x32\x30.tensorflow.metadata.v0.DatasetFeatureStatistics\"/\n\x0eSupervisedKeys\x12\r\n\x05input\x18\x01 \x01(\t\x12\x0e\n\x06output\x18\x02 \x01(\t\"\xaf\x02\n\x0b\x44\x61tasetInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x13\n\x0b\x64\x65scription\x18\x02 \x01(\t\x12\x10\n\x08\x63itation\x18\x03 \x01(\t\x12\x15\n\rsize_in_bytes\x18\x04 \x01(\x03\x12\x36\n\x08location\x18\x05 \x01(\x0b\x32$.tensorflow_datasets.DatasetLocation\x12.\n\x06schema\x18\x06 \x01(\x0b\x32\x1e.tensorflow.metadata.v0.Schema\x12.\n\x06splits\x18\x07 \x03(\x0b\x32\x1e.tensorflow_datasets.SplitInfo\x12<\n\x0fsupervised_keys\x18\x08 \x01(\x0b\x32#.tensorflow_datasets.SupervisedKeysB\x03\xf8\x01\x01\x62\x06proto3')
+  serialized_pb=_b('\n\x12\x64\x61taset_info.proto\x12\x13tensorflow_datasets\x1a-tensorflow_metadata/proto/v0/statistics.proto\x1a)tensorflow_metadata/proto/v0/schema.proto\"\x1f\n\x0f\x44\x61tasetLocation\x12\x0c\n\x04urls\x18\x01 \x03(\t\"s\n\tSplitInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x12\n\nnum_shards\x18\x02 \x01(\x03\x12\x44\n\nstatistics\x18\x03 \x01(\x0b\x32\x30.tensorflow.metadata.v0.DatasetFeatureStatistics\"/\n\x0eSupervisedKeys\x12\r\n\x05input\x18\x01 \x01(\t\x12\x0e\n\x06output\x18\x02 \x01(\t\"\xc0\x02\n\x0b\x44\x61tasetInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x13\n\x0b\x64\x65scription\x18\x02 \x01(\t\x12\x0f\n\x07version\x18\t \x01(\t\x12\x10\n\x08\x63itation\x18\x03 \x01(\t\x12\x15\n\rsize_in_bytes\x18\x04 \x01(\x03\x12\x36\n\x08location\x18\x05 \x01(\x0b\x32$.tensorflow_datasets.DatasetLocation\x12.\n\x06schema\x18\x06 \x01(\x0b\x32\x1e.tensorflow.metadata.v0.Schema\x12.\n\x06splits\x18\x07 \x03(\x0b\x32\x1e.tensorflow_datasets.SplitInfo\x12<\n\x0fsupervised_keys\x18\x08 \x01(\x0b\x32#.tensorflow_datasets.SupervisedKeysB\x03\xf8\x01\x01\x62\x06proto3')
   ,
   dependencies=[tensorflow__metadata_dot_proto_dot_v0_dot_statistics__pb2.DESCRIPTOR,tensorflow__metadata_dot_proto_dot_v0_dot_schema__pb2.DESCRIPTOR,])
 
@@ -180,42 +180,49 @@
       is_extension=False, extension_scope=None,
       serialized_options=None, file=DESCRIPTOR),
     _descriptor.FieldDescriptor(
-      name='citation', full_name='tensorflow_datasets.DatasetInfo.citation', index=2,
+      name='version', full_name='tensorflow_datasets.DatasetInfo.version', index=2,
+      number=9, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='citation', full_name='tensorflow_datasets.DatasetInfo.citation', index=3,
       number=3, type=9, cpp_type=9, label=1,
       has_default_value=False, default_value=_b("").decode('utf-8'),
       message_type=None, enum_type=None, containing_type=None,
       is_extension=False, extension_scope=None,
       serialized_options=None, file=DESCRIPTOR),
     _descriptor.FieldDescriptor(
-      name='size_in_bytes', full_name='tensorflow_datasets.DatasetInfo.size_in_bytes', index=3,
+      name='size_in_bytes', full_name='tensorflow_datasets.DatasetInfo.size_in_bytes', index=4,
       number=4, type=3, cpp_type=2, label=1,
       has_default_value=False, default_value=0,
       message_type=None, enum_type=None, containing_type=None,
       is_extension=False, extension_scope=None,
       serialized_options=None, file=DESCRIPTOR),
     _descriptor.FieldDescriptor(
-      name='location', full_name='tensorflow_datasets.DatasetInfo.location', index=4,
+      name='location', full_name='tensorflow_datasets.DatasetInfo.location', index=5,
       number=5, type=11, cpp_type=10, label=1,
       has_default_value=False, default_value=None,
       message_type=None, enum_type=None, containing_type=None,
       is_extension=False, extension_scope=None,
       serialized_options=None, file=DESCRIPTOR),
     _descriptor.FieldDescriptor(
-      name='schema', full_name='tensorflow_datasets.DatasetInfo.schema', index=5,
+      name='schema', full_name='tensorflow_datasets.DatasetInfo.schema', index=6,
       number=6, type=11, cpp_type=10, label=1,
       has_default_value=False, default_value=None,
       message_type=None, enum_type=None, containing_type=None,
       is_extension=False, extension_scope=None,
       serialized_options=None, file=DESCRIPTOR),
     _descriptor.FieldDescriptor(
-      name='splits', full_name='tensorflow_datasets.DatasetInfo.splits', index=6,
+      name='splits', full_name='tensorflow_datasets.DatasetInfo.splits', index=7,
       number=7, type=11, cpp_type=10, label=3,
       has_default_value=False, default_value=[],
       message_type=None, enum_type=None, containing_type=None,
       is_extension=False, extension_scope=None,
       serialized_options=None, file=DESCRIPTOR),
     _descriptor.FieldDescriptor(
-      name='supervised_keys', full_name='tensorflow_datasets.DatasetInfo.supervised_keys', index=7,
+      name='supervised_keys', full_name='tensorflow_datasets.DatasetInfo.supervised_keys', index=8,
       number=8, type=11, cpp_type=10, label=1,
       has_default_value=False, default_value=None,
       message_type=None, enum_type=None, containing_type=None,
@@ -234,7 +241,7 @@
   oneofs=[
   ],
   serialized_start=333,
-  serialized_end=636,
+  serialized_end=653,
 )
 
 _SPLITINFO.fields_by_name['statistics'].message_type = tensorflow__metadata_dot_proto_dot_v0_dot_statistics__pb2._DATASETFEATURESTATISTICS

diff --git a/tensorflow_datasets/core/proto/install_protoc.sh b/tensorflow_datasets/core/proto/install_protoc.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 # Install the .protoc compiler on Linux
 
+
 # Make sure you grab the latest version
 curl -OL https://github.com/google/protobuf/releases/download/v3.6.1/protoc-3.6.1-linux-x86_64.zip
 

diff --git a/tensorflow_datasets/core/utils/py_utils.py b/tensorflow_datasets/core/utils/py_utils.py
@@ -167,3 +167,19 @@ def get_proto(self):
     # Class cannot be wraped because __doc__ not overwritable with python2
     return decorator_cls
   return decorator
+
+
+def str_to_version(version_str):
+  """Return the tuple (major, minor, patch) version extracted from the str."""
+  version_ids = version_str.split(".")
+  if len(version_ids) != 3 or "-" in version_str:
+    raise ValueError(
+        "Could not convert the {} to version. Format should be x.y.z".format(
+            version_str))
+  try:
+    version_ids = tuple(int(v) for v in version_ids)
+  except ValueError:
+    raise ValueError(
+        "Could not convert the {} to version. Format should be x.y.z".format(
+            version_str))
+  return version_ids
diff --git a/tensorflow_datasets/core/utils/py_utils_test.py b/tensorflow_datasets/core/utils/py_utils_test.py
@@ -118,6 +118,22 @@ def map_fn(x):
         },
     })
 
+  def test_str_to_version(self):
+    """Test the zip nested function."""
+
+    self.assertEqual(py_utils.str_to_version('1.3.534'), (1, 3, 534))
+
+    with self.assertRaisesWithPredicateMatch(ValueError, 'Format should be '):
+      py_utils.str_to_version('1.3.-534')
+    with self.assertRaisesWithPredicateMatch(ValueError, 'Format should be '):
+      py_utils.str_to_version('1.3')
+    with self.assertRaisesWithPredicateMatch(ValueError, 'Format should be '):
+      py_utils.str_to_version('1.3.')
+    with self.assertRaisesWithPredicateMatch(ValueError, 'Format should be '):
+      py_utils.str_to_version('1..5')
+    with self.assertRaisesWithPredicateMatch(ValueError, 'Format should be '):
+      py_utils.str_to_version('a.b.c')
+
 
 if __name__ == '__main__':
   tf.test.main()
diff --git a/tensorflow_datasets/image/celeba.py b/tensorflow_datasets/image/celeba.py
@@ -44,6 +44,7 @@ def _info(self):
         description=("Large-scale CelebFaces Attributes, CelebA."
                      "Set of ~30k celebrities pictures. "
                      "These pictures are cropped."),
+        version="0.1.0",
         features=tfds.features.FeaturesDict({
             "image":
                 tfds.features.Image(

diff --git a/tensorflow_datasets/image/cifar.py b/tensorflow_datasets/image/cifar.py
@@ -58,6 +58,7 @@ def _info(self):
         description=("The CIFAR-10 dataset consists of 60000 32x32 colour "
                      "images in 10 classes, with 6000 images per class. There "
                      "are 50000 training images and 10000 test images."),
+        version="1.0.0",
         features=tfds.features.FeaturesDict({
             "image": tfds.features.Image(shape=_CIFAR_IMAGE_SHAPE),
             "label": tfds.features.ClassLabel(num_classes=10),
@@ -188,6 +189,7 @@ def _info(self):
                      "superclasses. Each image comes with a \"fine\" label "
                      "(the class to which it belongs) and a \"coarse\" label "
                      "(the superclass to which it belongs)."),
+        version="1.0.0",
         features=tfds.features.FeaturesDict({
             "image": tfds.features.Image(shape=_CIFAR_IMAGE_SHAPE),
             "label": tfds.features.OneOf(choice=label_to_use, feature_dict={

diff --git a/tensorflow_datasets/image/diabetic_retinopathy_detection.py b/tensorflow_datasets/image/diabetic_retinopathy_detection.py
@@ -36,6 +36,7 @@ def _info(self):
         name=self.name,
         description="A large set of high-resolution retina images taken under "
         "a variety of imaging conditions.",
+        version="1.0.0",
         features=tfds.features.FeaturesDict({
             "name": tfds.features.Text(),  # patient ID + eye. eg: "4_left".
             "image": tfds.features.Image(),

diff --git a/tensorflow_datasets/image/mnist.py b/tensorflow_datasets/image/mnist.py
@@ -49,6 +49,7 @@ def _info(self):
         description=("The MNIST database of handwritten digits, has a training "
                      "set of 60,000 examples, and a test set of 10,000 "
                      "examples."),
+        version="1.0.0",
         features=tfds.features.FeaturesDict({
             "image": tfds.features.Image(shape=_MNIST_IMAGE_SHAPE),
             "label": tfds.features.ClassLabel(num_classes=10),
@@ -131,6 +132,7 @@ def _info(self):
                      "test set of 10,000 examples. Each example is a 28x28 "
                      "grayscale image, associated with a label from 10 "
                      "classes."),
+        version="1.0.0",
         features=tfds.features.FeaturesDict({
             "image": tfds.features.Image(shape=_MNIST_IMAGE_SHAPE),
             "label": tfds.features.ClassLabel(num_classes=10),

diff --git a/tensorflow_datasets/video/bair_robot_pushing.py b/tensorflow_datasets/video/bair_robot_pushing.py
@@ -74,6 +74,7 @@ def _info(self):
         "pushing motions, including one training set (train) and "
         "two test sets of previously seen (testseen) and unseen "
         "(testnovel) objects.",
+        version="0.1.0",
         features=features,
         urls=["https://sites.google.com/site/brainrobotdata/home/push-dataset"],
         size_in_bytes=30.0 * tfds.units.GiB,