Skip to content

Commit

Permalink
Use discrete versioning instead of datetime
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 223138202
  • Loading branch information
Conchylicultor authored and Copybara-Service committed Nov 28, 2018
1 parent 3fffe5f commit 68cafe1
Show file tree
Hide file tree
Showing 13 changed files with 101 additions and 31 deletions.
27 changes: 19 additions & 8 deletions tensorflow_datasets/core/dataset_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
from __future__ import print_function

import abc
import datetime
import functools
import os

Expand Down Expand Up @@ -151,9 +150,15 @@ def download_and_prepare(
)

# Otherwise, create a new version in a new data_dir.
curr_date = datetime.datetime.now()
version_str = curr_date.strftime("v_%Y%m%d_%H%M")
data_dir = self._get_data_dir(version=version_str)
data_dir = self._get_data_dir(version=self.info.version)
if tf.gfile.Exists(data_dir):
# If generation is determinism, the dataset can be re-generated and raise
# an error only if generated files are different
raise ValueError(
"Trying to overwrite an existing dataset {} at {}. A dataset with "
"the same version {} already exists. If the dataset has changed, "
"please update the version number.".format(
self.name, data_dir, self.info.version))
tf.logging.info("Generating dataset %s (%s)", self.name, data_dir)

self._check_available_size(data_dir)
Expand Down Expand Up @@ -274,11 +279,17 @@ def _get_data_dir(self, version=None):

# Get the most recent directory
if tf.gfile.Exists(data_root_dir):
version_dirnames = [
f for f in sorted(tf.gfile.ListDirectory(data_root_dir))
if ".incomplete" not in f
]
version_dirnames = {}
for filename in tf.gfile.ListDirectory(data_root_dir):
try:
version_dirnames[filename] = utils.str_to_version(filename)
except ValueError: # Invalid version (ex: incomplete data dir)
pass
# If found valid data directories, take the biggest version
if version_dirnames:
version_dirnames = [
k for k, _ in sorted(version_dirnames.items(), key=lambda x: x[-1])
]
return os.path.join(data_root_dir, version_dirnames[-1])

# No directory found
Expand Down
14 changes: 14 additions & 0 deletions tensorflow_datasets/core/dataset_builder_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,20 @@ def test_load(self):
self.assertEqual(20, len(data))
self.assertLess(data[0]["x"], 30)

def test_get_data_dir(self):
# Test that the dataset load the most recent dir
with test_utils.tmp_dir(self.get_temp_dir()) as tmp_dir:
builder = DummyDatasetSharedGenerator(data_dir=tmp_dir)

# The dataset folder contains multiple versions
tf.gfile.MakeDirs(os.path.join(tmp_dir, builder.name, "14.0.0.invalid"))
tf.gfile.MakeDirs(os.path.join(tmp_dir, builder.name, "10.0.0"))
tf.gfile.MakeDirs(os.path.join(tmp_dir, builder.name, "9.0.0"))

# The last valid version is chosen by default
most_recent_dir = os.path.join(tmp_dir, builder.name, "10.0.0")
self.assertEqual(builder._get_data_dir(), most_recent_dir)


class DatasetBuilderReadTest(tf.test.TestCase):

Expand Down
25 changes: 10 additions & 15 deletions tensorflow_datasets/core/dataset_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from tensorflow_datasets.core import api_utils
from tensorflow_datasets.core import dataset_utils
from tensorflow_datasets.core import splits as splits_lib
from tensorflow_datasets.core import utils
from tensorflow_datasets.core.proto import dataset_info_pb2
from google.protobuf import json_format
from tensorflow_metadata.proto.v0 import schema_pb2
Expand All @@ -65,6 +66,7 @@ class DatasetInfo(object):
Properties:
name: `str`, name of this dataset.
description: `str`, description of this dataset.
version: `str`, semantic version of the dataset (ex: '1.2.0')
features: `tfds.features.FeaturesDict`: Information on the feature dict of
the `tf.data.Dataset` object from the `builder.as_dataset()` method.
splits: `SplitDict`, the available Splits for this dataset.
Expand All @@ -83,6 +85,7 @@ class DatasetInfo(object):
def __init__(self,
name=None,
description=None,
version=None,
features=None,
supervised_keys=None,
splits=None,
Expand All @@ -94,6 +97,7 @@ def __init__(self,
Args:
name: (`str`) Name of the dataset, usually set to builder.name.
description: `str`, description of this dataset.
version: `str`, semantic version of the dataset (ex: '1.2.0')
features: (`tfds.features.FeaturesDict`) Information on the feature dict
of the `tf.data.Dataset()` object from the `builder.as_dataset()`
method.
Expand All @@ -105,9 +109,13 @@ def __init__(self,
size of the dataset that we will be downloading from the internet.
citation: `str`, optional, the citation to use for this dataset.
"""
version = version or "0.0.0"
utils.str_to_version(version) # Ensure that the version is valid

self._info_proto = dataset_info_pb2.DatasetInfo(
name=name,
description=description,
version=version,
size_in_bytes=int(size_in_bytes),
citation=citation)
if urls:
Expand All @@ -133,17 +141,8 @@ def as_proto(self):

return self._info_proto

@property
def name(self):
return self._info_proto.name

@property
def description(self):
return self._info_proto.description

@property
def citation(self):
return self._info_proto.citation
def __getattr__(self, key):
return getattr(self.as_proto, key)

@property
def features(self):
Expand Down Expand Up @@ -179,10 +178,6 @@ def splits(self, split_dict):
def urls(self):
return self._info_proto.location.urls

@property
def size_in_bytes(self):
return self._info_proto.size_in_bytes

@property
def num_examples(self):
return sum(s.num_examples for s in self.splits.values())
Expand Down
3 changes: 3 additions & 0 deletions tensorflow_datasets/core/proto/dataset_info.proto
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ message DatasetInfo {
string name = 1;
string description = 2;

// Version string of the dataset (ex: '1.0.5')
string version = 9;

// A citation string if one exists for this dataset.
string citation = 3;

Expand Down
23 changes: 15 additions & 8 deletions tensorflow_datasets/core/proto/dataset_info_generated_pb2.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
package='tensorflow_datasets',
syntax='proto3',
serialized_options=_b('\370\001\001'),
serialized_pb=_b('\n\x12\x64\x61taset_info.proto\x12\x13tensorflow_datasets\x1a-tensorflow_metadata/proto/v0/statistics.proto\x1a)tensorflow_metadata/proto/v0/schema.proto\"\x1f\n\x0f\x44\x61tasetLocation\x12\x0c\n\x04urls\x18\x01 \x03(\t\"s\n\tSplitInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x12\n\nnum_shards\x18\x02 \x01(\x03\x12\x44\n\nstatistics\x18\x03 \x01(\x0b\x32\x30.tensorflow.metadata.v0.DatasetFeatureStatistics\"/\n\x0eSupervisedKeys\x12\r\n\x05input\x18\x01 \x01(\t\x12\x0e\n\x06output\x18\x02 \x01(\t\"\xaf\x02\n\x0b\x44\x61tasetInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x13\n\x0b\x64\x65scription\x18\x02 \x01(\t\x12\x10\n\x08\x63itation\x18\x03 \x01(\t\x12\x15\n\rsize_in_bytes\x18\x04 \x01(\x03\x12\x36\n\x08location\x18\x05 \x01(\x0b\x32$.tensorflow_datasets.DatasetLocation\x12.\n\x06schema\x18\x06 \x01(\x0b\x32\x1e.tensorflow.metadata.v0.Schema\x12.\n\x06splits\x18\x07 \x03(\x0b\x32\x1e.tensorflow_datasets.SplitInfo\x12<\n\x0fsupervised_keys\x18\x08 \x01(\x0b\x32#.tensorflow_datasets.SupervisedKeysB\x03\xf8\x01\x01\x62\x06proto3')
serialized_pb=_b('\n\x12\x64\x61taset_info.proto\x12\x13tensorflow_datasets\x1a-tensorflow_metadata/proto/v0/statistics.proto\x1a)tensorflow_metadata/proto/v0/schema.proto\"\x1f\n\x0f\x44\x61tasetLocation\x12\x0c\n\x04urls\x18\x01 \x03(\t\"s\n\tSplitInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x12\n\nnum_shards\x18\x02 \x01(\x03\x12\x44\n\nstatistics\x18\x03 \x01(\x0b\x32\x30.tensorflow.metadata.v0.DatasetFeatureStatistics\"/\n\x0eSupervisedKeys\x12\r\n\x05input\x18\x01 \x01(\t\x12\x0e\n\x06output\x18\x02 \x01(\t\"\xc0\x02\n\x0b\x44\x61tasetInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x13\n\x0b\x64\x65scription\x18\x02 \x01(\t\x12\x0f\n\x07version\x18\t \x01(\t\x12\x10\n\x08\x63itation\x18\x03 \x01(\t\x12\x15\n\rsize_in_bytes\x18\x04 \x01(\x03\x12\x36\n\x08location\x18\x05 \x01(\x0b\x32$.tensorflow_datasets.DatasetLocation\x12.\n\x06schema\x18\x06 \x01(\x0b\x32\x1e.tensorflow.metadata.v0.Schema\x12.\n\x06splits\x18\x07 \x03(\x0b\x32\x1e.tensorflow_datasets.SplitInfo\x12<\n\x0fsupervised_keys\x18\x08 \x01(\x0b\x32#.tensorflow_datasets.SupervisedKeysB\x03\xf8\x01\x01\x62\x06proto3')
,
dependencies=[tensorflow__metadata_dot_proto_dot_v0_dot_statistics__pb2.DESCRIPTOR,tensorflow__metadata_dot_proto_dot_v0_dot_schema__pb2.DESCRIPTOR,])

Expand Down Expand Up @@ -180,42 +180,49 @@
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='citation', full_name='tensorflow_datasets.DatasetInfo.citation', index=2,
name='version', full_name='tensorflow_datasets.DatasetInfo.version', index=2,
number=9, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='citation', full_name='tensorflow_datasets.DatasetInfo.citation', index=3,
number=3, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='size_in_bytes', full_name='tensorflow_datasets.DatasetInfo.size_in_bytes', index=3,
name='size_in_bytes', full_name='tensorflow_datasets.DatasetInfo.size_in_bytes', index=4,
number=4, type=3, cpp_type=2, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='location', full_name='tensorflow_datasets.DatasetInfo.location', index=4,
name='location', full_name='tensorflow_datasets.DatasetInfo.location', index=5,
number=5, type=11, cpp_type=10, label=1,
has_default_value=False, default_value=None,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='schema', full_name='tensorflow_datasets.DatasetInfo.schema', index=5,
name='schema', full_name='tensorflow_datasets.DatasetInfo.schema', index=6,
number=6, type=11, cpp_type=10, label=1,
has_default_value=False, default_value=None,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='splits', full_name='tensorflow_datasets.DatasetInfo.splits', index=6,
name='splits', full_name='tensorflow_datasets.DatasetInfo.splits', index=7,
number=7, type=11, cpp_type=10, label=3,
has_default_value=False, default_value=[],
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='supervised_keys', full_name='tensorflow_datasets.DatasetInfo.supervised_keys', index=7,
name='supervised_keys', full_name='tensorflow_datasets.DatasetInfo.supervised_keys', index=8,
number=8, type=11, cpp_type=10, label=1,
has_default_value=False, default_value=None,
message_type=None, enum_type=None, containing_type=None,
Expand All @@ -234,7 +241,7 @@
oneofs=[
],
serialized_start=333,
serialized_end=636,
serialized_end=653,
)

_SPLITINFO.fields_by_name['statistics'].message_type = tensorflow__metadata_dot_proto_dot_v0_dot_statistics__pb2._DATASETFEATURESTATISTICS
Expand Down
1 change: 1 addition & 0 deletions tensorflow_datasets/core/proto/install_protoc.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/bin/bash
# Install the .protoc compiler on Linux


# Make sure you grab the latest version
curl -OL https://github.com/google/protobuf/releases/download/v3.6.1/protoc-3.6.1-linux-x86_64.zip

Expand Down
16 changes: 16 additions & 0 deletions tensorflow_datasets/core/utils/py_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,3 +167,19 @@ def get_proto(self):
# Class cannot be wraped because __doc__ not overwritable with python2
return decorator_cls
return decorator


def str_to_version(version_str):
"""Return the tuple (major, minor, patch) version extracted from the str."""
version_ids = version_str.split(".")
if len(version_ids) != 3 or "-" in version_str:
raise ValueError(
"Could not convert the {} to version. Format should be x.y.z".format(
version_str))
try:
version_ids = tuple(int(v) for v in version_ids)
except ValueError:
raise ValueError(
"Could not convert the {} to version. Format should be x.y.z".format(
version_str))
return version_ids
16 changes: 16 additions & 0 deletions tensorflow_datasets/core/utils/py_utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,22 @@ def map_fn(x):
},
})

def test_str_to_version(self):
"""Test the zip nested function."""

self.assertEqual(py_utils.str_to_version('1.3.534'), (1, 3, 534))

with self.assertRaisesWithPredicateMatch(ValueError, 'Format should be '):
py_utils.str_to_version('1.3.-534')
with self.assertRaisesWithPredicateMatch(ValueError, 'Format should be '):
py_utils.str_to_version('1.3')
with self.assertRaisesWithPredicateMatch(ValueError, 'Format should be '):
py_utils.str_to_version('1.3.')
with self.assertRaisesWithPredicateMatch(ValueError, 'Format should be '):
py_utils.str_to_version('1..5')
with self.assertRaisesWithPredicateMatch(ValueError, 'Format should be '):
py_utils.str_to_version('a.b.c')


if __name__ == '__main__':
tf.test.main()
1 change: 1 addition & 0 deletions tensorflow_datasets/image/celeba.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def _info(self):
description=("Large-scale CelebFaces Attributes, CelebA."
"Set of ~30k celebrities pictures. "
"These pictures are cropped."),
version="0.1.0",
features=tfds.features.FeaturesDict({
"image":
tfds.features.Image(
Expand Down
2 changes: 2 additions & 0 deletions tensorflow_datasets/image/cifar.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def _info(self):
description=("The CIFAR-10 dataset consists of 60000 32x32 colour "
"images in 10 classes, with 6000 images per class. There "
"are 50000 training images and 10000 test images."),
version="1.0.0",
features=tfds.features.FeaturesDict({
"image": tfds.features.Image(shape=_CIFAR_IMAGE_SHAPE),
"label": tfds.features.ClassLabel(num_classes=10),
Expand Down Expand Up @@ -188,6 +189,7 @@ def _info(self):
"superclasses. Each image comes with a \"fine\" label "
"(the class to which it belongs) and a \"coarse\" label "
"(the superclass to which it belongs)."),
version="1.0.0",
features=tfds.features.FeaturesDict({
"image": tfds.features.Image(shape=_CIFAR_IMAGE_SHAPE),
"label": tfds.features.OneOf(choice=label_to_use, feature_dict={
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def _info(self):
name=self.name,
description="A large set of high-resolution retina images taken under "
"a variety of imaging conditions.",
version="1.0.0",
features=tfds.features.FeaturesDict({
"name": tfds.features.Text(), # patient ID + eye. eg: "4_left".
"image": tfds.features.Image(),
Expand Down
2 changes: 2 additions & 0 deletions tensorflow_datasets/image/mnist.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def _info(self):
description=("The MNIST database of handwritten digits, has a training "
"set of 60,000 examples, and a test set of 10,000 "
"examples."),
version="1.0.0",
features=tfds.features.FeaturesDict({
"image": tfds.features.Image(shape=_MNIST_IMAGE_SHAPE),
"label": tfds.features.ClassLabel(num_classes=10),
Expand Down Expand Up @@ -131,6 +132,7 @@ def _info(self):
"test set of 10,000 examples. Each example is a 28x28 "
"grayscale image, associated with a label from 10 "
"classes."),
version="1.0.0",
features=tfds.features.FeaturesDict({
"image": tfds.features.Image(shape=_MNIST_IMAGE_SHAPE),
"label": tfds.features.ClassLabel(num_classes=10),
Expand Down
1 change: 1 addition & 0 deletions tensorflow_datasets/video/bair_robot_pushing.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ def _info(self):
"pushing motions, including one training set (train) and "
"two test sets of previously seen (testseen) and unseen "
"(testnovel) objects.",
version="0.1.0",
features=features,
urls=["https://sites.google.com/site/brainrobotdata/home/push-dataset"],
size_in_bytes=30.0 * tfds.units.GiB,
Expand Down

0 comments on commit 68cafe1

Please sign in to comment.