Skip to content

Commit

Permalink
Dark launch S3 (new sharding slicing shuffling mecanism) on all image…
Browse files Browse the repository at this point in the history
… datasets.

PiperOrigin-RevId: 253956508
  • Loading branch information
pierrot0 authored and copybara-github committed Jun 19, 2019
1 parent 4d0e144 commit 7dd97e2
Show file tree
Hide file tree
Showing 67 changed files with 600 additions and 104 deletions.
11 changes: 10 additions & 1 deletion tensorflow_datasets/image/caltech.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ class Caltech101(tfds.core.GeneratorBasedBuilder):
"""Caltech-101."""

VERSION = tfds.core.Version("1.1.0")
SUPPORTED_VERSIONS = [
tfds.core.Version("2.0.0", experiments={tfds.core.Experiment.S3: True}),
]
# Version history:
# 2.0.0: S3 (new shuffling, sharding and slicing mechanism).

def _info(self):
names_file = tfds.core.get_tfds_path(_LABELS_FNAME)
Expand Down Expand Up @@ -129,10 +134,14 @@ def _generate_examples(self, images_dir_path, is_train_split):
for image_file in fnames_to_emit:
if image_file.endswith(".jpg"):
image_path = os.path.join(full_path, image_file)
yield {
record = {
"image": image_path,
"label": d.lower(),
"image/file_name": image_file,
}
if self.version.implements(tfds.core.Experiment.S3):
yield "%s/%s" % (d, image_file), record
else:
yield record
# Resets the seeds to their previous states.
np.random.set_state(numpy_original_state)
5 changes: 5 additions & 0 deletions tensorflow_datasets/image/caltech_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,5 +35,10 @@ def setUp(self):
super(Caltech101Test, self).setUp()
caltech._TRAIN_POINTS_PER_CLASS = 1


class Caltech101S3Test(Caltech101Test):
VERSION = '2.0.0'


if __name__ == '__main__':
testing.test_main()
16 changes: 12 additions & 4 deletions tensorflow_datasets/image/cats_vs_dogs.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,13 @@ class CatsVsDogs(tfds.core.GeneratorBasedBuilder):
"""Cats vs Dogs."""

VERSION = tfds.core.Version("2.0.1")
# From 1.0.0 to 2.0.0:
# - _NUM_CORRUPT_IMAGES: 1800->1738.
# - add 'image/filename' feature.
SUPPORTED_VERSIONS = [
tfds.core.Version("3.0.0", experiments={tfds.core.Experiment.S3: True}),
tfds.core.Version("2.0.1"),
]
# Version history:
# 3.0.0: S3 (new shuffling, sharding and slicing mechanism).
# 2.0.0: _NUM_CORRUPT_IMAGES: 1800->1738, add 'image/filename' feature.

def _info(self):
return tfds.core.DatasetInfo(
Expand Down Expand Up @@ -95,11 +99,15 @@ def _generate_examples(self, archive):
if tf.compat.as_bytes("JFIF") not in fobj.peek(10):
num_skipped += 1
continue
yield {
record = {
"image": fobj,
"image/filename": fname,
"label": label,
}
if self.version.implements(tfds.core.Experiment.S3):
yield fname, record
else:
yield record

if num_skipped != _NUM_CORRUPT_IMAGES:
raise ValueError("Expected %d corrupt images, but found %d" % (
Expand Down
5 changes: 5 additions & 0 deletions tensorflow_datasets/image/cats_vs_dogs_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,5 +32,10 @@ class CatsVsDogsTest(testing.DatasetBuilderTestCase):
}
DL_EXTRACT_RESULT = 'cats_vs_dogs.zip'


class CatsVsDogsS3Test(CatsVsDogsTest):
VERSION = '3.0.0'


if __name__ == '__main__':
testing.test_main()
30 changes: 23 additions & 7 deletions tensorflow_datasets/image/cbis_ddsm.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,9 @@ class CuratedBreastImagingDDSMConfig(tfds.core.BuilderConfig):
"""BuilderConfig for CuratedBreastImagingDDSM."""

def __init__(self, image_size=None, patch_size=None, **kwargs):
kwargs['supported_versions'] = [
tfds.core.Version('1.0.0', experiments={tfds.core.Experiment.S3: True}),
]
super(CuratedBreastImagingDDSMConfig, self).__init__(**kwargs)
self.image_size = image_size
self.patch_size = patch_size
Expand All @@ -131,20 +134,20 @@ class CuratedBreastImagingDDSM(tfds.core.GeneratorBasedBuilder):
BUILDER_CONFIGS = [
CuratedBreastImagingDDSMConfig(
name='patches',
version='0.2.0',
version=tfds.core.Version('0.2.0'),
description=('Patches containing both calsification and mass cases, '
'plus pathces with no abnormalities. Designed as a '
'traditional 5-class classification task.'),
image_size=(1152, 896), # Note: (height, width).
patch_size=(224, 224)),
CuratedBreastImagingDDSMConfig(
name='original-calc',
version='0.1.0',
version=tfds.core.Version('0.1.0'),
description=('Original images of the calcification cases compressed '
'in lossless PNG.')),
CuratedBreastImagingDDSMConfig(
name='original-mass',
version='0.1.0',
version=tfds.core.Version('0.1.0'),
description=('Original images of the mass cases compressed in '
'lossless PNG.')),
]
Expand Down Expand Up @@ -363,7 +366,7 @@ def _include_example_in_split(example):
for _, patient_examples in sorted(patients_data.items()):
for _, example in sorted(patient_examples.items()):
if _include_example_in_split(example):
yield {
record = {
'id': example['id'],
'patient': example['patient'],
'image': example['image'],
Expand All @@ -375,6 +378,10 @@ def _include_example_in_split(example):
} for abnormality in example['abnormalities']],
# pylint: enable=g-complex-comprehension
}
if self.version.implements(tfds.core.Experiment.S3):
yield example['id'], record
else:
yield record

def _generate_examples_patches(self,
patients_data,
Expand Down Expand Up @@ -417,25 +424,34 @@ def _generate_examples_patches(self,
num_positive_patches_per_abnormality)):
patch_id = ('%s/abnorm_%s/patch_%d' %
(example['id'], abnormality['id'], k))
yield {
record = {
'id': patch_id,
# Note: TFDS needs the shape to be (?, ?, 1).
'image': np.expand_dims(patch, axis=-1),
'label': label,
}
if self.version.implements(tfds.core.Experiment.S3):
yield patch_id, record
else:
yield record

# Sample background patches from the given mammography.
for k, patch in enumerate(
_sample_negative_patches(image, example['image'],
abnormalities_masks, abnormalities_areas,
patch_size,
num_background_patches_per_image)):
yield {
'id': '%s/background_%d' % (example['id'], k),
id_ = '%s/background_%d' % (example['id'], k)
record = {
'id': id_,
# Note: TFDS needs the shape to be (?, ?, 1).
'image': np.expand_dims(patch, axis=-1),
'label': 'BACKGROUND',
}
if self.version.implements(tfds.core.Experiment.S3):
yield id_, record
else:
yield record


def _load_csv_files(manual_dir, dictionary_of_csv_files):
Expand Down
15 changes: 15 additions & 0 deletions tensorflow_datasets/image/cbis_ddsm_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@ class CuratedBreastImagingDDSMOriginalCalcTest(testing.DatasetBuilderTestCase):
}


class CuratedBreastImagingDDSMOriginalCalcS3Test(
CuratedBreastImagingDDSMOriginalCalcTest):
VERSION = '1.0.0'


class CuratedBreastImagingDDSMOriginalMassTest(testing.DatasetBuilderTestCase):
DATASET_CLASS = cbis_ddsm.CuratedBreastImagingDDSM
BUILDER_CONFIG_NAMES_TO_TEST = ['original-mass']
Expand All @@ -48,6 +53,11 @@ class CuratedBreastImagingDDSMOriginalMassTest(testing.DatasetBuilderTestCase):
}


class CuratedBreastImagingDDSMOriginalMassS3Test(
CuratedBreastImagingDDSMOriginalMassTest):
VERSION = '1.0.0'


class CuratedBreastImagingDDSMPatchesTest(testing.DatasetBuilderTestCase):
DATASET_CLASS = cbis_ddsm.CuratedBreastImagingDDSM
BUILDER_CONFIG_NAMES_TO_TEST = ['patches']
Expand All @@ -65,5 +75,10 @@ class CuratedBreastImagingDDSMPatchesTest(testing.DatasetBuilderTestCase):
}


class CuratedBreastImagingDDSMPatchesS3Test(
CuratedBreastImagingDDSMPatchesTest):
VERSION = '1.0.0'


if __name__ == '__main__':
testing.test_main()
11 changes: 10 additions & 1 deletion tensorflow_datasets/image/celeba.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,11 @@ class CelebA(tfds.core.GeneratorBasedBuilder):
"""CelebA dataset. Aligned and cropped. With metadata."""

VERSION = tfds.core.Version("0.3.0")
SUPPORTED_VERSIONS = [
tfds.core.Version("1.0.0", experiments={tfds.core.Experiment.S3: True}),
]
# Version history:
# 1.0.0: S3 (new shuffling, sharding and slicing mechanism).

def _info(self):
return tfds.core.DatasetInfo(
Expand Down Expand Up @@ -195,7 +200,7 @@ def _generate_examples(self, file_id, extracted_dirs):
for file_name in sorted(files):
path = os.path.join(filedir, file_name)

yield {
record = {
"image": path,
"landmarks": {
k: v for k, v in zip(landmarks[0], landmarks[1][file_name])
Expand All @@ -205,3 +210,7 @@ def _generate_examples(self, file_id, extracted_dirs):
k: v > 0 for k, v in zip(attributes[0], attributes[1][file_name])
},
}
if self.version.implements(tfds.core.Experiment.S3):
yield file_name, record
else:
yield record
4 changes: 4 additions & 0 deletions tensorflow_datasets/image/celeba_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,9 @@ class CelebATest(testing.DatasetBuilderTestCase):
}


class CelebAS3Test(CelebATest):
VERSION = "1.0.0"


if __name__ == "__main__":
testing.test_main()
9 changes: 8 additions & 1 deletion tensorflow_datasets/image/celebahq.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ def __init__(self, resolution, **kwargs):
1024.
**kwargs: keyword arguments forwarded to super.
"""
kwargs["supported_versions"] = [
tfds.core.Version("1.0.0", experiments={tfds.core.Experiment.S3: True}),
]
super(CelebaHQConfig, self).__init__(
name="%d" % resolution,
description=("CelebaHQ images in %d x %d resolution" %
Expand Down Expand Up @@ -125,4 +128,8 @@ def _split_generators(self, dl_manager):

def _generate_examples(self, archive):
for fname, fobj in archive:
yield {"image": fobj, "image/filename": fname}
record = {"image": fobj, "image/filename": fname}
if self.version.implements(tfds.core.Experiment.S3):
yield fname, record
else:
yield record
5 changes: 5 additions & 0 deletions tensorflow_datasets/image/celebahq_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,5 +31,10 @@ class CelebAHQTest(tfds_test.DatasetBuilderTestCase):
"train": 3,
}


class CelebAHQS3Test(CelebAHQTest):
VERSION = "1.0.0"


if __name__ == "__main__":
tfds_test.test_main()
10 changes: 9 additions & 1 deletion tensorflow_datasets/image/chexpert.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,10 @@ class Chexpert(tfds.core.GeneratorBasedBuilder):
"""CheXpert 2019."""

VERSION = tfds.core.Version("1.0.0")
SUPPORTED_VERSIONS = [
tfds.core.Version("2.0.0", experiments={tfds.core.Experiment.S3: True}),
tfds.core.Version("1.0.0"),
]

def _info(self):
return tfds.core.DatasetInfo(
Expand Down Expand Up @@ -134,8 +138,12 @@ def _generate_examples(self, imgs_path, csv_path):
data.append((name, labels))

for name, labels in data:
yield {
record = {
"name": name,
"image": os.path.join(imgs_path, name),
"label": labels
}
if self.version.implements(tfds.core.Experiment.S3):
yield name, record
else:
yield record
4 changes: 4 additions & 0 deletions tensorflow_datasets/image/chexpert_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,5 +31,9 @@ class ChexpertTest(testing.DatasetBuilderTestCase):
}


class ChexpertS3Test(ChexpertTest):
VERSION = "2.0.0"


if __name__ == "__main__":
testing.test_main()
15 changes: 13 additions & 2 deletions tensorflow_datasets/image/clevr.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,12 @@ class CLEVR(tfds.core.GeneratorBasedBuilder):
"""CLEVR dataset."""

VERSION = tfds.core.Version("1.0.0")
SUPPORTED_VERSIONS = [
tfds.core.Version("2.0.0", experiments={tfds.core.Experiment.S3: True}),
tfds.core.Version("1.0.0"),
]
# Version history:
# 2.0.0: S3 (new shuffling, sharding and slicing mechanism).

def _info(self):
return tfds.core.DatasetInfo(
Expand Down Expand Up @@ -112,8 +118,13 @@ def _generate_examples(self, images_dir_path, scenes_description_file):
"rotation", "pixel_coords", "3d_coords"]
for image_path, scene in zip(image_paths, scenes_json["scenes"]):
objects = scene["objects"]
yield {
fname = os.path.basename(image_path)
record = {
"image": image_path,
"file_name": os.path.basename(image_path),
"file_name": fname,
"objects": [{attr: obj[attr] for attr in attrs} for obj in objects] # pylint: disable=g-complex-comprehension
}
if self.version.implements(tfds.core.Experiment.S3):
yield fname, record
else:
yield record
4 changes: 4 additions & 0 deletions tensorflow_datasets/image/clevr_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,5 +31,9 @@ class CLEVRTest(testing.DatasetBuilderTestCase):
}


class CLEVRS3Test(CLEVRTest):
VERSION = "2.0.0"


if __name__ == "__main__":
testing.test_main()
12 changes: 11 additions & 1 deletion tensorflow_datasets/image/coco.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,12 @@ class Coco2014(tfds.core.GeneratorBasedBuilder):
"""MS Coco dataset."""

VERSION = tfds.core.Version("1.0.0")
SUPPORTED_VERSIONS = [
tfds.core.Version("2.0.0", experiments={tfds.core.Experiment.S3: True}),
tfds.core.Version("1.0.0"),
]
# Version history:
# 2.0.0: S3 (new shuffling, sharding and slicing mechanism).

def _info(self):
return tfds.core.DatasetInfo(
Expand Down Expand Up @@ -236,7 +242,7 @@ def build_bbox(x, y, width, height):
)
# pylint: enable=cell-var-from-loop

yield {
record = {
"image": os.path.join(image_dir, split_type, image_info["file_name"]),
"image/filename": image_info["file_name"],
"objects": [{ # pylint: disable=g-complex-comprehension
Expand All @@ -245,6 +251,10 @@ def build_bbox(x, y, width, height):
"is_crowd": bool(instance_info["iscrowd"]),
} for instance_info in instances],
}
if self.version.implements(tfds.core.Experiment.S3):
yield image_info["file_name"], record
else:
yield record
logging.info(
"%d/%d images do not contains any annotations",
annotation_skipped,
Expand Down
4 changes: 4 additions & 0 deletions tensorflow_datasets/image/coco_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,5 +47,9 @@ class Coco2014Test(testing.DatasetBuilderTestCase):
}


class Coco2014S3Test(Coco2014Test):
VERSION = "2.0.0"


if __name__ == "__main__":
testing.test_main()
Loading

0 comments on commit 7dd97e2

Please sign in to comment.