Dark launch S3 (new sharding slicing shuffling mecanism) on all image…

… datasets. PiperOrigin-RevId: 253956508
liwohj · Jun 19, 2019 · 7dd97e2 · 7dd97e2
1 parent 4d0e144
commit 7dd97e2
Show file tree

Hide file tree

Showing 67 changed files with 600 additions and 104 deletions.
diff --git a/tensorflow_datasets/image/caltech.py b/tensorflow_datasets/image/caltech.py
@@ -50,6 +50,11 @@ class Caltech101(tfds.core.GeneratorBasedBuilder):
   """Caltech-101."""
 
   VERSION = tfds.core.Version("1.1.0")
+  SUPPORTED_VERSIONS = [
+      tfds.core.Version("2.0.0", experiments={tfds.core.Experiment.S3: True}),
+  ]
+  # Version history:
+  # 2.0.0: S3 (new shuffling, sharding and slicing mechanism).
 
   def _info(self):
     names_file = tfds.core.get_tfds_path(_LABELS_FNAME)
@@ -129,10 +134,14 @@ def _generate_examples(self, images_dir_path, is_train_split):
           for image_file in fnames_to_emit:
             if image_file.endswith(".jpg"):
               image_path = os.path.join(full_path, image_file)
-              yield {
+              record = {
                   "image": image_path,
                   "label": d.lower(),
                   "image/file_name": image_file,
               }
+              if self.version.implements(tfds.core.Experiment.S3):
+                yield "%s/%s" % (d, image_file), record
+              else:
+                yield record
     # Resets the seeds to their previous states.
     np.random.set_state(numpy_original_state)
diff --git a/tensorflow_datasets/image/caltech_test.py b/tensorflow_datasets/image/caltech_test.py
@@ -35,5 +35,10 @@ def setUp(self):
     super(Caltech101Test, self).setUp()
     caltech._TRAIN_POINTS_PER_CLASS = 1
 
+
+class Caltech101S3Test(Caltech101Test):
+  VERSION = '2.0.0'
+
+
 if __name__ == '__main__':
   testing.test_main()
diff --git a/tensorflow_datasets/image/cats_vs_dogs.py b/tensorflow_datasets/image/cats_vs_dogs.py
@@ -53,9 +53,13 @@ class CatsVsDogs(tfds.core.GeneratorBasedBuilder):
   """Cats vs Dogs."""
 
   VERSION = tfds.core.Version("2.0.1")
-  # From 1.0.0 to 2.0.0:
-  #  - _NUM_CORRUPT_IMAGES: 1800->1738.
-  #  - add 'image/filename' feature.
+  SUPPORTED_VERSIONS = [
+      tfds.core.Version("3.0.0", experiments={tfds.core.Experiment.S3: True}),
+      tfds.core.Version("2.0.1"),
+  ]
+  # Version history:
+  # 3.0.0: S3 (new shuffling, sharding and slicing mechanism).
+  # 2.0.0: _NUM_CORRUPT_IMAGES: 1800->1738, add 'image/filename' feature.
 
   def _info(self):
     return tfds.core.DatasetInfo(
@@ -95,11 +99,15 @@ def _generate_examples(self, archive):
       if tf.compat.as_bytes("JFIF") not in fobj.peek(10):
         num_skipped += 1
         continue
-      yield {
+      record = {
           "image": fobj,
           "image/filename": fname,
           "label": label,
       }
+      if self.version.implements(tfds.core.Experiment.S3):
+        yield fname, record
+      else:
+        yield record
 
     if num_skipped != _NUM_CORRUPT_IMAGES:
       raise ValueError("Expected %d corrupt images, but found %d" % (

diff --git a/tensorflow_datasets/image/cats_vs_dogs_test.py b/tensorflow_datasets/image/cats_vs_dogs_test.py
@@ -32,5 +32,10 @@ class CatsVsDogsTest(testing.DatasetBuilderTestCase):
   }
   DL_EXTRACT_RESULT = 'cats_vs_dogs.zip'
 
+
+class CatsVsDogsS3Test(CatsVsDogsTest):
+  VERSION = '3.0.0'
+
+
 if __name__ == '__main__':
   testing.test_main()
diff --git a/tensorflow_datasets/image/cbis_ddsm.py b/tensorflow_datasets/image/cbis_ddsm.py
@@ -120,6 +120,9 @@ class CuratedBreastImagingDDSMConfig(tfds.core.BuilderConfig):
   """BuilderConfig for CuratedBreastImagingDDSM."""
 
   def __init__(self, image_size=None, patch_size=None, **kwargs):
+    kwargs['supported_versions'] = [
+        tfds.core.Version('1.0.0', experiments={tfds.core.Experiment.S3: True}),
+    ]
     super(CuratedBreastImagingDDSMConfig, self).__init__(**kwargs)
     self.image_size = image_size
     self.patch_size = patch_size
@@ -131,20 +134,20 @@ class CuratedBreastImagingDDSM(tfds.core.GeneratorBasedBuilder):
   BUILDER_CONFIGS = [
       CuratedBreastImagingDDSMConfig(
           name='patches',
-          version='0.2.0',
+          version=tfds.core.Version('0.2.0'),
           description=('Patches containing both calsification and mass cases, '
                        'plus pathces with no abnormalities. Designed as a '
                        'traditional 5-class classification task.'),
           image_size=(1152, 896),  # Note: (height, width).
           patch_size=(224, 224)),
       CuratedBreastImagingDDSMConfig(
           name='original-calc',
-          version='0.1.0',
+          version=tfds.core.Version('0.1.0'),
           description=('Original images of the calcification cases compressed '
                        'in lossless PNG.')),
       CuratedBreastImagingDDSMConfig(
           name='original-mass',
-          version='0.1.0',
+          version=tfds.core.Version('0.1.0'),
           description=('Original images of the mass cases compressed in '
                        'lossless PNG.')),
   ]
@@ -363,7 +366,7 @@ def _include_example_in_split(example):
     for _, patient_examples in sorted(patients_data.items()):
       for _, example in sorted(patient_examples.items()):
         if _include_example_in_split(example):
-          yield {
+          record = {
               'id': example['id'],
               'patient': example['patient'],
               'image': example['image'],
@@ -375,6 +378,10 @@ def _include_example_in_split(example):
               } for abnormality in example['abnormalities']],
               # pylint: enable=g-complex-comprehension
           }
+          if self.version.implements(tfds.core.Experiment.S3):
+            yield example['id'], record
+          else:
+            yield record
 
   def _generate_examples_patches(self,
                                  patients_data,
@@ -417,25 +424,34 @@ def _generate_examples_patches(self,
                                        num_positive_patches_per_abnormality)):
             patch_id = ('%s/abnorm_%s/patch_%d' %
                         (example['id'], abnormality['id'], k))
-            yield {
+            record = {
                 'id': patch_id,
                 # Note: TFDS needs the shape to be (?, ?, 1).
                 'image': np.expand_dims(patch, axis=-1),
                 'label': label,
             }
+            if self.version.implements(tfds.core.Experiment.S3):
+              yield patch_id, record
+            else:
+              yield record
 
         # Sample background patches from the given mammography.
         for k, patch in enumerate(
             _sample_negative_patches(image, example['image'],
                                      abnormalities_masks, abnormalities_areas,
                                      patch_size,
                                      num_background_patches_per_image)):
-          yield {
-              'id': '%s/background_%d' % (example['id'], k),
+          id_ = '%s/background_%d' % (example['id'], k)
+          record = {
+              'id': id_,
               # Note: TFDS needs the shape to be (?, ?, 1).
               'image': np.expand_dims(patch, axis=-1),
               'label': 'BACKGROUND',
           }
+          if self.version.implements(tfds.core.Experiment.S3):
+            yield id_, record
+          else:
+            yield record
 
 
 def _load_csv_files(manual_dir, dictionary_of_csv_files):

diff --git a/tensorflow_datasets/image/cbis_ddsm_test.py b/tensorflow_datasets/image/cbis_ddsm_test.py
@@ -35,6 +35,11 @@ class CuratedBreastImagingDDSMOriginalCalcTest(testing.DatasetBuilderTestCase):
   }
 
 
+class CuratedBreastImagingDDSMOriginalCalcS3Test(
+    CuratedBreastImagingDDSMOriginalCalcTest):
+  VERSION = '1.0.0'
+
+
 class CuratedBreastImagingDDSMOriginalMassTest(testing.DatasetBuilderTestCase):
   DATASET_CLASS = cbis_ddsm.CuratedBreastImagingDDSM
   BUILDER_CONFIG_NAMES_TO_TEST = ['original-mass']
@@ -48,6 +53,11 @@ class CuratedBreastImagingDDSMOriginalMassTest(testing.DatasetBuilderTestCase):
   }
 
 
+class CuratedBreastImagingDDSMOriginalMassS3Test(
+    CuratedBreastImagingDDSMOriginalMassTest):
+  VERSION = '1.0.0'
+
+
 class CuratedBreastImagingDDSMPatchesTest(testing.DatasetBuilderTestCase):
   DATASET_CLASS = cbis_ddsm.CuratedBreastImagingDDSM
   BUILDER_CONFIG_NAMES_TO_TEST = ['patches']
@@ -65,5 +75,10 @@ class CuratedBreastImagingDDSMPatchesTest(testing.DatasetBuilderTestCase):
   }
 
 
+class CuratedBreastImagingDDSMPatchesS3Test(
+    CuratedBreastImagingDDSMPatchesTest):
+  VERSION = '1.0.0'
+
+
 if __name__ == '__main__':
   testing.test_main()
diff --git a/tensorflow_datasets/image/celeba.py b/tensorflow_datasets/image/celeba.py
@@ -97,6 +97,11 @@ class CelebA(tfds.core.GeneratorBasedBuilder):
   """CelebA dataset. Aligned and cropped. With metadata."""
 
   VERSION = tfds.core.Version("0.3.0")
+  SUPPORTED_VERSIONS = [
+      tfds.core.Version("1.0.0", experiments={tfds.core.Experiment.S3: True}),
+  ]
+  # Version history:
+  # 1.0.0: S3 (new shuffling, sharding and slicing mechanism).
 
   def _info(self):
     return tfds.core.DatasetInfo(
@@ -195,7 +200,7 @@ def _generate_examples(self, file_id, extracted_dirs):
     for file_name in sorted(files):
       path = os.path.join(filedir, file_name)
 
-      yield {
+      record = {
           "image": path,
           "landmarks": {
               k: v for k, v in zip(landmarks[0], landmarks[1][file_name])
@@ -205,3 +210,7 @@ def _generate_examples(self, file_id, extracted_dirs):
               k: v > 0 for k, v in zip(attributes[0], attributes[1][file_name])
           },
       }
+      if self.version.implements(tfds.core.Experiment.S3):
+        yield file_name, record
+      else:
+        yield record
diff --git a/tensorflow_datasets/image/celeba_test.py b/tensorflow_datasets/image/celeba_test.py
@@ -39,5 +39,9 @@ class CelebATest(testing.DatasetBuilderTestCase):
   }
 
 
+class CelebAS3Test(CelebATest):
+  VERSION = "1.0.0"
+
+
 if __name__ == "__main__":
   testing.test_main()
diff --git a/tensorflow_datasets/image/celebahq.py b/tensorflow_datasets/image/celebahq.py
@@ -59,6 +59,9 @@ def __init__(self, resolution, **kwargs):
         1024.
       **kwargs: keyword arguments forwarded to super.
     """
+    kwargs["supported_versions"] = [
+        tfds.core.Version("1.0.0", experiments={tfds.core.Experiment.S3: True}),
+    ]
     super(CelebaHQConfig, self).__init__(
         name="%d" % resolution,
         description=("CelebaHQ images in %d x %d resolution" %
@@ -125,4 +128,8 @@ def _split_generators(self, dl_manager):
 
   def _generate_examples(self, archive):
     for fname, fobj in archive:
-      yield {"image": fobj, "image/filename": fname}
+      record = {"image": fobj, "image/filename": fname}
+      if self.version.implements(tfds.core.Experiment.S3):
+        yield fname, record
+      else:
+        yield record
diff --git a/tensorflow_datasets/image/celebahq_test.py b/tensorflow_datasets/image/celebahq_test.py
@@ -31,5 +31,10 @@ class CelebAHQTest(tfds_test.DatasetBuilderTestCase):
       "train": 3,
   }
 
+
+class CelebAHQS3Test(CelebAHQTest):
+  VERSION = "1.0.0"
+
+
 if __name__ == "__main__":
   tfds_test.test_main()
diff --git a/tensorflow_datasets/image/chexpert.py b/tensorflow_datasets/image/chexpert.py
@@ -74,6 +74,10 @@ class Chexpert(tfds.core.GeneratorBasedBuilder):
   """CheXpert 2019."""
 
   VERSION = tfds.core.Version("1.0.0")
+  SUPPORTED_VERSIONS = [
+      tfds.core.Version("2.0.0", experiments={tfds.core.Experiment.S3: True}),
+      tfds.core.Version("1.0.0"),
+  ]
 
   def _info(self):
     return tfds.core.DatasetInfo(
@@ -134,8 +138,12 @@ def _generate_examples(self, imgs_path, csv_path):
         data.append((name, labels))
 
     for name, labels in data:
-      yield {
+      record = {
           "name": name,
           "image": os.path.join(imgs_path, name),
           "label": labels
       }
+      if self.version.implements(tfds.core.Experiment.S3):
+        yield name, record
+      else:
+        yield record
diff --git a/tensorflow_datasets/image/chexpert_test.py b/tensorflow_datasets/image/chexpert_test.py
@@ -31,5 +31,9 @@ class ChexpertTest(testing.DatasetBuilderTestCase):
   }
 
 
+class ChexpertS3Test(ChexpertTest):
+  VERSION = "2.0.0"
+
+
 if __name__ == "__main__":
   testing.test_main()
diff --git a/tensorflow_datasets/image/clevr.py b/tensorflow_datasets/image/clevr.py
@@ -49,6 +49,12 @@ class CLEVR(tfds.core.GeneratorBasedBuilder):
   """CLEVR dataset."""
 
   VERSION = tfds.core.Version("1.0.0")
+  SUPPORTED_VERSIONS = [
+      tfds.core.Version("2.0.0", experiments={tfds.core.Experiment.S3: True}),
+      tfds.core.Version("1.0.0"),
+  ]
+  # Version history:
+  # 2.0.0: S3 (new shuffling, sharding and slicing mechanism).
 
   def _info(self):
     return tfds.core.DatasetInfo(
@@ -112,8 +118,13 @@ def _generate_examples(self, images_dir_path, scenes_description_file):
              "rotation", "pixel_coords", "3d_coords"]
     for image_path, scene in zip(image_paths, scenes_json["scenes"]):
       objects = scene["objects"]
-      yield {
+      fname = os.path.basename(image_path)
+      record = {
           "image": image_path,
-          "file_name": os.path.basename(image_path),
+          "file_name": fname,
           "objects": [{attr: obj[attr] for attr in attrs} for obj in objects]  # pylint: disable=g-complex-comprehension
       }
+      if self.version.implements(tfds.core.Experiment.S3):
+        yield fname, record
+      else:
+        yield record
diff --git a/tensorflow_datasets/image/clevr_test.py b/tensorflow_datasets/image/clevr_test.py
@@ -31,5 +31,9 @@ class CLEVRTest(testing.DatasetBuilderTestCase):
   }
 
 
+class CLEVRS3Test(CLEVRTest):
+  VERSION = "2.0.0"
+
+
 if __name__ == "__main__":
   testing.test_main()
diff --git a/tensorflow_datasets/image/coco.py b/tensorflow_datasets/image/coco.py
@@ -59,6 +59,12 @@ class Coco2014(tfds.core.GeneratorBasedBuilder):
   """MS Coco dataset."""
 
   VERSION = tfds.core.Version("1.0.0")
+  SUPPORTED_VERSIONS = [
+      tfds.core.Version("2.0.0", experiments={tfds.core.Experiment.S3: True}),
+      tfds.core.Version("1.0.0"),
+  ]
+  # Version history:
+  # 2.0.0: S3 (new shuffling, sharding and slicing mechanism).
 
   def _info(self):
     return tfds.core.DatasetInfo(
@@ -236,7 +242,7 @@ def build_bbox(x, y, width, height):
         )
         # pylint: enable=cell-var-from-loop
 
-      yield {
+      record = {
           "image": os.path.join(image_dir, split_type, image_info["file_name"]),
           "image/filename": image_info["file_name"],
           "objects": [{   # pylint: disable=g-complex-comprehension
@@ -245,6 +251,10 @@ def build_bbox(x, y, width, height):
               "is_crowd": bool(instance_info["iscrowd"]),
           } for instance_info in instances],
       }
+      if self.version.implements(tfds.core.Experiment.S3):
+        yield image_info["file_name"], record
+      else:
+        yield record
     logging.info(
         "%d/%d images do not contains any annotations",
         annotation_skipped,

diff --git a/tensorflow_datasets/image/coco_test.py b/tensorflow_datasets/image/coco_test.py
@@ -47,5 +47,9 @@ class Coco2014Test(testing.DatasetBuilderTestCase):
   }
 
 
+class Coco2014S3Test(Coco2014Test):
+  VERSION = "2.0.0"
+
+
 if __name__ == "__main__":
   testing.test_main()