Add PatchCamelyon dataset.

https://patchcamelyon.grand-challenge.org/ PiperOrigin-RevId: 253198741
liwohj · Jun 14, 2019 · e9d6c9b · e9d6c9b
1 parent 27f1e7f
commit e9d6c9b
Show file tree

Hide file tree

Showing 11 changed files with 228 additions and 0 deletions.
diff --git a/tensorflow_datasets/image/__init__.py b/tensorflow_datasets/image/__init__.py
@@ -50,6 +50,7 @@
 from tensorflow_datasets.image.open_images import OpenImagesV4
 from tensorflow_datasets.image.oxford_flowers102 import OxfordFlowers102
 from tensorflow_datasets.image.oxford_iiit_pet import OxfordIIITPet
+from tensorflow_datasets.image.patch_camelyon import PatchCamelyon
 from tensorflow_datasets.image.quickdraw import QuickdrawBitmap
 from tensorflow_datasets.image.resisc45 import Resisc45
 from tensorflow_datasets.image.rock_paper_scissors import RockPaperScissors

diff --git a/tensorflow_datasets/image/patch_camelyon.py b/tensorflow_datasets/image/patch_camelyon.py
@@ -0,0 +1,109 @@
+# coding=utf-8
+# Copyright 2019 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""PatchCamelyon images dataset."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import h5py
+import tensorflow_datasets.public_api as tfds
+
+_DESCRIPTION = """\
+The PatchCamelyon benchmark is a new and challenging image classification
+dataset. It consists of 327.680 color images (96 x 96px) extracted from
+histopathologic scans of lymph node sections. Each image is annoted with a
+binary label indicating presence of metastatic tissue. PCam provides a new
+benchmark for machine learning models: bigger than CIFAR10, smaller than
+Imagenet, trainable on a single GPU.
+"""
+_CITATION = """\
+@misc{b_s_veeling_j_linmans_j_winkens_t_cohen_2018_2546921,
+  author       = {B. S. Veeling, J. Linmans, J. Winkens, T. Cohen, M. Welling},
+  title        = {Rotation Equivariant CNNs for Digital Pathology},
+  month        = sep,
+  year         = 2018,
+  doi          = {10.1007/978-3-030-00934-2_24},
+  url          = {https://doi.org/10.1007/978-3-030-00934-2_24}
+}
+"""
+_URL = 'https://patchcamelyon.grand-challenge.org/'
+
+
+class PatchCamelyon(tfds.core.GeneratorBasedBuilder):
+  """PatchCamelyon."""
+
+  VERSION = tfds.core.Version('0.1.0')
+
+  def _info(self):
+    return tfds.core.DatasetInfo(
+        builder=self,
+        description=_DESCRIPTION,
+        features=tfds.features.FeaturesDict({
+            'id':
+                tfds.features.Text(),
+            'image':
+                tfds.features.Image(shape=(96, 96, 3), encoding_format='png'),
+            'label':
+                tfds.features.ClassLabel(num_classes=2),
+        }),
+        supervised_keys=('image', 'label'),
+        urls=[_URL],
+        citation=_CITATION)
+
+  def _split_generators(self, dl_manager):
+    base_url = 'https://zenodo.org/record/2546921/files/'
+    resources = {
+        'test_x': base_url + 'camelyonpatch_level_2_split_test_x.h5.gz',
+        'test_y': base_url + 'camelyonpatch_level_2_split_test_y.h5.gz',
+        'train_x': base_url + 'camelyonpatch_level_2_split_train_x.h5.gz',
+        'train_y': base_url + 'camelyonpatch_level_2_split_train_y.h5.gz',
+        'valid_x': base_url + 'camelyonpatch_level_2_split_valid_x.h5.gz',
+        'valid_y': base_url + 'camelyonpatch_level_2_split_valid_y.h5.gz',
+    }
+    paths = dl_manager.download_and_extract(resources)
+    return [
+        tfds.core.SplitGenerator(
+            name=tfds.Split.TEST,
+            num_shards=1,
+            gen_kwargs=dict(split='test', paths=paths)),
+        tfds.core.SplitGenerator(
+            name=tfds.Split.TRAIN,
+            num_shards=10,
+            gen_kwargs=dict(split='train', paths=paths)),
+        tfds.core.SplitGenerator(
+            name=tfds.Split.VALIDATION,
+            num_shards=1,
+            gen_kwargs=dict(split='valid', paths=paths)),
+    ]
+
+  def _generate_examples(self, split, paths):
+    """Generates images and labels given the image directory path.
+
+    Args:
+      split: name of the split to generate examples for (test, train, valid).
+      paths: dictionary with the paths to the h5 files for each split.
+
+    Yields:
+      A dictionary with the image and the corresponding label.
+    """
+    h5x_file = h5py.File(paths[split + '_x'], 'r')
+    h5y_file = h5py.File(paths[split + '_y'], 'r')
+    images = h5x_file['x']
+    labels = h5y_file['y']  # Note: Labels are in a N x 1 x 1 x 1 tensor.
+    for i, (image, label) in enumerate(zip(images, labels)):
+      label = label.flatten()[0]
+      yield {'id': '%s_%d' % (split, i), 'image': image, 'label': label}
diff --git a/tensorflow_datasets/image/patch_camelyon_test.py b/tensorflow_datasets/image/patch_camelyon_test.py
@@ -0,0 +1,47 @@
+# coding=utf-8
+# Copyright 2019 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for camelyon_patch.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow_datasets import testing
+from tensorflow_datasets.image import patch_camelyon
+
+
+class Caltech101Test(testing.DatasetBuilderTestCase):
+
+  DATASET_CLASS = patch_camelyon.PatchCamelyon
+
+  SPLITS = {
+      'train': 5,
+      'test': 4,
+      'validation': 3,
+  }
+
+  DL_EXTRACT_RESULT = {
+      'train_x': 'camelyonpatch_level_2_split_train_x.h5',
+      'train_y': 'camelyonpatch_level_2_split_train_y.h5',
+      'test_x': 'camelyonpatch_level_2_split_test_x.h5',
+      'test_y': 'camelyonpatch_level_2_split_test_y.h5',
+      'valid_x': 'camelyonpatch_level_2_split_valid_x.h5',
+      'valid_y': 'camelyonpatch_level_2_split_valid_y.h5',
+  }
+
+
+if __name__ == '__main__':
+  testing.test_main()
diff --git a/tensorflow_datasets/testing/patch_camelyon.py b/tensorflow_datasets/testing/patch_camelyon.py
@@ -0,0 +1,65 @@
+# coding=utf-8
+# Copyright 2019 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""Create fake data for Camelyon Patch dataset.
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import google_type_annotations
+from __future__ import print_function
+
+import os
+
+from absl import app
+from absl import flags
+
+import h5py
+import numpy as np
+from tensorflow_datasets.core.utils import py_utils
+
+
+flags.DEFINE_string('tfds_dir', py_utils.tfds_dir(),
+                    'Path to tensorflow_datasets directory')
+FLAGS = flags.FLAGS
+
+
+def get_output_file_prefix(split):
+  return os.path.join(FLAGS.tfds_dir, 'testing', 'test_data', 'fake_examples',
+                      'patch_camelyon',
+                      'camelyonpatch_level_2_split_%s' % split)
+
+
+def write_to_h5_file(filepath, dataset_name, content):
+  with h5py.File(filepath, 'w') as h5_f:
+    h5_f.create_dataset(dataset_name, data=content)
+
+
+def main(_):
+  np.random.seed(0x12345)
+  for split, num_examples in [('train', 5), ('test', 4), ('valid', 3)]:
+    x = np.random.randint(
+        low=0, high=256, size=(num_examples, 96, 96, 3), dtype=np.uint8)
+    y = np.random.randint(
+        low=0, high=2, size=(num_examples, 1, 1, 1), dtype=np.uint32)
+    images_filepath = get_output_file_prefix(split) + '_x.h5'
+    labels_filepath = get_output_file_prefix(split) + '_y.h5'
+    write_to_h5_file(images_filepath, dataset_name='x', content=x)
+    write_to_h5_file(labels_filepath, dataset_name='y', content=y)
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/...sets/testing/test_data/fake_examples/patch_camelyon/camelyonpatch_level_2_split_test_x.h5 b/...sets/testing/test_data/fake_examples/patch_camelyon/camelyonpatch_level_2_split_test_x.h5
diff --git a/...sets/testing/test_data/fake_examples/patch_camelyon/camelyonpatch_level_2_split_test_y.h5 b/...sets/testing/test_data/fake_examples/patch_camelyon/camelyonpatch_level_2_split_test_y.h5
diff --git a/...ets/testing/test_data/fake_examples/patch_camelyon/camelyonpatch_level_2_split_train_x.h5 b/...ets/testing/test_data/fake_examples/patch_camelyon/camelyonpatch_level_2_split_train_x.h5
diff --git a/...ets/testing/test_data/fake_examples/patch_camelyon/camelyonpatch_level_2_split_train_y.h5 b/...ets/testing/test_data/fake_examples/patch_camelyon/camelyonpatch_level_2_split_train_y.h5
diff --git a/...ets/testing/test_data/fake_examples/patch_camelyon/camelyonpatch_level_2_split_valid_x.h5 b/...ets/testing/test_data/fake_examples/patch_camelyon/camelyonpatch_level_2_split_valid_x.h5
diff --git a/...ets/testing/test_data/fake_examples/patch_camelyon/camelyonpatch_level_2_split_valid_y.h5 b/...ets/testing/test_data/fake_examples/patch_camelyon/camelyonpatch_level_2_split_valid_y.h5
diff --git a/tensorflow_datasets/url_checksums/patch_camelyon.txt b/tensorflow_datasets/url_checksums/patch_camelyon.txt
@@ -0,0 +1,6 @@
+https://zenodo.org/record/2546921/files/camelyonpatch_level_2_split_test_x.h5.gz 800875929 79174c2201ad521602a5888be8f36ee10875f37403dd3f2086caf2182ef87245
+https://zenodo.org/record/2546921/files/camelyonpatch_level_2_split_test_y.h5.gz 3040 0a522005fccc8bbd04c5a117bfaf81d8da2676f03a29d7499f71d0a0bd6068ef
+https://zenodo.org/record/2546921/files/camelyonpatch_level_2_split_train_x.h5.gz 6421353462 d619e741468a7ab35c7e4a75e6821b7e7e6c9411705d45708f2a0efc8960656c
+https://zenodo.org/record/2546921/files/camelyonpatch_level_2_split_train_y.h5.gz 21378 b74126d2c01b20d3661f9b46765d29cf4e4fba6faba29c8e0d09d406331ab75a
+https://zenodo.org/record/2546921/files/camelyonpatch_level_2_split_valid_x.h5.gz 805965320 f82ee1670d027b4ec388048d9eabc2186b77c009655dae76d624c0ecb053ccb2
+https://zenodo.org/record/2546921/files/camelyonpatch_level_2_split_valid_y.h5.gz 3038 ce1ae30f08feb468447971cfd0472e7becd0ad96d877c64120c72571439ae48c