Skip to content

Commit

Permalink
Add PatchCamelyon dataset.
Browse files Browse the repository at this point in the history
https://patchcamelyon.grand-challenge.org/

PiperOrigin-RevId: 253198741
  • Loading branch information
jpuigcerver authored and copybara-github committed Jun 14, 2019
1 parent 27f1e7f commit e9d6c9b
Show file tree
Hide file tree
Showing 11 changed files with 228 additions and 0 deletions.
1 change: 1 addition & 0 deletions tensorflow_datasets/image/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
from tensorflow_datasets.image.open_images import OpenImagesV4
from tensorflow_datasets.image.oxford_flowers102 import OxfordFlowers102
from tensorflow_datasets.image.oxford_iiit_pet import OxfordIIITPet
from tensorflow_datasets.image.patch_camelyon import PatchCamelyon
from tensorflow_datasets.image.quickdraw import QuickdrawBitmap
from tensorflow_datasets.image.resisc45 import Resisc45
from tensorflow_datasets.image.rock_paper_scissors import RockPaperScissors
Expand Down
109 changes: 109 additions & 0 deletions tensorflow_datasets/image/patch_camelyon.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# coding=utf-8
# Copyright 2019 The TensorFlow Datasets Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""PatchCamelyon images dataset."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import h5py
import tensorflow_datasets.public_api as tfds

_DESCRIPTION = """\
The PatchCamelyon benchmark is a new and challenging image classification
dataset. It consists of 327.680 color images (96 x 96px) extracted from
histopathologic scans of lymph node sections. Each image is annoted with a
binary label indicating presence of metastatic tissue. PCam provides a new
benchmark for machine learning models: bigger than CIFAR10, smaller than
Imagenet, trainable on a single GPU.
"""
_CITATION = """\
@misc{b_s_veeling_j_linmans_j_winkens_t_cohen_2018_2546921,
author = {B. S. Veeling, J. Linmans, J. Winkens, T. Cohen, M. Welling},
title = {Rotation Equivariant CNNs for Digital Pathology},
month = sep,
year = 2018,
doi = {10.1007/978-3-030-00934-2_24},
url = {https://doi.org/10.1007/978-3-030-00934-2_24}
}
"""
_URL = 'https://patchcamelyon.grand-challenge.org/'


class PatchCamelyon(tfds.core.GeneratorBasedBuilder):
"""PatchCamelyon."""

VERSION = tfds.core.Version('0.1.0')

def _info(self):
return tfds.core.DatasetInfo(
builder=self,
description=_DESCRIPTION,
features=tfds.features.FeaturesDict({
'id':
tfds.features.Text(),
'image':
tfds.features.Image(shape=(96, 96, 3), encoding_format='png'),
'label':
tfds.features.ClassLabel(num_classes=2),
}),
supervised_keys=('image', 'label'),
urls=[_URL],
citation=_CITATION)

def _split_generators(self, dl_manager):
base_url = 'https://zenodo.org/record/2546921/files/'
resources = {
'test_x': base_url + 'camelyonpatch_level_2_split_test_x.h5.gz',
'test_y': base_url + 'camelyonpatch_level_2_split_test_y.h5.gz',
'train_x': base_url + 'camelyonpatch_level_2_split_train_x.h5.gz',
'train_y': base_url + 'camelyonpatch_level_2_split_train_y.h5.gz',
'valid_x': base_url + 'camelyonpatch_level_2_split_valid_x.h5.gz',
'valid_y': base_url + 'camelyonpatch_level_2_split_valid_y.h5.gz',
}
paths = dl_manager.download_and_extract(resources)
return [
tfds.core.SplitGenerator(
name=tfds.Split.TEST,
num_shards=1,
gen_kwargs=dict(split='test', paths=paths)),
tfds.core.SplitGenerator(
name=tfds.Split.TRAIN,
num_shards=10,
gen_kwargs=dict(split='train', paths=paths)),
tfds.core.SplitGenerator(
name=tfds.Split.VALIDATION,
num_shards=1,
gen_kwargs=dict(split='valid', paths=paths)),
]

def _generate_examples(self, split, paths):
"""Generates images and labels given the image directory path.
Args:
split: name of the split to generate examples for (test, train, valid).
paths: dictionary with the paths to the h5 files for each split.
Yields:
A dictionary with the image and the corresponding label.
"""
h5x_file = h5py.File(paths[split + '_x'], 'r')
h5y_file = h5py.File(paths[split + '_y'], 'r')
images = h5x_file['x']
labels = h5y_file['y'] # Note: Labels are in a N x 1 x 1 x 1 tensor.
for i, (image, label) in enumerate(zip(images, labels)):
label = label.flatten()[0]
yield {'id': '%s_%d' % (split, i), 'image': image, 'label': label}
47 changes: 47 additions & 0 deletions tensorflow_datasets/image/patch_camelyon_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# coding=utf-8
# Copyright 2019 The TensorFlow Datasets Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Tests for camelyon_patch.py."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from tensorflow_datasets import testing
from tensorflow_datasets.image import patch_camelyon


class Caltech101Test(testing.DatasetBuilderTestCase):

DATASET_CLASS = patch_camelyon.PatchCamelyon

SPLITS = {
'train': 5,
'test': 4,
'validation': 3,
}

DL_EXTRACT_RESULT = {
'train_x': 'camelyonpatch_level_2_split_train_x.h5',
'train_y': 'camelyonpatch_level_2_split_train_y.h5',
'test_x': 'camelyonpatch_level_2_split_test_x.h5',
'test_y': 'camelyonpatch_level_2_split_test_y.h5',
'valid_x': 'camelyonpatch_level_2_split_valid_x.h5',
'valid_y': 'camelyonpatch_level_2_split_valid_y.h5',
}


if __name__ == '__main__':
testing.test_main()
65 changes: 65 additions & 0 deletions tensorflow_datasets/testing/patch_camelyon.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# coding=utf-8
# Copyright 2019 The TensorFlow Datasets Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

r"""Create fake data for Camelyon Patch dataset.
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import google_type_annotations
from __future__ import print_function

import os

from absl import app
from absl import flags

import h5py
import numpy as np
from tensorflow_datasets.core.utils import py_utils


flags.DEFINE_string('tfds_dir', py_utils.tfds_dir(),
'Path to tensorflow_datasets directory')
FLAGS = flags.FLAGS


def get_output_file_prefix(split):
return os.path.join(FLAGS.tfds_dir, 'testing', 'test_data', 'fake_examples',
'patch_camelyon',
'camelyonpatch_level_2_split_%s' % split)


def write_to_h5_file(filepath, dataset_name, content):
with h5py.File(filepath, 'w') as h5_f:
h5_f.create_dataset(dataset_name, data=content)


def main(_):
np.random.seed(0x12345)
for split, num_examples in [('train', 5), ('test', 4), ('valid', 3)]:
x = np.random.randint(
low=0, high=256, size=(num_examples, 96, 96, 3), dtype=np.uint8)
y = np.random.randint(
low=0, high=2, size=(num_examples, 1, 1, 1), dtype=np.uint32)
images_filepath = get_output_file_prefix(split) + '_x.h5'
labels_filepath = get_output_file_prefix(split) + '_y.h5'
write_to_h5_file(images_filepath, dataset_name='x', content=x)
write_to_h5_file(labels_filepath, dataset_name='y', content=y)


if __name__ == '__main__':
app.run(main)
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
6 changes: 6 additions & 0 deletions tensorflow_datasets/url_checksums/patch_camelyon.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
https://zenodo.org/record/2546921/files/camelyonpatch_level_2_split_test_x.h5.gz 800875929 79174c2201ad521602a5888be8f36ee10875f37403dd3f2086caf2182ef87245
https://zenodo.org/record/2546921/files/camelyonpatch_level_2_split_test_y.h5.gz 3040 0a522005fccc8bbd04c5a117bfaf81d8da2676f03a29d7499f71d0a0bd6068ef
https://zenodo.org/record/2546921/files/camelyonpatch_level_2_split_train_x.h5.gz 6421353462 d619e741468a7ab35c7e4a75e6821b7e7e6c9411705d45708f2a0efc8960656c
https://zenodo.org/record/2546921/files/camelyonpatch_level_2_split_train_y.h5.gz 21378 b74126d2c01b20d3661f9b46765d29cf4e4fba6faba29c8e0d09d406331ab75a
https://zenodo.org/record/2546921/files/camelyonpatch_level_2_split_valid_x.h5.gz 805965320 f82ee1670d027b4ec388048d9eabc2186b77c009655dae76d624c0ecb053ccb2
https://zenodo.org/record/2546921/files/camelyonpatch_level_2_split_valid_y.h5.gz 3038 ce1ae30f08feb468447971cfd0472e7becd0ad96d877c64120c72571439ae48c

0 comments on commit e9d6c9b

Please sign in to comment.