Skip to content

Add option to split data into train/test/validate sets #149

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 23 commits into from
Nov 5, 2019
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 43 additions & 13 deletions label_maker/package.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
from label_maker.utils import is_tif


def package_directory(dest_folder, classes, imagery, ml_type, seed=False, train_size=0.8, **kwargs):
def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_names=['train', 'test'],
split_vals=[0.8, .2], **kwargs):
"""Generate an .npz file containing arrays for training machine learning algorithms

Parameters
Expand All @@ -28,16 +29,25 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, train_
ml_type: str
Defines the type of machine learning. One of "classification", "object-detection", or "segmentation"
seed: int
Random generator seed. Optional, use to make results reproducable.
train_size: float
Portion of the data to use in training, the remainder is used as test data (default 0.8)
Random generator seed. Optional, use to make results reproducible.

split_vals: lst
Percentage of data to put in each catagory listed in split_names. Must be floats and must sum to one.

split_names: lst
List of names for each subset of the data, either ['train', 'test'] or ['train', 'test', 'val']

**kwargs: dict
Other properties from CLI config passed as keywords to other utility functions
"""
# if a seed is given, use it
if seed:
np.random.seed(seed)

assert len(split_names) == 2 or len(split_names) == 3.
assert len(split_names) == len(split_vals), "split_names and split_vals must be the same length."
assert np.isclose(sum(split_vals), 1), "split_vals must sum to one."

# open labels file, create tile array
labels_file = op.join(dest_folder, 'labels.npz')
labels = np.load(labels_file)
Expand All @@ -60,7 +70,7 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, train_
# open the images and load those plus the labels into the final arrays
o = urlparse(imagery)
_, image_format = op.splitext(o.path)
if is_tif(imagery): # if a TIF is provided, use jpg as tile format
if is_tif(imagery): # if a TIF is provided, use jpg as tile format
image_format = '.jpg'
for tile in tiles:
image_file = op.join(dest_folder, 'tiles', '{}{}'.format(tile, image_format))
Expand All @@ -86,16 +96,36 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, train_
elif ml_type == 'segmentation':
y_vals.append(labels[tile][..., np.newaxis]) # Add grayscale channel

# split into train and test
split_index = int(len(x_vals) * train_size)

# convert lists to numpy arrays
x_vals = np.array(x_vals, dtype=np.uint8)
y_vals = np.array(y_vals, dtype=np.uint8)

x_vals_split_lst = np.split(x_vals,
[int(split_vals[0] * len(x_vals)), int((split_vals[0] + split_vals[1]) * len(x_vals))])

if len(x_vals_split_lst[-1]) == 0:
x_vals_split_lst = x_vals_split_lst[:-1]

y_vals_split_lst = np.split(y_vals,
[int(split_vals[0] * len(y_vals)), int((split_vals[0] + split_vals[1]) * len(y_vals))])

if len(y_vals_split_lst[-1]) == 0:
y_vals_split_lst = y_vals_split_lst[:-1]

print('Saving packaged file to {}'.format(op.join(dest_folder, 'data.npz')))
np.savez(op.join(dest_folder, 'data.npz'),
x_train=x_vals[:split_index, ...],
y_train=y_vals[:split_index, ...],
x_test=x_vals[split_index:, ...],
y_test=y_vals[split_index:, ...])

if len(split_vals) == 2:
np.savez(op.join(dest_folder, 'data.npz'),
x_train=x_vals_split_lst[0],
y_train=y_vals_split_lst[0],
x_test=x_vals_split_lst[1],
y_test=y_vals_split_lst[1])

if len(split_vals) == 3:
np.savez(op.join(dest_folder, 'data.npz'),
x_train=x_vals_split_lst[0],
y_train=y_vals_split_lst[0],
x_test=x_vals_split_lst[1],
y_test=y_vals_split_lst[1],
x_val=x_vals_split_lst[2],
y_val=y_vals_split_lst[2])
4 changes: 3 additions & 1 deletion label_maker/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,7 @@
'background_ratio': {'type': 'float'},
'ml_type': {'allowed': ['classification', 'object-detection', 'segmentation'], 'required': True},
'seed': {'type': 'integer'},
'imagery_offset': {'type': 'list', 'schema': {'type': 'integer'}, 'minlength': 2, 'maxlength': 2}
'imagery_offset': {'type': 'list', 'schema': {'type': 'integer'}, 'minlength': 2, 'maxlength': 2},
'split_vals': {'type': 'list', 'schema': {'type': 'float'}, 'minlength': 2, 'maxlength': 3},
'split_names': {'type': 'list', 'schema': {'type': 'string'}, 'minlength': 2, 'maxlength': 3}
}
23 changes: 23 additions & 0 deletions test/fixtures/integration/config_3way.integration.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{"country": "portugal",
"bounding_box": [
-9.4575,
38.8467,
-9.4510,
38.8513
],
"zoom": 17,
"classes": [
{ "name": "Water Tower", "filter": ["==", "man_made", "water_tower"] },
{ "name": "Building", "filter": ["has", "building"] },
{ "name": "Farmland", "filter": ["==", "landuse", "farmland"] },
{ "name": "Ruins", "filter": ["==", "historic", "ruins"] },
{ "name": "Parking", "filter": ["==", "amenity", "parking"] },
{ "name": "Roads", "filter": ["has", "highway"] }
],
"imagery": "https://api.mapbox.com/v4/mapbox.satellite/{z}/{x}/{y}.jpg?access_token=ACCESS_TOKEN",
"background_ratio": 1,
"ml_type": "classification",
"seed": 19,
"split_names": ["train", "test", "val"],
"split_vals": [0.7, 0.2, 0.1]
}
27 changes: 27 additions & 0 deletions test/integration/test_classification_package.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,19 @@ class TestClassificationPackage(unittest.TestCase):
"""Tests for classification package creation"""
@classmethod
def setUpClass(cls):

makedirs('integration-cl')
copyfile('test/fixtures/integration/labels-cl.npz', 'integration-cl/labels.npz')
copytree('test/fixtures/integration/tiles', 'integration-cl/tiles')

makedirs('integration-cl-split')
copyfile('test/fixtures/integration/labels-cl.npz', 'integration-cl-split/labels.npz')
copytree('test/fixtures/integration/tiles', 'integration-cl-split/tiles')

@classmethod
def tearDownClass(cls):
rmtree('integration-cl')
rmtree('integration-cl-split')

def test_cli(self):
"""Verify data.npz produced by CLI"""
Expand Down Expand Up @@ -48,3 +54,24 @@ def test_cli(self):
[0, 0, 0, 0, 0, 0, 1]]
)
self.assertTrue(np.array_equal(data['y_test'], expected_y_test))

def test_cli_3way_split(self):
"""Verify data.npz produced by CLI when split into train/test/val"""

cmd = 'label-maker package --dest integration-cl-split --config test/fixtures/integration/config_3way.integration.json'
cmd = cmd.split(' ')
subprocess.run(cmd, universal_newlines=True)

data = np.load('integration-cl-split/data.npz')

# validate our image data with shapes
self.assertEqual(data['x_train'].shape, (5, 256, 256, 3))
self.assertEqual(data['x_test'].shape, (2, 256, 256, 3))
self.assertEqual(data['x_val'].shape, (1, 256, 256, 3))

# validate label data with shapes
self.assertEqual(data['y_train'].shape, (5, 7))
self.assertEqual(data['y_test'].shape, (2, 7))
self.assertEqual(data['y_val'].shape, (1, 7))