developmentseed · martham93 · Nov 5, 2019 · Oct 24, 2019 · Oct 25, 2019 · Oct 25, 2019
diff --git a/label_maker/package.py b/label_maker/package.py
@@ -9,7 +9,8 @@
 from label_maker.utils import is_tif
 
 
-def package_directory(dest_folder, classes, imagery, ml_type, seed=False, train_size=0.8, **kwargs):
+def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_names=['train', 'test'],
+                      split_vals=[0.8, .2], **kwargs):
     """Generate an .npz file containing arrays for training machine learning algorithms
 
     Parameters
@@ -28,16 +29,25 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, train_
     ml_type: str
         Defines the type of machine learning. One of "classification", "object-detection", or "segmentation"
     seed: int
-        Random generator seed. Optional, use to make results reproducable.
-    train_size: float
-        Portion of the data to use in training, the remainder is used as test data (default 0.8)
+        Random generator seed. Optional, use to make results reproducible.
+
+    split_vals: lst
+        Percentage of data to put in each catagory listed in split_names. Must be floats and must sum to one.
+
+    split_names: lst
+        List of names for each subset of the data, either ['train', 'test'] or ['train', 'test', 'val']
+
     **kwargs: dict
         Other properties from CLI config passed as keywords to other utility functions
     """
     # if a seed is given, use it
     if seed:
         np.random.seed(seed)
 
+    assert len(split_names) == 2 or len(split_names) == 3.
+    assert len(split_names) == len(split_vals), "split_names and split_vals must be the same length."
+    assert np.isclose(sum(split_vals), 1), "split_vals must sum to one."
+
     # open labels file, create tile array
     labels_file = op.join(dest_folder, 'labels.npz')
     labels = np.load(labels_file)
@@ -60,7 +70,7 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, train_
     # open the images and load those plus the labels into the final arrays
     o = urlparse(imagery)
     _, image_format = op.splitext(o.path)
-    if is_tif(imagery): # if a TIF is provided, use jpg as tile format
+    if is_tif(imagery):  # if a TIF is provided, use jpg as tile format
         image_format = '.jpg'
     for tile in tiles:
         image_file = op.join(dest_folder, 'tiles', '{}{}'.format(tile, image_format))
@@ -86,16 +96,36 @@ def package_directory(dest_folder, classes, imagery, ml_type, seed=False, train_
         elif ml_type == 'segmentation':
             y_vals.append(labels[tile][..., np.newaxis])  # Add grayscale channel
 
-    # split into train and test
-    split_index = int(len(x_vals) * train_size)
-
     # convert lists to numpy arrays
     x_vals = np.array(x_vals, dtype=np.uint8)
     y_vals = np.array(y_vals, dtype=np.uint8)
 
+    x_vals_split_lst = np.split(x_vals,
+                                [int(split_vals[0] * len(x_vals)), int((split_vals[0] + split_vals[1]) * len(x_vals))])
+
+    if len(x_vals_split_lst[-1]) == 0:
+        x_vals_split_lst = x_vals_split_lst[:-1]
+
+    y_vals_split_lst = np.split(y_vals,
+                                [int(split_vals[0] * len(y_vals)), int((split_vals[0] + split_vals[1]) * len(y_vals))])
+
+    if len(y_vals_split_lst[-1]) == 0:
+        y_vals_split_lst = y_vals_split_lst[:-1]
+
     print('Saving packaged file to {}'.format(op.join(dest_folder, 'data.npz')))
-    np.savez(op.join(dest_folder, 'data.npz'),
-             x_train=x_vals[:split_index, ...],
-             y_train=y_vals[:split_index, ...],
-             x_test=x_vals[split_index:, ...],
-             y_test=y_vals[split_index:, ...])
+
+    if len(split_vals) == 2:
+        np.savez(op.join(dest_folder, 'data.npz'),
+                 x_train=x_vals_split_lst[0],
+                 y_train=y_vals_split_lst[0],
+                 x_test=x_vals_split_lst[1],
+                 y_test=y_vals_split_lst[1])
+
+    if len(split_vals) == 3:
+        np.savez(op.join(dest_folder, 'data.npz'),
+                 x_train=x_vals_split_lst[0],
+                 y_train=y_vals_split_lst[0],
+                 x_test=x_vals_split_lst[1],
+                 y_test=y_vals_split_lst[1],
+                 x_val=x_vals_split_lst[2],
+                 y_val=y_vals_split_lst[2])
diff --git a/label_maker/validate.py b/label_maker/validate.py
@@ -30,5 +30,7 @@
     'background_ratio': {'type': 'float'},
     'ml_type': {'allowed': ['classification', 'object-detection', 'segmentation'], 'required': True},
     'seed': {'type': 'integer'},
-    'imagery_offset': {'type': 'list', 'schema': {'type': 'integer'}, 'minlength': 2, 'maxlength': 2}
+    'imagery_offset': {'type': 'list', 'schema': {'type': 'integer'}, 'minlength': 2, 'maxlength': 2},
+    'split_vals': {'type': 'list', 'schema': {'type': 'float'}, 'minlength': 2, 'maxlength': 3},
+    'split_names': {'type': 'list', 'schema': {'type': 'string'}, 'minlength': 2, 'maxlength': 3}
 }
diff --git a/test/fixtures/integration/config_3way.integration.json b/test/fixtures/integration/config_3way.integration.json
@@ -0,0 +1,23 @@
+{"country": "portugal",
+  "bounding_box": [
+    -9.4575,
+    38.8467,
+    -9.4510,
+    38.8513
+  ],
+  "zoom": 17,
+  "classes": [
+    { "name": "Water Tower", "filter": ["==", "man_made", "water_tower"] },
+    { "name": "Building", "filter": ["has", "building"] },
+    { "name": "Farmland", "filter": ["==", "landuse", "farmland"] },
+    { "name": "Ruins", "filter": ["==", "historic", "ruins"] },
+    { "name": "Parking", "filter": ["==", "amenity", "parking"] },
+    { "name": "Roads", "filter": ["has", "highway"] }
+  ],
+  "imagery": "https://api.mapbox.com/v4/mapbox.satellite/{z}/{x}/{y}.jpg?access_token=ACCESS_TOKEN",
+  "background_ratio": 1,
+  "ml_type": "classification",
+  "seed": 19,
+  "split_names": ["train", "test", "val"],
+  "split_vals": [0.7, 0.2, 0.1]
+}
diff --git a/test/integration/test_classification_package.py b/test/integration/test_classification_package.py
@@ -11,13 +11,19 @@ class TestClassificationPackage(unittest.TestCase):
     """Tests for classification package creation"""
     @classmethod
     def setUpClass(cls):
+
         makedirs('integration-cl')
         copyfile('test/fixtures/integration/labels-cl.npz', 'integration-cl/labels.npz')
         copytree('test/fixtures/integration/tiles', 'integration-cl/tiles')
 
+        makedirs('integration-cl-split')
+        copyfile('test/fixtures/integration/labels-cl.npz', 'integration-cl-split/labels.npz')
+        copytree('test/fixtures/integration/tiles', 'integration-cl-split/tiles')
+
     @classmethod
     def tearDownClass(cls):
         rmtree('integration-cl')
+        rmtree('integration-cl-split')
 
     def test_cli(self):
         """Verify data.npz produced by CLI"""
@@ -48,3 +54,24 @@ def test_cli(self):
              [0, 0, 0, 0, 0, 0, 1]]
         )
         self.assertTrue(np.array_equal(data['y_test'], expected_y_test))
+
+    def test_cli_3way_split(self):
+        """Verify data.npz produced by CLI when split into train/test/val"""
+
+        cmd = 'label-maker package --dest integration-cl-split --config test/fixtures/integration/config_3way.integration.json'
+        cmd = cmd.split(' ')
+        subprocess.run(cmd, universal_newlines=True)
+
+        data = np.load('integration-cl-split/data.npz')
+
+        # validate our image data with shapes
+        self.assertEqual(data['x_train'].shape, (5, 256, 256, 3))
+        self.assertEqual(data['x_test'].shape, (2, 256, 256, 3))
+        self.assertEqual(data['x_val'].shape, (1, 256, 256, 3))
+
+        # validate label data with shapes
+        self.assertEqual(data['y_train'].shape, (5, 7))
+        self.assertEqual(data['y_test'].shape, (2, 7))
+        self.assertEqual(data['y_val'].shape, (1, 7))
+
+