Added tests for decision tree

tirkarthi · Aug 6, 2021 · 579f993 · 579f993
1 parent 0445030
commit 579f993
Show file tree

Hide file tree

Showing 2 changed files with 201 additions and 47 deletions.
diff --git a/tests/test_decision_tree.py b/tests/test_decision_tree.py
@@ -0,0 +1,171 @@
+import unittest
+
+import numpy as np
+from pycompss.api.api import compss_wait_on
+
+import dislib as ds
+import dislib.commons.rf.decision_tree as dt
+import dislib.commons.rf.data as data
+
+
+class DecisionTreeTest(unittest.TestCase):
+    def test_decision_tree(self):
+        x1 = np.array(
+            [
+                [0.3, -0.3],
+                [0.4, -0.5],
+                [0.5, -0.4],
+                [0.3, 0.3],
+                [0.4, 0.5],
+                [0.5, 0.4],
+                [-0.3, -0.3],
+                [-0.4, -0.5],
+                [-0.5, -0.4],
+            ]
+        )
+        x2 = np.array([[0.4, -0.3], [0.4, 0.3], [-0.4, -0.3]])
+        y1 = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
+        y2 = np.array([0, 1, 2])
+
+        x1_ds = ds.array(x1, (3, 2))
+        x2_ds = ds.array(x2, (3, 2))
+        y1_ds = ds.array(y1[:, np.newaxis], (3, 1))
+
+        data1 = data.transform_to_rf_dataset(
+            x1_ds, y1_ds, "classification", features_file=True
+        )
+
+        # Model
+        try_features = 2
+        max_depth = np.inf
+        distr_depth = 2
+        sklearn_max = 1e8
+        bootstrap = True
+        seed = 0
+        random_state = np.random.RandomState(seed)
+        n_samples, n_features = x1.shape
+        n_classes = np.bincount(y1).shape[0]
+        features_mmap = x1.T
+
+        # Test bootstrap
+        sample1, y_s1 = compss_wait_on(
+            dt._sample_selection(n_samples, y1, True, seed)
+        )
+        sample2, y_s2 = compss_wait_on(
+            dt._sample_selection(n_samples, y1, False, seed)
+        )
+        self.assertTrue(
+            np.array_equal(sample1, np.array([0, 2, 3, 3, 3, 4, 5, 5, 7]))
+        )
+        self.assertTrue(
+            np.array_equal(sample2, np.array([0, 1, 2, 3, 4, 5, 6, 7, 8]))
+        )
+        self.assertTrue(
+            np.array_equal(y_s1, np.array([0, 0, 1, 1, 1, 1, 1, 1, 2]))
+        )
+        self.assertTrue(
+            np.array_equal(y_s2, np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]))
+        )
+
+        # Assert split wrapper
+        sample, y_s = sample2, y_s2
+        with self.assertRaises(ValueError):
+            dt._split_node_wrapper(
+                sample,
+                n_features,
+                y_s,
+                n_classes,
+                try_features,
+                random_state,
+                samples_file=None,
+                features_file=None,
+            )
+
+        split = dt._split_node_wrapper(
+            sample,
+            n_features,
+            y_s,
+            n_classes,
+            try_features,
+            random_state,
+            samples_file=data1.samples_path,
+            features_file=data1.features_path,
+        )
+        split = compss_wait_on(split)
+        node_info, left_group, y_l, right_group, y_r = split
+        self.assertTrue(node_info.index in (0, 1))
+        if node_info.index == 0:
+            self.assertTrue(np.array_equal(left_group, np.array([6, 7, 8])))
+            self.assertTrue(np.array_equal(y_l, np.array([2, 2, 2])))
+            self.assertTrue(
+                np.array_equal(right_group, np.array([0, 1, 2, 3, 4, 5]))
+            )
+            self.assertTrue(np.array_equal(y_r, np.array([0, 0, 0, 1, 1, 1])))
+            self.assertAlmostEqual(node_info.value, 0.0)
+            split_l = dt._compute_split(
+                left_group,
+                n_features,
+                y_l,
+                n_classes,
+                try_features,
+                features_mmap,
+                random_state,
+            )
+            node_info, left_group, y_l, right_group, y_r = split_l
+            self.assertTrue(np.array_equal(left_group, np.array([6, 7, 8])))
+            self.assertTrue(np.array_equal(y_l, np.array([2, 2, 2])))
+            self.assertTrue(np.array_equal(right_group, np.array([])))
+            self.assertTrue(np.array_equal(y_r, np.array([])))
+            self.assertTrue(
+                np.array_equal(node_info.frequencies, np.array([0, 0, 3]))
+            )
+            self.assertEqual(node_info.size, 3)
+            self.assertEqual(node_info.target, 2)
+        elif node_info.index == 1:
+            self.assertTrue(
+                np.array_equal(left_group, np.array([0, 1, 2, 6, 7, 8]))
+            )
+            self.assertTrue(np.array_equal(y_l, np.array([0, 0, 0, 2, 2, 2])))
+            self.assertTrue(np.array_equal(right_group, np.array([3, 4, 5])))
+            self.assertTrue(np.array_equal(y_r, np.array([1, 1, 1])))
+            self.assertAlmostEqual(node_info.value, 0.0)
+            split_r = dt._compute_split(
+                right_group,
+                n_features,
+                y_r,
+                n_classes,
+                try_features,
+                features_mmap,
+                random_state,
+            )
+            node_info, left_group, y_l, right_group, y_r = split_r
+            self.assertTrue(np.array_equal(left_group, np.array([3, 4, 5])))
+            self.assertTrue(np.array_equal(y_l, np.array([1, 1, 1])))
+            self.assertTrue(np.array_equal(right_group, np.array([])))
+            self.assertTrue(np.array_equal(y_r, np.array([])))
+            self.assertTrue(
+                np.array_equal(node_info.frequencies, np.array([0, 3, 0]))
+            )
+            self.assertEqual(node_info.size, 3)
+            self.assertEqual(node_info.target, 1)
+
+        # Test tree
+        tree = dt.DecisionTreeClassifier(
+            try_features,
+            max_depth,
+            distr_depth,
+            sklearn_max,
+            bootstrap,
+            random_state,
+        )
+        tree.fit(data1)
+        y_pred = compss_wait_on(tree.predict(x2_ds))
+        self.assertTrue(np.array_equal(y_pred, y2))
+
+
+def main():
+    unittest.main()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_rf_dataset.py b/tests/test_rf_dataset.py
@@ -104,10 +104,10 @@ def test_rf_dataset(self):
 
         # Dataset creation
         rf_regr = data.transform_to_rf_dataset(
-            x_ds_1, y_ds_1, "regression"
+            x_ds_1, y_ds_1, "regression", features_file=True
         )
         rf_class = data.transform_to_rf_dataset(
-            x_ds_1, y_ds_1, "classification"
+            x_ds_1, y_ds_1, "classification", features_file=True
         )
         self.assertEquals(compss_wait_on(rf_regr.get_n_samples()), 900)
         self.assertEquals(compss_wait_on(rf_regr.get_n_features()), 10)
@@ -137,34 +137,22 @@ def test_rf_dataset(self):
         self.assertEqual(value, np.float64(np.inf))
 
 
-def _fill_samples_file(
-    samples_path, row_blocks, start_idx, fortran_order
-):
+def _fill_samples_file(samples_path, row_blocks, start_idx, fortran_order):
     rows_samples = Array._merge_blocks(row_blocks)
-    rows_samples = rows_samples.astype(
-        dtype="float32", casting="same_kind"
-    )
+    rows_samples = rows_samples.astype(dtype="float32", casting="same_kind")
     samples = np.lib.format.open_memmap(
         samples_path, mode="r+", fortran_order=fortran_order
     )
-    samples[start_idx: start_idx + rows_samples.shape[0]] = (
-        rows_samples
-    )
+    samples[start_idx : start_idx + rows_samples.shape[0]] = rows_samples
 
 
-def _fill_features_file(
-    samples_path, row_blocks, start_idx, fortran_order
-):
-    rows_samples = Array._merge_blocks(row_blocks).T
-    rows_samples = rows_samples.astype(
-        dtype="float32", casting="same_kind"
-    )
+def _fill_features_file(samples_path, row_blocks, start_idx, fortran_order):
+    rows_samples = Array._merge_blocks(row_blocks)
+    rows_samples = rows_samples.astype(dtype="float32", casting="same_kind")
     samples = np.lib.format.open_memmap(
         samples_path, mode="r+", fortran_order=fortran_order
     )
-    samples[start_idx: start_idx + rows_samples.shape[1]] = (
-        rows_samples
-    )
+    samples[:, start_idx : start_idx + rows_samples.shape[0]] = rows_samples.T
 
 
 def _fill_targets_file(targets_path, row_blocks):
@@ -177,7 +165,7 @@ def save_samples(x, samples_path, fortran_order):
     n_samples = x.shape[0]
     n_features = x.shape[1]
 
-    open(samples_path, 'w').close()
+    open(samples_path, "w").close()
     np.lib.format.open_memmap(
         samples_path,
         mode="w+",
@@ -188,9 +176,7 @@ def save_samples(x, samples_path, fortran_order):
     start_idx = 0
     row_blocks_iterator = x._iterator(axis=0)
     top_row = next(row_blocks_iterator)
-    _fill_samples_file(
-        samples_path, top_row._blocks, start_idx, fortran_order
-    )
+    _fill_samples_file(samples_path, top_row._blocks, start_idx, fortran_order)
     start_idx += x._top_left_shape[0]
     for x_row in row_blocks_iterator:
         _fill_samples_file(
@@ -200,7 +186,7 @@ def save_samples(x, samples_path, fortran_order):
 
 
 def save_targets(y, targets_path):
-    open(targets_path, 'w').close()
+    open(targets_path, "w").close()
     for y_row in y._iterator(axis=0):
         _fill_targets_file(targets_path, y_row._blocks)
 
@@ -209,33 +195,30 @@ def save_features(x, features_path, fortran_order):
     n_samples = x.shape[0]
     n_features = x.shape[1]
 
-    if features_path is not None:
-        np.lib.format.open_memmap(
-            features_path,
-            mode="w+",
-            dtype="float32",
-            fortran_order=fortran_order,
-            shape=(int(n_features), int(n_samples)),
-        )
-        start_idx = 0
-        col_blocks_iterator = x._iterator(axis=1)
-        left_col = next(col_blocks_iterator)
+    np.lib.format.open_memmap(
+        features_path,
+        mode="w+",
+        dtype="float32",
+        fortran_order=fortran_order,
+        shape=(int(n_features), int(n_samples)),
+    )
+    start_idx = 0
+    row_blocks_iterator = x._iterator(axis=0)
+    top_row = next(row_blocks_iterator)
+    _fill_features_file(
+        features_path, top_row._blocks, start_idx, fortran_order
+    )
+    start_idx += x._top_left_shape[0]
+    for x_row in row_blocks_iterator:
         _fill_features_file(
-            features_path, left_col._blocks,
-            start_idx, fortran_order
+            features_path, x_row._blocks, start_idx, fortran_order
         )
-        start_idx += x._top_left_shape[1]
-        for x_row in col_blocks_iterator:
-            _fill_features_file(
-                features_path, x_row._blocks,
-                start_idx, fortran_order
-            )
-            start_idx += x._reg_shape[1]
+        start_idx += x._reg_shape[0]
 
 
 def main():
     unittest.main()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()