From 579f9935ff5b6ae85a8ee7329e9f2620995e2d79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Fri, 6 Aug 2021 19:37:23 +0200 Subject: [PATCH] Added tests for decision tree --- tests/test_decision_tree.py | 171 ++++++++++++++++++++++++++++++++++++ tests/test_rf_dataset.py | 77 +++++++--------- 2 files changed, 201 insertions(+), 47 deletions(-) create mode 100644 tests/test_decision_tree.py diff --git a/tests/test_decision_tree.py b/tests/test_decision_tree.py new file mode 100644 index 00000000..e935dc56 --- /dev/null +++ b/tests/test_decision_tree.py @@ -0,0 +1,171 @@ +import unittest + +import numpy as np +from pycompss.api.api import compss_wait_on + +import dislib as ds +import dislib.commons.rf.decision_tree as dt +import dislib.commons.rf.data as data + + +class DecisionTreeTest(unittest.TestCase): + def test_decision_tree(self): + x1 = np.array( + [ + [0.3, -0.3], + [0.4, -0.5], + [0.5, -0.4], + [0.3, 0.3], + [0.4, 0.5], + [0.5, 0.4], + [-0.3, -0.3], + [-0.4, -0.5], + [-0.5, -0.4], + ] + ) + x2 = np.array([[0.4, -0.3], [0.4, 0.3], [-0.4, -0.3]]) + y1 = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) + y2 = np.array([0, 1, 2]) + + x1_ds = ds.array(x1, (3, 2)) + x2_ds = ds.array(x2, (3, 2)) + y1_ds = ds.array(y1[:, np.newaxis], (3, 1)) + + data1 = data.transform_to_rf_dataset( + x1_ds, y1_ds, "classification", features_file=True + ) + + # Model + try_features = 2 + max_depth = np.inf + distr_depth = 2 + sklearn_max = 1e8 + bootstrap = True + seed = 0 + random_state = np.random.RandomState(seed) + n_samples, n_features = x1.shape + n_classes = np.bincount(y1).shape[0] + features_mmap = x1.T + + # Test bootstrap + sample1, y_s1 = compss_wait_on( + dt._sample_selection(n_samples, y1, True, seed) + ) + sample2, y_s2 = compss_wait_on( + dt._sample_selection(n_samples, y1, False, seed) + ) + self.assertTrue( + np.array_equal(sample1, np.array([0, 2, 3, 3, 3, 4, 5, 5, 7])) + ) + self.assertTrue( + np.array_equal(sample2, np.array([0, 1, 2, 3, 4, 5, 6, 7, 8])) + ) + self.assertTrue( + np.array_equal(y_s1, np.array([0, 0, 1, 1, 1, 1, 1, 1, 2])) + ) + self.assertTrue( + np.array_equal(y_s2, np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])) + ) + + # Assert split wrapper + sample, y_s = sample2, y_s2 + with self.assertRaises(ValueError): + dt._split_node_wrapper( + sample, + n_features, + y_s, + n_classes, + try_features, + random_state, + samples_file=None, + features_file=None, + ) + + split = dt._split_node_wrapper( + sample, + n_features, + y_s, + n_classes, + try_features, + random_state, + samples_file=data1.samples_path, + features_file=data1.features_path, + ) + split = compss_wait_on(split) + node_info, left_group, y_l, right_group, y_r = split + self.assertTrue(node_info.index in (0, 1)) + if node_info.index == 0: + self.assertTrue(np.array_equal(left_group, np.array([6, 7, 8]))) + self.assertTrue(np.array_equal(y_l, np.array([2, 2, 2]))) + self.assertTrue( + np.array_equal(right_group, np.array([0, 1, 2, 3, 4, 5])) + ) + self.assertTrue(np.array_equal(y_r, np.array([0, 0, 0, 1, 1, 1]))) + self.assertAlmostEqual(node_info.value, 0.0) + split_l = dt._compute_split( + left_group, + n_features, + y_l, + n_classes, + try_features, + features_mmap, + random_state, + ) + node_info, left_group, y_l, right_group, y_r = split_l + self.assertTrue(np.array_equal(left_group, np.array([6, 7, 8]))) + self.assertTrue(np.array_equal(y_l, np.array([2, 2, 2]))) + self.assertTrue(np.array_equal(right_group, np.array([]))) + self.assertTrue(np.array_equal(y_r, np.array([]))) + self.assertTrue( + np.array_equal(node_info.frequencies, np.array([0, 0, 3])) + ) + self.assertEqual(node_info.size, 3) + self.assertEqual(node_info.target, 2) + elif node_info.index == 1: + self.assertTrue( + np.array_equal(left_group, np.array([0, 1, 2, 6, 7, 8])) + ) + self.assertTrue(np.array_equal(y_l, np.array([0, 0, 0, 2, 2, 2]))) + self.assertTrue(np.array_equal(right_group, np.array([3, 4, 5]))) + self.assertTrue(np.array_equal(y_r, np.array([1, 1, 1]))) + self.assertAlmostEqual(node_info.value, 0.0) + split_r = dt._compute_split( + right_group, + n_features, + y_r, + n_classes, + try_features, + features_mmap, + random_state, + ) + node_info, left_group, y_l, right_group, y_r = split_r + self.assertTrue(np.array_equal(left_group, np.array([3, 4, 5]))) + self.assertTrue(np.array_equal(y_l, np.array([1, 1, 1]))) + self.assertTrue(np.array_equal(right_group, np.array([]))) + self.assertTrue(np.array_equal(y_r, np.array([]))) + self.assertTrue( + np.array_equal(node_info.frequencies, np.array([0, 3, 0])) + ) + self.assertEqual(node_info.size, 3) + self.assertEqual(node_info.target, 1) + + # Test tree + tree = dt.DecisionTreeClassifier( + try_features, + max_depth, + distr_depth, + sklearn_max, + bootstrap, + random_state, + ) + tree.fit(data1) + y_pred = compss_wait_on(tree.predict(x2_ds)) + self.assertTrue(np.array_equal(y_pred, y2)) + + +def main(): + unittest.main() + + +if __name__ == "__main__": + main() diff --git a/tests/test_rf_dataset.py b/tests/test_rf_dataset.py index c70664e1..86eceaf8 100644 --- a/tests/test_rf_dataset.py +++ b/tests/test_rf_dataset.py @@ -104,10 +104,10 @@ def test_rf_dataset(self): # Dataset creation rf_regr = data.transform_to_rf_dataset( - x_ds_1, y_ds_1, "regression" + x_ds_1, y_ds_1, "regression", features_file=True ) rf_class = data.transform_to_rf_dataset( - x_ds_1, y_ds_1, "classification" + x_ds_1, y_ds_1, "classification", features_file=True ) self.assertEquals(compss_wait_on(rf_regr.get_n_samples()), 900) self.assertEquals(compss_wait_on(rf_regr.get_n_features()), 10) @@ -137,34 +137,22 @@ def test_rf_dataset(self): self.assertEqual(value, np.float64(np.inf)) -def _fill_samples_file( - samples_path, row_blocks, start_idx, fortran_order -): +def _fill_samples_file(samples_path, row_blocks, start_idx, fortran_order): rows_samples = Array._merge_blocks(row_blocks) - rows_samples = rows_samples.astype( - dtype="float32", casting="same_kind" - ) + rows_samples = rows_samples.astype(dtype="float32", casting="same_kind") samples = np.lib.format.open_memmap( samples_path, mode="r+", fortran_order=fortran_order ) - samples[start_idx: start_idx + rows_samples.shape[0]] = ( - rows_samples - ) + samples[start_idx : start_idx + rows_samples.shape[0]] = rows_samples -def _fill_features_file( - samples_path, row_blocks, start_idx, fortran_order -): - rows_samples = Array._merge_blocks(row_blocks).T - rows_samples = rows_samples.astype( - dtype="float32", casting="same_kind" - ) +def _fill_features_file(samples_path, row_blocks, start_idx, fortran_order): + rows_samples = Array._merge_blocks(row_blocks) + rows_samples = rows_samples.astype(dtype="float32", casting="same_kind") samples = np.lib.format.open_memmap( samples_path, mode="r+", fortran_order=fortran_order ) - samples[start_idx: start_idx + rows_samples.shape[1]] = ( - rows_samples - ) + samples[:, start_idx : start_idx + rows_samples.shape[0]] = rows_samples.T def _fill_targets_file(targets_path, row_blocks): @@ -177,7 +165,7 @@ def save_samples(x, samples_path, fortran_order): n_samples = x.shape[0] n_features = x.shape[1] - open(samples_path, 'w').close() + open(samples_path, "w").close() np.lib.format.open_memmap( samples_path, mode="w+", @@ -188,9 +176,7 @@ def save_samples(x, samples_path, fortran_order): start_idx = 0 row_blocks_iterator = x._iterator(axis=0) top_row = next(row_blocks_iterator) - _fill_samples_file( - samples_path, top_row._blocks, start_idx, fortran_order - ) + _fill_samples_file(samples_path, top_row._blocks, start_idx, fortran_order) start_idx += x._top_left_shape[0] for x_row in row_blocks_iterator: _fill_samples_file( @@ -200,7 +186,7 @@ def save_samples(x, samples_path, fortran_order): def save_targets(y, targets_path): - open(targets_path, 'w').close() + open(targets_path, "w").close() for y_row in y._iterator(axis=0): _fill_targets_file(targets_path, y_row._blocks) @@ -209,33 +195,30 @@ def save_features(x, features_path, fortran_order): n_samples = x.shape[0] n_features = x.shape[1] - if features_path is not None: - np.lib.format.open_memmap( - features_path, - mode="w+", - dtype="float32", - fortran_order=fortran_order, - shape=(int(n_features), int(n_samples)), - ) - start_idx = 0 - col_blocks_iterator = x._iterator(axis=1) - left_col = next(col_blocks_iterator) + np.lib.format.open_memmap( + features_path, + mode="w+", + dtype="float32", + fortran_order=fortran_order, + shape=(int(n_features), int(n_samples)), + ) + start_idx = 0 + row_blocks_iterator = x._iterator(axis=0) + top_row = next(row_blocks_iterator) + _fill_features_file( + features_path, top_row._blocks, start_idx, fortran_order + ) + start_idx += x._top_left_shape[0] + for x_row in row_blocks_iterator: _fill_features_file( - features_path, left_col._blocks, - start_idx, fortran_order + features_path, x_row._blocks, start_idx, fortran_order ) - start_idx += x._top_left_shape[1] - for x_row in col_blocks_iterator: - _fill_features_file( - features_path, x_row._blocks, - start_idx, fortran_order - ) - start_idx += x._reg_shape[1] + start_idx += x._reg_shape[0] def main(): unittest.main() -if __name__ == '__main__': +if __name__ == "__main__": main()