Skip to content

Commit

Permalink
Added tests for decision tree
Browse files Browse the repository at this point in the history
  • Loading branch information
gcasadesus committed Aug 6, 2021
1 parent 0445030 commit 579f993
Show file tree
Hide file tree
Showing 2 changed files with 201 additions and 47 deletions.
171 changes: 171 additions & 0 deletions tests/test_decision_tree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
import unittest

import numpy as np
from pycompss.api.api import compss_wait_on

import dislib as ds
import dislib.commons.rf.decision_tree as dt
import dislib.commons.rf.data as data


class DecisionTreeTest(unittest.TestCase):
def test_decision_tree(self):
x1 = np.array(
[
[0.3, -0.3],
[0.4, -0.5],
[0.5, -0.4],
[0.3, 0.3],
[0.4, 0.5],
[0.5, 0.4],
[-0.3, -0.3],
[-0.4, -0.5],
[-0.5, -0.4],
]
)
x2 = np.array([[0.4, -0.3], [0.4, 0.3], [-0.4, -0.3]])
y1 = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
y2 = np.array([0, 1, 2])

x1_ds = ds.array(x1, (3, 2))
x2_ds = ds.array(x2, (3, 2))
y1_ds = ds.array(y1[:, np.newaxis], (3, 1))

data1 = data.transform_to_rf_dataset(
x1_ds, y1_ds, "classification", features_file=True
)

# Model
try_features = 2
max_depth = np.inf
distr_depth = 2
sklearn_max = 1e8
bootstrap = True
seed = 0
random_state = np.random.RandomState(seed)
n_samples, n_features = x1.shape
n_classes = np.bincount(y1).shape[0]
features_mmap = x1.T

# Test bootstrap
sample1, y_s1 = compss_wait_on(
dt._sample_selection(n_samples, y1, True, seed)
)
sample2, y_s2 = compss_wait_on(
dt._sample_selection(n_samples, y1, False, seed)
)
self.assertTrue(
np.array_equal(sample1, np.array([0, 2, 3, 3, 3, 4, 5, 5, 7]))
)
self.assertTrue(
np.array_equal(sample2, np.array([0, 1, 2, 3, 4, 5, 6, 7, 8]))
)
self.assertTrue(
np.array_equal(y_s1, np.array([0, 0, 1, 1, 1, 1, 1, 1, 2]))
)
self.assertTrue(
np.array_equal(y_s2, np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]))
)

# Assert split wrapper
sample, y_s = sample2, y_s2
with self.assertRaises(ValueError):
dt._split_node_wrapper(
sample,
n_features,
y_s,
n_classes,
try_features,
random_state,
samples_file=None,
features_file=None,
)

split = dt._split_node_wrapper(
sample,
n_features,
y_s,
n_classes,
try_features,
random_state,
samples_file=data1.samples_path,
features_file=data1.features_path,
)
split = compss_wait_on(split)
node_info, left_group, y_l, right_group, y_r = split
self.assertTrue(node_info.index in (0, 1))
if node_info.index == 0:
self.assertTrue(np.array_equal(left_group, np.array([6, 7, 8])))
self.assertTrue(np.array_equal(y_l, np.array([2, 2, 2])))
self.assertTrue(
np.array_equal(right_group, np.array([0, 1, 2, 3, 4, 5]))
)
self.assertTrue(np.array_equal(y_r, np.array([0, 0, 0, 1, 1, 1])))
self.assertAlmostEqual(node_info.value, 0.0)
split_l = dt._compute_split(
left_group,
n_features,
y_l,
n_classes,
try_features,
features_mmap,
random_state,
)
node_info, left_group, y_l, right_group, y_r = split_l
self.assertTrue(np.array_equal(left_group, np.array([6, 7, 8])))
self.assertTrue(np.array_equal(y_l, np.array([2, 2, 2])))
self.assertTrue(np.array_equal(right_group, np.array([])))
self.assertTrue(np.array_equal(y_r, np.array([])))
self.assertTrue(
np.array_equal(node_info.frequencies, np.array([0, 0, 3]))
)
self.assertEqual(node_info.size, 3)
self.assertEqual(node_info.target, 2)
elif node_info.index == 1:
self.assertTrue(
np.array_equal(left_group, np.array([0, 1, 2, 6, 7, 8]))
)
self.assertTrue(np.array_equal(y_l, np.array([0, 0, 0, 2, 2, 2])))
self.assertTrue(np.array_equal(right_group, np.array([3, 4, 5])))
self.assertTrue(np.array_equal(y_r, np.array([1, 1, 1])))
self.assertAlmostEqual(node_info.value, 0.0)
split_r = dt._compute_split(
right_group,
n_features,
y_r,
n_classes,
try_features,
features_mmap,
random_state,
)
node_info, left_group, y_l, right_group, y_r = split_r
self.assertTrue(np.array_equal(left_group, np.array([3, 4, 5])))
self.assertTrue(np.array_equal(y_l, np.array([1, 1, 1])))
self.assertTrue(np.array_equal(right_group, np.array([])))
self.assertTrue(np.array_equal(y_r, np.array([])))
self.assertTrue(
np.array_equal(node_info.frequencies, np.array([0, 3, 0]))
)
self.assertEqual(node_info.size, 3)
self.assertEqual(node_info.target, 1)

# Test tree
tree = dt.DecisionTreeClassifier(
try_features,
max_depth,
distr_depth,
sklearn_max,
bootstrap,
random_state,
)
tree.fit(data1)
y_pred = compss_wait_on(tree.predict(x2_ds))
self.assertTrue(np.array_equal(y_pred, y2))


def main():
unittest.main()


if __name__ == "__main__":
main()
77 changes: 30 additions & 47 deletions tests/test_rf_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,10 +104,10 @@ def test_rf_dataset(self):

# Dataset creation
rf_regr = data.transform_to_rf_dataset(
x_ds_1, y_ds_1, "regression"
x_ds_1, y_ds_1, "regression", features_file=True
)
rf_class = data.transform_to_rf_dataset(
x_ds_1, y_ds_1, "classification"
x_ds_1, y_ds_1, "classification", features_file=True
)
self.assertEquals(compss_wait_on(rf_regr.get_n_samples()), 900)
self.assertEquals(compss_wait_on(rf_regr.get_n_features()), 10)
Expand Down Expand Up @@ -137,34 +137,22 @@ def test_rf_dataset(self):
self.assertEqual(value, np.float64(np.inf))


def _fill_samples_file(
samples_path, row_blocks, start_idx, fortran_order
):
def _fill_samples_file(samples_path, row_blocks, start_idx, fortran_order):
rows_samples = Array._merge_blocks(row_blocks)
rows_samples = rows_samples.astype(
dtype="float32", casting="same_kind"
)
rows_samples = rows_samples.astype(dtype="float32", casting="same_kind")
samples = np.lib.format.open_memmap(
samples_path, mode="r+", fortran_order=fortran_order
)
samples[start_idx: start_idx + rows_samples.shape[0]] = (
rows_samples
)
samples[start_idx : start_idx + rows_samples.shape[0]] = rows_samples


def _fill_features_file(
samples_path, row_blocks, start_idx, fortran_order
):
rows_samples = Array._merge_blocks(row_blocks).T
rows_samples = rows_samples.astype(
dtype="float32", casting="same_kind"
)
def _fill_features_file(samples_path, row_blocks, start_idx, fortran_order):
rows_samples = Array._merge_blocks(row_blocks)
rows_samples = rows_samples.astype(dtype="float32", casting="same_kind")
samples = np.lib.format.open_memmap(
samples_path, mode="r+", fortran_order=fortran_order
)
samples[start_idx: start_idx + rows_samples.shape[1]] = (
rows_samples
)
samples[:, start_idx : start_idx + rows_samples.shape[0]] = rows_samples.T


def _fill_targets_file(targets_path, row_blocks):
Expand All @@ -177,7 +165,7 @@ def save_samples(x, samples_path, fortran_order):
n_samples = x.shape[0]
n_features = x.shape[1]

open(samples_path, 'w').close()
open(samples_path, "w").close()
np.lib.format.open_memmap(
samples_path,
mode="w+",
Expand All @@ -188,9 +176,7 @@ def save_samples(x, samples_path, fortran_order):
start_idx = 0
row_blocks_iterator = x._iterator(axis=0)
top_row = next(row_blocks_iterator)
_fill_samples_file(
samples_path, top_row._blocks, start_idx, fortran_order
)
_fill_samples_file(samples_path, top_row._blocks, start_idx, fortran_order)
start_idx += x._top_left_shape[0]
for x_row in row_blocks_iterator:
_fill_samples_file(
Expand All @@ -200,7 +186,7 @@ def save_samples(x, samples_path, fortran_order):


def save_targets(y, targets_path):
open(targets_path, 'w').close()
open(targets_path, "w").close()
for y_row in y._iterator(axis=0):
_fill_targets_file(targets_path, y_row._blocks)

Expand All @@ -209,33 +195,30 @@ def save_features(x, features_path, fortran_order):
n_samples = x.shape[0]
n_features = x.shape[1]

if features_path is not None:
np.lib.format.open_memmap(
features_path,
mode="w+",
dtype="float32",
fortran_order=fortran_order,
shape=(int(n_features), int(n_samples)),
)
start_idx = 0
col_blocks_iterator = x._iterator(axis=1)
left_col = next(col_blocks_iterator)
np.lib.format.open_memmap(
features_path,
mode="w+",
dtype="float32",
fortran_order=fortran_order,
shape=(int(n_features), int(n_samples)),
)
start_idx = 0
row_blocks_iterator = x._iterator(axis=0)
top_row = next(row_blocks_iterator)
_fill_features_file(
features_path, top_row._blocks, start_idx, fortran_order
)
start_idx += x._top_left_shape[0]
for x_row in row_blocks_iterator:
_fill_features_file(
features_path, left_col._blocks,
start_idx, fortran_order
features_path, x_row._blocks, start_idx, fortran_order
)
start_idx += x._top_left_shape[1]
for x_row in col_blocks_iterator:
_fill_features_file(
features_path, x_row._blocks,
start_idx, fortran_order
)
start_idx += x._reg_shape[1]
start_idx += x._reg_shape[0]


def main():
unittest.main()


if __name__ == '__main__':
if __name__ == "__main__":
main()

0 comments on commit 579f993

Please sign in to comment.