From d357af555f4483c0e9a1a65d16fd5e6b17caf6b4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= <gcasadesusvila@gmail.com>
Date: Tue, 17 Aug 2021 18:48:08 +0200
Subject: [PATCH 1/3] Added module for trees and changed forest classes.

---
 dislib/classification/__init__.py             |   2 +-
 dislib/regression/__init__.py                 |   2 +-
 dislib/{commons/rf => trees}/__init__.py      |   0
 dislib/{commons/rf => trees}/data.py          |  27 +-
 dislib/{commons/rf => trees}/decision_tree.py |  49 ++-
 dislib/{commons/rf => trees}/forest.py        | 358 ++++++++++--------
 dislib/{commons/rf => trees}/test_split.py    |   0
 dislib/utils/saving.py                        |   5 +-
 docs/source/user-guide.rst                    |   4 +-
 tests/test_rf_classifier.py                   |  97 +++--
 tests/test_rf_dataset.py                      |  11 +-
 tests/test_rf_regressor.py                    |  30 +-
 12 files changed, 320 insertions(+), 265 deletions(-)
 rename dislib/{commons/rf => trees}/__init__.py (100%)
 rename dislib/{commons/rf => trees}/data.py (96%)
 rename dislib/{commons/rf => trees}/decision_tree.py (96%)
 rename dislib/{commons/rf => trees}/forest.py (76%)
 rename dislib/{commons/rf => trees}/test_split.py (100%)

diff --git a/dislib/classification/__init__.py b/dislib/classification/__init__.py
index 695dd571..31745ea9 100644
--- a/dislib/classification/__init__.py
+++ b/dislib/classification/__init__.py
@@ -1,4 +1,4 @@
 from dislib.classification.csvm.base import CascadeSVM
-from dislib.commons.rf.forest import RandomForestClassifier
+from dislib.trees.forest import RandomForestClassifier
 
 __all__ = ["CascadeSVM", "RandomForestClassifier"]
diff --git a/dislib/regression/__init__.py b/dislib/regression/__init__.py
index a47cd17d..651aa23d 100644
--- a/dislib/regression/__init__.py
+++ b/dislib/regression/__init__.py
@@ -1,5 +1,5 @@
 from dislib.regression.linear.base import LinearRegression
 from dislib.regression.lasso.base import Lasso
-from dislib.commons.rf.forest import RandomForestRegressor
+from dislib.trees.forest import RandomForestRegressor
 
 __all__ = ["LinearRegression", "Lasso", "RandomForestRegressor"]
diff --git a/dislib/commons/rf/__init__.py b/dislib/trees/__init__.py
similarity index 100%
rename from dislib/commons/rf/__init__.py
rename to dislib/trees/__init__.py
diff --git a/dislib/commons/rf/data.py b/dislib/trees/data.py
similarity index 96%
rename from dislib/commons/rf/data.py
rename to dislib/trees/data.py
index e5155bdc..b5ca9058 100644
--- a/dislib/commons/rf/data.py
+++ b/dislib/trees/data.py
@@ -1,5 +1,4 @@
 import tempfile
-
 import numpy as np
 from numpy.lib import format
 from pycompss.api.parameter import (
@@ -10,7 +9,6 @@
     Type,
 )
 from pycompss.api.task import task
-
 from dislib.data.array import Array
 
 
@@ -43,7 +41,6 @@ def get_n_samples(self):
             If self.n_samples is None and self.samples_path is not a string.
         ValueError
             If invalid content is encountered in the samples file.
-
         """
         if self.n_samples is None:
             if not isinstance(self.samples_path, str):
@@ -68,7 +65,6 @@ def get_n_features(self):
             If self.n_features is None and self.samples_path is not a string.
         ValueError
             If invalid content is encountered in the samples file.
-
         """
         if self.n_features is None:
             if not isinstance(self.samples_path, str):
@@ -88,7 +84,6 @@ def validate_features_file(self):
         ValueError
             If the shape of the array in the features_file doesn't match this
             class n_samples and n_features or if the array is in fortran order.
-
         """
         features_npy_file = _NpyFile(self.features_path)
         shape = features_npy_file.get_shape()
@@ -156,7 +151,6 @@ def get_y_targets(self):
         Returns
         -------
         y_targets: ndarray
-
         """
         if self.y_targets is None:
             labels = _get_labels(self.targets_path)
@@ -169,7 +163,6 @@ def get_classes(self):
         Returns
         -------
         y_categories: ndarray
-
         """
         if self.y_categories is None:
             labels = _get_labels(self.targets_path)
@@ -182,7 +175,6 @@ def get_n_classes(self):
         Returns
         -------
         n_classes: int
-
         """
         if self.n_classes is None:
             labels = _get_labels(self.targets_path)
@@ -238,7 +230,6 @@ def get_y_targets(self):
         Returns
         -------
         y_targets: ndarray
-
         """
         if self.y_targets is None:
             targets = _get_values(self.targets_path)
@@ -253,11 +244,14 @@ def get_classes(self):
 
 
 def transform_to_rf_dataset(
-    x: Array, y: Array, task: str, features_file=False
+    x: Array,
+    y: Array,
+    base_dataset: RfRegressorDataset or RfClassifierDataset,
+    features_file=False,
 ) -> RfRegressorDataset or RfClassifierDataset:
     """Creates a RfDataset object from samples x and targets y.
 
-    This function creates a dislib.commons.rf.data.RfDataset by saving
+    This function creates a `RfDataset` by saving
     x and y in files.
 
     Parameters
@@ -323,16 +317,7 @@ def transform_to_rf_dataset(
     else:
         features_path = None
 
-    if task == "classification":
-        rf_dataset = RfClassifierDataset(
-            samples_path, targets_path, features_path
-        )
-    elif task == "regression":
-        rf_dataset = RfRegressorDataset(
-            samples_path, targets_path, features_path
-        )
-    else:
-        raise ValueError("task must be either classification or regression.")
+    rf_dataset = base_dataset(samples_path, targets_path, features_path)
     rf_dataset.n_samples = n_samples
     rf_dataset.n_features = n_features
     return rf_dataset
diff --git a/dislib/commons/rf/decision_tree.py b/dislib/trees/decision_tree.py
similarity index 96%
rename from dislib/commons/rf/decision_tree.py
rename to dislib/trees/decision_tree.py
index 751983d4..5b2b6dfd 100644
--- a/dislib/commons/rf/decision_tree.py
+++ b/dislib/trees/decision_tree.py
@@ -8,7 +8,7 @@
 from sklearn.tree import DecisionTreeClassifier as SklearnDTClassifier
 from sklearn.tree import DecisionTreeRegressor as SklearnDTRegressor
 
-from dislib.commons.rf.test_split import test_split
+from dislib.trees.test_split import test_split
 from dislib.data.array import Array
 
 
@@ -27,6 +27,8 @@ def __init__(
         sklearn_max,
         bootstrap,
         random_state,
+        base_node,
+        base_tree,
     ):
         self.try_features = try_features
         self.max_depth = max_depth
@@ -34,6 +36,8 @@ def __init__(
         self.sklearn_max = sklearn_max
         self.bootstrap = bootstrap
         self.random_state = random_state
+        self.base_node = base_node
+        self.base_tree = base_tree
 
         self.n_features = None
         self.n_classes = None
@@ -48,7 +52,6 @@ def fit(self, dataset):
         Parameters
         ----------
         dataset : dislib.classification.rf._data.RfDataset
-
         """
 
         self.n_features = dataset.get_n_features()
@@ -63,9 +66,8 @@ def fit(self, dataset):
         sample, y_s = _sample_selection(
             n_samples, y_targets, self.bootstrap, seed
         )
-        Node = _ClassificationNode if self.n_classes else _RegressionNode
 
-        self.tree = Node()
+        self.tree = self.base_node()
         self.nodes_info = []
         self.subtrees = []
         tree_traversal = [(self.tree, sample, y_s, 0)]
@@ -87,8 +89,8 @@ def fit(self, dataset):
                 compss_delete_object(y_s)
                 node.content = len(self.nodes_info)
                 self.nodes_info.append(node_info)
-                node.left = Node()
-                node.right = Node()
+                node.left = self.base_node()
+                node.right = self.base_node()
                 depth = depth + 1
                 tree_traversal.append((node.right, right_group, y_r, depth))
                 tree_traversal.append((node.left, left_group, y_l, depth))
@@ -102,6 +104,8 @@ def fit(self, dataset):
                     self.try_features,
                     self.sklearn_max,
                     self.random_state,
+                    self.base_node,
+                    self.base_tree,
                     samples_path,
                     features_path,
                 )
@@ -216,6 +220,8 @@ def __init__(
             sklearn_max,
             bootstrap,
             random_state,
+            _ClassificationNode,
+            SklearnDTClassifier,
         )
 
     def predict_proba(self, x_row):
@@ -234,7 +240,6 @@ def predict_proba(self, x_row):
             of the column being codes of the fitted
             dislib.classification.rf.data.RfDataset. The returned object can be
             a pycompss.runtime.Future object.
-
         """
 
         assert self.tree is not None, "The decision tree is not fitted."
@@ -319,6 +324,8 @@ def __init__(
             sklearn_max,
             bootstrap,
             random_state,
+            _RegressionNode,
+            SklearnDTRegressor,
         )
 
 
@@ -539,6 +546,8 @@ def _build_subtree_wrapper(
     m_try,
     sklearn_max,
     random_state,
+    base_node,
+    base_tree,
     samples_file,
     features_file,
 ):
@@ -553,6 +562,8 @@ def _build_subtree_wrapper(
             m_try,
             sklearn_max,
             seed,
+            base_node,
+            base_tree,
             samples_file,
             features_file,
         )
@@ -566,6 +577,8 @@ def _build_subtree_wrapper(
             m_try,
             sklearn_max,
             seed,
+            base_node,
+            base_tree,
             samples_file,
         )
 
@@ -580,6 +593,8 @@ def _build_subtree_using_features(
     m_try,
     sklearn_max,
     seed,
+    base_node,
+    base_tree,
     samples_file,
     features_file,
 ):
@@ -593,6 +608,8 @@ def _build_subtree_using_features(
         m_try,
         sklearn_max,
         random_state,
+        base_node,
+        base_tree,
         samples_file,
         features_file=features_file,
     )
@@ -608,6 +625,8 @@ def _build_subtree(
     m_try,
     sklearn_max,
     seed,
+    base_node,
+    base_tree,
     samples_file,
 ):
     random_state = RandomState(seed)
@@ -620,6 +639,8 @@ def _build_subtree(
         m_try,
         sklearn_max,
         random_state,
+        base_node,
+        base_tree,
         samples_file,
     )
 
@@ -633,19 +654,19 @@ def _compute_build_subtree(
     m_try,
     sklearn_max,
     random_state,
+    base_node,
+    base_tree,
     samples_file,
     features_file=None,
     use_sklearn=True,
 ):
-    Node = _ClassificationNode if n_classes else _RegressionNode
-    SklearnDT = SklearnDTClassifier if n_classes else SklearnDTRegressor
     if not sample.size:
-        return Node()
+        return base_node()
     if features_file is not None:
         mmap = np.load(features_file, mmap_mode="r", allow_pickle=False)
     else:
         mmap = np.load(samples_file, mmap_mode="r", allow_pickle=False).T
-    subtree = Node()
+    subtree = base_node()
     tree_traversal = [(subtree, sample, y_s, 0)]
     while tree_traversal:
         node, sample, y_s, depth = tree_traversal.pop()
@@ -655,7 +676,7 @@ def _compute_build_subtree(
                     sklearn_max_depth = None
                 else:
                     sklearn_max_depth = max_depth - depth
-                dt = SklearnDT(
+                dt = base_tree(
                     max_features=m_try,
                     max_depth=sklearn_max_depth,
                     random_state=random_state,
@@ -681,8 +702,8 @@ def _compute_build_subtree(
                 node_info, left_group, y_l, right_group, y_r = split
                 node.content = node_info
                 if isinstance(node_info, _InnerNodeInfo):
-                    node.left = Node()
-                    node.right = Node()
+                    node.left = base_node()
+                    node.right = base_node()
                     tree_traversal.append(
                         (node.right, right_group, y_r, depth + 1)
                     )
diff --git a/dislib/commons/rf/forest.py b/dislib/trees/forest.py
similarity index 76%
rename from dislib/commons/rf/forest.py
rename to dislib/trees/forest.py
index be2e668c..0fd22274 100644
--- a/dislib/commons/rf/forest.py
+++ b/dislib/trees/forest.py
@@ -8,13 +8,14 @@
 from sklearn.base import BaseEstimator
 from sklearn.utils import check_random_state
 
-from dislib.commons.rf.decision_tree import (
+from dislib.trees.decision_tree import (
     DecisionTreeClassifier,
     DecisionTreeRegressor,
 )
+from dislib.trees.data import RfClassifierDataset, RfRegressorDataset
 from dislib.data.array import Array
 from dislib.utils.base import _paired_partition
-from dislib.commons.rf.data import transform_to_rf_dataset
+from dislib.trees.data import transform_to_rf_dataset
 
 
 class BaseRandomForest(BaseEstimator):
@@ -33,6 +34,8 @@ def __init__(
         sklearn_max,
         hard_vote,
         random_state,
+        base_tree,
+        base_dataset,
     ):
         self.n_estimators = n_estimators
         self.try_features = try_features
@@ -41,9 +44,11 @@ def __init__(
         self.sklearn_max = sklearn_max
         self.hard_vote = hard_vote
         self.random_state = random_state
+        self.base_tree = base_tree
+        self.base_dataset = base_dataset
 
     def fit(self, x, y):
-        """Fits the RandomForest.
+        """Fits a RandomForest.
 
         Parameters
         ----------
@@ -56,21 +61,9 @@ def fit(self, x, y):
         Returns
         -------
         self : RandomForest
-
         """
-        self.classes = None
-        self.trees = []
-
-        if self.hard_vote is not None:
-            # Classification
-            task = "classification"
-            Tree = DecisionTreeClassifier
-        else:
-            # Regression
-            task = "regression"
-            Tree = DecisionTreeRegressor
 
-        dataset = transform_to_rf_dataset(x, y, task)
+        dataset = transform_to_rf_dataset(x, y, self.base_dataset)
 
         n_features = dataset.get_n_features()
         try_features = _resolve_try_features(self.try_features, n_features)
@@ -85,8 +78,9 @@ def fit(self, x, y):
         else:
             distr_depth = self.distr_depth
 
-        for i in range(self.n_estimators):
-            tree = Tree(
+        self.trees = []
+        for _ in range(self.n_estimators):
+            tree = self.base_tree(
                 try_features,
                 self.max_depth,
                 distr_depth,
@@ -101,128 +95,6 @@ def fit(self, x, y):
 
         return self
 
-    def predict(self, x):
-        """Predicts target classes or values using a fitted forest.
-
-        Parameters
-        ----------
-        x : ds-array, shape=(n_samples, n_features)
-            The input samples.
-
-        Returns
-        -------
-        y_pred : ds-array, shape=(n_samples, 1)
-            Predicted class labels or values for x.
-
-        """
-        assert self.trees is not None, "The random forest is not fitted."
-        pred_blocks = []
-        if self.hard_vote is not None:
-            # Classification
-            if self.hard_vote:
-                for x_row in x._iterator(axis=0):
-                    tree_predictions = []
-                    for tree in self.trees:
-                        tree_predictions.append(tree.predict(x_row))
-                    pred_blocks.append(
-                        _hard_vote(self.classes, *tree_predictions)
-                    )
-            else:
-                for x_row in x._iterator(axis=0):
-                    tree_predictions = []
-                    for tree in self.trees:
-                        tree_predictions.append(tree.predict_proba(x_row))
-                    pred_blocks.append(
-                        _soft_vote(self.classes, *tree_predictions)
-                    )
-        else:
-            # Regression
-            for x_row in x._iterator(axis=0):
-                tree_predictions = []
-                for tree in self.trees:
-                    tree_predictions.append(tree.predict(x_row))
-                pred_blocks.append(_join_predictions(*tree_predictions))
-
-        y_pred = Array(
-            blocks=[pred_blocks],
-            top_left_shape=(x._top_left_shape[0], 1),
-            reg_shape=(x._reg_shape[0], 1),
-            shape=(x.shape[0], 1),
-            sparse=False,
-        )
-
-        return y_pred
-
-    def score(self, x, y, collect=False):
-        """Accuracy classification score.
-
-        For classification returns the mean accuracy on the given test data.
-
-        For regression returns the coefficient of determination $R^2$ of
-        the prediction.
-        The coefficient $R^2$ is defined as $(1-u/v)$, where $u$
-        is the residual sum of squares `((y_true - y_pred) ** 2).sum()` and
-        $v$ is the total sum of squares
-        `((y_true - y_true.mean()) ** 2).sum()`.
-        The best possible score is 1.0 and it can be negative
-        if the model is arbitrarily worse.
-        A constant model that always predicts the expected value of y,
-        disregarding the input features, would get a $R^2$ score of 0.0.
-
-        Parameters
-        ----------
-        x : ds-array, shape=(n_samples, n_features)
-            The training input samples.
-        y : ds-array, shape (n_samples, 1)
-            The true labels.
-        collect : bool, optional (default=False)
-            When True, a synchronized result is returned.
-
-
-        Returns
-        -------
-        score : float (as future object)
-            Fraction of correctly classified samples for classification
-            or coefficient of determination $R^2$ for regression.
-
-        """
-        assert self.trees is not None, "The random forest is not fitted."
-        partial_scores = []
-        if self.hard_vote is not None:
-            # Classification
-            if self.hard_vote:
-                for x_row, y_row in _paired_partition(x, y):
-                    tree_predictions = []
-                    for tree in self.trees:
-                        tree_predictions.append(tree.predict(x_row))
-                    subset_score = _hard_vote_score(
-                        y_row._blocks, self.classes, *tree_predictions
-                    )
-                    partial_scores.append(subset_score)
-            else:
-                for x_row, y_row in _paired_partition(x, y):
-                    tree_predictions = []
-                    for tree in self.trees:
-                        tree_predictions.append(tree.predict_proba(x_row))
-                    subset_score = _soft_vote_score(
-                        y_row._blocks, self.classes, *tree_predictions
-                    )
-                    partial_scores.append(subset_score)
-            score = _merge_classification_scores(*partial_scores)
-        else:
-            # Regression
-            for x_row, y_row in _paired_partition(x, y):
-                tree_predictions = []
-                for tree in self.trees:
-                    tree_predictions.append(tree.predict(x_row))
-                subset_score = _regression_score(
-                    y_row._blocks, *tree_predictions
-                )
-                partial_scores.append(subset_score)
-            score = _merge_regression_scores(*partial_scores)
-
-        return compss_wait_on(score) if collect else score
-
 
 class RandomForestClassifier(BaseRandomForest):
     """A distributed random forest classifier.
@@ -291,8 +163,49 @@ def __init__(
             sklearn_max,
             hard_vote,
             random_state,
+            base_tree=DecisionTreeClassifier,
+            base_dataset=RfClassifierDataset,
         )
 
+    def predict(self, x):
+        """Predicts target classes using a fitted forest.
+
+        Parameters
+        ----------
+        x : ds-array, shape=(n_samples, n_features)
+            The input samples.
+
+        Returns
+        -------
+        y_pred : ds-array, shape=(n_samples, 1)
+            Predicted class labels for x.
+        """
+        assert self.trees is not None, "The random forest is not fitted."
+
+        pred_blocks = []
+        if self.hard_vote:
+            for x_row in x._iterator(axis=0):
+                tree_predictions = []
+                for tree in self.trees:
+                    tree_predictions.append(tree.predict(x_row))
+                pred_blocks.append(_hard_vote(self.classes, *tree_predictions))
+        else:
+            for x_row in x._iterator(axis=0):
+                tree_predictions = []
+                for tree in self.trees:
+                    tree_predictions.append(tree.predict_proba(x_row))
+                pred_blocks.append(_soft_vote(self.classes, *tree_predictions))
+
+        y_pred = Array(
+            blocks=[pred_blocks],
+            top_left_shape=(x._top_left_shape[0], 1),
+            reg_shape=(x._reg_shape[0], 1),
+            shape=(x.shape[0], 1),
+            sparse=False,
+        )
+
+        return y_pred
+
     def predict_proba(self, x):
         """Predicts class probabilities using a fitted forest.
 
@@ -311,9 +224,9 @@ def predict_proba(self, x):
             Predicted probabilities for the samples to belong to each class.
             The columns of the array correspond to the classes given at
             self.classes.
-
         """
         assert self.trees is not None, "The random forest is not fitted."
+
         prob_blocks = []
         for x_row in x._iterator(axis=0):
             tree_predictions = []
@@ -332,6 +245,53 @@ def predict_proba(self, x):
         )
         return probabilities
 
+    def score(self, x, y, collect=False):
+        """Accuracy classification score.
+
+        Returns the mean accuracy of the predictions on the given test data.
+
+        Parameters
+        ----------
+        x : ds-array, shape=(n_samples, n_features)
+            The training input samples.
+        y : ds-array, shape (n_samples, 1)
+            The true labels.
+        collect : bool, optional (default=False)
+            When True, a synchronized result is returned.
+
+
+        Returns
+        -------
+        score : float (as future object)
+            Fraction of correctly classified samples.
+        """
+        assert self.trees is not None, "The random forest is not fitted."
+
+        partial_scores = []
+        if self.hard_vote:
+            for x_row, y_row in _paired_partition(x, y):
+                tree_predictions = []
+                for tree in self.trees:
+                    tree_predictions.append(tree.predict(x_row))
+                subset_score = _hard_vote_score(
+                    y_row._blocks, self.classes, *tree_predictions
+                )
+                partial_scores.append(subset_score)
+
+        else:
+            for x_row, y_row in _paired_partition(x, y):
+                tree_predictions = []
+                for tree in self.trees:
+                    tree_predictions.append(tree.predict_proba(x_row))
+                subset_score = _soft_vote_score(
+                    y_row._blocks, self.classes, *tree_predictions
+                )
+                partial_scores.append(subset_score)
+
+        score = _merge_classification_scores(*partial_scores)
+
+        return compss_wait_on(score) if collect else score
+
 
 class RandomForestRegressor(BaseRandomForest):
     """A distributed random forest regressor.
@@ -393,8 +353,100 @@ def __init__(
             sklearn_max,
             hard_vote,
             random_state,
+            base_tree=DecisionTreeRegressor,
+            base_dataset=RfRegressorDataset,
+        )
+
+    def predict(self, x):
+        """Predicts target values using a fitted forest.
+
+        Parameters
+        ----------
+        x : ds-array, shape=(n_samples, n_features)
+            The input samples.
+
+        Returns
+        -------
+        y_pred : ds-array, shape=(n_samples, 1)
+            Predicted values for x.
+        """
+        assert self.trees is not None, "The random forest is not fitted."
+
+        pred_blocks = []
+        for x_row in x._iterator(axis=0):
+            tree_predictions = []
+            for tree in self.trees:
+                tree_predictions.append(tree.predict(x_row))
+            pred_blocks.append(_join_predictions(*tree_predictions))
+
+        y_pred = Array(
+            blocks=[pred_blocks],
+            top_left_shape=(x._top_left_shape[0], 1),
+            reg_shape=(x._reg_shape[0], 1),
+            shape=(x.shape[0], 1),
+            sparse=False,
         )
 
+        return y_pred
+
+    def score(self, x, y, collect=False):
+        """R2 regression score.
+
+        Returns the coefficient of determination $R^2$ of the prediction.
+        The coefficient $R^2$ is defined as $(1-u/v)$, where $u$
+        is the residual sum of squares `((y_true - y_pred) ** 2).sum()` and
+        $v$ is the total sum of squares
+        `((y_true - y_true.mean()) ** 2).sum()`.
+        The best possible score is 1.0 and it can be negative
+        if the model is arbitrarily worse.
+        A constant model that always predicts the expected value of y,
+        disregarding the input features, would get a $R^2$ score of 0.0.
+
+        Parameters
+        ----------
+        x : ds-array, shape=(n_samples, n_features)
+            The training input samples.
+        y : ds-array, shape (n_samples, 1)
+            The true values.
+        collect : bool, optional (default=False)
+            When True, a synchronized result is returned.
+
+
+        Returns
+        -------
+        score : float (as future object)
+            Coefficient of determination $R^2$.
+        """
+        assert self.trees is not None, "The random forest is not fitted."
+
+        partial_scores = []
+        for x_row, y_row in _paired_partition(x, y):
+            tree_predictions = []
+            for tree in self.trees:
+                tree_predictions.append(tree.predict(x_row))
+            subset_score = _regression_score(y_row._blocks, *tree_predictions)
+            partial_scores.append(subset_score)
+
+        score = _merge_regression_scores(*partial_scores)
+
+        return compss_wait_on(score) if collect else score
+
+
+def _base_soft_vote(classes, *predictions):
+    aggregate = predictions[0]
+    for p in predictions[1:]:
+        aggregate += p
+    predicted_labels = classes[np.argmax(aggregate, axis=1)]
+    return predicted_labels
+
+
+def _base_hard_vote(classes, *predictions):
+    mode = np.empty((len(predictions[0]),), dtype=int)
+    for sample_i, votes in enumerate(zip(*predictions)):
+        mode[sample_i] = Counter(votes).most_common(1)[0][0]
+    labels = classes[mode]
+    return labels
+
 
 @task(returns=1)
 def _resolve_try_features(try_features, n_features):
@@ -419,40 +471,28 @@ def _join_predictions(*predictions):
 
 @task(returns=1)
 def _soft_vote(classes, *predictions):
-    aggregate = predictions[0]
-    for p in predictions[1:]:
-        aggregate += p
-    labels = classes[np.argmax(aggregate, axis=1)]
-    return labels
+    predicted_labels = _base_soft_vote(classes, *predictions)
+    return predicted_labels
 
 
 @task(returns=1)
 def _hard_vote(classes, *predictions):
-    mode = np.empty((len(predictions[0]),), dtype=int)
-    for sample_i, votes in enumerate(zip(*predictions)):
-        mode[sample_i] = Counter(votes).most_common(1)[0][0]
-    labels = classes[mode]
-    return labels
+    predicted_labels = _base_hard_vote(classes, *predictions)
+    return predicted_labels
 
 
 @task(y_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1)
 def _soft_vote_score(y_blocks, classes, *predictions):
+    predicted_labels = _base_soft_vote(classes, *predictions)
     real_labels = Array._merge_blocks(y_blocks).flatten()
-    aggregate = predictions[0]
-    for p in predictions[1:]:
-        aggregate += p
-    predicted_labels = classes[np.argmax(aggregate, axis=1)]
     correct = np.count_nonzero(predicted_labels == real_labels)
     return correct, len(real_labels)
 
 
 @task(y_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1)
 def _hard_vote_score(y_blocks, classes, *predictions):
+    predicted_labels = _base_hard_vote(classes, *predictions)
     real_labels = Array._merge_blocks(y_blocks).flatten()
-    mode = np.empty((len(predictions[0]),), dtype=int)
-    for sample_i, votes in enumerate(zip(*predictions)):
-        mode[sample_i] = Counter(votes).most_common(1)[0][0]
-    predicted_labels = classes[mode]
     correct = np.count_nonzero(predicted_labels == real_labels)
     return correct, len(real_labels)
 
diff --git a/dislib/commons/rf/test_split.py b/dislib/trees/test_split.py
similarity index 100%
rename from dislib/commons/rf/test_split.py
rename to dislib/trees/test_split.py
diff --git a/dislib/utils/saving.py b/dislib/utils/saving.py
index 02ecfb8a..a0f36abd 100644
--- a/dislib/utils/saving.py
+++ b/dislib/utils/saving.py
@@ -16,7 +16,7 @@
 import dislib.recommendation
 import dislib.regression
 from dislib.data.array import Array
-from dislib.commons.rf.decision_tree import (
+from dislib.trees.decision_tree import (
     DecisionTreeClassifier,
     DecisionTreeRegressor,
     _Node,
@@ -296,7 +296,8 @@ def _decode_helper(obj):
         ):
             dict_ = _decode_helper(obj["items"])
             if class_name in (
-                "DecisionTreeClassifier", "DecisionTreeRegressor"
+                "DecisionTreeClassifier",
+                "DecisionTreeRegressor",
             ):
                 model = DISLIB_CLASSES[obj["class_name"]](
                     try_features=dict_.pop("try_features"),
diff --git a/docs/source/user-guide.rst b/docs/source/user-guide.rst
index 3fb02dc1..4f16e981 100644
--- a/docs/source/user-guide.rst
+++ b/docs/source/user-guide.rst
@@ -294,7 +294,7 @@ scalability of the estimator is limited by the reduction phase of the cascade.
 Random forest classifier
 ........................
 
-:class:`RandomForestClassifier <dislib.commons.rf.forest.RandomForestClassifier>`
+:class:`RandomForestClassifier <dislib.trees.forest.RandomForestClassifier>`
 is a classifier that uses an ensemble of decision trees and aggregates their
 predictions. The process of building each decision tree includes some
 randomization in order to make them different. The accuracy of the joint
@@ -569,7 +569,7 @@ linear equations.)
 Random forest regressor
 ........................
 
-:class:`RandomForestRegressor <dislib.commons.rf.forest.RandomForestRegressor>`
+:class:`RandomForestRegressor <dislib.trees.forest.RandomForestRegressor>`
 is a regressor that uses an ensemble of decision trees and aggregates their
 predictions. The process of building each decision tree includes some
 randomization in order to make them different. The accuracy of the joint
diff --git a/tests/test_rf_classifier.py b/tests/test_rf_classifier.py
index 6b4648a9..9b17712e 100644
--- a/tests/test_rf_classifier.py
+++ b/tests/test_rf_classifier.py
@@ -22,11 +22,12 @@ def test_make_classification_score(self):
             n_repeated=1,
             n_clusters_per_class=2,
             shuffle=True,
-            random_state=0)
-        x_train = ds.array(x[:len(x) // 2], (300, 10))
-        y_train = ds.array(y[:len(y) // 2][:, np.newaxis], (300, 1))
-        x_test = ds.array(x[len(x) // 2:], (300, 10))
-        y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1))
+            random_state=0,
+        )
+        x_train = ds.array(x[::2], (300, 10))
+        y_train = ds.array(y[::2][:, np.newaxis], (300, 1))
+        x_test = ds.array(x[1::2], (300, 10))
+        y_test = ds.array(y[1::2][:, np.newaxis], (300, 1))
 
         rf = RandomForestClassifier(random_state=0)
 
@@ -45,11 +46,12 @@ def test_make_classification_predict_and_distr_depth(self):
             n_repeated=1,
             n_clusters_per_class=2,
             shuffle=True,
-            random_state=0)
-        x_train = ds.array(x[:len(x) // 2], (300, 10))
-        y_train = ds.array(y[:len(y) // 2][:, np.newaxis], (300, 1))
-        x_test = ds.array(x[len(x) // 2:], (300, 10))
-        y_test = y[len(y) // 2:]
+            random_state=0,
+        )
+        x_train = ds.array(x[::2], (300, 10))
+        y_train = ds.array(y[::2][:, np.newaxis], (300, 1))
+        x_test = ds.array(x[1::2], (300, 10))
+        y_test = y[1::2]
 
         rf = RandomForestClassifier(distr_depth=2, random_state=0)
 
@@ -69,9 +71,10 @@ def test_make_classification_fit_predict(self):
             n_repeated=1,
             n_clusters_per_class=2,
             shuffle=True,
-            random_state=0)
-        x_train = ds.array(x[:len(x) // 2], (300, 10))
-        y_train = ds.array(y[:len(y) // 2][:, np.newaxis], (300, 1))
+            random_state=0,
+        )
+        x_train = ds.array(x[::2], (300, 10))
+        y_train = ds.array(y[::2][:, np.newaxis], (300, 1))
 
         rf = RandomForestClassifier(random_state=0)
 
@@ -91,11 +94,12 @@ def test_make_classification_sklearn_max_predict(self):
             n_repeated=1,
             n_clusters_per_class=2,
             shuffle=True,
-            random_state=0)
-        x_train = ds.array(x[:len(x) // 2], (300, 10))
-        y_train = ds.array(y[:len(y) // 2][:, np.newaxis], (300, 1))
-        x_test = ds.array(x[len(x) // 2:], (300, 10))
-        y_test = y[len(y) // 2:]
+            random_state=0,
+        )
+        x_train = ds.array(x[::2], (300, 10))
+        y_train = ds.array(y[::2][:, np.newaxis], (300, 1))
+        x_test = ds.array(x[1::2], (300, 10))
+        y_test = y[1::2]
 
         rf = RandomForestClassifier(random_state=0, sklearn_max=10)
 
@@ -115,11 +119,12 @@ def test_make_classification_sklearn_max_predict_proba(self):
             n_repeated=1,
             n_clusters_per_class=2,
             shuffle=True,
-            random_state=0)
-        x_train = ds.array(x[:len(x) // 2], (300, 10))
-        y_train = ds.array(y[:len(y) // 2][:, np.newaxis], (300, 1))
-        x_test = ds.array(x[len(x) // 2:], (300, 10))
-        y_test = y[len(y) // 2:]
+            random_state=0,
+        )
+        x_train = ds.array(x[::2], (300, 10))
+        y_train = ds.array(y[::2][:, np.newaxis], (300, 1))
+        x_test = ds.array(x[1::2], (300, 10))
+        y_test = y[1::2]
 
         rf = RandomForestClassifier(random_state=0, sklearn_max=10)
 
@@ -141,14 +146,16 @@ def test_make_classification_hard_vote_predict(self):
             n_repeated=1,
             n_clusters_per_class=2,
             shuffle=True,
-            random_state=0)
-        x_train = ds.array(x[:len(x) // 2], (300, 10))
-        y_train = ds.array(y[:len(y) // 2][:, np.newaxis], (300, 1))
-        x_test = ds.array(x[len(x) // 2:], (300, 10))
-        y_test = y[len(y) // 2:]
+            random_state=0,
+        )
+        x_train = ds.array(x[::2], (300, 10))
+        y_train = ds.array(y[::2][:, np.newaxis], (300, 1))
+        x_test = ds.array(x[1::2], (300, 10))
+        y_test = y[1::2]
 
-        rf = RandomForestClassifier(random_state=0, sklearn_max=10,
-                                    hard_vote=True)
+        rf = RandomForestClassifier(
+            random_state=0, sklearn_max=10, hard_vote=True
+        )
 
         rf.fit(x_train, y_train)
         y_pred = rf.predict(x_test).collect()
@@ -167,15 +174,20 @@ def test_make_classification_hard_vote_score_mix(self):
             n_repeated=1,
             n_clusters_per_class=2,
             shuffle=True,
-            random_state=0)
-        x_train = ds.array(x[:len(x) // 2], (300, 10))
-        y_train = ds.array(y[:len(y) // 2][:, np.newaxis], (300, 1))
-        x_test = ds.array(x[len(x) // 2:], (300, 10))
-        y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1))
-
-        rf = RandomForestClassifier(random_state=0, sklearn_max=100,
-                                    distr_depth=2, max_depth=12,
-                                    hard_vote=True)
+            random_state=0,
+        )
+        x_train = ds.array(x[::2], (300, 10))
+        y_train = ds.array(y[::2][:, np.newaxis], (300, 1))
+        x_test = ds.array(x[1::2], (300, 10))
+        y_test = ds.array(y[1::2][:, np.newaxis], (300, 1))
+
+        rf = RandomForestClassifier(
+            random_state=0,
+            sklearn_max=100,
+            distr_depth=2,
+            max_depth=12,
+            hard_vote=True,
+        )
 
         rf.fit(x_train, y_train)
         accuracy = compss_wait_on(rf.score(x_test, y_test))
@@ -190,8 +202,9 @@ def test_score_on_iris(self, collect):
         ds_validate = ds.array(x[1::2], block_size=(30, 2))
         validate_y = ds.array(y[1::2].reshape(-1, 1), block_size=(30, 1))
 
-        rf = RandomForestClassifier(n_estimators=1, max_depth=1,
-                                    random_state=0)
+        rf = RandomForestClassifier(
+            n_estimators=1, max_depth=1, random_state=0
+        )
         rf.fit(ds_fit, fit_y)
         accuracy = rf.score(ds_validate, validate_y, collect)
         if not collect:
@@ -205,5 +218,5 @@ def main():
     unittest.main()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/tests/test_rf_dataset.py b/tests/test_rf_dataset.py
index de55fc76..a08532a0 100644
--- a/tests/test_rf_dataset.py
+++ b/tests/test_rf_dataset.py
@@ -4,8 +4,7 @@
 import shutil
 from sklearn.datasets import make_classification
 import dislib as ds
-from dislib.commons.rf import data
-from dislib.commons.rf import test_split
+from dislib.trees import data, test_split
 from dislib.data.array import Array
 import numpy as np
 from sys import float_info
@@ -83,10 +82,6 @@ def test_rf_dataset(self):
         with self.assertRaises(TypeError):
             rf_dataset.get_n_features()
 
-        # Task must be classification or regression
-        with self.assertRaises(ValueError):
-            rf_dataset = data.transform_to_rf_dataset(x_ds_1, y_ds_1, "aaa")
-
         # Validate dimension
         rf_dataset = data.RfBaseDataset(
             samples_path_1, targets_path_1, features_path_f
@@ -104,10 +99,10 @@ def test_rf_dataset(self):
 
         # Dataset creation
         rf_regr = data.transform_to_rf_dataset(
-            x_ds_1, y_ds_1, "regression", features_file=True
+            x_ds_1, y_ds_1, data.RfRegressorDataset, features_file=True
         )
         rf_class = data.transform_to_rf_dataset(
-            x_ds_1, y_ds_1, "classification", features_file=True
+            x_ds_1, y_ds_1, data.RfClassifierDataset, features_file=True
         )
         self.assertEquals(compss_wait_on(rf_regr.get_n_samples()), 900)
         self.assertEquals(compss_wait_on(rf_regr.get_n_features()), 10)
diff --git a/tests/test_rf_regressor.py b/tests/test_rf_regressor.py
index 36da50f7..83c40ba4 100644
--- a/tests/test_rf_regressor.py
+++ b/tests/test_rf_regressor.py
@@ -24,10 +24,10 @@ def test_make_regression(self):
             shuffle=True,
             random_state=0,
         )
-        x_train = ds.array(x[: len(x) // 2], (300, 10))
-        y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1))
-        x_test = ds.array(x[len(x) // 2:], (300, 10))
-        y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1))
+        x_train = ds.array(x[::2], (300, 10))
+        y_train = ds.array(y[::2][:, np.newaxis], (300, 1))
+        x_test = ds.array(x[1::2], (300, 10))
+        y_test = ds.array(y[1::2][:, np.newaxis], (300, 1))
 
         rf = RandomForestRegressor(random_state=0)
 
@@ -35,7 +35,7 @@ def test_make_regression(self):
         accuracy1 = compss_wait_on(rf.score(x_test, y_test))
 
         y_pred = rf.predict(x_test).collect()
-        y_true = y[len(y) // 2:]
+        y_true = y[1::2]
         accuracy2 = _determination_coefficient(y_true, y_pred)
 
         self.assertGreater(accuracy1, 0.85)
@@ -51,10 +51,10 @@ def test_make_regression_predict_and_distr_depth(self):
             shuffle=True,
             random_state=0,
         )
-        x_train = ds.array(x[: len(x) // 2], (300, 10))
-        y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1))
-        x_test = ds.array(x[len(x) // 2:], (300, 10))
-        y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1))
+        x_train = ds.array(x[::2], (300, 10))
+        y_train = ds.array(y[::2][:, np.newaxis], (300, 1))
+        x_test = ds.array(x[1::2], (300, 10))
+        y_test = ds.array(y[1::2][:, np.newaxis], (300, 1))
 
         rf = RandomForestRegressor(distr_depth=2, random_state=0)
 
@@ -62,7 +62,7 @@ def test_make_regression_predict_and_distr_depth(self):
         accuracy1 = compss_wait_on(rf.score(x_test, y_test))
 
         y_pred = rf.predict(x_test).collect()
-        y_true = y[len(y) // 2:]
+        y_true = y[1::2]
         accuracy2 = _determination_coefficient(y_true, y_pred)
 
         self.assertGreater(accuracy1, 0.85)
@@ -78,10 +78,10 @@ def test_make_regression_sklearn_max_predict(self):
             shuffle=True,
             random_state=0,
         )
-        x_train = ds.array(x[: len(x) // 2], (300, 10))
-        y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1))
-        x_test = ds.array(x[len(x) // 2:], (300, 10))
-        y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1))
+        x_train = ds.array(x[::2], (300, 10))
+        y_train = ds.array(y[::2][:, np.newaxis], (300, 1))
+        x_test = ds.array(x[1::2], (300, 10))
+        y_test = ds.array(y[1::2][:, np.newaxis], (300, 1))
 
         rf = RandomForestRegressor(random_state=0, sklearn_max=10)
 
@@ -89,7 +89,7 @@ def test_make_regression_sklearn_max_predict(self):
         accuracy1 = compss_wait_on(rf.score(x_test, y_test))
 
         y_pred = rf.predict(x_test).collect()
-        y_true = y[len(y) // 2:]
+        y_true = y[1::2]
         accuracy2 = _determination_coefficient(y_true, y_pred)
 
         self.assertGreater(accuracy1, 0.85)

From 197ea853dfdd8eccb62600eab3153ea79d76408f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= <gcasadesusvila@gmail.com>
Date: Wed, 18 Aug 2021 10:18:24 +0200
Subject: [PATCH 2/3] Changed imports.

---
 tests/test_decision_tree.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_decision_tree.py b/tests/test_decision_tree.py
index e935dc56..bbf559c6 100644
--- a/tests/test_decision_tree.py
+++ b/tests/test_decision_tree.py
@@ -4,8 +4,8 @@
 from pycompss.api.api import compss_wait_on
 
 import dislib as ds
-import dislib.commons.rf.decision_tree as dt
-import dislib.commons.rf.data as data
+import dislib.trees.decision_tree as dt
+import dislib.trees.data as data
 
 
 class DecisionTreeTest(unittest.TestCase):
@@ -32,7 +32,7 @@ def test_decision_tree(self):
         y1_ds = ds.array(y1[:, np.newaxis], (3, 1))
 
         data1 = data.transform_to_rf_dataset(
-            x1_ds, y1_ds, "classification", features_file=True
+            x1_ds, y1_ds, data.RfClassifierDataset, features_file=True
         )
 
         # Model

From 1ddfeed8b318178a50964c95d1c5ccaf6d7b64ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= <gcasadesusvila@gmail.com>
Date: Wed, 18 Aug 2021 17:56:36 +0200
Subject: [PATCH 3/3] Changed class exposure.

---
 dislib/trees/__init__.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/dislib/trees/__init__.py b/dislib/trees/__init__.py
index e69de29b..1eb1dc2b 100644
--- a/dislib/trees/__init__.py
+++ b/dislib/trees/__init__.py
@@ -0,0 +1,14 @@
+from dislib.trees.forest import RandomForestClassifier, RandomForestRegressor
+from dislib.trees.decision_tree import (
+    DecisionTreeClassifier,
+    DecisionTreeRegressor,
+)
+from dislib.trees.data import transform_to_rf_dataset
+
+__all__ = [
+    "RandomForestClassifier",
+    "RandomForestRegressor",
+    "DecisionTreeClassifier",
+    "DecisionTreeRegressor",
+    "transform_to_rf_dataset",
+]