Skip to content

Commit

Permalink
Merge pull request bsc-wdc#351 from gcasadesus/trees-module
Browse files Browse the repository at this point in the history
Trees Module
  • Loading branch information
michal-choinski authored Aug 19, 2021
2 parents 8b34f9c + 1ddfeed commit bb6303d
Show file tree
Hide file tree
Showing 14 changed files with 337 additions and 268 deletions.
2 changes: 1 addition & 1 deletion dislib/classification/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from dislib.classification.csvm.base import CascadeSVM
from dislib.commons.rf.forest import RandomForestClassifier
from dislib.trees.forest import RandomForestClassifier

__all__ = ["CascadeSVM", "RandomForestClassifier"]
Empty file removed dislib/commons/rf/__init__.py
Empty file.
2 changes: 1 addition & 1 deletion dislib/regression/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from dislib.regression.linear.base import LinearRegression
from dislib.regression.lasso.base import Lasso
from dislib.commons.rf.forest import RandomForestRegressor
from dislib.trees.forest import RandomForestRegressor

__all__ = ["LinearRegression", "Lasso", "RandomForestRegressor"]
14 changes: 14 additions & 0 deletions dislib/trees/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from dislib.trees.forest import RandomForestClassifier, RandomForestRegressor
from dislib.trees.decision_tree import (
DecisionTreeClassifier,
DecisionTreeRegressor,
)
from dislib.trees.data import transform_to_rf_dataset

__all__ = [
"RandomForestClassifier",
"RandomForestRegressor",
"DecisionTreeClassifier",
"DecisionTreeRegressor",
"transform_to_rf_dataset",
]
27 changes: 6 additions & 21 deletions dislib/commons/rf/data.py → dislib/trees/data.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import tempfile

import numpy as np
from numpy.lib import format
from pycompss.api.parameter import (
Expand All @@ -10,7 +9,6 @@
Type,
)
from pycompss.api.task import task

from dislib.data.array import Array


Expand Down Expand Up @@ -43,7 +41,6 @@ def get_n_samples(self):
If self.n_samples is None and self.samples_path is not a string.
ValueError
If invalid content is encountered in the samples file.
"""
if self.n_samples is None:
if not isinstance(self.samples_path, str):
Expand All @@ -68,7 +65,6 @@ def get_n_features(self):
If self.n_features is None and self.samples_path is not a string.
ValueError
If invalid content is encountered in the samples file.
"""
if self.n_features is None:
if not isinstance(self.samples_path, str):
Expand All @@ -88,7 +84,6 @@ def validate_features_file(self):
ValueError
If the shape of the array in the features_file doesn't match this
class n_samples and n_features or if the array is in fortran order.
"""
features_npy_file = _NpyFile(self.features_path)
shape = features_npy_file.get_shape()
Expand Down Expand Up @@ -156,7 +151,6 @@ def get_y_targets(self):
Returns
-------
y_targets: ndarray
"""
if self.y_targets is None:
labels = _get_labels(self.targets_path)
Expand All @@ -169,7 +163,6 @@ def get_classes(self):
Returns
-------
y_categories: ndarray
"""
if self.y_categories is None:
labels = _get_labels(self.targets_path)
Expand All @@ -182,7 +175,6 @@ def get_n_classes(self):
Returns
-------
n_classes: int
"""
if self.n_classes is None:
labels = _get_labels(self.targets_path)
Expand Down Expand Up @@ -238,7 +230,6 @@ def get_y_targets(self):
Returns
-------
y_targets: ndarray
"""
if self.y_targets is None:
targets = _get_values(self.targets_path)
Expand All @@ -253,11 +244,14 @@ def get_classes(self):


def transform_to_rf_dataset(
x: Array, y: Array, task: str, features_file=False
x: Array,
y: Array,
base_dataset: RfRegressorDataset or RfClassifierDataset,
features_file=False,
) -> RfRegressorDataset or RfClassifierDataset:
"""Creates a RfDataset object from samples x and targets y.
This function creates a dislib.commons.rf.data.RfDataset by saving
This function creates a `RfDataset` by saving
x and y in files.
Parameters
Expand Down Expand Up @@ -323,16 +317,7 @@ def transform_to_rf_dataset(
else:
features_path = None

if task == "classification":
rf_dataset = RfClassifierDataset(
samples_path, targets_path, features_path
)
elif task == "regression":
rf_dataset = RfRegressorDataset(
samples_path, targets_path, features_path
)
else:
raise ValueError("task must be either classification or regression.")
rf_dataset = base_dataset(samples_path, targets_path, features_path)
rf_dataset.n_samples = n_samples
rf_dataset.n_features = n_features
return rf_dataset
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from sklearn.tree import DecisionTreeClassifier as SklearnDTClassifier
from sklearn.tree import DecisionTreeRegressor as SklearnDTRegressor

from dislib.commons.rf.test_split import test_split
from dislib.trees.test_split import test_split
from dislib.data.array import Array


Expand All @@ -27,13 +27,17 @@ def __init__(
sklearn_max,
bootstrap,
random_state,
base_node,
base_tree,
):
self.try_features = try_features
self.max_depth = max_depth
self.distr_depth = distr_depth
self.sklearn_max = sklearn_max
self.bootstrap = bootstrap
self.random_state = random_state
self.base_node = base_node
self.base_tree = base_tree

self.n_features = None
self.n_classes = None
Expand All @@ -48,7 +52,6 @@ def fit(self, dataset):
Parameters
----------
dataset : dislib.classification.rf._data.RfDataset
"""

self.n_features = dataset.get_n_features()
Expand All @@ -63,9 +66,8 @@ def fit(self, dataset):
sample, y_s = _sample_selection(
n_samples, y_targets, self.bootstrap, seed
)
Node = _ClassificationNode if self.n_classes else _RegressionNode

self.tree = Node()
self.tree = self.base_node()
self.nodes_info = []
self.subtrees = []
tree_traversal = [(self.tree, sample, y_s, 0)]
Expand All @@ -87,8 +89,8 @@ def fit(self, dataset):
compss_delete_object(y_s)
node.content = len(self.nodes_info)
self.nodes_info.append(node_info)
node.left = Node()
node.right = Node()
node.left = self.base_node()
node.right = self.base_node()
depth = depth + 1
tree_traversal.append((node.right, right_group, y_r, depth))
tree_traversal.append((node.left, left_group, y_l, depth))
Expand All @@ -102,6 +104,8 @@ def fit(self, dataset):
self.try_features,
self.sklearn_max,
self.random_state,
self.base_node,
self.base_tree,
samples_path,
features_path,
)
Expand Down Expand Up @@ -216,6 +220,8 @@ def __init__(
sklearn_max,
bootstrap,
random_state,
_ClassificationNode,
SklearnDTClassifier,
)

def predict_proba(self, x_row):
Expand All @@ -234,7 +240,6 @@ def predict_proba(self, x_row):
of the column being codes of the fitted
dislib.classification.rf.data.RfDataset. The returned object can be
a pycompss.runtime.Future object.
"""

assert self.tree is not None, "The decision tree is not fitted."
Expand Down Expand Up @@ -319,6 +324,8 @@ def __init__(
sklearn_max,
bootstrap,
random_state,
_RegressionNode,
SklearnDTRegressor,
)


Expand Down Expand Up @@ -539,6 +546,8 @@ def _build_subtree_wrapper(
m_try,
sklearn_max,
random_state,
base_node,
base_tree,
samples_file,
features_file,
):
Expand All @@ -553,6 +562,8 @@ def _build_subtree_wrapper(
m_try,
sklearn_max,
seed,
base_node,
base_tree,
samples_file,
features_file,
)
Expand All @@ -566,6 +577,8 @@ def _build_subtree_wrapper(
m_try,
sklearn_max,
seed,
base_node,
base_tree,
samples_file,
)

Expand All @@ -580,6 +593,8 @@ def _build_subtree_using_features(
m_try,
sklearn_max,
seed,
base_node,
base_tree,
samples_file,
features_file,
):
Expand All @@ -593,6 +608,8 @@ def _build_subtree_using_features(
m_try,
sklearn_max,
random_state,
base_node,
base_tree,
samples_file,
features_file=features_file,
)
Expand All @@ -608,6 +625,8 @@ def _build_subtree(
m_try,
sklearn_max,
seed,
base_node,
base_tree,
samples_file,
):
random_state = RandomState(seed)
Expand All @@ -620,6 +639,8 @@ def _build_subtree(
m_try,
sklearn_max,
random_state,
base_node,
base_tree,
samples_file,
)

Expand All @@ -633,19 +654,19 @@ def _compute_build_subtree(
m_try,
sklearn_max,
random_state,
base_node,
base_tree,
samples_file,
features_file=None,
use_sklearn=True,
):
Node = _ClassificationNode if n_classes else _RegressionNode
SklearnDT = SklearnDTClassifier if n_classes else SklearnDTRegressor
if not sample.size:
return Node()
return base_node()
if features_file is not None:
mmap = np.load(features_file, mmap_mode="r", allow_pickle=False)
else:
mmap = np.load(samples_file, mmap_mode="r", allow_pickle=False).T
subtree = Node()
subtree = base_node()
tree_traversal = [(subtree, sample, y_s, 0)]
while tree_traversal:
node, sample, y_s, depth = tree_traversal.pop()
Expand All @@ -655,7 +676,7 @@ def _compute_build_subtree(
sklearn_max_depth = None
else:
sklearn_max_depth = max_depth - depth
dt = SklearnDT(
dt = base_tree(
max_features=m_try,
max_depth=sklearn_max_depth,
random_state=random_state,
Expand All @@ -681,8 +702,8 @@ def _compute_build_subtree(
node_info, left_group, y_l, right_group, y_r = split
node.content = node_info
if isinstance(node_info, _InnerNodeInfo):
node.left = Node()
node.right = Node()
node.left = base_node()
node.right = base_node()
tree_traversal.append(
(node.right, right_group, y_r, depth + 1)
)
Expand Down
Loading

0 comments on commit bb6303d

Please sign in to comment.