Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Trees Module #351

Merged
merged 3 commits into from
Aug 19, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion dislib/classification/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from dislib.classification.csvm.base import CascadeSVM
from dislib.commons.rf.forest import RandomForestClassifier
from dislib.trees.forest import RandomForestClassifier

__all__ = ["CascadeSVM", "RandomForestClassifier"]
2 changes: 1 addition & 1 deletion dislib/regression/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from dislib.regression.linear.base import LinearRegression
from dislib.regression.lasso.base import Lasso
from dislib.commons.rf.forest import RandomForestRegressor
from dislib.trees.forest import RandomForestRegressor

__all__ = ["LinearRegression", "Lasso", "RandomForestRegressor"]
File renamed without changes.
27 changes: 6 additions & 21 deletions dislib/commons/rf/data.py → dislib/trees/data.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import tempfile
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please consider exposing only objects that are necessary in dislib/regression/init.py
(currently all of them are exposed)


import numpy as np
from numpy.lib import format
from pycompss.api.parameter import (
Expand All @@ -10,7 +9,6 @@
Type,
)
from pycompss.api.task import task

from dislib.data.array import Array


Expand Down Expand Up @@ -43,7 +41,6 @@ def get_n_samples(self):
If self.n_samples is None and self.samples_path is not a string.
ValueError
If invalid content is encountered in the samples file.

"""
if self.n_samples is None:
if not isinstance(self.samples_path, str):
Expand All @@ -68,7 +65,6 @@ def get_n_features(self):
If self.n_features is None and self.samples_path is not a string.
ValueError
If invalid content is encountered in the samples file.

"""
if self.n_features is None:
if not isinstance(self.samples_path, str):
Expand All @@ -88,7 +84,6 @@ def validate_features_file(self):
ValueError
If the shape of the array in the features_file doesn't match this
class n_samples and n_features or if the array is in fortran order.

"""
features_npy_file = _NpyFile(self.features_path)
shape = features_npy_file.get_shape()
Expand Down Expand Up @@ -156,7 +151,6 @@ def get_y_targets(self):
Returns
-------
y_targets: ndarray

"""
if self.y_targets is None:
labels = _get_labels(self.targets_path)
Expand All @@ -169,7 +163,6 @@ def get_classes(self):
Returns
-------
y_categories: ndarray

"""
if self.y_categories is None:
labels = _get_labels(self.targets_path)
Expand All @@ -182,7 +175,6 @@ def get_n_classes(self):
Returns
-------
n_classes: int

"""
if self.n_classes is None:
labels = _get_labels(self.targets_path)
Expand Down Expand Up @@ -238,7 +230,6 @@ def get_y_targets(self):
Returns
-------
y_targets: ndarray

"""
if self.y_targets is None:
targets = _get_values(self.targets_path)
Expand All @@ -253,11 +244,14 @@ def get_classes(self):


def transform_to_rf_dataset(
x: Array, y: Array, task: str, features_file=False
x: Array,
y: Array,
base_dataset: RfRegressorDataset or RfClassifierDataset,
features_file=False,
) -> RfRegressorDataset or RfClassifierDataset:
"""Creates a RfDataset object from samples x and targets y.

This function creates a dislib.commons.rf.data.RfDataset by saving
This function creates a `RfDataset` by saving
x and y in files.

Parameters
Expand Down Expand Up @@ -323,16 +317,7 @@ def transform_to_rf_dataset(
else:
features_path = None

if task == "classification":
rf_dataset = RfClassifierDataset(
samples_path, targets_path, features_path
)
elif task == "regression":
rf_dataset = RfRegressorDataset(
samples_path, targets_path, features_path
)
else:
raise ValueError("task must be either classification or regression.")
rf_dataset = base_dataset(samples_path, targets_path, features_path)
rf_dataset.n_samples = n_samples
rf_dataset.n_features = n_features
return rf_dataset
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from sklearn.tree import DecisionTreeClassifier as SklearnDTClassifier
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please consider exposing only objects that are necessary in dislib/regression/init.py
(currently all of them are exposed)

from sklearn.tree import DecisionTreeRegressor as SklearnDTRegressor

from dislib.commons.rf.test_split import test_split
from dislib.trees.test_split import test_split
from dislib.data.array import Array


Expand All @@ -27,13 +27,17 @@ def __init__(
sklearn_max,
bootstrap,
random_state,
base_node,
base_tree,
):
self.try_features = try_features
self.max_depth = max_depth
self.distr_depth = distr_depth
self.sklearn_max = sklearn_max
self.bootstrap = bootstrap
self.random_state = random_state
self.base_node = base_node
self.base_tree = base_tree

self.n_features = None
self.n_classes = None
Expand All @@ -48,7 +52,6 @@ def fit(self, dataset):
Parameters
----------
dataset : dislib.classification.rf._data.RfDataset

"""

self.n_features = dataset.get_n_features()
Expand All @@ -63,9 +66,8 @@ def fit(self, dataset):
sample, y_s = _sample_selection(
n_samples, y_targets, self.bootstrap, seed
)
Node = _ClassificationNode if self.n_classes else _RegressionNode

self.tree = Node()
self.tree = self.base_node()
self.nodes_info = []
self.subtrees = []
tree_traversal = [(self.tree, sample, y_s, 0)]
Expand All @@ -87,8 +89,8 @@ def fit(self, dataset):
compss_delete_object(y_s)
node.content = len(self.nodes_info)
self.nodes_info.append(node_info)
node.left = Node()
node.right = Node()
node.left = self.base_node()
node.right = self.base_node()
depth = depth + 1
tree_traversal.append((node.right, right_group, y_r, depth))
tree_traversal.append((node.left, left_group, y_l, depth))
Expand All @@ -102,6 +104,8 @@ def fit(self, dataset):
self.try_features,
self.sklearn_max,
self.random_state,
self.base_node,
self.base_tree,
samples_path,
features_path,
)
Expand Down Expand Up @@ -216,6 +220,8 @@ def __init__(
sklearn_max,
bootstrap,
random_state,
_ClassificationNode,
SklearnDTClassifier,
)

def predict_proba(self, x_row):
Expand All @@ -234,7 +240,6 @@ def predict_proba(self, x_row):
of the column being codes of the fitted
dislib.classification.rf.data.RfDataset. The returned object can be
a pycompss.runtime.Future object.

"""

assert self.tree is not None, "The decision tree is not fitted."
Expand Down Expand Up @@ -319,6 +324,8 @@ def __init__(
sklearn_max,
bootstrap,
random_state,
_RegressionNode,
SklearnDTRegressor,
)


Expand Down Expand Up @@ -539,6 +546,8 @@ def _build_subtree_wrapper(
m_try,
sklearn_max,
random_state,
base_node,
base_tree,
samples_file,
features_file,
):
Expand All @@ -553,6 +562,8 @@ def _build_subtree_wrapper(
m_try,
sklearn_max,
seed,
base_node,
base_tree,
samples_file,
features_file,
)
Expand All @@ -566,6 +577,8 @@ def _build_subtree_wrapper(
m_try,
sklearn_max,
seed,
base_node,
base_tree,
samples_file,
)

Expand All @@ -580,6 +593,8 @@ def _build_subtree_using_features(
m_try,
sklearn_max,
seed,
base_node,
base_tree,
samples_file,
features_file,
):
Expand All @@ -593,6 +608,8 @@ def _build_subtree_using_features(
m_try,
sklearn_max,
random_state,
base_node,
base_tree,
samples_file,
features_file=features_file,
)
Expand All @@ -608,6 +625,8 @@ def _build_subtree(
m_try,
sklearn_max,
seed,
base_node,
base_tree,
samples_file,
):
random_state = RandomState(seed)
Expand All @@ -620,6 +639,8 @@ def _build_subtree(
m_try,
sklearn_max,
random_state,
base_node,
base_tree,
samples_file,
)

Expand All @@ -633,19 +654,19 @@ def _compute_build_subtree(
m_try,
sklearn_max,
random_state,
base_node,
base_tree,
samples_file,
features_file=None,
use_sklearn=True,
):
Node = _ClassificationNode if n_classes else _RegressionNode
SklearnDT = SklearnDTClassifier if n_classes else SklearnDTRegressor
if not sample.size:
return Node()
return base_node()
if features_file is not None:
mmap = np.load(features_file, mmap_mode="r", allow_pickle=False)
else:
mmap = np.load(samples_file, mmap_mode="r", allow_pickle=False).T
subtree = Node()
subtree = base_node()
tree_traversal = [(subtree, sample, y_s, 0)]
while tree_traversal:
node, sample, y_s, depth = tree_traversal.pop()
Expand All @@ -655,7 +676,7 @@ def _compute_build_subtree(
sklearn_max_depth = None
else:
sklearn_max_depth = max_depth - depth
dt = SklearnDT(
dt = base_tree(
max_features=m_try,
max_depth=sklearn_max_depth,
random_state=random_state,
Expand All @@ -681,8 +702,8 @@ def _compute_build_subtree(
node_info, left_group, y_l, right_group, y_r = split
node.content = node_info
if isinstance(node_info, _InnerNodeInfo):
node.left = Node()
node.right = Node()
node.left = base_node()
node.right = base_node()
tree_traversal.append(
(node.right, right_group, y_r, depth + 1)
)
Expand Down
Loading