diff --git a/dislib/classification/__init__.py b/dislib/classification/__init__.py index 695dd571..31745ea9 100644 --- a/dislib/classification/__init__.py +++ b/dislib/classification/__init__.py @@ -1,4 +1,4 @@ from dislib.classification.csvm.base import CascadeSVM -from dislib.commons.rf.forest import RandomForestClassifier +from dislib.trees.forest import RandomForestClassifier __all__ = ["CascadeSVM", "RandomForestClassifier"] diff --git a/dislib/commons/rf/__init__.py b/dislib/commons/rf/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/dislib/regression/__init__.py b/dislib/regression/__init__.py index a47cd17d..651aa23d 100644 --- a/dislib/regression/__init__.py +++ b/dislib/regression/__init__.py @@ -1,5 +1,5 @@ from dislib.regression.linear.base import LinearRegression from dislib.regression.lasso.base import Lasso -from dislib.commons.rf.forest import RandomForestRegressor +from dislib.trees.forest import RandomForestRegressor __all__ = ["LinearRegression", "Lasso", "RandomForestRegressor"] diff --git a/dislib/trees/__init__.py b/dislib/trees/__init__.py new file mode 100644 index 00000000..1eb1dc2b --- /dev/null +++ b/dislib/trees/__init__.py @@ -0,0 +1,14 @@ +from dislib.trees.forest import RandomForestClassifier, RandomForestRegressor +from dislib.trees.decision_tree import ( + DecisionTreeClassifier, + DecisionTreeRegressor, +) +from dislib.trees.data import transform_to_rf_dataset + +__all__ = [ + "RandomForestClassifier", + "RandomForestRegressor", + "DecisionTreeClassifier", + "DecisionTreeRegressor", + "transform_to_rf_dataset", +] diff --git a/dislib/commons/rf/data.py b/dislib/trees/data.py similarity index 96% rename from dislib/commons/rf/data.py rename to dislib/trees/data.py index e5155bdc..b5ca9058 100644 --- a/dislib/commons/rf/data.py +++ b/dislib/trees/data.py @@ -1,5 +1,4 @@ import tempfile - import numpy as np from numpy.lib import format from pycompss.api.parameter import ( @@ -10,7 +9,6 @@ Type, ) from pycompss.api.task import task - from dislib.data.array import Array @@ -43,7 +41,6 @@ def get_n_samples(self): If self.n_samples is None and self.samples_path is not a string. ValueError If invalid content is encountered in the samples file. - """ if self.n_samples is None: if not isinstance(self.samples_path, str): @@ -68,7 +65,6 @@ def get_n_features(self): If self.n_features is None and self.samples_path is not a string. ValueError If invalid content is encountered in the samples file. - """ if self.n_features is None: if not isinstance(self.samples_path, str): @@ -88,7 +84,6 @@ def validate_features_file(self): ValueError If the shape of the array in the features_file doesn't match this class n_samples and n_features or if the array is in fortran order. - """ features_npy_file = _NpyFile(self.features_path) shape = features_npy_file.get_shape() @@ -156,7 +151,6 @@ def get_y_targets(self): Returns ------- y_targets: ndarray - """ if self.y_targets is None: labels = _get_labels(self.targets_path) @@ -169,7 +163,6 @@ def get_classes(self): Returns ------- y_categories: ndarray - """ if self.y_categories is None: labels = _get_labels(self.targets_path) @@ -182,7 +175,6 @@ def get_n_classes(self): Returns ------- n_classes: int - """ if self.n_classes is None: labels = _get_labels(self.targets_path) @@ -238,7 +230,6 @@ def get_y_targets(self): Returns ------- y_targets: ndarray - """ if self.y_targets is None: targets = _get_values(self.targets_path) @@ -253,11 +244,14 @@ def get_classes(self): def transform_to_rf_dataset( - x: Array, y: Array, task: str, features_file=False + x: Array, + y: Array, + base_dataset: RfRegressorDataset or RfClassifierDataset, + features_file=False, ) -> RfRegressorDataset or RfClassifierDataset: """Creates a RfDataset object from samples x and targets y. - This function creates a dislib.commons.rf.data.RfDataset by saving + This function creates a `RfDataset` by saving x and y in files. Parameters @@ -323,16 +317,7 @@ def transform_to_rf_dataset( else: features_path = None - if task == "classification": - rf_dataset = RfClassifierDataset( - samples_path, targets_path, features_path - ) - elif task == "regression": - rf_dataset = RfRegressorDataset( - samples_path, targets_path, features_path - ) - else: - raise ValueError("task must be either classification or regression.") + rf_dataset = base_dataset(samples_path, targets_path, features_path) rf_dataset.n_samples = n_samples rf_dataset.n_features = n_features return rf_dataset diff --git a/dislib/commons/rf/decision_tree.py b/dislib/trees/decision_tree.py similarity index 96% rename from dislib/commons/rf/decision_tree.py rename to dislib/trees/decision_tree.py index 751983d4..5b2b6dfd 100644 --- a/dislib/commons/rf/decision_tree.py +++ b/dislib/trees/decision_tree.py @@ -8,7 +8,7 @@ from sklearn.tree import DecisionTreeClassifier as SklearnDTClassifier from sklearn.tree import DecisionTreeRegressor as SklearnDTRegressor -from dislib.commons.rf.test_split import test_split +from dislib.trees.test_split import test_split from dislib.data.array import Array @@ -27,6 +27,8 @@ def __init__( sklearn_max, bootstrap, random_state, + base_node, + base_tree, ): self.try_features = try_features self.max_depth = max_depth @@ -34,6 +36,8 @@ def __init__( self.sklearn_max = sklearn_max self.bootstrap = bootstrap self.random_state = random_state + self.base_node = base_node + self.base_tree = base_tree self.n_features = None self.n_classes = None @@ -48,7 +52,6 @@ def fit(self, dataset): Parameters ---------- dataset : dislib.classification.rf._data.RfDataset - """ self.n_features = dataset.get_n_features() @@ -63,9 +66,8 @@ def fit(self, dataset): sample, y_s = _sample_selection( n_samples, y_targets, self.bootstrap, seed ) - Node = _ClassificationNode if self.n_classes else _RegressionNode - self.tree = Node() + self.tree = self.base_node() self.nodes_info = [] self.subtrees = [] tree_traversal = [(self.tree, sample, y_s, 0)] @@ -87,8 +89,8 @@ def fit(self, dataset): compss_delete_object(y_s) node.content = len(self.nodes_info) self.nodes_info.append(node_info) - node.left = Node() - node.right = Node() + node.left = self.base_node() + node.right = self.base_node() depth = depth + 1 tree_traversal.append((node.right, right_group, y_r, depth)) tree_traversal.append((node.left, left_group, y_l, depth)) @@ -102,6 +104,8 @@ def fit(self, dataset): self.try_features, self.sklearn_max, self.random_state, + self.base_node, + self.base_tree, samples_path, features_path, ) @@ -216,6 +220,8 @@ def __init__( sklearn_max, bootstrap, random_state, + _ClassificationNode, + SklearnDTClassifier, ) def predict_proba(self, x_row): @@ -234,7 +240,6 @@ def predict_proba(self, x_row): of the column being codes of the fitted dislib.classification.rf.data.RfDataset. The returned object can be a pycompss.runtime.Future object. - """ assert self.tree is not None, "The decision tree is not fitted." @@ -319,6 +324,8 @@ def __init__( sklearn_max, bootstrap, random_state, + _RegressionNode, + SklearnDTRegressor, ) @@ -539,6 +546,8 @@ def _build_subtree_wrapper( m_try, sklearn_max, random_state, + base_node, + base_tree, samples_file, features_file, ): @@ -553,6 +562,8 @@ def _build_subtree_wrapper( m_try, sklearn_max, seed, + base_node, + base_tree, samples_file, features_file, ) @@ -566,6 +577,8 @@ def _build_subtree_wrapper( m_try, sklearn_max, seed, + base_node, + base_tree, samples_file, ) @@ -580,6 +593,8 @@ def _build_subtree_using_features( m_try, sklearn_max, seed, + base_node, + base_tree, samples_file, features_file, ): @@ -593,6 +608,8 @@ def _build_subtree_using_features( m_try, sklearn_max, random_state, + base_node, + base_tree, samples_file, features_file=features_file, ) @@ -608,6 +625,8 @@ def _build_subtree( m_try, sklearn_max, seed, + base_node, + base_tree, samples_file, ): random_state = RandomState(seed) @@ -620,6 +639,8 @@ def _build_subtree( m_try, sklearn_max, random_state, + base_node, + base_tree, samples_file, ) @@ -633,19 +654,19 @@ def _compute_build_subtree( m_try, sklearn_max, random_state, + base_node, + base_tree, samples_file, features_file=None, use_sklearn=True, ): - Node = _ClassificationNode if n_classes else _RegressionNode - SklearnDT = SklearnDTClassifier if n_classes else SklearnDTRegressor if not sample.size: - return Node() + return base_node() if features_file is not None: mmap = np.load(features_file, mmap_mode="r", allow_pickle=False) else: mmap = np.load(samples_file, mmap_mode="r", allow_pickle=False).T - subtree = Node() + subtree = base_node() tree_traversal = [(subtree, sample, y_s, 0)] while tree_traversal: node, sample, y_s, depth = tree_traversal.pop() @@ -655,7 +676,7 @@ def _compute_build_subtree( sklearn_max_depth = None else: sklearn_max_depth = max_depth - depth - dt = SklearnDT( + dt = base_tree( max_features=m_try, max_depth=sklearn_max_depth, random_state=random_state, @@ -681,8 +702,8 @@ def _compute_build_subtree( node_info, left_group, y_l, right_group, y_r = split node.content = node_info if isinstance(node_info, _InnerNodeInfo): - node.left = Node() - node.right = Node() + node.left = base_node() + node.right = base_node() tree_traversal.append( (node.right, right_group, y_r, depth + 1) ) diff --git a/dislib/commons/rf/forest.py b/dislib/trees/forest.py similarity index 76% rename from dislib/commons/rf/forest.py rename to dislib/trees/forest.py index be2e668c..0fd22274 100644 --- a/dislib/commons/rf/forest.py +++ b/dislib/trees/forest.py @@ -8,13 +8,14 @@ from sklearn.base import BaseEstimator from sklearn.utils import check_random_state -from dislib.commons.rf.decision_tree import ( +from dislib.trees.decision_tree import ( DecisionTreeClassifier, DecisionTreeRegressor, ) +from dislib.trees.data import RfClassifierDataset, RfRegressorDataset from dislib.data.array import Array from dislib.utils.base import _paired_partition -from dislib.commons.rf.data import transform_to_rf_dataset +from dislib.trees.data import transform_to_rf_dataset class BaseRandomForest(BaseEstimator): @@ -33,6 +34,8 @@ def __init__( sklearn_max, hard_vote, random_state, + base_tree, + base_dataset, ): self.n_estimators = n_estimators self.try_features = try_features @@ -41,9 +44,11 @@ def __init__( self.sklearn_max = sklearn_max self.hard_vote = hard_vote self.random_state = random_state + self.base_tree = base_tree + self.base_dataset = base_dataset def fit(self, x, y): - """Fits the RandomForest. + """Fits a RandomForest. Parameters ---------- @@ -56,21 +61,9 @@ def fit(self, x, y): Returns ------- self : RandomForest - """ - self.classes = None - self.trees = [] - - if self.hard_vote is not None: - # Classification - task = "classification" - Tree = DecisionTreeClassifier - else: - # Regression - task = "regression" - Tree = DecisionTreeRegressor - dataset = transform_to_rf_dataset(x, y, task) + dataset = transform_to_rf_dataset(x, y, self.base_dataset) n_features = dataset.get_n_features() try_features = _resolve_try_features(self.try_features, n_features) @@ -85,8 +78,9 @@ def fit(self, x, y): else: distr_depth = self.distr_depth - for i in range(self.n_estimators): - tree = Tree( + self.trees = [] + for _ in range(self.n_estimators): + tree = self.base_tree( try_features, self.max_depth, distr_depth, @@ -101,128 +95,6 @@ def fit(self, x, y): return self - def predict(self, x): - """Predicts target classes or values using a fitted forest. - - Parameters - ---------- - x : ds-array, shape=(n_samples, n_features) - The input samples. - - Returns - ------- - y_pred : ds-array, shape=(n_samples, 1) - Predicted class labels or values for x. - - """ - assert self.trees is not None, "The random forest is not fitted." - pred_blocks = [] - if self.hard_vote is not None: - # Classification - if self.hard_vote: - for x_row in x._iterator(axis=0): - tree_predictions = [] - for tree in self.trees: - tree_predictions.append(tree.predict(x_row)) - pred_blocks.append( - _hard_vote(self.classes, *tree_predictions) - ) - else: - for x_row in x._iterator(axis=0): - tree_predictions = [] - for tree in self.trees: - tree_predictions.append(tree.predict_proba(x_row)) - pred_blocks.append( - _soft_vote(self.classes, *tree_predictions) - ) - else: - # Regression - for x_row in x._iterator(axis=0): - tree_predictions = [] - for tree in self.trees: - tree_predictions.append(tree.predict(x_row)) - pred_blocks.append(_join_predictions(*tree_predictions)) - - y_pred = Array( - blocks=[pred_blocks], - top_left_shape=(x._top_left_shape[0], 1), - reg_shape=(x._reg_shape[0], 1), - shape=(x.shape[0], 1), - sparse=False, - ) - - return y_pred - - def score(self, x, y, collect=False): - """Accuracy classification score. - - For classification returns the mean accuracy on the given test data. - - For regression returns the coefficient of determination $R^2$ of - the prediction. - The coefficient $R^2$ is defined as $(1-u/v)$, where $u$ - is the residual sum of squares `((y_true - y_pred) ** 2).sum()` and - $v$ is the total sum of squares - `((y_true - y_true.mean()) ** 2).sum()`. - The best possible score is 1.0 and it can be negative - if the model is arbitrarily worse. - A constant model that always predicts the expected value of y, - disregarding the input features, would get a $R^2$ score of 0.0. - - Parameters - ---------- - x : ds-array, shape=(n_samples, n_features) - The training input samples. - y : ds-array, shape (n_samples, 1) - The true labels. - collect : bool, optional (default=False) - When True, a synchronized result is returned. - - - Returns - ------- - score : float (as future object) - Fraction of correctly classified samples for classification - or coefficient of determination $R^2$ for regression. - - """ - assert self.trees is not None, "The random forest is not fitted." - partial_scores = [] - if self.hard_vote is not None: - # Classification - if self.hard_vote: - for x_row, y_row in _paired_partition(x, y): - tree_predictions = [] - for tree in self.trees: - tree_predictions.append(tree.predict(x_row)) - subset_score = _hard_vote_score( - y_row._blocks, self.classes, *tree_predictions - ) - partial_scores.append(subset_score) - else: - for x_row, y_row in _paired_partition(x, y): - tree_predictions = [] - for tree in self.trees: - tree_predictions.append(tree.predict_proba(x_row)) - subset_score = _soft_vote_score( - y_row._blocks, self.classes, *tree_predictions - ) - partial_scores.append(subset_score) - score = _merge_classification_scores(*partial_scores) - else: - # Regression - for x_row, y_row in _paired_partition(x, y): - tree_predictions = [] - for tree in self.trees: - tree_predictions.append(tree.predict(x_row)) - subset_score = _regression_score( - y_row._blocks, *tree_predictions - ) - partial_scores.append(subset_score) - score = _merge_regression_scores(*partial_scores) - - return compss_wait_on(score) if collect else score - class RandomForestClassifier(BaseRandomForest): """A distributed random forest classifier. @@ -291,8 +163,49 @@ def __init__( sklearn_max, hard_vote, random_state, + base_tree=DecisionTreeClassifier, + base_dataset=RfClassifierDataset, ) + def predict(self, x): + """Predicts target classes using a fitted forest. + + Parameters + ---------- + x : ds-array, shape=(n_samples, n_features) + The input samples. + + Returns + ------- + y_pred : ds-array, shape=(n_samples, 1) + Predicted class labels for x. + """ + assert self.trees is not None, "The random forest is not fitted." + + pred_blocks = [] + if self.hard_vote: + for x_row in x._iterator(axis=0): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict(x_row)) + pred_blocks.append(_hard_vote(self.classes, *tree_predictions)) + else: + for x_row in x._iterator(axis=0): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict_proba(x_row)) + pred_blocks.append(_soft_vote(self.classes, *tree_predictions)) + + y_pred = Array( + blocks=[pred_blocks], + top_left_shape=(x._top_left_shape[0], 1), + reg_shape=(x._reg_shape[0], 1), + shape=(x.shape[0], 1), + sparse=False, + ) + + return y_pred + def predict_proba(self, x): """Predicts class probabilities using a fitted forest. @@ -311,9 +224,9 @@ def predict_proba(self, x): Predicted probabilities for the samples to belong to each class. The columns of the array correspond to the classes given at self.classes. - """ assert self.trees is not None, "The random forest is not fitted." + prob_blocks = [] for x_row in x._iterator(axis=0): tree_predictions = [] @@ -332,6 +245,53 @@ def predict_proba(self, x): ) return probabilities + def score(self, x, y, collect=False): + """Accuracy classification score. + + Returns the mean accuracy of the predictions on the given test data. + + Parameters + ---------- + x : ds-array, shape=(n_samples, n_features) + The training input samples. + y : ds-array, shape (n_samples, 1) + The true labels. + collect : bool, optional (default=False) + When True, a synchronized result is returned. + + + Returns + ------- + score : float (as future object) + Fraction of correctly classified samples. + """ + assert self.trees is not None, "The random forest is not fitted." + + partial_scores = [] + if self.hard_vote: + for x_row, y_row in _paired_partition(x, y): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict(x_row)) + subset_score = _hard_vote_score( + y_row._blocks, self.classes, *tree_predictions + ) + partial_scores.append(subset_score) + + else: + for x_row, y_row in _paired_partition(x, y): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict_proba(x_row)) + subset_score = _soft_vote_score( + y_row._blocks, self.classes, *tree_predictions + ) + partial_scores.append(subset_score) + + score = _merge_classification_scores(*partial_scores) + + return compss_wait_on(score) if collect else score + class RandomForestRegressor(BaseRandomForest): """A distributed random forest regressor. @@ -393,8 +353,100 @@ def __init__( sklearn_max, hard_vote, random_state, + base_tree=DecisionTreeRegressor, + base_dataset=RfRegressorDataset, + ) + + def predict(self, x): + """Predicts target values using a fitted forest. + + Parameters + ---------- + x : ds-array, shape=(n_samples, n_features) + The input samples. + + Returns + ------- + y_pred : ds-array, shape=(n_samples, 1) + Predicted values for x. + """ + assert self.trees is not None, "The random forest is not fitted." + + pred_blocks = [] + for x_row in x._iterator(axis=0): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict(x_row)) + pred_blocks.append(_join_predictions(*tree_predictions)) + + y_pred = Array( + blocks=[pred_blocks], + top_left_shape=(x._top_left_shape[0], 1), + reg_shape=(x._reg_shape[0], 1), + shape=(x.shape[0], 1), + sparse=False, ) + return y_pred + + def score(self, x, y, collect=False): + """R2 regression score. + + Returns the coefficient of determination $R^2$ of the prediction. + The coefficient $R^2$ is defined as $(1-u/v)$, where $u$ + is the residual sum of squares `((y_true - y_pred) ** 2).sum()` and + $v$ is the total sum of squares + `((y_true - y_true.mean()) ** 2).sum()`. + The best possible score is 1.0 and it can be negative + if the model is arbitrarily worse. + A constant model that always predicts the expected value of y, + disregarding the input features, would get a $R^2$ score of 0.0. + + Parameters + ---------- + x : ds-array, shape=(n_samples, n_features) + The training input samples. + y : ds-array, shape (n_samples, 1) + The true values. + collect : bool, optional (default=False) + When True, a synchronized result is returned. + + + Returns + ------- + score : float (as future object) + Coefficient of determination $R^2$. + """ + assert self.trees is not None, "The random forest is not fitted." + + partial_scores = [] + for x_row, y_row in _paired_partition(x, y): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict(x_row)) + subset_score = _regression_score(y_row._blocks, *tree_predictions) + partial_scores.append(subset_score) + + score = _merge_regression_scores(*partial_scores) + + return compss_wait_on(score) if collect else score + + +def _base_soft_vote(classes, *predictions): + aggregate = predictions[0] + for p in predictions[1:]: + aggregate += p + predicted_labels = classes[np.argmax(aggregate, axis=1)] + return predicted_labels + + +def _base_hard_vote(classes, *predictions): + mode = np.empty((len(predictions[0]),), dtype=int) + for sample_i, votes in enumerate(zip(*predictions)): + mode[sample_i] = Counter(votes).most_common(1)[0][0] + labels = classes[mode] + return labels + @task(returns=1) def _resolve_try_features(try_features, n_features): @@ -419,40 +471,28 @@ def _join_predictions(*predictions): @task(returns=1) def _soft_vote(classes, *predictions): - aggregate = predictions[0] - for p in predictions[1:]: - aggregate += p - labels = classes[np.argmax(aggregate, axis=1)] - return labels + predicted_labels = _base_soft_vote(classes, *predictions) + return predicted_labels @task(returns=1) def _hard_vote(classes, *predictions): - mode = np.empty((len(predictions[0]),), dtype=int) - for sample_i, votes in enumerate(zip(*predictions)): - mode[sample_i] = Counter(votes).most_common(1)[0][0] - labels = classes[mode] - return labels + predicted_labels = _base_hard_vote(classes, *predictions) + return predicted_labels @task(y_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) def _soft_vote_score(y_blocks, classes, *predictions): + predicted_labels = _base_soft_vote(classes, *predictions) real_labels = Array._merge_blocks(y_blocks).flatten() - aggregate = predictions[0] - for p in predictions[1:]: - aggregate += p - predicted_labels = classes[np.argmax(aggregate, axis=1)] correct = np.count_nonzero(predicted_labels == real_labels) return correct, len(real_labels) @task(y_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) def _hard_vote_score(y_blocks, classes, *predictions): + predicted_labels = _base_hard_vote(classes, *predictions) real_labels = Array._merge_blocks(y_blocks).flatten() - mode = np.empty((len(predictions[0]),), dtype=int) - for sample_i, votes in enumerate(zip(*predictions)): - mode[sample_i] = Counter(votes).most_common(1)[0][0] - predicted_labels = classes[mode] correct = np.count_nonzero(predicted_labels == real_labels) return correct, len(real_labels) diff --git a/dislib/commons/rf/test_split.py b/dislib/trees/test_split.py similarity index 100% rename from dislib/commons/rf/test_split.py rename to dislib/trees/test_split.py diff --git a/dislib/utils/saving.py b/dislib/utils/saving.py index 02ecfb8a..a0f36abd 100644 --- a/dislib/utils/saving.py +++ b/dislib/utils/saving.py @@ -16,7 +16,7 @@ import dislib.recommendation import dislib.regression from dislib.data.array import Array -from dislib.commons.rf.decision_tree import ( +from dislib.trees.decision_tree import ( DecisionTreeClassifier, DecisionTreeRegressor, _Node, @@ -296,7 +296,8 @@ def _decode_helper(obj): ): dict_ = _decode_helper(obj["items"]) if class_name in ( - "DecisionTreeClassifier", "DecisionTreeRegressor" + "DecisionTreeClassifier", + "DecisionTreeRegressor", ): model = DISLIB_CLASSES[obj["class_name"]]( try_features=dict_.pop("try_features"), diff --git a/docs/source/user-guide.rst b/docs/source/user-guide.rst index 3fb02dc1..4f16e981 100644 --- a/docs/source/user-guide.rst +++ b/docs/source/user-guide.rst @@ -294,7 +294,7 @@ scalability of the estimator is limited by the reduction phase of the cascade. Random forest classifier ........................ -:class:`RandomForestClassifier ` +:class:`RandomForestClassifier ` is a classifier that uses an ensemble of decision trees and aggregates their predictions. The process of building each decision tree includes some randomization in order to make them different. The accuracy of the joint @@ -569,7 +569,7 @@ linear equations.) Random forest regressor ........................ -:class:`RandomForestRegressor ` +:class:`RandomForestRegressor ` is a regressor that uses an ensemble of decision trees and aggregates their predictions. The process of building each decision tree includes some randomization in order to make them different. The accuracy of the joint diff --git a/tests/test_decision_tree.py b/tests/test_decision_tree.py index e935dc56..bbf559c6 100644 --- a/tests/test_decision_tree.py +++ b/tests/test_decision_tree.py @@ -4,8 +4,8 @@ from pycompss.api.api import compss_wait_on import dislib as ds -import dislib.commons.rf.decision_tree as dt -import dislib.commons.rf.data as data +import dislib.trees.decision_tree as dt +import dislib.trees.data as data class DecisionTreeTest(unittest.TestCase): @@ -32,7 +32,7 @@ def test_decision_tree(self): y1_ds = ds.array(y1[:, np.newaxis], (3, 1)) data1 = data.transform_to_rf_dataset( - x1_ds, y1_ds, "classification", features_file=True + x1_ds, y1_ds, data.RfClassifierDataset, features_file=True ) # Model diff --git a/tests/test_rf_classifier.py b/tests/test_rf_classifier.py index 6b4648a9..9b17712e 100644 --- a/tests/test_rf_classifier.py +++ b/tests/test_rf_classifier.py @@ -22,11 +22,12 @@ def test_make_classification_score(self): n_repeated=1, n_clusters_per_class=2, shuffle=True, - random_state=0) - x_train = ds.array(x[:len(x) // 2], (300, 10)) - y_train = ds.array(y[:len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) + random_state=0, + ) + x_train = ds.array(x[::2], (300, 10)) + y_train = ds.array(y[::2][:, np.newaxis], (300, 1)) + x_test = ds.array(x[1::2], (300, 10)) + y_test = ds.array(y[1::2][:, np.newaxis], (300, 1)) rf = RandomForestClassifier(random_state=0) @@ -45,11 +46,12 @@ def test_make_classification_predict_and_distr_depth(self): n_repeated=1, n_clusters_per_class=2, shuffle=True, - random_state=0) - x_train = ds.array(x[:len(x) // 2], (300, 10)) - y_train = ds.array(y[:len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = y[len(y) // 2:] + random_state=0, + ) + x_train = ds.array(x[::2], (300, 10)) + y_train = ds.array(y[::2][:, np.newaxis], (300, 1)) + x_test = ds.array(x[1::2], (300, 10)) + y_test = y[1::2] rf = RandomForestClassifier(distr_depth=2, random_state=0) @@ -69,9 +71,10 @@ def test_make_classification_fit_predict(self): n_repeated=1, n_clusters_per_class=2, shuffle=True, - random_state=0) - x_train = ds.array(x[:len(x) // 2], (300, 10)) - y_train = ds.array(y[:len(y) // 2][:, np.newaxis], (300, 1)) + random_state=0, + ) + x_train = ds.array(x[::2], (300, 10)) + y_train = ds.array(y[::2][:, np.newaxis], (300, 1)) rf = RandomForestClassifier(random_state=0) @@ -91,11 +94,12 @@ def test_make_classification_sklearn_max_predict(self): n_repeated=1, n_clusters_per_class=2, shuffle=True, - random_state=0) - x_train = ds.array(x[:len(x) // 2], (300, 10)) - y_train = ds.array(y[:len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = y[len(y) // 2:] + random_state=0, + ) + x_train = ds.array(x[::2], (300, 10)) + y_train = ds.array(y[::2][:, np.newaxis], (300, 1)) + x_test = ds.array(x[1::2], (300, 10)) + y_test = y[1::2] rf = RandomForestClassifier(random_state=0, sklearn_max=10) @@ -115,11 +119,12 @@ def test_make_classification_sklearn_max_predict_proba(self): n_repeated=1, n_clusters_per_class=2, shuffle=True, - random_state=0) - x_train = ds.array(x[:len(x) // 2], (300, 10)) - y_train = ds.array(y[:len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = y[len(y) // 2:] + random_state=0, + ) + x_train = ds.array(x[::2], (300, 10)) + y_train = ds.array(y[::2][:, np.newaxis], (300, 1)) + x_test = ds.array(x[1::2], (300, 10)) + y_test = y[1::2] rf = RandomForestClassifier(random_state=0, sklearn_max=10) @@ -141,14 +146,16 @@ def test_make_classification_hard_vote_predict(self): n_repeated=1, n_clusters_per_class=2, shuffle=True, - random_state=0) - x_train = ds.array(x[:len(x) // 2], (300, 10)) - y_train = ds.array(y[:len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = y[len(y) // 2:] + random_state=0, + ) + x_train = ds.array(x[::2], (300, 10)) + y_train = ds.array(y[::2][:, np.newaxis], (300, 1)) + x_test = ds.array(x[1::2], (300, 10)) + y_test = y[1::2] - rf = RandomForestClassifier(random_state=0, sklearn_max=10, - hard_vote=True) + rf = RandomForestClassifier( + random_state=0, sklearn_max=10, hard_vote=True + ) rf.fit(x_train, y_train) y_pred = rf.predict(x_test).collect() @@ -167,15 +174,20 @@ def test_make_classification_hard_vote_score_mix(self): n_repeated=1, n_clusters_per_class=2, shuffle=True, - random_state=0) - x_train = ds.array(x[:len(x) // 2], (300, 10)) - y_train = ds.array(y[:len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) - - rf = RandomForestClassifier(random_state=0, sklearn_max=100, - distr_depth=2, max_depth=12, - hard_vote=True) + random_state=0, + ) + x_train = ds.array(x[::2], (300, 10)) + y_train = ds.array(y[::2][:, np.newaxis], (300, 1)) + x_test = ds.array(x[1::2], (300, 10)) + y_test = ds.array(y[1::2][:, np.newaxis], (300, 1)) + + rf = RandomForestClassifier( + random_state=0, + sklearn_max=100, + distr_depth=2, + max_depth=12, + hard_vote=True, + ) rf.fit(x_train, y_train) accuracy = compss_wait_on(rf.score(x_test, y_test)) @@ -190,8 +202,9 @@ def test_score_on_iris(self, collect): ds_validate = ds.array(x[1::2], block_size=(30, 2)) validate_y = ds.array(y[1::2].reshape(-1, 1), block_size=(30, 1)) - rf = RandomForestClassifier(n_estimators=1, max_depth=1, - random_state=0) + rf = RandomForestClassifier( + n_estimators=1, max_depth=1, random_state=0 + ) rf.fit(ds_fit, fit_y) accuracy = rf.score(ds_validate, validate_y, collect) if not collect: @@ -205,5 +218,5 @@ def main(): unittest.main() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/tests/test_rf_dataset.py b/tests/test_rf_dataset.py index de55fc76..a08532a0 100644 --- a/tests/test_rf_dataset.py +++ b/tests/test_rf_dataset.py @@ -4,8 +4,7 @@ import shutil from sklearn.datasets import make_classification import dislib as ds -from dislib.commons.rf import data -from dislib.commons.rf import test_split +from dislib.trees import data, test_split from dislib.data.array import Array import numpy as np from sys import float_info @@ -83,10 +82,6 @@ def test_rf_dataset(self): with self.assertRaises(TypeError): rf_dataset.get_n_features() - # Task must be classification or regression - with self.assertRaises(ValueError): - rf_dataset = data.transform_to_rf_dataset(x_ds_1, y_ds_1, "aaa") - # Validate dimension rf_dataset = data.RfBaseDataset( samples_path_1, targets_path_1, features_path_f @@ -104,10 +99,10 @@ def test_rf_dataset(self): # Dataset creation rf_regr = data.transform_to_rf_dataset( - x_ds_1, y_ds_1, "regression", features_file=True + x_ds_1, y_ds_1, data.RfRegressorDataset, features_file=True ) rf_class = data.transform_to_rf_dataset( - x_ds_1, y_ds_1, "classification", features_file=True + x_ds_1, y_ds_1, data.RfClassifierDataset, features_file=True ) self.assertEquals(compss_wait_on(rf_regr.get_n_samples()), 900) self.assertEquals(compss_wait_on(rf_regr.get_n_features()), 10) diff --git a/tests/test_rf_regressor.py b/tests/test_rf_regressor.py index 36da50f7..83c40ba4 100644 --- a/tests/test_rf_regressor.py +++ b/tests/test_rf_regressor.py @@ -24,10 +24,10 @@ def test_make_regression(self): shuffle=True, random_state=0, ) - x_train = ds.array(x[: len(x) // 2], (300, 10)) - y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) + x_train = ds.array(x[::2], (300, 10)) + y_train = ds.array(y[::2][:, np.newaxis], (300, 1)) + x_test = ds.array(x[1::2], (300, 10)) + y_test = ds.array(y[1::2][:, np.newaxis], (300, 1)) rf = RandomForestRegressor(random_state=0) @@ -35,7 +35,7 @@ def test_make_regression(self): accuracy1 = compss_wait_on(rf.score(x_test, y_test)) y_pred = rf.predict(x_test).collect() - y_true = y[len(y) // 2:] + y_true = y[1::2] accuracy2 = _determination_coefficient(y_true, y_pred) self.assertGreater(accuracy1, 0.85) @@ -51,10 +51,10 @@ def test_make_regression_predict_and_distr_depth(self): shuffle=True, random_state=0, ) - x_train = ds.array(x[: len(x) // 2], (300, 10)) - y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) + x_train = ds.array(x[::2], (300, 10)) + y_train = ds.array(y[::2][:, np.newaxis], (300, 1)) + x_test = ds.array(x[1::2], (300, 10)) + y_test = ds.array(y[1::2][:, np.newaxis], (300, 1)) rf = RandomForestRegressor(distr_depth=2, random_state=0) @@ -62,7 +62,7 @@ def test_make_regression_predict_and_distr_depth(self): accuracy1 = compss_wait_on(rf.score(x_test, y_test)) y_pred = rf.predict(x_test).collect() - y_true = y[len(y) // 2:] + y_true = y[1::2] accuracy2 = _determination_coefficient(y_true, y_pred) self.assertGreater(accuracy1, 0.85) @@ -78,10 +78,10 @@ def test_make_regression_sklearn_max_predict(self): shuffle=True, random_state=0, ) - x_train = ds.array(x[: len(x) // 2], (300, 10)) - y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) + x_train = ds.array(x[::2], (300, 10)) + y_train = ds.array(y[::2][:, np.newaxis], (300, 1)) + x_test = ds.array(x[1::2], (300, 10)) + y_test = ds.array(y[1::2][:, np.newaxis], (300, 1)) rf = RandomForestRegressor(random_state=0, sklearn_max=10) @@ -89,7 +89,7 @@ def test_make_regression_sklearn_max_predict(self): accuracy1 = compss_wait_on(rf.score(x_test, y_test)) y_pred = rf.predict(x_test).collect() - y_true = y[len(y) // 2:] + y_true = y[1::2] accuracy2 = _determination_coefficient(y_true, y_pred) self.assertGreater(accuracy1, 0.85)