From 7430eef67675cf2f501702b2ecc876f485de67e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Fri, 16 Jul 2021 16:15:32 +0200 Subject: [PATCH 01/46] Added saving and loading utils --- dislib/utils/__init__.py | 3 +- dislib/utils/saving.py | 363 ++++++++++ tests/test_saving_cbor.py | 1403 +++++++++++++++++++++++++++++++++++++ tests/test_saving_json.py | 1403 +++++++++++++++++++++++++++++++++++++ 4 files changed, 3171 insertions(+), 1 deletion(-) create mode 100644 dislib/utils/saving.py create mode 100644 tests/test_saving_cbor.py create mode 100644 tests/test_saving_json.py diff --git a/dislib/utils/__init__.py b/dislib/utils/__init__.py index 34b84166..299601a7 100644 --- a/dislib/utils/__init__.py +++ b/dislib/utils/__init__.py @@ -1,3 +1,4 @@ from dislib.utils.base import shuffle +from dislib.utils.saving import save_model, load_model -__all__ = ['shuffle'] +__all__ = ["shuffle", "save_model", "load_model"] diff --git a/dislib/utils/saving.py b/dislib/utils/saving.py new file mode 100644 index 00000000..31159a7f --- /dev/null +++ b/dislib/utils/saving.py @@ -0,0 +1,363 @@ +import json +import os +import numpy as np + +from pycompss.runtime.management.classes import Future +from pycompss.api.api import compss_wait_on + +from sklearn.svm import SVC as SklearnSVC +from sklearn.tree import DecisionTreeClassifier as SklearnDTClassifier +from sklearn.tree._tree import Tree as SklearnTree +from scipy.sparse import csr_matrix + +import dislib as ds +import dislib.classification +import dislib.cluster +import dislib.recommendation +import dislib.regression +from dislib.data.array import Array +from dislib.classification.rf.decision_tree import ( + DecisionTreeClassifier, + _Node, + _InnerNodeInfo, + _LeafInfo, + _SkTreeWrapper, +) + +try: + import cbor2 +except ImportError: + cbor2 = None + +# Dislib models with saving tested (model: str -> module: str) +_implemented_models = { + "KMeans": "cluster", + "GaussianMixture": "cluster", + "CascadeSVM": "classification", + "RandomForestClassifier": "classification", + "ALS": "recommendation", + "LinearRegression": "regression", + "Lasso": "regression", +} + +# Classes used by models +_dislib_classes = { + "KMeans": dislib.cluster.KMeans, + "DecisionTreeClassifier": DecisionTreeClassifier, + "_Node": _Node, + "_InnerNodeInfo": _InnerNodeInfo, + "_LeafInfo": _LeafInfo, + "_SkTreeWrapper": _SkTreeWrapper, +} + +_sklearn_classes = { + "SVC": SklearnSVC, + "DecisionTreeClassifier": SklearnDTClassifier, +} + + +def save_model(model, filepath, overwrite=True, save_format=None): + """Saves a model to a file. + Usage: + >>> from dislib.cluster import KMeans + >>> from dislib.utils import save_model, load_model + >>> import numpy as np + >>> import dislib as ds + >>> x = np.array([[1, 2], [1, 4], [1, 0], [4, 2], [4, 4], [4, 0]]) + >>> x_train = ds.array(x, (2, 2)) + >>> model = KMeans(n_clusters=2, random_state=0) + >>> model.fit(x_train) + >>> save_model(model, '/tmp/model') + >>> loaded_model = load_model('/tmp/model') + >>> x_test = ds.array(np.array([[0, 0], [4, 4]]), (2, 2)) + >>> model_pred = model.predict(x_test) + >>> loaded_model_pred = loaded_model.predict(x_test) + >>> assert np.allclose(model_pred.collect(), loaded_model_pred.collect()) + + The file contains: + - the model's class + - the model's attributes + The model is synchronized before saving and can be reinstantiated in the + exact same state, without any of the code used for model definition or + fitting. + Args: + model: `dislib` model instance to be saved. + filepath: String path where to save the model + overwrite: Whether we should overwrite any existing model at the target + location, or instead ask the user with a manual prompt. + save_format: Format used to save the model. Defaults to `json`. + """ + # Check overwrite + if not overwrite and os.path.isfile(filepath): + return + + # Check for dislib model + model_name = model.__class__.__name__ + if model_name not in _implemented_models.keys(): + raise NotImplementedError( + "Saving has only been implemented for the following models:\n%s" + % _implemented_models.keys() + ) + + # Synchronize model + if model_name == "RandomForestClassifier": + _sync_rf(model) + + _sync_obj(model.__dict__) + model_metadata = model.__dict__.copy() + model_metadata["model_name"] = model_name + + # Save model + default_format = "json" + save_format = save_format or default_format + if save_format == "json": + with open(filepath, "w") as f: + json.dump(model_metadata, f, default=_encode_helper) + elif save_format == "cbor": + if cbor2 is None: + raise ModuleNotFoundError("No module named 'cbor2'") + with open(filepath, "wb") as f: + cbor2.dump(model_metadata, f, default=_encode_helper_cbor) + else: + raise ValueError("Save format must be either json or h5.") + + +def load_model(filepath, load_format=None): + """Loads a model from a file. + Usage: + >>> from dislib.cluster import KMeans + >>> from dislib.utils import save_model, load_model + >>> import numpy as np + >>> import dislib as ds + >>> x = np.array([[1, 2], [1, 4], [1, 0], [4, 2], [4, 4], [4, 0]]) + >>> x_train = ds.array(x, (2, 2)) + >>> model = KMeans(n_clusters=2, random_state=0) + >>> model.fit(x_train) + >>> save_model(model, '/tmp/model') + >>> loaded_model = load_model('/tmp/model') + >>> x_test = ds.array(np.array([[0, 0], [4, 4]]), (2, 2)) + >>> model_pred = model.predict(x_test) + >>> loaded_model_pred = loaded_model.predict(x_test) + >>> assert np.allclose(model_pred.collect(), loaded_model_pred.collect()) + + The file must contain: + - the model's class + - the model's attributes + The model is reinstantiated in the exact same state in which it was saved, + without any of the code used for model definition or fitting. + Args: + filepath: String path where to save the model + load_format: Format used to load the model. Defaults to 'json'. + """ + # Load model + default_format = "json" + load_format = load_format or default_format + + if load_format == "json": + with open(filepath, "r") as f: + model_metadata = json.load(f, object_hook=_decode_helper) + elif load_format == "cbor": + if cbor2 is None: + raise ModuleNotFoundError("No module named 'cbor2'") + with open(filepath, "rb") as f: + model_metadata = cbor2.load(f, object_hook=_decode_helper_cbor) + else: + raise ValueError("Load format must be either json or h5.") + + # Check for dislib model + model_name = model_metadata["model_name"] + if model_name not in _implemented_models.keys(): + raise NotImplementedError( + "Loading has only been implemented for the following models:\n%s" + % _implemented_models.keys() + ) + del model_metadata["model_name"] + + # Create model + model_module = getattr(ds, _implemented_models[model_name]) + model_class = getattr(model_module, model_name) + model = model_class() + model.__dict__.update(model_metadata) + + # Set class methodss + if model_name == "CascadeSVM" and "kernel" in model_metadata: + try: + model._kernel_f = getattr( + model, model._name_to_kernel[model_metadata["kernel"]] + ) + except AttributeError: + model._kernel_f = getattr(model, "_rbf_kernel") + + return model + + +def _encode_helper_cbor(encoder, obj): + """Special encoder wrapper for dislib using cbor""" + encoder.encode(_encode_helper(obj)) + + +def _decode_helper_cbor(decoder, obj): + """Special decoder wrapper for dislib using cbor""" + return _decode_helper(obj) + + +def _encode_helper(obj): + """Special encoder for dislib""" + if isinstance(obj, np.generic): + return obj.item() + elif isinstance(obj, range): + return { + "class_name": "range", + "start": obj.start, + "stop": obj.stop, + "step": obj.step, + } + elif isinstance(obj, csr_matrix): + return { + "class_name": "csr_matrix", + **obj.__dict__, + } + elif isinstance(obj, np.ndarray): + return { + "class_name": "ndarray", + "dtype_list": len(obj.dtype.descr) > 1, + "dtype": str(obj.dtype), + "items": obj.tolist(), + } + elif isinstance(obj, Array): + return {"class_name": "dsarray", **obj.__dict__} + elif isinstance(obj, np.random.RandomState): + return {"class_name": "RandomState", "items": obj.get_state()} + elif callable(obj): + return { + "class_name": "callable", + "module": obj.__module__, + "name": obj.__name__, + } + elif isinstance(obj, SklearnTree): + return { + "class_name": obj.__class__.__name__, + "n_features": obj.n_features, + "n_classes": obj.n_classes, + "n_outputs": obj.n_outputs, + "items": obj.__getstate__(), + } + elif isinstance( + obj, tuple(_dislib_classes.values()) + tuple(_sklearn_classes.values()) + ): + return { + "class_name": obj.__class__.__name__, + "module_name": obj.__module__, + "items": obj.__dict__, + } + raise TypeError("Not JSON Serializable:", obj) + + +def _decode_helper(obj): + """Special decoder for dislib""" + if isinstance(obj, dict) and "class_name" in obj: + + class_name = obj["class_name"] + if class_name == "range": + return range(obj["start"], obj["stop"], obj["step"]) + elif class_name == "tuple": + return tuple(obj["items"]) + elif class_name == "ndarray": + if obj["dtype_list"]: + items = list(map(tuple, obj["items"])) + return np.rec.fromrecords(items, dtype=eval(obj["dtype"])) + else: + return np.array(obj["items"], dtype=obj["dtype"]) + elif class_name == "csr_matrix": + return csr_matrix( + (obj["data"], obj["indices"], obj["indptr"]), + shape=obj["_shape"], + ) + elif class_name == "dsarray": + return Array( + blocks=obj["_blocks"], + top_left_shape=obj["_top_left_shape"], + reg_shape=obj["_reg_shape"], + shape=obj["_shape"], + sparse=obj["_sparse"], + delete=obj["_delete"], + ) + elif class_name == "RandomState": + random_state = np.random.RandomState() + random_state.set_state(_decode_helper(obj["items"])) + return random_state + elif class_name == "Tree": + dict_ = _decode_helper(obj["items"]) + model = SklearnTree( + obj["n_features"], obj["n_classes"], obj["n_outputs"] + ) + model.__setstate__(dict_) + return model + elif ( + class_name in _dislib_classes.keys() + and "dislib" in obj["module_name"] + ): + dict_ = _decode_helper(obj["items"]) + if class_name == "DecisionTreeClassifier": + model = _dislib_classes[obj["class_name"]]( + try_features=dict_.pop("try_features"), + max_depth=dict_.pop("max_depth"), + distr_depth=dict_.pop("distr_depth"), + sklearn_max=dict_.pop("sklearn_max"), + bootstrap=dict_.pop("bootstrap"), + random_state=dict_.pop("random_state"), + ) + elif class_name == "_SkTreeWrapper": + sk_tree = _decode_helper(dict_.pop("sk_tree")) + model = _dislib_classes[obj["class_name"]](sk_tree) + else: + model = _dislib_classes[obj["class_name"]]() + model.__dict__.update(dict_) + return model + elif ( + class_name in _sklearn_classes.keys() + and "sklearn" in obj["module_name"] + ): + dict_ = _decode_helper(obj["items"]) + model = _sklearn_classes[obj["class_name"]]() + model.__dict__.update(dict_) + return model + elif class_name == "callable": + if obj["module"] == "numpy": + return getattr(np, obj["name"]) + return None + + return obj + + +def _sync_obj(obj): + """Recursively synchronizes the Future objects of a list or dictionary.""" + if isinstance(obj, dict): + iterator = iter(obj.items()) + elif isinstance(obj, list): + iterator = iter(enumerate(obj)) + else: + raise ValueError("Expected dict or list and received %s." % type(obj)) + + for key, val in iterator: + if isinstance(val, (dict, list)): + _sync_obj(obj[key]) + else: + obj[key] = compss_wait_on(val) + if isinstance(obj[key], Future): + raise TypeError( + "Could not synchronize Future (%s, %s)." % (key, val) + ) + if hasattr(obj[key], "__dict__"): + _sync_obj(obj[key].__dict__) + + +def _sync_rf(rf): + """Sync the `try_features` and 'n_classes' attribute of the different trees + """ + if isinstance(rf.trees[0].try_features, Future): + try_features = compss_wait_on(rf.trees[0].try_features) + n_classes = compss_wait_on(rf.trees[0].n_classes) + for tree in rf.trees: + tree.try_features = try_features + tree.n_classes = n_classes diff --git a/tests/test_saving_cbor.py b/tests/test_saving_cbor.py new file mode 100644 index 00000000..64cd534a --- /dev/null +++ b/tests/test_saving_cbor.py @@ -0,0 +1,1403 @@ +import unittest + +import numpy as np +from numpy.random.mtrand import RandomState +from scipy.sparse import csr_matrix +from sklearn import datasets +from sklearn.cluster import KMeans as SKMeans +from sklearn.metrics import r2_score +from sklearn.datasets import make_classification +from sklearn.datasets import make_blobs, load_iris + +import dislib as ds +from dislib.cluster import KMeans +from dislib.cluster import GaussianMixture +from dislib.classification import CascadeSVM +from dislib.classification import RandomForestClassifier +from dislib.regression import Lasso +from dislib.regression import LinearRegression +from dislib.recommendation import ALS +from dislib.utils import save_model, load_model + +from pycompss.api.api import compss_wait_on + + +class KMeansSavingTestCBOR(unittest.TestCase): + filepath = "tests/files/saving/kmeans.cbor" + + def test_init_params_kmeans(self): + """Tests that KMeans correctly sets the initialization + parameters""" + n_clusters = 2 + max_iter = 1 + tol = 1e-4 + seed = 666 + arity = 2 + init = "random" + + km = KMeans( + n_clusters=n_clusters, + max_iter=max_iter, + tol=tol, + arity=arity, + random_state=seed, + ) + save_model(km, self.filepath, save_format="cbor") + km2 = load_model(self.filepath, load_format="cbor") + + expected = (n_clusters, init, max_iter, tol, arity) + real = (km.n_clusters, km.init, km.max_iter, km.tol, km.arity) + real2 = (km2.n_clusters, km2.init, km2.max_iter, km2.tol, km2.arity) + self.assertEqual(expected, real) + self.assertEqual(expected, real2) + + def test_fit_kmeans(self): + """Tests that the fit method returns the expected centers using toy + data. + """ + arr = np.array([[1, 2], [2, 1], [-1, -2], [-2, -1]]) + x = ds.array(arr, block_size=(2, 2)) + + km = KMeans(n_clusters=2, random_state=666, verbose=False) + km.fit(x) + + expected_centers = np.array([[1.5, 1.5], [-1.5, -1.5]]) + + save_model(km, self.filepath, save_format="cbor") + km2 = load_model(self.filepath, load_format="cbor") + + self.assertTrue((km.centers == expected_centers).all()) + self.assertTrue((km2.centers == expected_centers).all()) + + def test_predict_kmeans(self): + """Tests that labels are correctly predicted using toy data.""" + p1, p2, p3, p4 = [1, 2], [2, 1], [-1, -2], [-2, -1] + + arr1 = np.array([p1, p2, p3, p4]) + x = ds.array(arr1, block_size=(2, 2)) + + km = KMeans(n_clusters=2, random_state=666) + km.fit(x) + + save_model(km, self.filepath, save_format="cbor") + km2 = load_model(self.filepath, load_format="cbor") + + p5, p6 = [10, 10], [-10, -10] + + arr2 = np.array([p1, p2, p3, p4, p5, p6]) + x_test = ds.array(arr2, block_size=(2, 2)) + + labels = km.predict(x_test).collect() + labels2 = km2.predict(x_test).collect() + expected_labels = np.array([0, 0, 1, 1, 0, 1]) + + self.assertTrue(np.array_equal(labels, expected_labels)) + self.assertTrue(np.array_equal(labels2, expected_labels)) + + def test_fit_predict_kmeans(self): + """Tests fit_predict.""" + x, y = make_blobs(n_samples=1500, random_state=170) + x_filtered = np.vstack( + (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10]) + ) + + x_train = ds.array(x_filtered, block_size=(300, 2)) + + kmeans = KMeans(n_clusters=3, random_state=170) + labels = kmeans.fit_predict(x_train).collect() + + save_model(kmeans, self.filepath, save_format="cbor") + kmeans = load_model(self.filepath, load_format="cbor") + + skmeans = SKMeans(n_clusters=3, random_state=170) + sklabels = skmeans.fit_predict(x_filtered) + + centers = np.array( + [ + [-8.941375656533449, -5.481371322614891], + [-4.524023204953875, 0.06235042593214654], + [2.332994701667008, 0.37681003933082696], + ] + ) + + self.assertTrue(np.allclose(centers, kmeans.centers)) + self.assertTrue(np.allclose(labels, sklabels)) + + def test_sparse_kmeans(self): + """Tests K-means produces the same results using dense and sparse + data structures.""" + file_ = "tests/files/libsvm/2" + + x_sp, _ = ds.load_svmlight_file(file_, (10, 300), 780, True) + x_ds, _ = ds.load_svmlight_file(file_, (10, 300), 780, False) + + kmeans = KMeans(random_state=170) + kmeans.fit(x_sp) + + save_model(kmeans, self.filepath, save_format="cbor") + kmeans2 = load_model(self.filepath, load_format="cbor") + + y_sparse = kmeans.predict(x_sp).collect() + y_sparse2 = kmeans2.predict(x_sp).collect() + + sparse_c = kmeans.centers.toarray() + sparse_c2 = kmeans2.centers.toarray() + + kmeans = KMeans(random_state=170) + + y_dense = kmeans.fit_predict(x_ds).collect() + dense_c = kmeans.centers + + self.assertTrue(np.allclose(sparse_c, dense_c)) + self.assertTrue(np.allclose(sparse_c2, dense_c)) + self.assertTrue(np.array_equal(y_sparse, y_dense)) + self.assertTrue(np.array_equal(y_sparse2, y_dense)) + + def test_init_kmeans(self): + # With dense data + x, y = make_blobs(n_samples=1500, random_state=170) + x_filtered = np.vstack( + (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10]) + ) + x_train = ds.array(x_filtered, block_size=(300, 2)) + + init = np.random.random((5, 2)) + km = KMeans(n_clusters=5, init=init) + km.fit(x_train) + + save_model(km, self.filepath, save_format="cbor") + km2 = load_model(self.filepath, load_format="cbor") + + self.assertTrue(np.array_equal(km.init, init)) + self.assertTrue(np.array_equal(km2.init, init)) + self.assertFalse(np.array_equal(km.centers, init)) + self.assertFalse(np.array_equal(km2.centers, init)) + + # With sparse data + x_sp = ds.array(csr_matrix(x_filtered), block_size=(300, 2)) + init = csr_matrix(np.random.random((5, 2))) + + km = KMeans(n_clusters=5, init=init) + km.fit(x_sp) + + save_model(km, self.filepath, save_format="cbor") + km2 = load_model(self.filepath, load_format="cbor") + + self.assertTrue(np.array_equal(km.init.toarray(), init.toarray())) + self.assertTrue(np.array_equal(km2.init.toarray(), init.toarray())) + self.assertFalse(np.array_equal(km.centers.toarray(), init.toarray())) + self.assertFalse(np.array_equal(km2.centers.toarray(), init.toarray())) + + +class GaussianMixtureSavingTestCBOR(unittest.TestCase): + filepath = "tests/files/saving/gm.cbor" + + def test_init_params(self): + """Tests that GaussianMixture params are set""" + n_components = 2 + covariance_type = "diag" + tol = 1e-4 + reg_covar = 1e-5 + max_iter = 3 + init_params = "random" + weights_init = np.array([0.4, 0.6]) + means_init = np.array([[0, 0], [2, 3]]) + precisions_init = "todo" + random_state = RandomState(666) + gm = GaussianMixture( + n_components=n_components, + covariance_type=covariance_type, + tol=tol, + reg_covar=reg_covar, + max_iter=max_iter, + init_params=init_params, + weights_init=weights_init, + means_init=means_init, + precisions_init=precisions_init, + random_state=random_state, + ) + + save_model(gm, self.filepath, save_format="cbor") + gm2 = load_model(self.filepath, load_format="cbor") + + real = ( + gm.n_components, + gm.covariance_type, + gm.tol, + gm.reg_covar, + gm.max_iter, + gm.init_params, + gm.weights_init.tolist(), + gm.means_init.tolist(), + gm.precisions_init, + *[ + list(x) if isinstance(x, np.ndarray) else x + for x in gm.random_state.get_state() + ], + ) + real2 = ( + gm2.n_components, + gm2.covariance_type, + gm2.tol, + gm2.reg_covar, + gm2.max_iter, + gm2.init_params, + gm2.weights_init.tolist(), + gm2.means_init.tolist(), + gm2.precisions_init, + *[ + list(x) if isinstance(x, np.ndarray) else x + for x in gm2.random_state.get_state() + ], + ) + + self.assertEqual(real, real2) + + def test_fit(self): + """Tests GaussianMixture.fit()""" + + x = np.array([[1, 2], [2, 1], [-3, -3], [-1, -2], [-2, -1], [3, 3]]) + ds_x = ds.array(x, block_size=(3, 2)) + + gm = GaussianMixture(n_components=2, random_state=666) + gm.fit(ds_x) + + expected_weights = np.array([0.5, 0.5]) + expected_means = np.array([[-2, -2], [2, 2]]) + expected_cov = np.array( + [ + [[0.66671688, 0.33338255], [0.33338255, 0.66671688]], + [[0.66671688, 0.33338255], [0.33338255, 0.66671688]], + ] + ) + expected_pc = np.array( + [ + [[1.22469875, -0.70714834], [0.0, 1.4141944]], + [[1.22469875, -0.70714834], [0.0, 1.4141944]], + ] + ) + + save_model(gm, self.filepath, save_format="cbor") + gm2 = load_model(self.filepath, load_format="cbor") + + gm.weights_ = compss_wait_on(gm.weights_) + gm.means_ = compss_wait_on(gm.means_) + gm.covariances_ = compss_wait_on(gm.covariances_) + gm.precisions_cholesky_ = compss_wait_on(gm.precisions_cholesky_) + + gm2.weights_ = compss_wait_on(gm2.weights_) + gm2.means_ = compss_wait_on(gm2.means_) + gm2.covariances_ = compss_wait_on(gm2.covariances_) + gm2.precisions_cholesky_ = compss_wait_on(gm2.precisions_cholesky_) + + self.assertTrue((np.allclose(gm.weights_, expected_weights))) + self.assertTrue((np.allclose(gm.means_, expected_means))) + self.assertTrue((np.allclose(gm.covariances_, expected_cov))) + self.assertTrue((np.allclose(gm.precisions_cholesky_, expected_pc))) + + self.assertTrue((np.allclose(gm2.weights_, expected_weights))) + self.assertTrue((np.allclose(gm2.means_, expected_means))) + self.assertTrue((np.allclose(gm2.covariances_, expected_cov))) + self.assertTrue((np.allclose(gm2.precisions_cholesky_, expected_pc))) + + def test_predict(self): + """Tests GaussianMixture.predict()""" + x_train = np.array([[1, 2], [-1, -2], [2, 1], [-2, -1]]) + ds_x_train = ds.array(x_train, block_size=(2, 2)) + + gm = GaussianMixture(n_components=2, random_state=666) + gm.fit(ds_x_train) + + save_model(gm, self.filepath, save_format="cbor") + gm2 = load_model(self.filepath, load_format="cbor") + + x_test = np.concatenate((x_train, [[2, 2], [-1, -3]])) + ds_x_test = ds.array(x_test, block_size=(2, 2)) + pred = gm.predict(ds_x_test).collect() + pred2 = gm2.predict(ds_x_test).collect() + + self.assertTrue(pred[0] != pred[1]) + self.assertTrue(pred[0] == pred[2] == pred[4]) + self.assertTrue(pred[1] == pred[3] == pred[5]) + + self.assertTrue(pred2[0] != pred2[1]) + self.assertTrue(pred2[0] == pred2[2] == pred2[4]) + self.assertTrue(pred2[1] == pred2[3] == pred2[5]) + + def test_fit_predict(self): + """Tests GaussianMixture.fit_predict()""" + x, y = make_blobs(n_samples=1500, random_state=170) + x_filtered = np.vstack( + (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10]) + ) + y_real = np.concatenate((np.zeros(500), np.ones(100), 2 * np.ones(10))) + + ds_x = ds.array(x_filtered, block_size=(300, 2)) + + gm = GaussianMixture(n_components=3, random_state=170) + pred = gm.fit_predict(ds_x).collect() + + save_model(gm, self.filepath, save_format="cbor") + gm2 = load_model(self.filepath, load_format="cbor") + + pred2 = gm2.predict(ds_x).collect() + + self.assertEqual(len(pred), 610) + accuracy = np.count_nonzero(pred == y_real) / len(pred) + self.assertGreater(accuracy, 0.99) + + self.assertEqual(len(pred2), 610) + accuracy2 = np.count_nonzero(pred2 == y_real) / len(pred2) + self.assertGreater(accuracy2, 0.99) + + def test_sparse(self): + """Tests GaussianMixture produces the same results using dense and + sparse data structures""" + file_ = "tests/files/libsvm/2" + + x_sparse, _ = ds.load_svmlight_file(file_, (10, 780), 780, True) + x_dense, _ = ds.load_svmlight_file(file_, (10, 780), 780, False) + + covariance_types = "full", "tied", "diag", "spherical" + + for cov_type in covariance_types: + gm = GaussianMixture( + n_components=4, random_state=0, covariance_type=cov_type + ) + gm.fit(x_sparse) + save_model(gm, self.filepath, save_format="cbor") + gm2 = load_model(self.filepath, load_format="cbor") + labels_sparse = gm.predict(x_sparse).collect() + labels_sparse2 = gm2.predict(x_sparse).collect() + + gm = GaussianMixture( + n_components=4, random_state=0, covariance_type=cov_type + ) + gm.fit(x_dense) + save_model(gm, self.filepath, save_format="cbor") + gm2 = load_model(self.filepath, load_format="cbor") + labels_dense = gm.predict(x_dense).collect() + labels_dense2 = gm2.predict(x_dense).collect() + + self.assertTrue(np.array_equal(labels_sparse, labels_sparse2)) + self.assertTrue(np.array_equal(labels_sparse, labels_dense)) + self.assertTrue(np.array_equal(labels_sparse2, labels_dense2)) + + def test_init_random(self): + """Tests GaussianMixture random initialization""" + x = ds.random_array((50, 3), (10, 3), random_state=0) + gm = GaussianMixture( + init_params="random", n_components=4, arity=2, random_state=170 + ) + gm.fit(x) + save_model(gm, self.filepath, save_format="cbor") + gm2 = load_model(self.filepath, load_format="cbor") + self.assertGreater(gm.n_iter, 5) + self.assertGreater(gm2.n_iter, 5) + + def test_means_init_and_weights_init(self): + """Tests GaussianMixture means_init and weights_init parameters""" + x, _ = load_iris(return_X_y=True) + x_ds = ds.array(x, (75, 4)) + weights_init = [1 / 3, 1 / 3, 1 / 3] + means_init = np.array([[5, 3, 2, 0], [6, 3, 4, 1], [7, 3, 6, 2]]) + gm = GaussianMixture( + random_state=0, + n_components=3, + weights_init=weights_init, + means_init=means_init, + ) + gm.fit(x_ds) + save_model(gm, self.filepath, save_format="cbor") + gm2 = load_model(self.filepath, load_format="cbor") + self.assertTrue(gm.converged_) + self.assertTrue(gm2.converged_) + + +class CSVMSavingTestCBOR(unittest.TestCase): + filepath = "tests/files/saving/csvm.cbor" + + def test_init_params(self): + """Test constructor parameters""" + cascade_arity = 3 + max_iter = 1 + tol = 1e-4 + kernel = "rbf" + c = 2 + gamma = 0.1 + check_convergence = True + seed = 666 + verbose = False + + csvm = CascadeSVM( + cascade_arity=cascade_arity, + max_iter=max_iter, + tol=tol, + kernel=kernel, + c=c, + gamma=gamma, + check_convergence=check_convergence, + random_state=seed, + verbose=verbose, + ) + save_model(csvm, self.filepath, save_format="cbor") + csvm2 = load_model(self.filepath, load_format="cbor") + + self.assertEqual(csvm.cascade_arity, cascade_arity) + self.assertEqual(csvm.max_iter, max_iter) + self.assertEqual(csvm.tol, tol) + self.assertEqual(csvm.kernel, kernel) + self.assertEqual(csvm.c, c) + self.assertEqual(csvm.gamma, gamma) + self.assertEqual(csvm.check_convergence, check_convergence) + self.assertEqual(csvm.random_state, seed) + self.assertEqual(csvm.verbose, verbose) + + self.assertEqual(csvm2.cascade_arity, cascade_arity) + self.assertEqual(csvm2.max_iter, max_iter) + self.assertEqual(csvm2.tol, tol) + self.assertEqual(csvm2.kernel, kernel) + self.assertEqual(csvm2.c, c) + self.assertEqual(csvm2.gamma, gamma) + self.assertEqual(csvm2.check_convergence, check_convergence) + self.assertEqual(csvm2.random_state, seed) + self.assertEqual(csvm2.verbose, verbose) + + def test_fit_private_params(self): + kernel = "rbf" + c = 2 + gamma = 0.1 + seed = 666 + file_ = "tests/files/libsvm/2" + + x, y = ds.load_svmlight_file(file_, (10, 300), 780, False) + csvm = CascadeSVM(kernel=kernel, c=c, gamma=gamma, random_state=seed) + csvm.fit(x, y) + save_model(csvm, self.filepath, save_format="cbor") + csvm2 = load_model(self.filepath, load_format="cbor") + self.assertEqual(csvm._clf_params["kernel"], kernel) + self.assertEqual(csvm._clf_params["C"], c) + self.assertEqual(csvm._clf_params["gamma"], gamma) + self.assertEqual(csvm2._clf_params["kernel"], kernel) + self.assertEqual(csvm2._clf_params["C"], c) + self.assertEqual(csvm2._clf_params["gamma"], gamma) + + kernel, c = "linear", 0.3 + csvm = CascadeSVM(kernel=kernel, c=c, random_state=seed) + csvm.fit(x, y) + save_model(csvm, self.filepath, save_format="cbor") + csvm2 = load_model(self.filepath, load_format="cbor") + self.assertEqual(csvm._clf_params["kernel"], kernel) + self.assertEqual(csvm._clf_params["C"], c) + self.assertEqual(csvm2._clf_params["kernel"], kernel) + self.assertEqual(csvm2._clf_params["C"], c) + + # # check for exception when incorrect kernel is passed + # self.assertRaises(AttributeError, CascadeSVM(kernel='fake_kernel')) + + def test_fit(self): + seed = 666 + file_ = "tests/files/libsvm/2" + + x, y = ds.load_svmlight_file(file_, (10, 300), 780, False) + + csvm = CascadeSVM( + cascade_arity=3, + max_iter=5, + tol=1e-4, + kernel="linear", + c=2, + gamma=0.1, + check_convergence=True, + random_state=seed, + verbose=False, + ) + + csvm.fit(x, y) + save_model(csvm, self.filepath, save_format="cbor") + csvm2 = load_model(self.filepath, load_format="cbor") + + self.assertTrue(csvm.converged) + self.assertTrue(csvm2.converged) + + csvm = CascadeSVM( + cascade_arity=3, + max_iter=1, + tol=1e-4, + kernel="linear", + c=2, + gamma=0.1, + check_convergence=False, + random_state=seed, + verbose=False, + ) + + csvm.fit(x, y) + save_model(csvm, self.filepath, save_format="cbor") + csvm2 = load_model(self.filepath, load_format="cbor") + self.assertFalse(csvm.converged) + self.assertEqual(csvm.iterations, 1) + self.assertFalse(csvm2.converged) + self.assertEqual(csvm2.iterations, 1) + + def test_predict(self): + seed = 666 + + # negative points belong to class 1, positives to 0 + p1, p2, p3, p4 = [1, 2], [2, 1], [-1, -2], [-2, -1] + + x = ds.array(np.array([p1, p4, p3, p2]), (2, 2)) + y = ds.array(np.array([0, 1, 1, 0]).reshape(-1, 1), (2, 1)) + + csvm = CascadeSVM( + cascade_arity=3, + max_iter=10, + tol=1e-4, + kernel="linear", + c=2, + gamma=0.1, + check_convergence=False, + random_state=seed, + verbose=False, + ) + + csvm.fit(x, y) + save_model(csvm, self.filepath, save_format="cbor") + csvm2 = load_model(self.filepath, load_format="cbor") + + # p5 should belong to class 0, p6 to class 1 + p5, p6 = np.array([1, 1]), np.array([-1, -1]) + + x_test = ds.array(np.array([p1, p2, p3, p4, p5, p6]), (2, 2)) + + y_pred = csvm.predict(x_test) + y_pred2 = csvm2.predict(x_test) + + l1, l2, l3, l4, l5, l6 = y_pred.collect() + self.assertTrue(l1 == l2 == l5 == 0) + self.assertTrue(l3 == l4 == l6 == 1) + + l1, l2, l3, l4, l5, l6 = y_pred2.collect() + self.assertTrue(l1 == l2 == l5 == 0) + self.assertTrue(l3 == l4 == l6 == 1) + + def test_score(self): + seed = 666 + + # negative points belong to class 1, positives to 0 + p1, p2, p3, p4 = [1, 2], [2, 1], [-1, -2], [-2, -1] + + x = ds.array(np.array([p1, p4, p3, p2]), (2, 2)) + y = ds.array(np.array([0, 1, 1, 0]).reshape(-1, 1), (2, 1)) + + csvm = CascadeSVM( + cascade_arity=3, + max_iter=10, + tol=1e-4, + kernel="rbf", + c=2, + gamma=0.1, + check_convergence=True, + random_state=seed, + verbose=False, + ) + + csvm.fit(x, y) + save_model(csvm, self.filepath, save_format="cbor") + csvm2 = load_model(self.filepath, load_format="cbor") + + # points are separable, scoring the training dataset should have 100% + # accuracy + x_test = ds.array(np.array([p1, p2, p3, p4]), (2, 2)) + y_test = ds.array(np.array([0, 0, 1, 1]).reshape(-1, 1), (2, 1)) + + accuracy = compss_wait_on(csvm.score(x_test, y_test)) + accuracy2 = compss_wait_on(csvm2.score(x_test, y_test)) + + self.assertEqual(accuracy, 1.0) + self.assertEqual(accuracy2, 1.0) + + def test_decision_func(self): + seed = 666 + + # negative points belong to class 1, positives to 0 + # all points are in the x-axis + p1, p2, p3, p4 = [0, 2], [0, 1], [0, -2], [0, -1] + + x = ds.array(np.array([p1, p4, p3, p2]), (2, 2)) + y = ds.array(np.array([0, 1, 1, 0]).reshape(-1, 1), (2, 1)) + + csvm = CascadeSVM( + cascade_arity=3, + max_iter=10, + tol=1e-4, + kernel="rbf", + c=2, + gamma=0.1, + check_convergence=False, + random_state=seed, + verbose=False, + ) + + csvm.fit(x, y) + save_model(csvm, self.filepath, save_format="cbor") + csvm2 = load_model(self.filepath, load_format="cbor") + + # p1 should be equidistant to p3, and p2 to p4 + x_test = ds.array(np.array([p1, p2, p3, p4]), (2, 2)) + + y_pred = csvm.decision_function(x_test) + y_pred2 = csvm2.decision_function(x_test) + + d1, d2, d3, d4 = y_pred.collect() + self.assertTrue(np.isclose(abs(d1) - abs(d3), 0)) + self.assertTrue(np.isclose(abs(d2) - abs(d4), 0)) + d1, d2, d3, d4 = y_pred2.collect() + self.assertTrue(np.isclose(abs(d1) - abs(d3), 0)) + self.assertTrue(np.isclose(abs(d2) - abs(d4), 0)) + + # p5 and p6 should be in the decision function (distance=0) + p5, p6 = np.array([1, 0]), np.array([-1, 0]) + + x_test = ds.array(np.array([p5, p6]), (1, 2)) + + y_pred = csvm.decision_function(x_test) + y_pred2 = csvm2.decision_function(x_test) + + d5, d6 = y_pred.collect() + self.assertTrue(np.isclose(d5, 0)) + self.assertTrue(np.isclose(d6, 0)) + d5, d6 = y_pred2.collect() + self.assertTrue(np.isclose(d5, 0)) + self.assertTrue(np.isclose(d6, 0)) + + def test_sparse(self): + """Tests that C-SVM produces the same results with sparse and dense + data""" + seed = 666 + train = "tests/files/libsvm/3" + + x_sp, y_sp = ds.load_svmlight_file(train, (10, 300), 780, True) + x_d, y_d = ds.load_svmlight_file(train, (10, 300), 780, False) + + csvm_sp = CascadeSVM(random_state=seed) + csvm_sp.fit(x_sp, y_sp) + save_model(csvm_sp, self.filepath, save_format="cbor") + csvm_sp2 = load_model(self.filepath, load_format="cbor") + + csvm_d = CascadeSVM(random_state=seed) + csvm_d.fit(x_d, y_d) + save_model(csvm_d, self.filepath, save_format="cbor") + csvm_d2 = load_model(self.filepath, load_format="cbor") + + sv_d = csvm_d._clf.support_vectors_ + sv_sp = csvm_sp._clf.support_vectors_.toarray() + sv_d2 = csvm_d2._clf.support_vectors_ + sv_sp2 = csvm_sp2._clf.support_vectors_.toarray() + + self.assertTrue(np.array_equal(sv_d, sv_sp)) + self.assertTrue(np.array_equal(sv_d2, sv_sp2)) + self.assertTrue(np.array_equal(sv_d, sv_d2)) + + coef_d = csvm_d._clf.dual_coef_ + coef_sp = csvm_sp._clf.dual_coef_.toarray() + coef_d2 = csvm_d2._clf.dual_coef_ + coef_sp2 = csvm_sp2._clf.dual_coef_.toarray() + + self.assertTrue(np.array_equal(coef_d, coef_sp)) + self.assertTrue(np.array_equal(coef_d2, coef_sp2)) + self.assertTrue(np.array_equal(coef_d, coef_d2)) + + def test_duplicates(self): + """Tests that C-SVM does not generate duplicate support vectors""" + x = ds.array( + np.array( + [ + [0, 1], + [1, 1], + [0, 1], + [1, 2], + [0, 0], + [2, 2], + [2, 1], + [1, 0], + ] + ), + (2, 2), + ) + + y = ds.array(np.array([1, 0, 1, 0, 1, 0, 0, 1]).reshape(-1, 1), (2, 1)) + + csvm = CascadeSVM(c=1, random_state=1, max_iter=100, tol=0) + csvm.fit(x, y) + save_model(csvm, self.filepath, save_format="cbor") + csvm2 = load_model(self.filepath, load_format="cbor") + + csvm._collect_clf() + csvm2._collect_clf() + self.assertEqual(csvm._clf.support_vectors_.shape[0], 6) + self.assertEqual(csvm2._clf.support_vectors_.shape[0], 6) + + +class RFSavingTestCBOR(unittest.TestCase): + filepath = "tests/files/saving/rf.cbor" + + def test_make_classification_score(self): + """Tests RandomForestClassifier fit and score with default params.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[: len(x) // 2], (300, 10)) + y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) + + rf = RandomForestClassifier(random_state=0) + rf.fit(x_train, y_train) + save_model(rf, self.filepath, save_format="cbor") + rf2 = load_model(self.filepath, load_format="cbor") + + accuracy = compss_wait_on(rf.score(x_test, y_test)) + accuracy2 = compss_wait_on(rf2.score(x_test, y_test)) + self.assertGreater(accuracy, 0.7) + self.assertGreater(accuracy2, 0.7) + + def test_make_classification_predict_and_distr_depth(self): + """Tests RandomForestClassifier fit and predict with a distr_depth.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[: len(x) // 2], (300, 10)) + y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = y[len(y) // 2:] + + rf = RandomForestClassifier(distr_depth=2, random_state=0) + rf.fit(x_train, y_train) + save_model(rf, self.filepath, save_format="cbor") + rf2 = load_model(self.filepath, load_format="cbor") + + y_pred = rf.predict(x_test).collect() + y_pred2 = rf2.predict(x_test).collect() + accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) + accuracy2 = np.count_nonzero(y_pred2 == y_test) / len(y_test) + self.assertGreater(accuracy, 0.7) + self.assertGreater(accuracy2, 0.7) + + def test_make_classification_fit_predict(self): + """Tests RandomForestClassifier fit_predict with default params.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[: len(x) // 2], (300, 10)) + y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) + + rf = RandomForestClassifier(random_state=0) + rf.fit(x_train, y_train) + save_model(rf, self.filepath, save_format="cbor") + rf2 = load_model(self.filepath, load_format="cbor") + + y_pred = rf.predict(x_train).collect() + y_pred2 = rf2.predict(x_train).collect() + y_train = y_train.collect() + accuracy = np.count_nonzero(y_pred == y_train) / len(y_train) + accuracy2 = np.count_nonzero(y_pred2 == y_train) / len(y_train) + self.assertGreater(accuracy, 0.7) + self.assertGreater(accuracy2, 0.7) + + def test_make_classification_sklearn_max_predict(self): + """Tests RandomForestClassifier predict with sklearn_max.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[: len(x) // 2], (300, 10)) + y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = y[len(y) // 2:] + + rf = RandomForestClassifier(random_state=0, sklearn_max=10) + rf.fit(x_train, y_train) + save_model(rf, self.filepath, save_format="cbor") + rf2 = load_model(self.filepath, load_format="cbor") + + y_pred = rf.predict(x_test).collect() + y_pred2 = rf2.predict(x_test).collect() + accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) + accuracy2 = np.count_nonzero(y_pred2 == y_test) / len(y_test) + self.assertGreater(accuracy, 0.7) + self.assertGreater(accuracy2, 0.7) + + def test_make_classification_sklearn_max_predict_proba(self): + """Tests RandomForestClassifier predict_proba with sklearn_max.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[: len(x) // 2], (300, 10)) + y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = y[len(y) // 2:] + + rf = RandomForestClassifier(random_state=0, sklearn_max=10) + rf.fit(x_train, y_train) + save_model(rf, self.filepath, save_format="cbor") + rf2 = load_model(self.filepath, load_format="cbor") + + probabilities = rf.predict_proba(x_test).collect() + probabilities2 = rf2.predict_proba(x_test).collect() + rf.classes = compss_wait_on(rf.classes) + rf2.classes = compss_wait_on(rf2.classes) + y_pred = rf.classes[np.argmax(probabilities, axis=1)] + y_pred2 = rf2.classes[np.argmax(probabilities2, axis=1)] + accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) + accuracy2 = np.count_nonzero(y_pred2 == y_test) / len(y_test) + self.assertGreater(accuracy, 0.7) + self.assertGreater(accuracy2, 0.7) + + def test_make_classification_hard_vote_predict(self): + """Tests RandomForestClassifier predict with hard_vote.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[: len(x) // 2], (300, 10)) + y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = y[len(y) // 2:] + + rf = RandomForestClassifier( + random_state=0, sklearn_max=10, hard_vote=True + ) + rf.fit(x_train, y_train) + save_model(rf, self.filepath, save_format="cbor") + rf2 = load_model(self.filepath, load_format="cbor") + + y_pred = rf.predict(x_test).collect() + y_pred2 = rf2.predict(x_test).collect() + accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) + accuracy2 = np.count_nonzero(y_pred2 == y_test) / len(y_test) + self.assertGreater(accuracy, 0.7) + self.assertGreater(accuracy2, 0.7) + + def test_make_classification_hard_vote_score_mix(self): + """Tests RandomForestClassifier score with hard_vote, sklearn_max, + distr_depth and max_depth.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[: len(x) // 2], (300, 10)) + y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) + + rf = RandomForestClassifier( + random_state=0, + sklearn_max=100, + distr_depth=2, + max_depth=12, + hard_vote=True, + ) + rf.fit(x_train, y_train) + save_model(rf, self.filepath, save_format="cbor") + rf2 = load_model(self.filepath, load_format="cbor") + + accuracy = compss_wait_on(rf.score(x_test, y_test)) + accuracy2 = compss_wait_on(rf2.score(x_test, y_test)) + self.assertGreater(accuracy, 0.7) + self.assertGreater(accuracy2, 0.7) + + def test_iris(self): + """Tests RandomForestClassifier with a minimal example.""" + x, y = datasets.load_iris(return_X_y=True) + ds_fit = ds.array(x[::2], block_size=(30, 2)) + fit_y = ds.array(y[::2].reshape(-1, 1), block_size=(30, 1)) + ds_validate = ds.array(x[1::2], block_size=(30, 2)) + validate_y = ds.array(y[1::2].reshape(-1, 1), block_size=(30, 1)) + + rf = RandomForestClassifier( + n_estimators=1, max_depth=1, random_state=0 + ) + rf.fit(ds_fit, fit_y) + save_model(rf, self.filepath, save_format="cbor") + rf2 = load_model(self.filepath, load_format="cbor") + + accuracy = compss_wait_on(rf.score(ds_validate, validate_y)) + accuracy2 = compss_wait_on(rf2.score(ds_validate, validate_y)) + + # Accuracy should be <= 2/3 for any seed, often exactly equal. + self.assertAlmostEqual(accuracy, 2 / 3) + self.assertAlmostEqual(accuracy2, 2 / 3) + + +class LassoSavingTestCBOR(unittest.TestCase): + filepath = "tests/files/saving/lasso.cbor" + + def test_fit_predict(self): + """Tests fit and predicts methods""" + + np.random.seed(42) + + n_samples, n_features = 50, 100 + X = np.random.randn(n_samples, n_features) + + # Decreasing coef w. alternated signs for visualization + idx = np.arange(n_features) + coef = (-1) ** idx * np.exp(-idx / 10) + coef[10:] = 0 # sparsify coef + y = np.dot(X, coef) + + # Add noise + y += 0.01 * np.random.normal(size=n_samples) + + n_samples = X.shape[0] + X_train, y_train = X[: n_samples // 2], y[: n_samples // 2] + X_test, y_test = X[n_samples // 2:], y[n_samples // 2:] + + lasso = Lasso(lmbd=0.1, max_iter=50) + + lasso.fit(ds.array(X_train, (5, 100)), ds.array(y_train, (5, 1))) + save_model(lasso, self.filepath, save_format="cbor") + lasso2 = load_model(self.filepath, load_format="cbor") + + y_pred_lasso = lasso.predict(ds.array(X_test, (25, 100))) + r2_score_lasso = r2_score(y_test, y_pred_lasso.collect()) + y_pred_lasso2 = lasso2.predict(ds.array(X_test, (25, 100))) + r2_score_lasso2 = r2_score(y_test, y_pred_lasso2.collect()) + + self.assertAlmostEqual(r2_score_lasso, 0.9481746925431124) + self.assertAlmostEqual(r2_score_lasso2, 0.9481746925431124) + + +class LinearRegressionSavingTestCBOR(unittest.TestCase): + filepath = "tests/files/saving/linear_regression.cbor" + + def test_univariate(self): + """Tests fit() and predict(), univariate.""" + x_data = np.array([1, 2, 3, 4, 5]) + y_data = np.array([2, 1, 1, 2, 4.5]) + + bn, bm = 2, 1 + + x = ds.array(x=x_data, block_size=(bn, bm)) + y = ds.array(x=y_data, block_size=(bn, bm)) + + reg = LinearRegression() + reg.fit(x, y) + save_model(reg, self.filepath, save_format="cbor") + reg2 = load_model(self.filepath, load_format="cbor") + + self.assertTrue(np.allclose(reg.coef_.collect(), 0.6)) + self.assertTrue(np.allclose(reg.intercept_.collect(), 0.3)) + self.assertTrue(np.allclose(reg2.coef_.collect(), 0.6)) + self.assertTrue(np.allclose(reg2.intercept_.collect(), 0.3)) + + # Predict one sample + x_test = np.array([3]) + test_data = ds.array(x=x_test, block_size=(1, 1)) + pred = reg.predict(test_data).collect() + pred2 = reg2.predict(test_data).collect() + self.assertTrue(np.allclose(pred, 2.1)) + self.assertTrue(np.allclose(pred2, 2.1)) + + # Predict multiple samples + x_test = np.array([3, 5, 6]) + test_data = ds.array(x=x_test, block_size=(bn, bm)) + pred = reg.predict(test_data).collect() + pred2 = reg2.predict(test_data).collect() + self.assertTrue(np.allclose(pred, [2.1, 3.3, 3.9])) + self.assertTrue(np.allclose(pred2, [2.1, 3.3, 3.9])) + + def test_univariate_no_intercept(self): + """Tests fit() and predict(), univariate, fit_intercept=False.""" + x_data = np.array([1, 2, 3, 4, 5]) + y_data = np.array([2, 1, 1, 2, 4.5]) + + bn, bm = 2, 1 + + x = ds.array(x=x_data, block_size=(bn, bm)) + y = ds.array(x=y_data, block_size=(bn, bm)) + + reg = LinearRegression(fit_intercept=False) + reg.fit(x, y) + save_model(reg, self.filepath, save_format="cbor") + reg2 = load_model(self.filepath, load_format="cbor") + + self.assertTrue(np.allclose(reg.coef_.collect(), 0.68181818)) + self.assertTrue(np.allclose(reg2.coef_.collect(), 0.68181818)) + self.assertTrue(np.allclose(reg.intercept_.collect(), 0)) + self.assertTrue(np.allclose(reg2.intercept_.collect(), 0)) + + # Predict one sample + x_test = np.array([3]) + test_data = ds.array(x=x_test, block_size=(1, 1)) + pred = reg.predict(test_data).collect() + pred2 = reg2.predict(test_data).collect() + self.assertTrue(np.allclose(pred, 2.04545455)) + self.assertTrue(np.allclose(pred2, 2.04545455)) + + # Predict multiple samples + x_test = np.array([3, 5, 6]) + test_data = ds.array(x=x_test, block_size=(bn, bm)) + pred = reg.predict(test_data).collect() + pred2 = reg.predict(test_data).collect() + self.assertTrue(np.allclose(pred, [2.04545455, 3.4090909, 4.0909091])) + self.assertTrue(np.allclose(pred2, [2.04545455, 3.4090909, 4.0909091])) + + def test_multivariate(self): + """Tests fit() and predict(), multivariate.""" + x_data = np.array([[1, 2], [2, 0], [3, 1], [4, 4], [5, 3]]) + y_data = np.array([2, 1, 1, 2, 4.5]) + + bn, bm = 2, 2 + + x = ds.array(x=x_data, block_size=(bn, bm)) + y = ds.array(x=y_data, block_size=(bn, 1)) + + reg = LinearRegression() + reg.fit(x, y) + save_model(reg, self.filepath, save_format="cbor") + reg2 = load_model(self.filepath, load_format="cbor") + + self.assertTrue(np.allclose(reg.coef_.collect(), [0.421875, 0.296875])) + self.assertTrue( + np.allclose(reg2.coef_.collect(), [0.421875, 0.296875]) + ) + self.assertTrue(np.allclose(reg.intercept_.collect(), 0.240625)) + self.assertTrue(np.allclose(reg2.intercept_.collect(), 0.240625)) + + # Predict one sample + x_test = np.array([3, 2]) + test_data = ds.array(x=x_test, block_size=(1, bm)) + pred = reg.predict(test_data).collect() + pred2 = reg2.predict(test_data).collect() + self.assertTrue(np.allclose(pred, 2.1)) + self.assertTrue(np.allclose(pred2, 2.1)) + + # Predict multiple samples + x_test = np.array([[3, 2], [4, 4], [1, 3]]) + test_data = ds.array(x=x_test, block_size=(bn, bm)) + pred = reg.predict(test_data).collect() + self.assertTrue(np.allclose(pred, [2.1, 3.115625, 1.553125])) + + def test_multivariate_no_intercept(self): + """Tests fit() and predict(), multivariate, fit_intercept=False.""" + x_data = np.array([[1, 2], [2, 0], [3, 1], [4, 4], [5, 3]]) + y_data = np.array([2, 1, 1, 2, 4.5]) + + bn, bm = 2, 2 + + x = ds.array(x=x_data, block_size=(bn, bm)) + y = ds.array(x=y_data, block_size=(bn, 1)) + + reg = LinearRegression(fit_intercept=False) + reg.fit(x, y) + save_model(reg, self.filepath, save_format="cbor") + reg2 = load_model(self.filepath, load_format="cbor") + + self.assertTrue( + np.allclose(reg.coef_.collect(), [0.48305085, 0.30367232]) + ) + self.assertTrue( + np.allclose(reg2.coef_.collect(), [0.48305085, 0.30367232]) + ) + self.assertTrue(np.allclose(reg.intercept_.collect(), 0)) + self.assertTrue(np.allclose(reg2.intercept_.collect(), 0)) + + # Predict one sample + x_test = np.array([3, 2]) + test_data = ds.array(x=x_test, block_size=(1, bm)) + pred = reg.predict(test_data).collect() + pred2 = reg2.predict(test_data).collect() + self.assertTrue(np.allclose(pred, [2.05649718])) + self.assertTrue(np.allclose(pred2, [2.05649718])) + + # Predict multiple samples + x_test = np.array([[3, 2], [4, 4], [1, 3]]) + test_data = ds.array(x=x_test, block_size=(bn, bm)) + pred = reg.predict(test_data).collect() + pred2 = reg2.predict(test_data).collect() + self.assertTrue(np.allclose(pred, [2.05649718, 3.14689266, 1.3940678])) + self.assertTrue( + np.allclose(pred2, [2.05649718, 3.14689266, 1.3940678]) + ) + + def test_multivariate_multiobjective(self): + """Tests fit() and predict(), multivariate, multiobjective.""" + x_data = np.array( + [[1, 2, 3], [2, 0, 4], [3, 1, 8], [4, 4, 2], [5, 3, 1], [2, 7, 1]] + ) + y_data = np.array( + [ + [2, 0, 3], + [1, 5, 2], + [1, 3, 4], + [2, 7, 9], + [4.5, -1, 4], + [0, 0, 0], + ] + ) + + bn, bm = 2, 2 + + x = ds.array(x=x_data, block_size=(bn, bm)) + y = ds.array(x=y_data, block_size=(bn, bm)) + + reg = LinearRegression() + reg.fit(x, y) + save_model(reg, self.filepath, save_format="cbor") + reg2 = load_model(self.filepath, load_format="cbor") + + # Predict one sample + x_test = np.array([3, 2, 1]) + test_data = ds.array(x=x_test, block_size=(1, bm)) + pred = reg.predict(test_data).collect() + pred2 = reg2.predict(test_data).collect() + self.assertTrue(np.allclose(pred, [3.0318415, 1.97164872, 3.85410906])) + self.assertTrue( + np.allclose(pred2, [3.0318415, 1.97164872, 3.85410906]) + ) + + # Predict multiple samples + x_test = np.array([[3, 2, 1], [4, 3, 3], [1, 1, 1]]) + test_data = ds.array(x=x_test, block_size=(bn, bm)) + pred = reg.predict(test_data).collect() + pred2 = reg2.predict(test_data).collect() + self.assertTrue( + np.allclose( + pred, + [ + [3.0318415, 1.97164872, 3.85410906], + [2.5033157, 2.65809327, 5.05310495], + [2.145797, 1.4840121, 1.5739791], + ], + ) + ) + self.assertTrue( + np.allclose( + pred2, + [ + [3.0318415, 1.97164872, 3.85410906], + [2.5033157, 2.65809327, 5.05310495], + [2.145797, 1.4840121, 1.5739791], + ], + ) + ) + + # Check attributes values + self.assertTrue( + np.allclose( + reg2.coef_.collect(), + [ + [0.65034768, 0.34673933, 1.22176283], + [-0.41465084, -0.20584208, -0.16339571], + [-0.38211131, 0.27277365, 0.07031439], + ], + ) + ) + self.assertTrue( + np.allclose( + reg2.coef_.collect(), + [ + [0.65034768, 0.34673933, 1.22176283], + [-0.41465084, -0.20584208, -0.16339571], + [-0.38211131, 0.27277365, 0.07031439], + ], + ) + ) + self.assertTrue( + np.allclose( + reg.intercept_.collect(), [2.29221145, 1.07034124, 0.44529761] + ) + ) + self.assertTrue( + np.allclose( + reg2.intercept_.collect(), [2.29221145, 1.07034124, 0.44529761] + ) + ) + + +def load_movielens(train_ratio=0.9): + file = "tests/files/sample_movielens_ratings.csv" + + # 'user_id', 'movie_id', 'rating', 'timestamp' + + data = np.genfromtxt(file, dtype="int", delimiter=",", usecols=range(3)) + + # just in case there are movies/user without rating + # movie_id + n_m = max(len(np.unique(data[:, 1])), max(data[:, 1]) + 1) + # user_id + n_u = max(len(np.unique(data[:, 0])), max(data[:, 0]) + 1) + + idx = int(data.shape[0] * train_ratio) + + train_data = data[:idx] + test_data = data[idx:] + + train = csr_matrix( + (train_data[:, 2], (train_data[:, 0], train_data[:, 1])), + shape=(n_u, n_m), + ) + + test = csr_matrix((test_data[:, 2], (test_data[:, 0], test_data[:, 1]))) + + x_size, y_size = train.shape[0] // 4, train.shape[1] // 4 + train_arr = ds.array(train, block_size=(x_size, y_size)) + + x_size, y_size = test.shape[0] // 4, test.shape[1] // 4 + test_arr = ds.array(test, block_size=(x_size, y_size)) + + return train_arr, test_arr + + +class ALSSavingTestCBOR(unittest.TestCase): + filepath = "tests/files/saving/als.cbor" + + def test_init_params(self): + # Test all parameters + seed = 666 + n_f = 100 + lambda_ = 0.001 + convergence_threshold = 0.1 + max_iter = 10 + verbose = True + arity = 12 + + als = ALS( + random_state=seed, + n_f=n_f, + lambda_=lambda_, + tol=convergence_threshold, + max_iter=max_iter, + verbose=verbose, + arity=arity, + ) + save_model(als, self.filepath, save_format="cbor") + als2 = load_model(self.filepath, load_format="cbor") + + self.assertEqual(als.random_state, seed) + self.assertEqual(als.n_f, n_f) + self.assertEqual(als.lambda_, lambda_) + self.assertEqual(als.tol, convergence_threshold) + self.assertEqual(als.max_iter, max_iter) + self.assertEqual(als.verbose, verbose) + self.assertEqual(als.arity, arity) + self.assertEqual(als2.random_state, seed) + self.assertEqual(als2.n_f, n_f) + self.assertEqual(als2.lambda_, lambda_) + self.assertEqual(als2.tol, convergence_threshold) + self.assertEqual(als2.max_iter, max_iter) + self.assertEqual(als2.verbose, verbose) + self.assertEqual(als2.arity, arity) + + def test_fit(self): + train, test = load_movielens() + + als = ALS( + tol=0.01, + random_state=666, + n_f=100, + verbose=False, + check_convergence=True, + ) + + als.fit(train, test) + self.assertTrue(als.converged) + + als.fit(train) + save_model(als, self.filepath, save_format="cbor") + als2 = load_model(self.filepath, load_format="cbor") + + self.assertTrue(als.converged) + self.assertTrue(als2.converged) + + def test_predict(self): + data = np.array([[0, 0, 5], [3, 0, 5], [3, 1, 2]]) + ratings = csr_matrix(data) + train = ds.array(x=ratings, block_size=(1, 1)) + als = ALS(tol=0.01, random_state=666, n_f=5, verbose=False) + als.fit(train) + save_model(als, self.filepath, save_format="cbor") + als2 = load_model(self.filepath, load_format="cbor") + + predictions = als.predict_user(user_id=0) + predictions2 = als2.predict_user(user_id=0) + + # Check that the ratings for user 0 are similar to user 1 because they + # share preferences (third movie), thus it is expected that user 0 + # will rate movie 1 similarly to user 1. + self.assertTrue( + 2.75 < predictions[0] < 3.25 + and predictions[1] < 1 + and predictions[2] > 4.5 + ) + self.assertTrue( + 2.75 < predictions2[0] < 3.25 + and predictions2[1] < 1 + and predictions2[2] > 4.5 + ) + self.assertTrue( + np.array_equal(predictions, predictions2, equal_nan=True) + ) + + +def main(): + unittest.main() + + +if __name__ == "__main__": + main() diff --git a/tests/test_saving_json.py b/tests/test_saving_json.py new file mode 100644 index 00000000..be18474d --- /dev/null +++ b/tests/test_saving_json.py @@ -0,0 +1,1403 @@ +import unittest + +import numpy as np +from numpy.random.mtrand import RandomState +from scipy.sparse import csr_matrix +from sklearn import datasets +from sklearn.cluster import KMeans as SKMeans +from sklearn.metrics import r2_score +from sklearn.datasets import make_classification +from sklearn.datasets import make_blobs, load_iris + +import dislib as ds +from dislib.cluster import KMeans +from dislib.cluster import GaussianMixture +from dislib.classification import CascadeSVM +from dislib.classification import RandomForestClassifier +from dislib.regression import Lasso +from dislib.regression import LinearRegression +from dislib.recommendation import ALS +from dislib.utils import save_model, load_model + +from pycompss.api.api import compss_wait_on + + +class KMeansSavingTestJSON(unittest.TestCase): + filepath = "tests/files/saving/kmeans.json" + + def test_init_params_kmeans(self): + """Tests that saved and loaded KMeans object correctly sets the initialization + parameters""" + n_clusters = 2 + max_iter = 1 + tol = 1e-4 + seed = 666 + arity = 2 + init = "random" + + km = KMeans( + n_clusters=n_clusters, + max_iter=max_iter, + tol=tol, + arity=arity, + random_state=seed, + ) + save_model(km, self.filepath) + km2 = load_model(self.filepath) + + expected = (n_clusters, init, max_iter, tol, arity) + real = (km.n_clusters, km.init, km.max_iter, km.tol, km.arity) + real2 = (km2.n_clusters, km2.init, km2.max_iter, km2.tol, km2.arity) + self.assertEqual(expected, real) + self.assertEqual(expected, real2) + + def test_fit_kmeans(self): + """Tests that the fit method returns the expected centers using toy + data. + """ + arr = np.array([[1, 2], [2, 1], [-1, -2], [-2, -1]]) + x = ds.array(arr, block_size=(2, 2)) + + km = KMeans(n_clusters=2, random_state=666, verbose=False) + km.fit(x) + + expected_centers = np.array([[1.5, 1.5], [-1.5, -1.5]]) + + save_model(km, self.filepath) + km2 = load_model(self.filepath) + + self.assertTrue((km.centers == expected_centers).all()) + self.assertTrue((km2.centers == expected_centers).all()) + + def test_predict_kmeans(self): + """Tests that labels are correctly predicted using toy data.""" + p1, p2, p3, p4 = [1, 2], [2, 1], [-1, -2], [-2, -1] + + arr1 = np.array([p1, p2, p3, p4]) + x = ds.array(arr1, block_size=(2, 2)) + + km = KMeans(n_clusters=2, random_state=666) + km.fit(x) + + save_model(km, self.filepath) + km2 = load_model(self.filepath) + + p5, p6 = [10, 10], [-10, -10] + + arr2 = np.array([p1, p2, p3, p4, p5, p6]) + x_test = ds.array(arr2, block_size=(2, 2)) + + labels = km.predict(x_test).collect() + labels2 = km2.predict(x_test).collect() + expected_labels = np.array([0, 0, 1, 1, 0, 1]) + + self.assertTrue(np.array_equal(labels, expected_labels)) + self.assertTrue(np.array_equal(labels2, expected_labels)) + + def test_fit_predict_kmeans(self): + """Tests fit_predict.""" + x, y = make_blobs(n_samples=1500, random_state=170) + x_filtered = np.vstack( + (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10]) + ) + + x_train = ds.array(x_filtered, block_size=(300, 2)) + + kmeans = KMeans(n_clusters=3, random_state=170) + labels = kmeans.fit_predict(x_train).collect() + + save_model(kmeans, self.filepath) + kmeans = load_model(self.filepath) + + skmeans = SKMeans(n_clusters=3, random_state=170) + sklabels = skmeans.fit_predict(x_filtered) + + centers = np.array( + [ + [-8.941375656533449, -5.481371322614891], + [-4.524023204953875, 0.06235042593214654], + [2.332994701667008, 0.37681003933082696], + ] + ) + + self.assertTrue(np.allclose(centers, kmeans.centers)) + self.assertTrue(np.allclose(labels, sklabels)) + + def test_sparse_kmeans(self): + """Tests K-means produces the same results using dense and sparse + data structures.""" + file_ = "tests/files/libsvm/2" + + x_sp, _ = ds.load_svmlight_file(file_, (10, 300), 780, True) + x_ds, _ = ds.load_svmlight_file(file_, (10, 300), 780, False) + + kmeans = KMeans(random_state=170) + kmeans.fit(x_sp) + + save_model(kmeans, self.filepath) + kmeans2 = load_model(self.filepath) + + y_sparse = kmeans.predict(x_sp).collect() + y_sparse2 = kmeans2.predict(x_sp).collect() + + sparse_c = kmeans.centers.toarray() + sparse_c2 = kmeans2.centers.toarray() + + kmeans = KMeans(random_state=170) + + y_dense = kmeans.fit_predict(x_ds).collect() + dense_c = kmeans.centers + + self.assertTrue(np.allclose(sparse_c, dense_c)) + self.assertTrue(np.allclose(sparse_c2, dense_c)) + self.assertTrue(np.array_equal(y_sparse, y_dense)) + self.assertTrue(np.array_equal(y_sparse2, y_dense)) + + def test_init_kmeans(self): + # With dense data + x, y = make_blobs(n_samples=1500, random_state=170) + x_filtered = np.vstack( + (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10]) + ) + x_train = ds.array(x_filtered, block_size=(300, 2)) + + init = np.random.random((5, 2)) + km = KMeans(n_clusters=5, init=init) + km.fit(x_train) + + save_model(km, self.filepath) + km2 = load_model(self.filepath) + + self.assertTrue(np.array_equal(km.init, init)) + self.assertTrue(np.array_equal(km2.init, init)) + self.assertFalse(np.array_equal(km.centers, init)) + self.assertFalse(np.array_equal(km2.centers, init)) + + # With sparse data + x_sp = ds.array(csr_matrix(x_filtered), block_size=(300, 2)) + init = csr_matrix(np.random.random((5, 2))) + + km = KMeans(n_clusters=5, init=init) + km.fit(x_sp) + + save_model(km, self.filepath) + km2 = load_model(self.filepath) + + self.assertTrue(np.array_equal(km.init.toarray(), init.toarray())) + self.assertTrue(np.array_equal(km2.init.toarray(), init.toarray())) + self.assertFalse(np.array_equal(km.centers.toarray(), init.toarray())) + self.assertFalse(np.array_equal(km2.centers.toarray(), init.toarray())) + + +class GaussianMixtureSavingTestJSON(unittest.TestCase): + filepath = "tests/files/saving/gm.json" + + def test_init_params(self): + """Tests that GaussianMixture params are set""" + n_components = 2 + covariance_type = "diag" + tol = 1e-4 + reg_covar = 1e-5 + max_iter = 3 + init_params = "random" + weights_init = np.array([0.4, 0.6]) + means_init = np.array([[0, 0], [2, 3]]) + precisions_init = "todo" + random_state = RandomState(666) + gm = GaussianMixture( + n_components=n_components, + covariance_type=covariance_type, + tol=tol, + reg_covar=reg_covar, + max_iter=max_iter, + init_params=init_params, + weights_init=weights_init, + means_init=means_init, + precisions_init=precisions_init, + random_state=random_state, + ) + + save_model(gm, self.filepath) + gm2 = load_model(self.filepath) + + real = ( + gm.n_components, + gm.covariance_type, + gm.tol, + gm.reg_covar, + gm.max_iter, + gm.init_params, + gm.weights_init.tolist(), + gm.means_init.tolist(), + gm.precisions_init, + *[ + list(x) if isinstance(x, np.ndarray) else x + for x in gm.random_state.get_state() + ], + ) + real2 = ( + gm2.n_components, + gm2.covariance_type, + gm2.tol, + gm2.reg_covar, + gm2.max_iter, + gm2.init_params, + gm2.weights_init.tolist(), + gm2.means_init.tolist(), + gm2.precisions_init, + *[ + list(x) if isinstance(x, np.ndarray) else x + for x in gm2.random_state.get_state() + ], + ) + + self.assertEqual(real, real2) + + def test_fit(self): + """Tests GaussianMixture.fit()""" + + x = np.array([[1, 2], [2, 1], [-3, -3], [-1, -2], [-2, -1], [3, 3]]) + ds_x = ds.array(x, block_size=(3, 2)) + + gm = GaussianMixture(n_components=2, random_state=666) + gm.fit(ds_x) + + expected_weights = np.array([0.5, 0.5]) + expected_means = np.array([[-2, -2], [2, 2]]) + expected_cov = np.array( + [ + [[0.66671688, 0.33338255], [0.33338255, 0.66671688]], + [[0.66671688, 0.33338255], [0.33338255, 0.66671688]], + ] + ) + expected_pc = np.array( + [ + [[1.22469875, -0.70714834], [0.0, 1.4141944]], + [[1.22469875, -0.70714834], [0.0, 1.4141944]], + ] + ) + + save_model(gm, self.filepath) + gm2 = load_model(self.filepath) + + gm.weights_ = compss_wait_on(gm.weights_) + gm.means_ = compss_wait_on(gm.means_) + gm.covariances_ = compss_wait_on(gm.covariances_) + gm.precisions_cholesky_ = compss_wait_on(gm.precisions_cholesky_) + + gm2.weights_ = compss_wait_on(gm2.weights_) + gm2.means_ = compss_wait_on(gm2.means_) + gm2.covariances_ = compss_wait_on(gm2.covariances_) + gm2.precisions_cholesky_ = compss_wait_on(gm2.precisions_cholesky_) + + self.assertTrue((np.allclose(gm.weights_, expected_weights))) + self.assertTrue((np.allclose(gm.means_, expected_means))) + self.assertTrue((np.allclose(gm.covariances_, expected_cov))) + self.assertTrue((np.allclose(gm.precisions_cholesky_, expected_pc))) + + self.assertTrue((np.allclose(gm2.weights_, expected_weights))) + self.assertTrue((np.allclose(gm2.means_, expected_means))) + self.assertTrue((np.allclose(gm2.covariances_, expected_cov))) + self.assertTrue((np.allclose(gm2.precisions_cholesky_, expected_pc))) + + def test_predict(self): + """Tests GaussianMixture.predict()""" + x_train = np.array([[1, 2], [-1, -2], [2, 1], [-2, -1]]) + ds_x_train = ds.array(x_train, block_size=(2, 2)) + + gm = GaussianMixture(n_components=2, random_state=666) + gm.fit(ds_x_train) + + save_model(gm, self.filepath) + gm2 = load_model(self.filepath) + + x_test = np.concatenate((x_train, [[2, 2], [-1, -3]])) + ds_x_test = ds.array(x_test, block_size=(2, 2)) + pred = gm.predict(ds_x_test).collect() + pred2 = gm2.predict(ds_x_test).collect() + + self.assertTrue(pred[0] != pred[1]) + self.assertTrue(pred[0] == pred[2] == pred[4]) + self.assertTrue(pred[1] == pred[3] == pred[5]) + + self.assertTrue(pred2[0] != pred2[1]) + self.assertTrue(pred2[0] == pred2[2] == pred2[4]) + self.assertTrue(pred2[1] == pred2[3] == pred2[5]) + + def test_fit_predict(self): + """Tests GaussianMixture.fit_predict()""" + x, y = make_blobs(n_samples=1500, random_state=170) + x_filtered = np.vstack( + (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10]) + ) + y_real = np.concatenate((np.zeros(500), np.ones(100), 2 * np.ones(10))) + + ds_x = ds.array(x_filtered, block_size=(300, 2)) + + gm = GaussianMixture(n_components=3, random_state=170) + pred = gm.fit_predict(ds_x).collect() + + save_model(gm, self.filepath) + gm2 = load_model(self.filepath) + + pred2 = gm2.predict(ds_x).collect() + + self.assertEqual(len(pred), 610) + accuracy = np.count_nonzero(pred == y_real) / len(pred) + self.assertGreater(accuracy, 0.99) + + self.assertEqual(len(pred2), 610) + accuracy2 = np.count_nonzero(pred2 == y_real) / len(pred2) + self.assertGreater(accuracy2, 0.99) + + def test_sparse(self): + """Tests GaussianMixture produces the same results using dense and + sparse data structures""" + file_ = "tests/files/libsvm/2" + + x_sparse, _ = ds.load_svmlight_file(file_, (10, 780), 780, True) + x_dense, _ = ds.load_svmlight_file(file_, (10, 780), 780, False) + + covariance_types = "full", "tied", "diag", "spherical" + + for cov_type in covariance_types: + gm = GaussianMixture( + n_components=4, random_state=0, covariance_type=cov_type + ) + gm.fit(x_sparse) + save_model(gm, self.filepath) + gm2 = load_model(self.filepath) + labels_sparse = gm.predict(x_sparse).collect() + labels_sparse2 = gm2.predict(x_sparse).collect() + + gm = GaussianMixture( + n_components=4, random_state=0, covariance_type=cov_type + ) + gm.fit(x_dense) + save_model(gm, self.filepath) + gm2 = load_model(self.filepath) + labels_dense = gm.predict(x_dense).collect() + labels_dense2 = gm2.predict(x_dense).collect() + + self.assertTrue(np.array_equal(labels_sparse, labels_sparse2)) + self.assertTrue(np.array_equal(labels_sparse, labels_dense)) + self.assertTrue(np.array_equal(labels_sparse2, labels_dense2)) + + def test_init_random(self): + """Tests GaussianMixture random initialization""" + x = ds.random_array((50, 3), (10, 3), random_state=0) + gm = GaussianMixture( + init_params="random", n_components=4, arity=2, random_state=170 + ) + gm.fit(x) + save_model(gm, self.filepath) + gm2 = load_model(self.filepath) + self.assertGreater(gm.n_iter, 5) + self.assertGreater(gm2.n_iter, 5) + + def test_means_init_and_weights_init(self): + """Tests GaussianMixture means_init and weights_init parameters""" + x, _ = load_iris(return_X_y=True) + x_ds = ds.array(x, (75, 4)) + weights_init = [1 / 3, 1 / 3, 1 / 3] + means_init = np.array([[5, 3, 2, 0], [6, 3, 4, 1], [7, 3, 6, 2]]) + gm = GaussianMixture( + random_state=0, + n_components=3, + weights_init=weights_init, + means_init=means_init, + ) + gm.fit(x_ds) + save_model(gm, self.filepath) + gm2 = load_model(self.filepath) + self.assertTrue(gm.converged_) + self.assertTrue(gm2.converged_) + + +class CSVMSavingTestJSON(unittest.TestCase): + filepath = "tests/files/saving/csvm.json" + + def test_init_params(self): + """Test constructor parameters""" + cascade_arity = 3 + max_iter = 1 + tol = 1e-4 + kernel = "rbf" + c = 2 + gamma = 0.1 + check_convergence = True + seed = 666 + verbose = False + + csvm = CascadeSVM( + cascade_arity=cascade_arity, + max_iter=max_iter, + tol=tol, + kernel=kernel, + c=c, + gamma=gamma, + check_convergence=check_convergence, + random_state=seed, + verbose=verbose, + ) + save_model(csvm, self.filepath) + csvm2 = load_model(self.filepath) + + self.assertEqual(csvm.cascade_arity, cascade_arity) + self.assertEqual(csvm.max_iter, max_iter) + self.assertEqual(csvm.tol, tol) + self.assertEqual(csvm.kernel, kernel) + self.assertEqual(csvm.c, c) + self.assertEqual(csvm.gamma, gamma) + self.assertEqual(csvm.check_convergence, check_convergence) + self.assertEqual(csvm.random_state, seed) + self.assertEqual(csvm.verbose, verbose) + + self.assertEqual(csvm2.cascade_arity, cascade_arity) + self.assertEqual(csvm2.max_iter, max_iter) + self.assertEqual(csvm2.tol, tol) + self.assertEqual(csvm2.kernel, kernel) + self.assertEqual(csvm2.c, c) + self.assertEqual(csvm2.gamma, gamma) + self.assertEqual(csvm2.check_convergence, check_convergence) + self.assertEqual(csvm2.random_state, seed) + self.assertEqual(csvm2.verbose, verbose) + + def test_fit_private_params(self): + kernel = "rbf" + c = 2 + gamma = 0.1 + seed = 666 + file_ = "tests/files/libsvm/2" + + x, y = ds.load_svmlight_file(file_, (10, 300), 780, False) + csvm = CascadeSVM(kernel=kernel, c=c, gamma=gamma, random_state=seed) + csvm.fit(x, y) + save_model(csvm, self.filepath) + csvm2 = load_model(self.filepath) + self.assertEqual(csvm._clf_params["kernel"], kernel) + self.assertEqual(csvm._clf_params["C"], c) + self.assertEqual(csvm._clf_params["gamma"], gamma) + self.assertEqual(csvm2._clf_params["kernel"], kernel) + self.assertEqual(csvm2._clf_params["C"], c) + self.assertEqual(csvm2._clf_params["gamma"], gamma) + + kernel, c = "linear", 0.3 + csvm = CascadeSVM(kernel=kernel, c=c, random_state=seed) + csvm.fit(x, y) + save_model(csvm, self.filepath) + csvm2 = load_model(self.filepath) + self.assertEqual(csvm._clf_params["kernel"], kernel) + self.assertEqual(csvm._clf_params["C"], c) + self.assertEqual(csvm2._clf_params["kernel"], kernel) + self.assertEqual(csvm2._clf_params["C"], c) + + # # check for exception when incorrect kernel is passed + # self.assertRaises(AttributeError, CascadeSVM(kernel='fake_kernel')) + + def test_fit(self): + seed = 666 + file_ = "tests/files/libsvm/2" + + x, y = ds.load_svmlight_file(file_, (10, 300), 780, False) + + csvm = CascadeSVM( + cascade_arity=3, + max_iter=5, + tol=1e-4, + kernel="linear", + c=2, + gamma=0.1, + check_convergence=True, + random_state=seed, + verbose=False, + ) + + csvm.fit(x, y) + save_model(csvm, self.filepath) + csvm2 = load_model(self.filepath) + + self.assertTrue(csvm.converged) + self.assertTrue(csvm2.converged) + + csvm = CascadeSVM( + cascade_arity=3, + max_iter=1, + tol=1e-4, + kernel="linear", + c=2, + gamma=0.1, + check_convergence=False, + random_state=seed, + verbose=False, + ) + + csvm.fit(x, y) + save_model(csvm, self.filepath) + csvm2 = load_model(self.filepath) + self.assertFalse(csvm.converged) + self.assertEqual(csvm.iterations, 1) + self.assertFalse(csvm2.converged) + self.assertEqual(csvm2.iterations, 1) + + def test_predict(self): + seed = 666 + + # negative points belong to class 1, positives to 0 + p1, p2, p3, p4 = [1, 2], [2, 1], [-1, -2], [-2, -1] + + x = ds.array(np.array([p1, p4, p3, p2]), (2, 2)) + y = ds.array(np.array([0, 1, 1, 0]).reshape(-1, 1), (2, 1)) + + csvm = CascadeSVM( + cascade_arity=3, + max_iter=10, + tol=1e-4, + kernel="linear", + c=2, + gamma=0.1, + check_convergence=False, + random_state=seed, + verbose=False, + ) + + csvm.fit(x, y) + save_model(csvm, self.filepath) + csvm2 = load_model(self.filepath) + + # p5 should belong to class 0, p6 to class 1 + p5, p6 = np.array([1, 1]), np.array([-1, -1]) + + x_test = ds.array(np.array([p1, p2, p3, p4, p5, p6]), (2, 2)) + + y_pred = csvm.predict(x_test) + y_pred2 = csvm2.predict(x_test) + + l1, l2, l3, l4, l5, l6 = y_pred.collect() + self.assertTrue(l1 == l2 == l5 == 0) + self.assertTrue(l3 == l4 == l6 == 1) + + l1, l2, l3, l4, l5, l6 = y_pred2.collect() + self.assertTrue(l1 == l2 == l5 == 0) + self.assertTrue(l3 == l4 == l6 == 1) + + def test_score(self): + seed = 666 + + # negative points belong to class 1, positives to 0 + p1, p2, p3, p4 = [1, 2], [2, 1], [-1, -2], [-2, -1] + + x = ds.array(np.array([p1, p4, p3, p2]), (2, 2)) + y = ds.array(np.array([0, 1, 1, 0]).reshape(-1, 1), (2, 1)) + + csvm = CascadeSVM( + cascade_arity=3, + max_iter=10, + tol=1e-4, + kernel="rbf", + c=2, + gamma=0.1, + check_convergence=True, + random_state=seed, + verbose=False, + ) + + csvm.fit(x, y) + save_model(csvm, self.filepath) + csvm2 = load_model(self.filepath) + + # points are separable, scoring the training dataset should have 100% + # accuracy + x_test = ds.array(np.array([p1, p2, p3, p4]), (2, 2)) + y_test = ds.array(np.array([0, 0, 1, 1]).reshape(-1, 1), (2, 1)) + + accuracy = compss_wait_on(csvm.score(x_test, y_test)) + accuracy2 = compss_wait_on(csvm2.score(x_test, y_test)) + + self.assertEqual(accuracy, 1.0) + self.assertEqual(accuracy2, 1.0) + + def test_decision_func(self): + seed = 666 + + # negative points belong to class 1, positives to 0 + # all points are in the x-axis + p1, p2, p3, p4 = [0, 2], [0, 1], [0, -2], [0, -1] + + x = ds.array(np.array([p1, p4, p3, p2]), (2, 2)) + y = ds.array(np.array([0, 1, 1, 0]).reshape(-1, 1), (2, 1)) + + csvm = CascadeSVM( + cascade_arity=3, + max_iter=10, + tol=1e-4, + kernel="rbf", + c=2, + gamma=0.1, + check_convergence=False, + random_state=seed, + verbose=False, + ) + + csvm.fit(x, y) + save_model(csvm, self.filepath) + csvm2 = load_model(self.filepath) + + # p1 should be equidistant to p3, and p2 to p4 + x_test = ds.array(np.array([p1, p2, p3, p4]), (2, 2)) + + y_pred = csvm.decision_function(x_test) + y_pred2 = csvm2.decision_function(x_test) + + d1, d2, d3, d4 = y_pred.collect() + self.assertTrue(np.isclose(abs(d1) - abs(d3), 0)) + self.assertTrue(np.isclose(abs(d2) - abs(d4), 0)) + d1, d2, d3, d4 = y_pred2.collect() + self.assertTrue(np.isclose(abs(d1) - abs(d3), 0)) + self.assertTrue(np.isclose(abs(d2) - abs(d4), 0)) + + # p5 and p6 should be in the decision function (distance=0) + p5, p6 = np.array([1, 0]), np.array([-1, 0]) + + x_test = ds.array(np.array([p5, p6]), (1, 2)) + + y_pred = csvm.decision_function(x_test) + y_pred2 = csvm2.decision_function(x_test) + + d5, d6 = y_pred.collect() + self.assertTrue(np.isclose(d5, 0)) + self.assertTrue(np.isclose(d6, 0)) + d5, d6 = y_pred2.collect() + self.assertTrue(np.isclose(d5, 0)) + self.assertTrue(np.isclose(d6, 0)) + + def test_sparse(self): + """Tests that C-SVM produces the same results with sparse and dense + data""" + seed = 666 + train = "tests/files/libsvm/3" + + x_sp, y_sp = ds.load_svmlight_file(train, (10, 300), 780, True) + x_d, y_d = ds.load_svmlight_file(train, (10, 300), 780, False) + + csvm_sp = CascadeSVM(random_state=seed) + csvm_sp.fit(x_sp, y_sp) + save_model(csvm_sp, self.filepath) + csvm_sp2 = load_model(self.filepath) + + csvm_d = CascadeSVM(random_state=seed) + csvm_d.fit(x_d, y_d) + save_model(csvm_d, self.filepath) + csvm_d2 = load_model(self.filepath) + + sv_d = csvm_d._clf.support_vectors_ + sv_sp = csvm_sp._clf.support_vectors_.toarray() + sv_d2 = csvm_d2._clf.support_vectors_ + sv_sp2 = csvm_sp2._clf.support_vectors_.toarray() + + self.assertTrue(np.array_equal(sv_d, sv_sp)) + self.assertTrue(np.array_equal(sv_d2, sv_sp2)) + self.assertTrue(np.array_equal(sv_d, sv_d2)) + + coef_d = csvm_d._clf.dual_coef_ + coef_sp = csvm_sp._clf.dual_coef_.toarray() + coef_d2 = csvm_d2._clf.dual_coef_ + coef_sp2 = csvm_sp2._clf.dual_coef_.toarray() + + self.assertTrue(np.array_equal(coef_d, coef_sp)) + self.assertTrue(np.array_equal(coef_d2, coef_sp2)) + self.assertTrue(np.array_equal(coef_d, coef_d2)) + + def test_duplicates(self): + """Tests that C-SVM does not generate duplicate support vectors""" + x = ds.array( + np.array( + [ + [0, 1], + [1, 1], + [0, 1], + [1, 2], + [0, 0], + [2, 2], + [2, 1], + [1, 0], + ] + ), + (2, 2), + ) + + y = ds.array(np.array([1, 0, 1, 0, 1, 0, 0, 1]).reshape(-1, 1), (2, 1)) + + csvm = CascadeSVM(c=1, random_state=1, max_iter=100, tol=0) + csvm.fit(x, y) + save_model(csvm, self.filepath) + csvm2 = load_model(self.filepath) + + csvm._collect_clf() + csvm2._collect_clf() + self.assertEqual(csvm._clf.support_vectors_.shape[0], 6) + self.assertEqual(csvm2._clf.support_vectors_.shape[0], 6) + + +class RFSavingTestJSON(unittest.TestCase): + filepath = "tests/files/saving/rf.json" + + def test_make_classification_score(self): + """Tests RandomForestClassifier fit and score with default params.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[: len(x) // 2], (300, 10)) + y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) + + rf = RandomForestClassifier(random_state=0) + rf.fit(x_train, y_train) + save_model(rf, self.filepath) + rf2 = load_model(self.filepath) + + accuracy = compss_wait_on(rf.score(x_test, y_test)) + accuracy2 = compss_wait_on(rf2.score(x_test, y_test)) + self.assertGreater(accuracy, 0.7) + self.assertGreater(accuracy2, 0.7) + + def test_make_classification_predict_and_distr_depth(self): + """Tests RandomForestClassifier fit and predict with a distr_depth.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[: len(x) // 2], (300, 10)) + y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = y[len(y) // 2:] + + rf = RandomForestClassifier(distr_depth=2, random_state=0) + rf.fit(x_train, y_train) + save_model(rf, self.filepath) + rf2 = load_model(self.filepath) + + y_pred = rf.predict(x_test).collect() + y_pred2 = rf2.predict(x_test).collect() + accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) + accuracy2 = np.count_nonzero(y_pred2 == y_test) / len(y_test) + self.assertGreater(accuracy, 0.7) + self.assertGreater(accuracy2, 0.7) + + def test_make_classification_fit_predict(self): + """Tests RandomForestClassifier fit_predict with default params.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[: len(x) // 2], (300, 10)) + y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) + + rf = RandomForestClassifier(random_state=0) + rf.fit(x_train, y_train) + save_model(rf, self.filepath) + rf2 = load_model(self.filepath) + + y_pred = rf.predict(x_train).collect() + y_pred2 = rf2.predict(x_train).collect() + y_train = y_train.collect() + accuracy = np.count_nonzero(y_pred == y_train) / len(y_train) + accuracy2 = np.count_nonzero(y_pred2 == y_train) / len(y_train) + self.assertGreater(accuracy, 0.7) + self.assertGreater(accuracy2, 0.7) + + def test_make_classification_sklearn_max_predict(self): + """Tests RandomForestClassifier predict with sklearn_max.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[: len(x) // 2], (300, 10)) + y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = y[len(y) // 2:] + + rf = RandomForestClassifier(random_state=0, sklearn_max=10) + rf.fit(x_train, y_train) + save_model(rf, self.filepath) + rf2 = load_model(self.filepath) + + y_pred = rf.predict(x_test).collect() + y_pred2 = rf2.predict(x_test).collect() + accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) + accuracy2 = np.count_nonzero(y_pred2 == y_test) / len(y_test) + self.assertGreater(accuracy, 0.7) + self.assertGreater(accuracy2, 0.7) + + def test_make_classification_sklearn_max_predict_proba(self): + """Tests RandomForestClassifier predict_proba with sklearn_max.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[: len(x) // 2], (300, 10)) + y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = y[len(y) // 2:] + + rf = RandomForestClassifier(random_state=0, sklearn_max=10) + rf.fit(x_train, y_train) + save_model(rf, self.filepath) + rf2 = load_model(self.filepath) + + probabilities = rf.predict_proba(x_test).collect() + probabilities2 = rf2.predict_proba(x_test).collect() + rf.classes = compss_wait_on(rf.classes) + rf2.classes = compss_wait_on(rf2.classes) + y_pred = rf.classes[np.argmax(probabilities, axis=1)] + y_pred2 = rf2.classes[np.argmax(probabilities2, axis=1)] + accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) + accuracy2 = np.count_nonzero(y_pred2 == y_test) / len(y_test) + self.assertGreater(accuracy, 0.7) + self.assertGreater(accuracy2, 0.7) + + def test_make_classification_hard_vote_predict(self): + """Tests RandomForestClassifier predict with hard_vote.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[: len(x) // 2], (300, 10)) + y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = y[len(y) // 2:] + + rf = RandomForestClassifier( + random_state=0, sklearn_max=10, hard_vote=True + ) + rf.fit(x_train, y_train) + save_model(rf, self.filepath) + rf2 = load_model(self.filepath) + + y_pred = rf.predict(x_test).collect() + y_pred2 = rf2.predict(x_test).collect() + accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) + accuracy2 = np.count_nonzero(y_pred2 == y_test) / len(y_test) + self.assertGreater(accuracy, 0.7) + self.assertGreater(accuracy2, 0.7) + + def test_make_classification_hard_vote_score_mix(self): + """Tests RandomForestClassifier score with hard_vote, sklearn_max, + distr_depth and max_depth.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[: len(x) // 2], (300, 10)) + y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) + + rf = RandomForestClassifier( + random_state=0, + sklearn_max=100, + distr_depth=2, + max_depth=12, + hard_vote=True, + ) + rf.fit(x_train, y_train) + save_model(rf, self.filepath) + rf2 = load_model(self.filepath) + + accuracy = compss_wait_on(rf.score(x_test, y_test)) + accuracy2 = compss_wait_on(rf2.score(x_test, y_test)) + self.assertGreater(accuracy, 0.7) + self.assertGreater(accuracy2, 0.7) + + def test_iris(self): + """Tests RandomForestClassifier with a minimal example.""" + x, y = datasets.load_iris(return_X_y=True) + ds_fit = ds.array(x[::2], block_size=(30, 2)) + fit_y = ds.array(y[::2].reshape(-1, 1), block_size=(30, 1)) + ds_validate = ds.array(x[1::2], block_size=(30, 2)) + validate_y = ds.array(y[1::2].reshape(-1, 1), block_size=(30, 1)) + + rf = RandomForestClassifier( + n_estimators=1, max_depth=1, random_state=0 + ) + rf.fit(ds_fit, fit_y) + save_model(rf, self.filepath) + rf2 = load_model(self.filepath) + + accuracy = compss_wait_on(rf.score(ds_validate, validate_y)) + accuracy2 = compss_wait_on(rf2.score(ds_validate, validate_y)) + + # Accuracy should be <= 2/3 for any seed, often exactly equal. + self.assertAlmostEqual(accuracy, 2 / 3) + self.assertAlmostEqual(accuracy2, 2 / 3) + + +class LassoSavingTestJSON(unittest.TestCase): + filepath = "tests/files/saving/lasso.json" + + def test_fit_predict(self): + """Tests fit and predicts methods""" + + np.random.seed(42) + + n_samples, n_features = 50, 100 + X = np.random.randn(n_samples, n_features) + + # Decreasing coef w. alternated signs for visualization + idx = np.arange(n_features) + coef = (-1) ** idx * np.exp(-idx / 10) + coef[10:] = 0 # sparsify coef + y = np.dot(X, coef) + + # Add noise + y += 0.01 * np.random.normal(size=n_samples) + + n_samples = X.shape[0] + X_train, y_train = X[: n_samples // 2], y[: n_samples // 2] + X_test, y_test = X[n_samples // 2:], y[n_samples // 2:] + + lasso = Lasso(lmbd=0.1, max_iter=50) + + lasso.fit(ds.array(X_train, (5, 100)), ds.array(y_train, (5, 1))) + save_model(lasso, self.filepath) + lasso2 = load_model(self.filepath) + + y_pred_lasso = lasso.predict(ds.array(X_test, (25, 100))) + r2_score_lasso = r2_score(y_test, y_pred_lasso.collect()) + y_pred_lasso2 = lasso2.predict(ds.array(X_test, (25, 100))) + r2_score_lasso2 = r2_score(y_test, y_pred_lasso2.collect()) + + self.assertAlmostEqual(r2_score_lasso, 0.9481746925431124) + self.assertAlmostEqual(r2_score_lasso2, 0.9481746925431124) + + +class LinearRegressionSavingTestJSON(unittest.TestCase): + filepath = "tests/files/saving/linear_regression.json" + + def test_univariate(self): + """Tests fit() and predict(), univariate.""" + x_data = np.array([1, 2, 3, 4, 5]) + y_data = np.array([2, 1, 1, 2, 4.5]) + + bn, bm = 2, 1 + + x = ds.array(x=x_data, block_size=(bn, bm)) + y = ds.array(x=y_data, block_size=(bn, bm)) + + reg = LinearRegression() + reg.fit(x, y) + save_model(reg, self.filepath) + reg2 = load_model(self.filepath) + + self.assertTrue(np.allclose(reg.coef_.collect(), 0.6)) + self.assertTrue(np.allclose(reg.intercept_.collect(), 0.3)) + self.assertTrue(np.allclose(reg2.coef_.collect(), 0.6)) + self.assertTrue(np.allclose(reg2.intercept_.collect(), 0.3)) + + # Predict one sample + x_test = np.array([3]) + test_data = ds.array(x=x_test, block_size=(1, 1)) + pred = reg.predict(test_data).collect() + pred2 = reg2.predict(test_data).collect() + self.assertTrue(np.allclose(pred, 2.1)) + self.assertTrue(np.allclose(pred2, 2.1)) + + # Predict multiple samples + x_test = np.array([3, 5, 6]) + test_data = ds.array(x=x_test, block_size=(bn, bm)) + pred = reg.predict(test_data).collect() + pred2 = reg2.predict(test_data).collect() + self.assertTrue(np.allclose(pred, [2.1, 3.3, 3.9])) + self.assertTrue(np.allclose(pred2, [2.1, 3.3, 3.9])) + + def test_univariate_no_intercept(self): + """Tests fit() and predict(), univariate, fit_intercept=False.""" + x_data = np.array([1, 2, 3, 4, 5]) + y_data = np.array([2, 1, 1, 2, 4.5]) + + bn, bm = 2, 1 + + x = ds.array(x=x_data, block_size=(bn, bm)) + y = ds.array(x=y_data, block_size=(bn, bm)) + + reg = LinearRegression(fit_intercept=False) + reg.fit(x, y) + save_model(reg, self.filepath) + reg2 = load_model(self.filepath) + + self.assertTrue(np.allclose(reg.coef_.collect(), 0.68181818)) + self.assertTrue(np.allclose(reg2.coef_.collect(), 0.68181818)) + self.assertTrue(np.allclose(reg.intercept_.collect(), 0)) + self.assertTrue(np.allclose(reg2.intercept_.collect(), 0)) + + # Predict one sample + x_test = np.array([3]) + test_data = ds.array(x=x_test, block_size=(1, 1)) + pred = reg.predict(test_data).collect() + pred2 = reg2.predict(test_data).collect() + self.assertTrue(np.allclose(pred, 2.04545455)) + self.assertTrue(np.allclose(pred2, 2.04545455)) + + # Predict multiple samples + x_test = np.array([3, 5, 6]) + test_data = ds.array(x=x_test, block_size=(bn, bm)) + pred = reg.predict(test_data).collect() + pred2 = reg.predict(test_data).collect() + self.assertTrue(np.allclose(pred, [2.04545455, 3.4090909, 4.0909091])) + self.assertTrue(np.allclose(pred2, [2.04545455, 3.4090909, 4.0909091])) + + def test_multivariate(self): + """Tests fit() and predict(), multivariate.""" + x_data = np.array([[1, 2], [2, 0], [3, 1], [4, 4], [5, 3]]) + y_data = np.array([2, 1, 1, 2, 4.5]) + + bn, bm = 2, 2 + + x = ds.array(x=x_data, block_size=(bn, bm)) + y = ds.array(x=y_data, block_size=(bn, 1)) + + reg = LinearRegression() + reg.fit(x, y) + save_model(reg, self.filepath) + reg2 = load_model(self.filepath) + + self.assertTrue(np.allclose(reg.coef_.collect(), [0.421875, 0.296875])) + self.assertTrue( + np.allclose(reg2.coef_.collect(), [0.421875, 0.296875]) + ) + self.assertTrue(np.allclose(reg.intercept_.collect(), 0.240625)) + self.assertTrue(np.allclose(reg2.intercept_.collect(), 0.240625)) + + # Predict one sample + x_test = np.array([3, 2]) + test_data = ds.array(x=x_test, block_size=(1, bm)) + pred = reg.predict(test_data).collect() + pred2 = reg2.predict(test_data).collect() + self.assertTrue(np.allclose(pred, 2.1)) + self.assertTrue(np.allclose(pred2, 2.1)) + + # Predict multiple samples + x_test = np.array([[3, 2], [4, 4], [1, 3]]) + test_data = ds.array(x=x_test, block_size=(bn, bm)) + pred = reg.predict(test_data).collect() + self.assertTrue(np.allclose(pred, [2.1, 3.115625, 1.553125])) + + def test_multivariate_no_intercept(self): + """Tests fit() and predict(), multivariate, fit_intercept=False.""" + x_data = np.array([[1, 2], [2, 0], [3, 1], [4, 4], [5, 3]]) + y_data = np.array([2, 1, 1, 2, 4.5]) + + bn, bm = 2, 2 + + x = ds.array(x=x_data, block_size=(bn, bm)) + y = ds.array(x=y_data, block_size=(bn, 1)) + + reg = LinearRegression(fit_intercept=False) + reg.fit(x, y) + save_model(reg, self.filepath) + reg2 = load_model(self.filepath) + + self.assertTrue( + np.allclose(reg.coef_.collect(), [0.48305085, 0.30367232]) + ) + self.assertTrue( + np.allclose(reg2.coef_.collect(), [0.48305085, 0.30367232]) + ) + self.assertTrue(np.allclose(reg.intercept_.collect(), 0)) + self.assertTrue(np.allclose(reg2.intercept_.collect(), 0)) + + # Predict one sample + x_test = np.array([3, 2]) + test_data = ds.array(x=x_test, block_size=(1, bm)) + pred = reg.predict(test_data).collect() + pred2 = reg2.predict(test_data).collect() + self.assertTrue(np.allclose(pred, [2.05649718])) + self.assertTrue(np.allclose(pred2, [2.05649718])) + + # Predict multiple samples + x_test = np.array([[3, 2], [4, 4], [1, 3]]) + test_data = ds.array(x=x_test, block_size=(bn, bm)) + pred = reg.predict(test_data).collect() + pred2 = reg2.predict(test_data).collect() + self.assertTrue(np.allclose(pred, [2.05649718, 3.14689266, 1.3940678])) + self.assertTrue( + np.allclose(pred2, [2.05649718, 3.14689266, 1.3940678]) + ) + + def test_multivariate_multiobjective(self): + """Tests fit() and predict(), multivariate, multiobjective.""" + x_data = np.array( + [[1, 2, 3], [2, 0, 4], [3, 1, 8], [4, 4, 2], [5, 3, 1], [2, 7, 1]] + ) + y_data = np.array( + [ + [2, 0, 3], + [1, 5, 2], + [1, 3, 4], + [2, 7, 9], + [4.5, -1, 4], + [0, 0, 0], + ] + ) + + bn, bm = 2, 2 + + x = ds.array(x=x_data, block_size=(bn, bm)) + y = ds.array(x=y_data, block_size=(bn, bm)) + + reg = LinearRegression() + reg.fit(x, y) + save_model(reg, self.filepath) + reg2 = load_model(self.filepath) + + # Predict one sample + x_test = np.array([3, 2, 1]) + test_data = ds.array(x=x_test, block_size=(1, bm)) + pred = reg.predict(test_data).collect() + pred2 = reg2.predict(test_data).collect() + self.assertTrue(np.allclose(pred, [3.0318415, 1.97164872, 3.85410906])) + self.assertTrue( + np.allclose(pred2, [3.0318415, 1.97164872, 3.85410906]) + ) + + # Predict multiple samples + x_test = np.array([[3, 2, 1], [4, 3, 3], [1, 1, 1]]) + test_data = ds.array(x=x_test, block_size=(bn, bm)) + pred = reg.predict(test_data).collect() + pred2 = reg2.predict(test_data).collect() + self.assertTrue( + np.allclose( + pred, + [ + [3.0318415, 1.97164872, 3.85410906], + [2.5033157, 2.65809327, 5.05310495], + [2.145797, 1.4840121, 1.5739791], + ], + ) + ) + self.assertTrue( + np.allclose( + pred2, + [ + [3.0318415, 1.97164872, 3.85410906], + [2.5033157, 2.65809327, 5.05310495], + [2.145797, 1.4840121, 1.5739791], + ], + ) + ) + + # Check attributes values + self.assertTrue( + np.allclose( + reg2.coef_.collect(), + [ + [0.65034768, 0.34673933, 1.22176283], + [-0.41465084, -0.20584208, -0.16339571], + [-0.38211131, 0.27277365, 0.07031439], + ], + ) + ) + self.assertTrue( + np.allclose( + reg2.coef_.collect(), + [ + [0.65034768, 0.34673933, 1.22176283], + [-0.41465084, -0.20584208, -0.16339571], + [-0.38211131, 0.27277365, 0.07031439], + ], + ) + ) + self.assertTrue( + np.allclose( + reg.intercept_.collect(), [2.29221145, 1.07034124, 0.44529761] + ) + ) + self.assertTrue( + np.allclose( + reg2.intercept_.collect(), [2.29221145, 1.07034124, 0.44529761] + ) + ) + + +def load_movielens(train_ratio=0.9): + file = "tests/files/sample_movielens_ratings.csv" + + # 'user_id', 'movie_id', 'rating', 'timestamp' + + data = np.genfromtxt(file, dtype="int", delimiter=",", usecols=range(3)) + + # just in case there are movies/user without rating + # movie_id + n_m = max(len(np.unique(data[:, 1])), max(data[:, 1]) + 1) + # user_id + n_u = max(len(np.unique(data[:, 0])), max(data[:, 0]) + 1) + + idx = int(data.shape[0] * train_ratio) + + train_data = data[:idx] + test_data = data[idx:] + + train = csr_matrix( + (train_data[:, 2], (train_data[:, 0], train_data[:, 1])), + shape=(n_u, n_m), + ) + + test = csr_matrix((test_data[:, 2], (test_data[:, 0], test_data[:, 1]))) + + x_size, y_size = train.shape[0] // 4, train.shape[1] // 4 + train_arr = ds.array(train, block_size=(x_size, y_size)) + + x_size, y_size = test.shape[0] // 4, test.shape[1] // 4 + test_arr = ds.array(test, block_size=(x_size, y_size)) + + return train_arr, test_arr + + +class ALSSavingTestJSON(unittest.TestCase): + filepath = "tests/files/saving/als.json" + + def test_init_params(self): + # Test all parameters + seed = 666 + n_f = 100 + lambda_ = 0.001 + convergence_threshold = 0.1 + max_iter = 10 + verbose = True + arity = 12 + + als = ALS( + random_state=seed, + n_f=n_f, + lambda_=lambda_, + tol=convergence_threshold, + max_iter=max_iter, + verbose=verbose, + arity=arity, + ) + save_model(als, self.filepath) + als2 = load_model(self.filepath) + + self.assertEqual(als.random_state, seed) + self.assertEqual(als.n_f, n_f) + self.assertEqual(als.lambda_, lambda_) + self.assertEqual(als.tol, convergence_threshold) + self.assertEqual(als.max_iter, max_iter) + self.assertEqual(als.verbose, verbose) + self.assertEqual(als.arity, arity) + self.assertEqual(als2.random_state, seed) + self.assertEqual(als2.n_f, n_f) + self.assertEqual(als2.lambda_, lambda_) + self.assertEqual(als2.tol, convergence_threshold) + self.assertEqual(als2.max_iter, max_iter) + self.assertEqual(als2.verbose, verbose) + self.assertEqual(als2.arity, arity) + + def test_fit(self): + train, test = load_movielens() + + als = ALS( + tol=0.01, + random_state=666, + n_f=100, + verbose=False, + check_convergence=True, + ) + + als.fit(train, test) + self.assertTrue(als.converged) + + als.fit(train) + save_model(als, self.filepath) + als2 = load_model(self.filepath) + + self.assertTrue(als.converged) + self.assertTrue(als2.converged) + + def test_predict(self): + data = np.array([[0, 0, 5], [3, 0, 5], [3, 1, 2]]) + ratings = csr_matrix(data) + train = ds.array(x=ratings, block_size=(1, 1)) + als = ALS(tol=0.01, random_state=666, n_f=5, verbose=False) + als.fit(train) + save_model(als, self.filepath) + als2 = load_model(self.filepath) + + predictions = als.predict_user(user_id=0) + predictions2 = als2.predict_user(user_id=0) + + # Check that the ratings for user 0 are similar to user 1 because they + # share preferences (third movie), thus it is expected that user 0 + # will rate movie 1 similarly to user 1. + self.assertTrue( + 2.75 < predictions[0] < 3.25 + and predictions[1] < 1 + and predictions[2] > 4.5 + ) + self.assertTrue( + 2.75 < predictions2[0] < 3.25 + and predictions2[1] < 1 + and predictions2[2] > 4.5 + ) + self.assertTrue( + np.array_equal(predictions, predictions2, equal_nan=True) + ) + + +def main(): + unittest.main() + + +if __name__ == "__main__": + main() From f68c8472f96a69031fb29f2d4456285d1c594a25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Mon, 19 Jul 2021 10:38:34 +0200 Subject: [PATCH 02/46] Format and doc changes --- .gitignore | 3 ++ dislib/utils/saving.py | 107 +++++++++++++++++++++-------------------- requirements.txt | 1 + 3 files changed, 59 insertions(+), 52 deletions(-) diff --git a/.gitignore b/.gitignore index 66a5171a..ac75ae24 100644 --- a/.gitignore +++ b/.gitignore @@ -112,6 +112,9 @@ target/ *compss*.out *compss*.err +# Saving +**/saving/* + # ========== C & C++ ignores ================= # Prerequisites *.d diff --git a/dislib/utils/saving.py b/dislib/utils/saving.py index 31159a7f..30b015d8 100644 --- a/dislib/utils/saving.py +++ b/dislib/utils/saving.py @@ -1,6 +1,7 @@ import json import os import numpy as np +import cbor2 from pycompss.runtime.management.classes import Future from pycompss.api.api import compss_wait_on @@ -24,11 +25,6 @@ _SkTreeWrapper, ) -try: - import cbor2 -except ImportError: - cbor2 = None - # Dislib models with saving tested (model: str -> module: str) _implemented_models = { "KMeans": "cluster", @@ -56,9 +52,27 @@ } -def save_model(model, filepath, overwrite=True, save_format=None): - """Saves a model to a file. - Usage: +def save_model(model, filepath, overwrite=True, save_format="json"): + """ Saves a model to a file. + + The model is synchronized before saving and can be reinstantiated in the + exact same state, without any of the code used for model definition or + fitting. + + Parameters + ---------- + model : dislib model. + Dislib model to serialize and save. + filepath : str + Path where to save the model + overwrite : bool, optional (default=True) + Whether any existing model at the target + location should be overwritten. + save_format : str, optional (default='json) + Format used to save the models. + + Examples + -------- >>> from dislib.cluster import KMeans >>> from dislib.utils import save_model, load_model >>> import numpy as np @@ -73,20 +87,8 @@ def save_model(model, filepath, overwrite=True, save_format=None): >>> model_pred = model.predict(x_test) >>> loaded_model_pred = loaded_model.predict(x_test) >>> assert np.allclose(model_pred.collect(), loaded_model_pred.collect()) - - The file contains: - - the model's class - - the model's attributes - The model is synchronized before saving and can be reinstantiated in the - exact same state, without any of the code used for model definition or - fitting. - Args: - model: `dislib` model instance to be saved. - filepath: String path where to save the model - overwrite: Whether we should overwrite any existing model at the target - location, or instead ask the user with a manual prompt. - save_format: Format used to save the model. Defaults to `json`. """ + # Check overwrite if not overwrite and os.path.isfile(filepath): return @@ -108,23 +110,31 @@ def save_model(model, filepath, overwrite=True, save_format=None): model_metadata["model_name"] = model_name # Save model - default_format = "json" - save_format = save_format or default_format if save_format == "json": with open(filepath, "w") as f: json.dump(model_metadata, f, default=_encode_helper) elif save_format == "cbor": - if cbor2 is None: - raise ModuleNotFoundError("No module named 'cbor2'") with open(filepath, "wb") as f: cbor2.dump(model_metadata, f, default=_encode_helper_cbor) else: - raise ValueError("Save format must be either json or h5.") + raise ValueError("Wrong save format.") + +def load_model(filepath, load_format="json"): + """ Loads a model from a file. -def load_model(filepath, load_format=None): - """Loads a model from a file. - Usage: + The model is reinstantiated in the exact same state in which it was saved, + without any of the code used for model definition or fitting. + + Parameters + ---------- + filepath : str + Path of the saved the model + load_format : str, optional (default='json') + Format used to load the model. + + Examples + -------- >>> from dislib.cluster import KMeans >>> from dislib.utils import save_model, load_model >>> import numpy as np @@ -139,36 +149,22 @@ def load_model(filepath, load_format=None): >>> model_pred = model.predict(x_test) >>> loaded_model_pred = loaded_model.predict(x_test) >>> assert np.allclose(model_pred.collect(), loaded_model_pred.collect()) - - The file must contain: - - the model's class - - the model's attributes - The model is reinstantiated in the exact same state in which it was saved, - without any of the code used for model definition or fitting. - Args: - filepath: String path where to save the model - load_format: Format used to load the model. Defaults to 'json'. """ # Load model - default_format = "json" - load_format = load_format or default_format - if load_format == "json": with open(filepath, "r") as f: model_metadata = json.load(f, object_hook=_decode_helper) elif load_format == "cbor": - if cbor2 is None: - raise ModuleNotFoundError("No module named 'cbor2'") with open(filepath, "rb") as f: model_metadata = cbor2.load(f, object_hook=_decode_helper_cbor) else: - raise ValueError("Load format must be either json or h5.") + raise ValueError("Wrong load format.") # Check for dislib model model_name = model_metadata["model_name"] if model_name not in _implemented_models.keys(): raise NotImplementedError( - "Loading has only been implemented for the following models:\n%s" + "Saving has only been implemented for the following models:\n%s" % _implemented_models.keys() ) del model_metadata["model_name"] @@ -179,7 +175,7 @@ def load_model(filepath, load_format=None): model = model_class() model.__dict__.update(model_metadata) - # Set class methodss + # Set class methods if model_name == "CascadeSVM" and "kernel" in model_metadata: try: model._kernel_f = getattr( @@ -192,17 +188,19 @@ def load_model(filepath, load_format=None): def _encode_helper_cbor(encoder, obj): - """Special encoder wrapper for dislib using cbor""" + """ Special encoder wrapper for dislib using cbor2""" encoder.encode(_encode_helper(obj)) def _decode_helper_cbor(decoder, obj): - """Special decoder wrapper for dislib using cbor""" + """ Special decoder wrapper for dislib using cbor2""" return _decode_helper(obj) def _encode_helper(obj): - """Special encoder for dislib""" + """ Special encoder for dislib that serializes the different objectes + and stores their state for future loading. + """ if isinstance(obj, np.generic): return obj.item() elif isinstance(obj, range): @@ -254,7 +252,9 @@ def _encode_helper(obj): def _decode_helper(obj): - """Special decoder for dislib""" + """ Special decoder for dislib that instantiates the different objects + and updates their attributes to recover the saved state. + """ if isinstance(obj, dict) and "class_name" in obj: class_name = obj["class_name"] @@ -331,7 +331,9 @@ def _decode_helper(obj): def _sync_obj(obj): - """Recursively synchronizes the Future objects of a list or dictionary.""" + """ Recursively synchronizes the Future objects of a list or dictionary + by using `compss_wait_on(obj)`. + """ if isinstance(obj, dict): iterator = iter(obj.items()) elif isinstance(obj, list): @@ -353,7 +355,8 @@ def _sync_obj(obj): def _sync_rf(rf): - """Sync the `try_features` and 'n_classes' attribute of the different trees + """ Sync the `try_features` and `n_classes` attribute of the different trees + since they cannot be synced recursively. """ if isinstance(rf.trees[0].try_features, Future): try_features = compss_wait_on(rf.trees[0].try_features) diff --git a/requirements.txt b/requirements.txt index 4100177f..3fc50ee3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ scipy>=1.3.0 numpy>=1.18.1, <=1.19.5 numpydoc>=0.8.0 cvxpy>=1.1.5 +cbor2>=5.4.0 \ No newline at end of file From 2b4e09702119a9011b9ad2e80979f2aea9331faa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Mon, 19 Jul 2021 16:43:00 +0200 Subject: [PATCH 03/46] cbor2 not always required --- dislib/utils/saving.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/dislib/utils/saving.py b/dislib/utils/saving.py index 30b015d8..65065eab 100644 --- a/dislib/utils/saving.py +++ b/dislib/utils/saving.py @@ -1,7 +1,6 @@ import json import os import numpy as np -import cbor2 from pycompss.runtime.management.classes import Future from pycompss.api.api import compss_wait_on @@ -25,6 +24,11 @@ _SkTreeWrapper, ) +try: + import cbor2 +except ImportError: + cbor2 = None + # Dislib models with saving tested (model: str -> module: str) _implemented_models = { "KMeans": "cluster", @@ -114,6 +118,8 @@ def save_model(model, filepath, overwrite=True, save_format="json"): with open(filepath, "w") as f: json.dump(model_metadata, f, default=_encode_helper) elif save_format == "cbor": + if cbor2 is None: + raise ModuleNotFoundError("No module named 'cbor2'") with open(filepath, "wb") as f: cbor2.dump(model_metadata, f, default=_encode_helper_cbor) else: @@ -155,6 +161,8 @@ def load_model(filepath, load_format="json"): with open(filepath, "r") as f: model_metadata = json.load(f, object_hook=_decode_helper) elif load_format == "cbor": + if cbor2 is None: + raise ModuleNotFoundError("No module named 'cbor2'") with open(filepath, "rb") as f: model_metadata = cbor2.load(f, object_hook=_decode_helper_cbor) else: @@ -188,12 +196,12 @@ def load_model(filepath, load_format="json"): def _encode_helper_cbor(encoder, obj): - """ Special encoder wrapper for dislib using cbor2""" + """ Special encoder wrapper for dislib using cbor2.""" encoder.encode(_encode_helper(obj)) def _decode_helper_cbor(decoder, obj): - """ Special decoder wrapper for dislib using cbor2""" + """ Special decoder wrapper for dislib using cbor2.""" return _decode_helper(obj) From d2b036d7397806109ec9b304bf2cb8ab78881e07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Wed, 21 Jul 2021 12:12:32 +0200 Subject: [PATCH 04/46] Install dislib requirements --- Dockerfile | 2 ++ dislib/utils/saving.py | 18 +++++++++--------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/Dockerfile b/Dockerfile index e8a72019..75fabe37 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,6 +5,8 @@ COPY . dislib/ ENV PYTHONPATH=$PYTHONPATH:/dislib +RUN python -m pip install -r /dislib/requirements.txt + # Expose SSH port and run SSHD EXPOSE 22 CMD ["/usr/sbin/sshd","-D"] diff --git a/dislib/utils/saving.py b/dislib/utils/saving.py index 65065eab..a6e4e0fd 100644 --- a/dislib/utils/saving.py +++ b/dislib/utils/saving.py @@ -28,7 +28,7 @@ import cbor2 except ImportError: cbor2 = None - + # Dislib models with saving tested (model: str -> module: str) _implemented_models = { "KMeans": "cluster", @@ -57,7 +57,7 @@ def save_model(model, filepath, overwrite=True, save_format="json"): - """ Saves a model to a file. + """Saves a model to a file. The model is synchronized before saving and can be reinstantiated in the exact same state, without any of the code used for model definition or @@ -127,7 +127,7 @@ def save_model(model, filepath, overwrite=True, save_format="json"): def load_model(filepath, load_format="json"): - """ Loads a model from a file. + """Loads a model from a file. The model is reinstantiated in the exact same state in which it was saved, without any of the code used for model definition or fitting. @@ -196,17 +196,17 @@ def load_model(filepath, load_format="json"): def _encode_helper_cbor(encoder, obj): - """ Special encoder wrapper for dislib using cbor2.""" + """Special encoder wrapper for dislib using cbor2.""" encoder.encode(_encode_helper(obj)) def _decode_helper_cbor(decoder, obj): - """ Special decoder wrapper for dislib using cbor2.""" + """Special decoder wrapper for dislib using cbor2.""" return _decode_helper(obj) def _encode_helper(obj): - """ Special encoder for dislib that serializes the different objectes + """Special encoder for dislib that serializes the different objectes and stores their state for future loading. """ if isinstance(obj, np.generic): @@ -260,7 +260,7 @@ def _encode_helper(obj): def _decode_helper(obj): - """ Special decoder for dislib that instantiates the different objects + """Special decoder for dislib that instantiates the different objects and updates their attributes to recover the saved state. """ if isinstance(obj, dict) and "class_name" in obj: @@ -339,7 +339,7 @@ def _decode_helper(obj): def _sync_obj(obj): - """ Recursively synchronizes the Future objects of a list or dictionary + """Recursively synchronizes the Future objects of a list or dictionary by using `compss_wait_on(obj)`. """ if isinstance(obj, dict): @@ -363,7 +363,7 @@ def _sync_obj(obj): def _sync_rf(rf): - """ Sync the `try_features` and `n_classes` attribute of the different trees + """Sync the `try_features` and `n_classes` attribute of the different trees since they cannot be synced recursively. """ if isinstance(rf.trees[0].try_features, Future): From 30da4675cf42b3793fad83aa2a38e2c33ac9b2d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Wed, 21 Jul 2021 12:19:38 +0200 Subject: [PATCH 05/46] Added directory to save models during testing. --- .gitignore | 3 ++- tests/files/saving/saving.txt | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) create mode 100644 tests/files/saving/saving.txt diff --git a/.gitignore b/.gitignore index ac75ae24..ad8ef5a4 100644 --- a/.gitignore +++ b/.gitignore @@ -113,7 +113,8 @@ target/ *compss*.err # Saving -**/saving/* +tests/files/saving/* +!tests/files/saving/*.txt # ========== C & C++ ignores ================= # Prerequisites diff --git a/tests/files/saving/saving.txt b/tests/files/saving/saving.txt new file mode 100644 index 00000000..d7d8541b --- /dev/null +++ b/tests/files/saving/saving.txt @@ -0,0 +1 @@ +Directory where the models generated by the tests regarding saving functionalities should be located. \ No newline at end of file From 86e94920140bad6ec23ce5fcaf33444ccd01850b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Wed, 21 Jul 2021 12:21:54 +0200 Subject: [PATCH 06/46] Install requirements using pip3. --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 75fabe37..2fa20a5d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,7 +5,7 @@ COPY . dislib/ ENV PYTHONPATH=$PYTHONPATH:/dislib -RUN python -m pip install -r /dislib/requirements.txt +RUN pip3 install -r /dislib/requirements.txt # Expose SSH port and run SSHD EXPOSE 22 From 4e8ad6ebf03ff63d6a65f6548e21ef4861066296 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Wed, 21 Jul 2021 13:05:17 +0200 Subject: [PATCH 07/46] Changed environment language. --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 2fa20a5d..7b1ed215 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,7 +4,7 @@ MAINTAINER COMPSs Support COPY . dislib/ ENV PYTHONPATH=$PYTHONPATH:/dislib - +ENV LC_ALL=C.UTF-8 RUN pip3 install -r /dislib/requirements.txt # Expose SSH port and run SSHD From 7760772c987583c4fac8b7c83b67d2ec92e12cc6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Wed, 21 Jul 2021 17:02:51 +0200 Subject: [PATCH 08/46] Changed Jenkins timeout from 2h to 3h --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 396b9912..eaf042e9 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -11,7 +11,7 @@ def setGithubCommitStatus(state, description) { pipeline { options { - timeout(time: 2, unit: 'HOURS') + timeout(time: 3, unit: 'HOURS') } agent { node { From 55385a735415428e1e1c5676cf14d970b27b89cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Fri, 23 Jul 2021 12:53:22 +0200 Subject: [PATCH 09/46] Changed names of constant variables --- dislib/utils/saving.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/dislib/utils/saving.py b/dislib/utils/saving.py index a6e4e0fd..f0b8313c 100644 --- a/dislib/utils/saving.py +++ b/dislib/utils/saving.py @@ -30,7 +30,7 @@ cbor2 = None # Dislib models with saving tested (model: str -> module: str) -_implemented_models = { +IMPLEMENTED_MODELS = { "KMeans": "cluster", "GaussianMixture": "cluster", "CascadeSVM": "classification", @@ -41,7 +41,7 @@ } # Classes used by models -_dislib_classes = { +DISLIB_CLASSES = { "KMeans": dislib.cluster.KMeans, "DecisionTreeClassifier": DecisionTreeClassifier, "_Node": _Node, @@ -50,7 +50,7 @@ "_SkTreeWrapper": _SkTreeWrapper, } -_sklearn_classes = { +SKLEARN_CLASSES = { "SVC": SklearnSVC, "DecisionTreeClassifier": SklearnDTClassifier, } @@ -99,10 +99,10 @@ def save_model(model, filepath, overwrite=True, save_format="json"): # Check for dislib model model_name = model.__class__.__name__ - if model_name not in _implemented_models.keys(): + if model_name not in IMPLEMENTED_MODELS.keys(): raise NotImplementedError( "Saving has only been implemented for the following models:\n%s" - % _implemented_models.keys() + % IMPLEMENTED_MODELS.keys() ) # Synchronize model @@ -170,15 +170,15 @@ def load_model(filepath, load_format="json"): # Check for dislib model model_name = model_metadata["model_name"] - if model_name not in _implemented_models.keys(): + if model_name not in IMPLEMENTED_MODELS.keys(): raise NotImplementedError( "Saving has only been implemented for the following models:\n%s" - % _implemented_models.keys() + % IMPLEMENTED_MODELS.keys() ) del model_metadata["model_name"] # Create model - model_module = getattr(ds, _implemented_models[model_name]) + model_module = getattr(ds, IMPLEMENTED_MODELS[model_name]) model_class = getattr(model_module, model_name) model = model_class() model.__dict__.update(model_metadata) @@ -249,7 +249,7 @@ def _encode_helper(obj): "items": obj.__getstate__(), } elif isinstance( - obj, tuple(_dislib_classes.values()) + tuple(_sklearn_classes.values()) + obj, tuple(DISLIB_CLASSES.values()) + tuple(SKLEARN_CLASSES.values()) ): return { "class_name": obj.__class__.__name__, @@ -302,12 +302,12 @@ def _decode_helper(obj): model.__setstate__(dict_) return model elif ( - class_name in _dislib_classes.keys() + class_name in DISLIB_CLASSES.keys() and "dislib" in obj["module_name"] ): dict_ = _decode_helper(obj["items"]) if class_name == "DecisionTreeClassifier": - model = _dislib_classes[obj["class_name"]]( + model = DISLIB_CLASSES[obj["class_name"]]( try_features=dict_.pop("try_features"), max_depth=dict_.pop("max_depth"), distr_depth=dict_.pop("distr_depth"), @@ -317,17 +317,17 @@ def _decode_helper(obj): ) elif class_name == "_SkTreeWrapper": sk_tree = _decode_helper(dict_.pop("sk_tree")) - model = _dislib_classes[obj["class_name"]](sk_tree) + model = DISLIB_CLASSES[obj["class_name"]](sk_tree) else: - model = _dislib_classes[obj["class_name"]]() + model = DISLIB_CLASSES[obj["class_name"]]() model.__dict__.update(dict_) return model elif ( - class_name in _sklearn_classes.keys() + class_name in SKLEARN_CLASSES.keys() and "sklearn" in obj["module_name"] ): dict_ = _decode_helper(obj["items"]) - model = _sklearn_classes[obj["class_name"]]() + model = SKLEARN_CLASSES[obj["class_name"]]() model.__dict__.update(dict_) return model elif class_name == "callable": From 8b22122ecc834492c308fdb60622ffbae3c0870e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Wed, 21 Jul 2021 17:02:05 +0200 Subject: [PATCH 10/46] Add RF Classifier and started modifying _data.py --- dislib/regression/rf/__init__.py | 0 dislib/regression/rf/_data.py | 279 ++++++++++++++ dislib/regression/rf/decision_tree.py | 520 ++++++++++++++++++++++++++ dislib/regression/rf/forest.py | 306 +++++++++++++++ dislib/regression/rf/test_split.py | 50 +++ 5 files changed, 1155 insertions(+) create mode 100644 dislib/regression/rf/__init__.py create mode 100644 dislib/regression/rf/_data.py create mode 100644 dislib/regression/rf/decision_tree.py create mode 100644 dislib/regression/rf/forest.py create mode 100644 dislib/regression/rf/test_split.py diff --git a/dislib/regression/rf/__init__.py b/dislib/regression/rf/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/dislib/regression/rf/_data.py b/dislib/regression/rf/_data.py new file mode 100644 index 00000000..360f705d --- /dev/null +++ b/dislib/regression/rf/_data.py @@ -0,0 +1,279 @@ +import tempfile + +import numpy as np +from numpy.lib import format +from pycompss.api.parameter import ( + FILE_IN, + FILE_INOUT, + COLLECTION_IN, + Depth, + Type, +) +from pycompss.api.task import task + +from dislib.data.array import Array + + +class RfDataset(object): + """Dataset format used by the fit() of the RandomForestRegressor. + + The RfDataset contains a file path for the samples and another one for the + targets. Optionally, a path can be provided for a transposed version of the + samples matrix, i.e., the features. + + Note: For a representation of a dataset distributed in multiple files, use + dislib.data.Dataset instead. + + Parameters + ---------- + samples_path : str + Path of the .npy file containing the 2-d array of samples. It can be a + pycompss.runtime.Future object. If so, self.n_samples and + self.n_features must be set manually (they can also be + pycompss.runtime.Future objects). + targets_path : str + Path of the .dat file containing the 1-d array of targets. It can be a + pycompss.runtime.Future object. + features_path : str, optional (default=None) + Path of the .npy file containing the 2-d array of samples transposed. + The array must be C-ordered. Providing this array may improve the + performance as it allows sequential access to the features. + + Attributes + ---------- + n_samples : int + The number of samples of the dataset. It can be a + pycompss.runtime.Future object. + n_features : int + The number of features of the dataset. It can be a + pycompss.runtime.Future object. + y_targets : ndarray + The array of targets for this RfDataset. It can be a + pycompss.runtime.Future object. + + """ + + def __init__(self, samples_path, targets_path, features_path=None): + self.samples_path = samples_path + self.targets_path = targets_path + self.features_path = features_path + self.n_samples = None + self.n_features = None + + self.y_targets = None + + def get_n_samples(self): + """Gets the number of samples obtained from the samples file. + + Returns + ------- + n_samples : int + + Raises + ------ + AssertionError + If self.n_samples is None and self.samples_path is not a string. + ValueError + If invalid content is encountered in the samples file. + + """ + if self.n_samples is None: + assert isinstance(self.samples_path, str), ( + "self.n_samples must be set manually if self.samples_path " + "is a pycompss.runtime.Future object" + ) + shape = _NpyFile(self.samples_path).get_shape() + if len(shape) != 2: + raise ValueError("Cannot read 2D array from the samples file.") + self.n_samples, self.n_features = shape + return self.n_samples + + def get_n_features(self): + """Gets the number of features obtained from the samples file. + + Returns + ------- + n_features : int + + Raises + ------ + AssertionError + If self.n_features is None and self.samples_path is not a string. + ValueError + If invalid content is encountered in the samples file. + + """ + if self.n_features is None: + assert isinstance(self.samples_path, str), ( + "self.n_features must be set manually if self.samples_path " + "is a pycompss.runtime.Future object" + ) + shape = _NpyFile(self.samples_path).get_shape() + if len(shape) != 2: + raise ValueError("Cannot read 2D array from the samples file.") + self.n_samples, self.n_features = shape + return self.n_features + + def get_y_targets(self): + """Obtains the array of targets. + + Returns + ------- + y_targets : ndarray + + """ + if self.y_targets is None: + targets = _get_targets(self.targets_path) + self.y_targets = targets + return self.y_targets + + def validate_features_file(self): + """Validates the features file header information. + + Raises + ------ + ValueError + If the shape of the array in the features_file doesn't match this + class n_samples and n_features or if the array is in fortran order. + + """ + features_npy_file = _NpyFile(self.features_path) + shape = features_npy_file.get_shape() + fortran_order = features_npy_file.get_fortran_order() + if len(shape) != 2: + raise ValueError("Cannot read 2D array from features_file.") + if (self.get_n_features(), self.get_n_samples()) != shape: + raise ValueError("Invalid dimensions for the features_file.") + if fortran_order: + raise ValueError("Fortran order not supported for features array.") + + +def transform_to_rf_dataset(x: Array, y: Array) -> RfDataset: + """Creates a RfDataset object from samples x and targets y. + + This function creates a dislib.regression.rf.data.RfDataset by saving + x and y in files. + + Parameters + ---------- + x : ds-array, shape = (n_samples, n_features) + The training input samples. + y : ds-array, shape = (n_samples,) or (n_samples, n_outputs) + The target values. + + Returns + ------- + rf_dataset : dislib.regression.rf._data.RfDataset + + """ + n_samples = x.shape[0] + n_features = x.shape[1] + + samples_file = tempfile.NamedTemporaryFile( + mode="wb", prefix="tmp_rf_samples_", delete=False + ) + samples_path = samples_file.name + samples_file.close() + _allocate_samples_file(samples_path, n_samples, n_features) + + start_idx = 0 + row_blocks_iterator = x._iterator(axis=0) + top_row = next(row_blocks_iterator) + _fill_samples_file(samples_path, top_row._blocks, start_idx) + start_idx += x._top_left_shape[0] + for x_row in row_blocks_iterator: + _fill_samples_file(samples_path, x_row._blocks, start_idx) + start_idx += x._reg_shape[0] + + targets_file = tempfile.NamedTemporaryFile( + mode="w", prefix="tmp_rf_targets_", delete=False + ) + targets_path = targets_file.name + targets_file.close() + for y_row in y._iterator(axis=0): + _fill_targets_file(targets_path, y_row._blocks) + + rf_dataset = RfDataset(samples_path, targets_path) + rf_dataset.n_samples = n_samples + rf_dataset.n_features = n_features + return rf_dataset + + +class _NpyFile(object): + def __init__(self, path): + self.path = path + + self.shape = None + self.fortran_order = None + self.dtype = None + + def get_shape(self): + if self.shape is None: + self._read_header() + return self.shape + + def get_fortran_order(self): + if self.fortran_order is None: + self._read_header() + return self.fortran_order + + def get_dtype(self): + if self.dtype is None: + self._read_header() + return self.dtype + + def _read_header(self): + with open(self.path, "rb") as fp: + version = format.read_magic(fp) + try: + format._check_version(version) + except ValueError: + raise ValueError("Invalid file format.") + header_data = format._read_array_header(fp, version) + self.shape, self.fortran_order, self.dtype = header_data + + +@task(targets_path=FILE_IN, returns=1) +def _get_targets(targets_path): + y = np.genfromtxt(targets_path, dtype=None, encoding="utf-8") + return y + + +@task(returns=1) +def _get_samples_shape(subset): + return subset.samples.shape + + +@task(returns=3) +def _merge_shapes(*samples_shapes): + n_samples = 0 + n_features = samples_shapes[0][1] + for shape in samples_shapes: + n_samples += shape[0] + assert shape[1] == n_features, "Subsamples with different n_features." + return samples_shapes, n_samples, n_features + + +@task(samples_path=FILE_INOUT) +def _allocate_samples_file(samples_path, n_samples, n_features): + np.lib.format.open_memmap( + samples_path, + mode="w+", + dtype="float32", + shape=(int(n_samples), int(n_features)), + ) + + +@task(samples_path=FILE_INOUT, row_blocks={Type: COLLECTION_IN, Depth: 2}) +def _fill_samples_file(samples_path, row_blocks, start_idx): + rows_samples = Array._merge_blocks(row_blocks) + rows_samples = rows_samples.astype(dtype="float32", casting="same_kind") + samples = np.lib.format.open_memmap(samples_path, mode="r+") + samples[start_idx : start_idx + rows_samples.shape[0]] = rows_samples + + +@task(targets_path=FILE_INOUT, row_blocks={Type: COLLECTION_IN, Depth: 2}) +def _fill_targets_file(targets_path, row_blocks): + rows_targets = Array._merge_blocks(row_blocks) + with open(targets_path, "at") as f: + np.savetxt(f, rows_targets, fmt="%s", encoding="utf-8") diff --git a/dislib/regression/rf/decision_tree.py b/dislib/regression/rf/decision_tree.py new file mode 100644 index 00000000..0725fcfa --- /dev/null +++ b/dislib/regression/rf/decision_tree.py @@ -0,0 +1,520 @@ +from sys import float_info + +import numpy as np +from numpy.random.mtrand import RandomState +from pycompss.api.api import compss_delete_object +from pycompss.api.parameter import FILE_IN, Type, COLLECTION_IN, Depth +from pycompss.api.task import task +from sklearn.tree import DecisionTreeClassifier as SklearnDTClassifier + +from dislib.classification.rf.test_split import test_split +from dislib.data.array import Array + + +class DecisionTreeClassifier: + """A distributed decision tree classifier. + + Parameters + ---------- + try_features : int + The number of features to consider when looking for the best split. + + Note: the search for a split does not stop until at least one + valid partition of the node samples is found, even if it requires + to effectively inspect more than ``try_features`` features. + max_depth : int + The maximum depth of the tree. If np.inf, then nodes are expanded + until all leaves are pure. + distr_depth : int + Number of levels of the tree in which the nodes are split in a + distributed way. + bootstrap : bool + Randomly select n_instances samples with repetition (used in random + forests). + random_state : RandomState instance + The random number generator. + + Attributes + ---------- + n_features : int + The number of features of the dataset. It can be a + pycompss.runtime.Future object. + n_classes : int + The number of classes of this RfDataset. It can be a + pycompss.runtime.Future object. + tree : None or _Node + The root node of the tree after the tree is fitted. + nodes_info : None or list of _InnerNodeInfo and _LeafInfo + List of the node information for the nodes of the tree in the same + order as obtained in the fit() method, up to ``distr_depth`` depth. + After fit(), it is a pycompss.runtime.Future object. + subtrees : None or list of _Node + List of subtrees of the tree at ``distr_depth`` depth obtained in the + fit() method. After fit(), it is a list of pycompss.runtime.Future + objects. + + Methods + ------- + fit(dataset) + Fits the DecisionTreeClassifier. + predict(x_row) + Predicts classes for the given samples using a fitted tree. + predict_proba(x_row) + Predicts class probabilities for the given smaples using a fitted tree. + + """ + + def __init__(self, try_features, max_depth, distr_depth, sklearn_max, + bootstrap, random_state): + self.try_features = try_features + self.max_depth = max_depth + self.distr_depth = distr_depth + self.sklearn_max = sklearn_max + self.bootstrap = bootstrap + self.random_state = random_state + + self.n_features = None + self.n_classes = None + + self.tree = None + self.nodes_info = None + self.subtrees = None + + def fit(self, dataset): + """Fits the DecisionTreeClassifier. + + Parameters + ---------- + dataset : dislib.classification.rf._data.RfDataset + + """ + + self.n_features = dataset.get_n_features() + self.n_classes = dataset.get_n_classes() + samples_path = dataset.samples_path + features_path = dataset.features_path + n_samples = dataset.get_n_samples() + y_codes = dataset.get_y_codes() + + seed = self.random_state.randint(np.iinfo(np.int32).max) + + sample, y_s = _sample_selection(n_samples, y_codes, self.bootstrap, + seed) + + self.tree = _Node() + self.nodes_info = [] + self.subtrees = [] + tree_traversal = [(self.tree, sample, y_s, 0)] + while tree_traversal: + node, sample, y_s, depth = tree_traversal.pop() + if depth < self.distr_depth: + split = _split_node_wrapper(sample, self.n_features, y_s, + self.n_classes, self.try_features, + self.random_state, + samples_file=samples_path, + features_file=features_path) + node_info, left_group, y_l, right_group, y_r = split + compss_delete_object(sample) + compss_delete_object(y_s) + node.content = len(self.nodes_info) + self.nodes_info.append(node_info) + node.left = _Node() + node.right = _Node() + depth = depth + 1 + tree_traversal.append((node.right, right_group, y_r, depth)) + tree_traversal.append((node.left, left_group, y_l, depth)) + else: + subtree = _build_subtree_wrapper(sample, y_s, self.n_features, + self.max_depth - depth, + self.n_classes, + self.try_features, + self.sklearn_max, + self.random_state, + samples_path, features_path) + node.content = len(self.subtrees) + self.subtrees.append(subtree) + compss_delete_object(sample) + compss_delete_object(y_s) + self.nodes_info = _merge(*self.nodes_info) + + def predict(self, x_row): + """Predicts classes for the given samples using a fitted tree. + + Parameters + ---------- + x_row : ds-array + A row block of samples. + + Returns + ------- + predicted : ndarray + An array with the predicted classes for the given samples. The + values are codes of the fitted + dislib.classification.rf.data.RfDataset. The returned object can + be a pycompss.runtime.Future object. + + """ + + assert self.tree is not None, 'The decision tree is not fitted.' + + branch_predictions = [] + for i, subtree in enumerate(self.subtrees): + pred = _predict_branch(x_row._blocks, self.tree, self.nodes_info, + i, subtree, self.distr_depth) + branch_predictions.append(pred) + return _merge_branches(None, *branch_predictions) + + def predict_proba(self, x_row): + """Predicts class probabilities for a row block using a fitted tree. + + Parameters + ---------- + x_row : ds-array + A row block of samples. + + Returns + ------- + predicted_proba : ndarray + An array with the predicted probabilities for the given samples. + The shape is (len(subset.samples), self.n_classes), with the index + of the column being codes of the fitted + dislib.classification.rf.data.RfDataset. The returned object can be + a pycompss.runtime.Future object. + + """ + + assert self.tree is not None, 'The decision tree is not fitted.' + + branch_predictions = [] + for i, subtree in enumerate(self.subtrees): + pred = _predict_branch_proba(x_row._blocks, self.tree, + self.nodes_info, i, subtree, + self.distr_depth, self.n_classes) + branch_predictions.append(pred) + return _merge_branches(self.n_classes, *branch_predictions) + + +class _Node: + + def __init__(self): + self.content = None + self.left = None + self.right = None + + def predict(self, sample): + node_content = self.content + if isinstance(node_content, _LeafInfo): + return np.full((len(sample),), node_content.mode) + if isinstance(node_content, _SkTreeWrapper): + if len(sample) > 0: + return node_content.sk_tree.predict(sample) + if isinstance(node_content, _InnerNodeInfo): + pred = np.empty((len(sample),), dtype=np.int64) + left_mask = sample[:, node_content.index] <= node_content.value + pred[left_mask] = self.left.predict(sample[left_mask]) + pred[~left_mask] = self.right.predict(sample[~left_mask]) + return pred + assert len(sample) == 0, 'Type not supported' + return np.empty((0,), dtype=np.int64) + + def predict_proba(self, sample, n_classes): + node_content = self.content + if isinstance(node_content, _LeafInfo): + single_pred = node_content.frequencies / node_content.size + return np.tile(single_pred, (len(sample), 1)) + if isinstance(node_content, _SkTreeWrapper): + if len(sample) > 0: + sk_tree_pred = node_content.sk_tree.predict_proba(sample) + pred = np.zeros((len(sample), n_classes), dtype=np.float64) + pred[:, node_content.sk_tree.classes_] = sk_tree_pred + return pred + if isinstance(node_content, _InnerNodeInfo): + pred = np.empty((len(sample), n_classes), dtype=np.float64) + l_msk = sample[:, node_content.index] <= node_content.value + pred[l_msk] = self.left.predict_proba(sample[l_msk], n_classes) + pred[~l_msk] = self.right.predict_proba(sample[~l_msk], n_classes) + return pred + assert len(sample) == 0, 'Type not supported' + return np.empty((0, n_classes), dtype=np.float64) + + +class _InnerNodeInfo: + def __init__(self, index=None, value=None): + self.index = index + self.value = value + + +class _LeafInfo: + def __init__(self, size=None, frequencies=None, mode=None): + self.size = size + self.frequencies = frequencies + self.mode = mode + + +class _SkTreeWrapper: + def __init__(self, tree): + self.sk_tree = tree + self.classes = tree.classes_ + + +def _get_sample_attributes(samples_file, indices): + samples_mmap = np.load(samples_file, mmap_mode='r', allow_pickle=False) + x = samples_mmap[indices] + return x + + +def _get_feature_mmap(features_file, i): + return _get_features_mmap(features_file)[i] + + +def _get_features_mmap(features_file): + return np.load(features_file, mmap_mode='r', allow_pickle=False) + + +@task(priority=True, returns=2) +def _sample_selection(n_samples, y_codes, bootstrap, seed): + if bootstrap: + random_state = RandomState(seed) + selection = random_state.choice(n_samples, size=n_samples, + replace=True) + selection.sort() + return selection, y_codes[selection] + else: + return np.arange(n_samples), y_codes + + +def _feature_selection(untried_indices, m_try, random_state): + selection_len = min(m_try, len(untried_indices)) + return random_state.choice(untried_indices, size=selection_len, + replace=False) + + +def _get_groups(sample, y_s, features_mmap, index, value): + if index is None: + empty_sample = np.array([], dtype=np.int64) + empty_labels = np.array([], dtype=np.int8) + return sample, y_s, empty_sample, empty_labels + feature = features_mmap[index][sample] + mask = feature < value + left = sample[mask] + right = sample[~mask] + y_l = y_s[mask] + y_r = y_s[~mask] + return left, y_l, right, y_r + + +def _compute_leaf_info(y_s, n_classes): + frequencies = np.bincount(y_s, minlength=n_classes) + mode = np.argmax(frequencies) + return _LeafInfo(len(y_s), frequencies, mode) + + +def _split_node_wrapper(sample, n_features, y_s, n_classes, m_try, + random_state, samples_file=None, features_file=None): + seed = random_state.randint(np.iinfo(np.int32).max) + + if features_file is not None: + return _split_node_using_features(sample, n_features, y_s, n_classes, + m_try, features_file, seed) + elif samples_file is not None: + return _split_node(sample, n_features, y_s, n_classes, m_try, + samples_file, seed) + else: + raise ValueError('Invalid combination of arguments. samples_file is ' + 'None and features_file is None.') + + +@task(features_file=FILE_IN, returns=(object, list, list, list, list)) +def _split_node_using_features(sample, n_features, y_s, n_classes, m_try, + features_file, seed): + features_mmap = np.load(features_file, mmap_mode='r', allow_pickle=False) + random_state = RandomState(seed) + return _compute_split(sample, n_features, y_s, n_classes, m_try, + features_mmap, random_state) + + +@task(samples_file=FILE_IN, returns=(object, list, list, list, list)) +def _split_node(sample, n_features, y_s, n_classes, m_try, samples_file, seed): + features_mmap = np.load(samples_file, mmap_mode='r', allow_pickle=False).T + random_state = RandomState(seed) + return _compute_split(sample, n_features, y_s, n_classes, m_try, + features_mmap, random_state) + + +def _compute_split(sample, n_features, y_s, n_classes, m_try, features_mmap, + random_state): + node_info = left_group = y_l = right_group = y_r = None + split_ended = False + tried_indices = [] + while not split_ended: + untried_indices = np.setdiff1d(np.arange(n_features), tried_indices) + index_selection = _feature_selection(untried_indices, m_try, + random_state) + b_score = float_info.max + b_index = None + b_value = None + for index in index_selection: + feature = features_mmap[index] + score, value = test_split(sample, y_s, feature, n_classes) + if score < b_score: + b_score, b_value, b_index = score, value, index + groups = _get_groups(sample, y_s, features_mmap, b_index, b_value) + left_group, y_l, right_group, y_r = groups + if left_group.size and right_group.size: + split_ended = True + node_info = _InnerNodeInfo(b_index, b_value) + else: + tried_indices.extend(list(index_selection)) + if len(tried_indices) == n_features: + split_ended = True + node_info = _compute_leaf_info(y_s, n_classes) + left_group = sample + y_l = y_s + right_group = np.array([], dtype=np.int64) + y_r = np.array([], dtype=np.int8) + + return node_info, left_group, y_l, right_group, y_r + + +def _build_subtree_wrapper(sample, y_s, n_features, max_depth, n_classes, + m_try, sklearn_max, random_state, samples_file, + features_file): + seed = random_state.randint(np.iinfo(np.int32).max) + if features_file is not None: + return _build_subtree_using_features(sample, y_s, n_features, + max_depth, n_classes, m_try, + sklearn_max, seed, samples_file, + features_file) + else: + return _build_subtree(sample, y_s, n_features, max_depth, n_classes, + m_try, sklearn_max, seed, samples_file) + + +@task(samples_file=FILE_IN, features_file=FILE_IN, returns=_Node) +def _build_subtree_using_features(sample, y_s, n_features, max_depth, + n_classes, m_try, sklearn_max, seed, + samples_file, features_file): + random_state = RandomState(seed) + return _compute_build_subtree(sample, y_s, n_features, max_depth, + n_classes, m_try, sklearn_max, random_state, + samples_file, features_file=features_file) + + +@task(samples_file=FILE_IN, returns=_Node) +def _build_subtree(sample, y_s, n_features, max_depth, n_classes, m_try, + sklearn_max, seed, samples_file): + random_state = RandomState(seed) + return _compute_build_subtree(sample, y_s, n_features, max_depth, + n_classes, m_try, sklearn_max, random_state, + samples_file) + + +def _compute_build_subtree(sample, y_s, n_features, max_depth, n_classes, + m_try, sklearn_max, random_state, samples_file, + features_file=None, use_sklearn=True): + if not sample.size: + return _Node() + if features_file is not None: + mmap = np.load(features_file, mmap_mode='r', allow_pickle=False) + else: + mmap = np.load(samples_file, mmap_mode='r', allow_pickle=False).T + subtree = _Node() + tree_traversal = [(subtree, sample, y_s, 0)] + while tree_traversal: + node, sample, y_s, depth = tree_traversal.pop() + if depth < max_depth: + if use_sklearn and n_features * len(sample) <= sklearn_max: + if max_depth == np.inf: + sklearn_max_depth = None + else: + sklearn_max_depth = max_depth - depth + dt = SklearnDTClassifier(max_features=m_try, + max_depth=sklearn_max_depth, + random_state=random_state) + unique = np.unique(sample, return_index=True, + return_counts=True) + sample, new_indices, sample_weight = unique + x = _get_sample_attributes(samples_file, sample) + y_s = y_s[new_indices] + dt.fit(x, y_s, sample_weight=sample_weight, check_input=False) + node.content = _SkTreeWrapper(dt) + else: + split = _compute_split(sample, n_features, y_s, n_classes, + m_try, mmap, random_state) + node_info, left_group, y_l, right_group, y_r = split + node.content = node_info + if isinstance(node_info, _InnerNodeInfo): + node.left = _Node() + node.right = _Node() + tree_traversal.append((node.right, right_group, y_r, + depth + 1)) + tree_traversal.append((node.left, left_group, y_l, + depth + 1)) + else: + node.content = _compute_leaf_info(y_s, n_classes) + return subtree + + +@task(returns=list) +def _merge(*object_list): + return object_list + + +def _get_subtree_path(subtree_index, distr_depth): + if distr_depth == 0: + return '' + return bin(subtree_index)[2:].zfill(distr_depth) + + +def _get_predicted_indices(samples, tree, nodes_info, path): + idx_mask = np.full((len(samples),), True) + for direction in path: + node_info = nodes_info[tree.content] + if isinstance(node_info, _LeafInfo): + if direction == '1': + idx_mask[:] = 0 + else: + col = node_info.index + value = node_info.value + if direction == '0': + idx_mask[idx_mask] = samples[idx_mask, col] <= value + tree = tree.left + else: + idx_mask[idx_mask] = samples[idx_mask, col] > value + tree = tree.right + return idx_mask + + +@task(row_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) +def _predict_branch(row_blocks, tree, nodes_info, subtree_index, subtree, + distr_depth): + samples = Array._merge_blocks(row_blocks) + path = _get_subtree_path(subtree_index, distr_depth) + indices_mask = _get_predicted_indices(samples, tree, nodes_info, path) + prediction = subtree.predict(samples[indices_mask]) + return indices_mask, prediction + + +@task(row_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) +def _predict_branch_proba(row_blocks, tree, nodes_info, subtree_index, subtree, + distr_depth, n_classes): + samples = Array._merge_blocks(row_blocks) + path = _get_subtree_path(subtree_index, distr_depth) + indices_mask = _get_predicted_indices(samples, tree, nodes_info, path) + prediction = subtree.predict_proba(samples[indices_mask], n_classes) + return indices_mask, prediction + + +@task(returns=list) +def _merge_branches(n_classes, *predictions): + samples_len = len(predictions[0][0]) + if n_classes is not None: # predict + shape = (samples_len, n_classes) + dtype = np.float64 + else: # predict_proba + shape = (samples_len,) + dtype = np.int64 + merged_prediction = np.empty(shape, dtype=dtype) + for selected, prediction in predictions: + merged_prediction[selected] = prediction + return merged_prediction diff --git a/dislib/regression/rf/forest.py b/dislib/regression/rf/forest.py new file mode 100644 index 00000000..8f6c0f2a --- /dev/null +++ b/dislib/regression/rf/forest.py @@ -0,0 +1,306 @@ +import math +from collections import Counter + +import numpy as np +from pycompss.api.api import compss_wait_on +from pycompss.api.parameter import Type, COLLECTION_IN, Depth +from pycompss.api.task import task +from sklearn.base import BaseEstimator +from sklearn.utils import check_random_state + +from dislib.classification.rf.decision_tree import DecisionTreeClassifier +from dislib.data.array import Array +from dislib.utils.base import _paired_partition +from dislib.classification.rf._data import transform_to_rf_dataset + + +class RandomForestClassifier(BaseEstimator): + """A distributed random forest classifier. + + Parameters + ---------- + n_estimators : int, optional (default=10) + Number of trees to fit. + try_features : int, str or None, optional (default='sqrt') + The number of features to consider when looking for the best split: + + - If "sqrt", then `try_features=sqrt(n_features)`. + - If "third", then `try_features=n_features // 3`. + - If None, then `try_features=n_features`. + + Note: the search for a split does not stop until at least one + valid partition of the node samples is found, even if it requires + to effectively inspect more than ``try_features`` features. + max_depth : int or np.inf, optional (default=np.inf) + The maximum depth of the tree. If np.inf, then nodes are expanded + until all leaves are pure. + distr_depth : int or str, optional (default='auto') + Number of levels of the tree in which the nodes are split in a + distributed way. + sklearn_max: int or float, optional (default=1e8) + Maximum size (len(subsample)*n_features) of the arrays passed to + sklearn's DecisionTreeClassifier.fit(), which is called to fit subtrees + (subsamples) of our DecisionTreeClassifier. sklearn fit() is used + because it's faster, but requires loading the data to memory, which can + cause memory problems for large datasets. This parameter can be + adjusted to fit the hardware capabilities. + hard_vote : bool, optional (default=False) + If True, it uses majority voting over the predict() result of the + decision tree predictions. If False, it takes the class with the higher + probability given by predict_proba(), which is an average of the + probabilities given by the decision trees. + random_state : int, RandomState instance or None, optional (default=None) + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by `np.random`. + + Attributes + ---------- + classes : None or ndarray + Array of distinct classes, set at fit(). + trees : list of DecisionTreeClassifier + List of the tree classifiers of this forest, populated at fit(). + """ + + def __init__(self, + n_estimators=10, + try_features='sqrt', + max_depth=np.inf, + distr_depth='auto', + sklearn_max=1e8, + hard_vote=False, + random_state=None): + self.n_estimators = n_estimators + self.try_features = try_features + self.max_depth = max_depth + self.distr_depth = distr_depth + self.sklearn_max = sklearn_max + self.hard_vote = hard_vote + self.random_state = random_state + + def fit(self, x, y): + """Fits the RandomForestClassifier. + + Parameters + ---------- + x : ds-array, shape=(n_samples, n_features) + The training input samples. Internally, its dtype will be converted + to ``dtype=np.float32``. + y : ds-array, shape=(n_samples, 1) + The target values. + + Returns + ------- + self : RandomForestClassifier + + """ + self.classes = None + self.trees = [] + + dataset = transform_to_rf_dataset(x, y) + + n_features = dataset.get_n_features() + try_features = _resolve_try_features(self.try_features, n_features) + random_state = check_random_state(self.random_state) + + self.classes = dataset.get_classes() + + if self.distr_depth == 'auto': + dataset.n_samples = compss_wait_on(dataset.get_n_samples()) + distr_depth = max(0, int(math.log10(dataset.n_samples)) - 4) + distr_depth = min(distr_depth, self.max_depth) + else: + distr_depth = self.distr_depth + + for i in range(self.n_estimators): + tree = DecisionTreeClassifier(try_features, self.max_depth, + distr_depth, self.sklearn_max, + bootstrap=True, + random_state=random_state) + self.trees.append(tree) + + for tree in self.trees: + tree.fit(dataset) + + return self + + def predict_proba(self, x): + """Predicts class probabilities using a fitted forest. + + The probabilities are obtained as an average of the probabilities of + each decision tree. + + + Parameters + ---------- + x : ds-array, shape=(n_samples, n_features) + The input samples. + + Returns + ------- + probabilities : ds-array, shape=(n_samples, n_classes) + Predicted probabilities for the samples to belong to each class. + The columns of the array correspond to the classes given at + self.classes. + + """ + assert self.trees is not None, 'The random forest is not fitted.' + prob_blocks = [] + for x_row in x._iterator(axis=0): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict_proba(x_row)) + prob_blocks.append([_join_predictions(*tree_predictions)]) + self.classes = compss_wait_on(self.classes) + n_classes = len(self.classes) + + probabilities = Array(blocks=prob_blocks, + top_left_shape=(x._top_left_shape[0], n_classes), + reg_shape=(x._reg_shape[0], n_classes), + shape=(x.shape[0], n_classes), sparse=False) + return probabilities + + def predict(self, x): + """Predicts classes using a fitted forest. + + Parameters + ---------- + x : ds-array, shape=(n_samples, n_features) + The input samples. + + Returns + ------- + y_pred : ds-array, shape=(n_samples, 1) + Predicted class labels for x. + + """ + assert self.trees is not None, 'The random forest is not fitted.' + pred_blocks = [] + if self.hard_vote: + for x_row in x._iterator(axis=0): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict(x_row)) + pred_blocks.append(_hard_vote(self.classes, *tree_predictions)) + else: + for x_row in x._iterator(axis=0): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict_proba(x_row)) + pred_blocks.append(_soft_vote(self.classes, *tree_predictions)) + + y_pred = Array(blocks=[pred_blocks], + top_left_shape=(x._top_left_shape[0], 1), + reg_shape=(x._reg_shape[0], 1), shape=(x.shape[0], 1), + sparse=False) + + return y_pred + + def score(self, x, y): + """Accuracy classification score. + + Returns the mean accuracy on the given test data. + + + Parameters + ---------- + x : ds-array, shape=(n_samples, n_features) + The training input samples. + y : ds-array, shape (n_samples, 1) + The true labels. + + Returns + ------- + score : float (as future object) + Fraction of correctly classified samples. + + """ + assert self.trees is not None, 'The random forest is not fitted.' + partial_scores = [] + if self.hard_vote: + for x_row, y_row in _paired_partition(x, y): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict(x_row)) + subset_score = _hard_vote_score(y_row._blocks, self.classes, + *tree_predictions) + partial_scores.append(subset_score) + else: + for x_row, y_row in _paired_partition(x, y): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict_proba(x_row)) + subset_score = _soft_vote_score(y_row._blocks, self.classes, + *tree_predictions) + partial_scores.append(subset_score) + + return _merge_scores(*partial_scores) + + +@task(returns=1) +def _resolve_try_features(try_features, n_features): + if try_features is None: + return n_features + elif try_features == 'sqrt': + return int(math.sqrt(n_features)) + elif try_features == 'third': + return max(1, n_features // 3) + else: + return int(try_features) + + +@task(returns=1) +def _join_predictions(*predictions): + aggregate = predictions[0] + for p in predictions[1:]: + aggregate += p + labels = aggregate / len(predictions) + return labels + + +@task(returns=1) +def _soft_vote(classes, *predictions): + aggregate = predictions[0] + for p in predictions[1:]: + aggregate += p + labels = classes[np.argmax(aggregate, axis=1)] + return labels + + +@task(returns=1) +def _hard_vote(classes, *predictions): + mode = np.empty((len(predictions[0]),), dtype=int) + for sample_i, votes in enumerate(zip(*predictions)): + mode[sample_i] = Counter(votes).most_common(1)[0][0] + labels = classes[mode] + return labels + + +@task(y_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) +def _soft_vote_score(y_blocks, classes, *predictions): + real_labels = Array._merge_blocks(y_blocks).flatten() + aggregate = predictions[0] + for p in predictions[1:]: + aggregate += p + predicted_labels = classes[np.argmax(aggregate, axis=1)] + correct = np.count_nonzero(predicted_labels == real_labels) + return correct, len(real_labels) + + +@task(y_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) +def _hard_vote_score(y_blocks, classes, *predictions): + real_labels = Array._merge_blocks(y_blocks).flatten() + mode = np.empty((len(predictions[0]),), dtype=int) + for sample_i, votes in enumerate(zip(*predictions)): + mode[sample_i] = Counter(votes).most_common(1)[0][0] + predicted_labels = classes[mode] + correct = np.count_nonzero(predicted_labels == real_labels) + return correct, len(real_labels) + + +@task(returns=1) +def _merge_scores(*partial_scores): + correct = sum(subset_score[0] for subset_score in partial_scores) + total = sum(subset_score[1] for subset_score in partial_scores) + return correct / total diff --git a/dislib/regression/rf/test_split.py b/dislib/regression/rf/test_split.py new file mode 100644 index 00000000..70922783 --- /dev/null +++ b/dislib/regression/rf/test_split.py @@ -0,0 +1,50 @@ +from sys import float_info + +import numpy as np + + +def gini_criteria_proxy(l_weight, l_length, r_weight, r_length, not_repeated): + """ + Maximizing the Gini gain is equivalent to minimizing this proxy function. + + """ + return -(l_weight / l_length + r_weight / r_length) * not_repeated + + +def test_split(sample, y_s, feature, n_classes): + size = y_s.shape[0] + if size == 0: + return float_info.max, np.float64(np.inf) + + f = feature[sample] + sort_indices = np.argsort(f) + y_sorted = y_s[sort_indices] + f_sorted = f[sort_indices] + + not_repeated = np.empty(size, dtype=np.bool_) + not_repeated[0: size - 1] = (f_sorted[1:] != f_sorted[:-1]) + not_repeated[size - 1] = True + + l_freq = np.zeros((n_classes, size), dtype=np.int64) + l_freq[y_sorted, np.arange(size)] = 1 + + r_freq = np.zeros((n_classes, size), dtype=np.int64) + r_freq[:, 1:] = l_freq[:, :0:-1] + + l_weight = np.sum(np.square(np.cumsum(l_freq, axis=-1)), axis=0) + r_weight = np.sum(np.square(np.cumsum(r_freq, axis=-1)), axis=0)[::-1] + + l_length = np.arange(1, size + 1, dtype=np.int32) + r_length = np.arange(size - 1, -1, -1, dtype=np.int32) + r_length[size - 1] = 1 # Avoid div by zero, the right score is 0 anyways + + scores = gini_criteria_proxy(l_weight, l_length, r_weight, r_length, + not_repeated) + + min_index = size - np.argmin(scores[::-1]) - 1 + + if min_index + 1 == size: + b_value = np.float64(np.inf) + else: + b_value = (f_sorted[min_index] + f_sorted[min_index + 1]) / 2 + return scores[min_index], b_value From 5487ab0829258139ba32d2da88f8b420931f8786 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Thu, 22 Jul 2021 15:09:27 +0200 Subject: [PATCH 11/46] Added DecisionTreeRegressor with MSE criterion --- dislib/regression/rf/decision_tree.py | 397 ++++++++++++++------------ dislib/regression/rf/test_split.py | 28 +- 2 files changed, 234 insertions(+), 191 deletions(-) diff --git a/dislib/regression/rf/decision_tree.py b/dislib/regression/rf/decision_tree.py index 0725fcfa..43ecaf79 100644 --- a/dislib/regression/rf/decision_tree.py +++ b/dislib/regression/rf/decision_tree.py @@ -5,14 +5,14 @@ from pycompss.api.api import compss_delete_object from pycompss.api.parameter import FILE_IN, Type, COLLECTION_IN, Depth from pycompss.api.task import task -from sklearn.tree import DecisionTreeClassifier as SklearnDTClassifier +from sklearn.tree import DecisionTreeRegressor as SklearnDTRegressor -from dislib.classification.rf.test_split import test_split +from dislib.regression.rf.test_split import test_split from dislib.data.array import Array -class DecisionTreeClassifier: - """A distributed decision tree classifier. +class DecisionTreeRegressor: + """A distributed decision tree regressor. Parameters ---------- @@ -39,9 +39,6 @@ class DecisionTreeClassifier: n_features : int The number of features of the dataset. It can be a pycompss.runtime.Future object. - n_classes : int - The number of classes of this RfDataset. It can be a - pycompss.runtime.Future object. tree : None or _Node The root node of the tree after the tree is fitted. nodes_info : None or list of _InnerNodeInfo and _LeafInfo @@ -56,7 +53,7 @@ class DecisionTreeClassifier: Methods ------- fit(dataset) - Fits the DecisionTreeClassifier. + Fits the DecisionTreeRegressor. predict(x_row) Predicts classes for the given samples using a fitted tree. predict_proba(x_row) @@ -64,8 +61,15 @@ class DecisionTreeClassifier: """ - def __init__(self, try_features, max_depth, distr_depth, sklearn_max, - bootstrap, random_state): + def __init__( + self, + try_features, + max_depth, + distr_depth, + sklearn_max, + bootstrap, + random_state, + ): self.try_features = try_features self.max_depth = max_depth self.distr_depth = distr_depth @@ -74,14 +78,13 @@ def __init__(self, try_features, max_depth, distr_depth, sklearn_max, self.random_state = random_state self.n_features = None - self.n_classes = None self.tree = None self.nodes_info = None self.subtrees = None def fit(self, dataset): - """Fits the DecisionTreeClassifier. + """Fits the DecisionTreeRegressor. Parameters ---------- @@ -90,16 +93,16 @@ def fit(self, dataset): """ self.n_features = dataset.get_n_features() - self.n_classes = dataset.get_n_classes() samples_path = dataset.samples_path features_path = dataset.features_path n_samples = dataset.get_n_samples() - y_codes = dataset.get_y_codes() + y_targets = dataset.get_y_targets() seed = self.random_state.randint(np.iinfo(np.int32).max) - sample, y_s = _sample_selection(n_samples, y_codes, self.bootstrap, - seed) + sample, y_s = _sample_selection( + n_samples, y_targets, self.bootstrap, seed + ) self.tree = _Node() self.nodes_info = [] @@ -108,11 +111,15 @@ def fit(self, dataset): while tree_traversal: node, sample, y_s, depth = tree_traversal.pop() if depth < self.distr_depth: - split = _split_node_wrapper(sample, self.n_features, y_s, - self.n_classes, self.try_features, - self.random_state, - samples_file=samples_path, - features_file=features_path) + split = _split_node_wrapper( + sample, + self.n_features, + y_s, + self.try_features, + self.random_state, + samples_file=samples_path, + features_file=features_path, + ) node_info, left_group, y_l, right_group, y_r = split compss_delete_object(sample) compss_delete_object(y_s) @@ -124,13 +131,17 @@ def fit(self, dataset): tree_traversal.append((node.right, right_group, y_r, depth)) tree_traversal.append((node.left, left_group, y_l, depth)) else: - subtree = _build_subtree_wrapper(sample, y_s, self.n_features, - self.max_depth - depth, - self.n_classes, - self.try_features, - self.sklearn_max, - self.random_state, - samples_path, features_path) + subtree = _build_subtree_wrapper( + sample, + y_s, + self.n_features, + self.max_depth - depth, + self.try_features, + self.sklearn_max, + self.random_state, + samples_path, + features_path, + ) node.content = len(self.subtrees) self.subtrees.append(subtree) compss_delete_object(sample) @@ -155,47 +166,23 @@ def predict(self, x_row): """ - assert self.tree is not None, 'The decision tree is not fitted.' + assert self.tree is not None, "The decision tree is not fitted." branch_predictions = [] for i, subtree in enumerate(self.subtrees): - pred = _predict_branch(x_row._blocks, self.tree, self.nodes_info, - i, subtree, self.distr_depth) + pred = _predict_branch( + x_row._blocks, + self.tree, + self.nodes_info, + i, + subtree, + self.distr_depth, + ) branch_predictions.append(pred) return _merge_branches(None, *branch_predictions) - def predict_proba(self, x_row): - """Predicts class probabilities for a row block using a fitted tree. - - Parameters - ---------- - x_row : ds-array - A row block of samples. - - Returns - ------- - predicted_proba : ndarray - An array with the predicted probabilities for the given samples. - The shape is (len(subset.samples), self.n_classes), with the index - of the column being codes of the fitted - dislib.classification.rf.data.RfDataset. The returned object can be - a pycompss.runtime.Future object. - - """ - - assert self.tree is not None, 'The decision tree is not fitted.' - - branch_predictions = [] - for i, subtree in enumerate(self.subtrees): - pred = _predict_branch_proba(x_row._blocks, self.tree, - self.nodes_info, i, subtree, - self.distr_depth, self.n_classes) - branch_predictions.append(pred) - return _merge_branches(self.n_classes, *branch_predictions) - class _Node: - def __init__(self): self.content = None self.left = None @@ -204,7 +191,7 @@ def __init__(self): def predict(self, sample): node_content = self.content if isinstance(node_content, _LeafInfo): - return np.full((len(sample),), node_content.mode) + return np.full((len(sample),), node_content.mean) if isinstance(node_content, _SkTreeWrapper): if len(sample) > 0: return node_content.sk_tree.predict(sample) @@ -214,29 +201,9 @@ def predict(self, sample): pred[left_mask] = self.left.predict(sample[left_mask]) pred[~left_mask] = self.right.predict(sample[~left_mask]) return pred - assert len(sample) == 0, 'Type not supported' + assert len(sample) == 0, "Type not supported" return np.empty((0,), dtype=np.int64) - def predict_proba(self, sample, n_classes): - node_content = self.content - if isinstance(node_content, _LeafInfo): - single_pred = node_content.frequencies / node_content.size - return np.tile(single_pred, (len(sample), 1)) - if isinstance(node_content, _SkTreeWrapper): - if len(sample) > 0: - sk_tree_pred = node_content.sk_tree.predict_proba(sample) - pred = np.zeros((len(sample), n_classes), dtype=np.float64) - pred[:, node_content.sk_tree.classes_] = sk_tree_pred - return pred - if isinstance(node_content, _InnerNodeInfo): - pred = np.empty((len(sample), n_classes), dtype=np.float64) - l_msk = sample[:, node_content.index] <= node_content.value - pred[l_msk] = self.left.predict_proba(sample[l_msk], n_classes) - pred[~l_msk] = self.right.predict_proba(sample[~l_msk], n_classes) - return pred - assert len(sample) == 0, 'Type not supported' - return np.empty((0, n_classes), dtype=np.float64) - class _InnerNodeInfo: def __init__(self, index=None, value=None): @@ -245,10 +212,9 @@ def __init__(self, index=None, value=None): class _LeafInfo: - def __init__(self, size=None, frequencies=None, mode=None): + def __init__(self, size=None, mean=None): self.size = size - self.frequencies = frequencies - self.mode = mode + self.mean = mean class _SkTreeWrapper: @@ -258,7 +224,7 @@ def __init__(self, tree): def _get_sample_attributes(samples_file, indices): - samples_mmap = np.load(samples_file, mmap_mode='r', allow_pickle=False) + samples_mmap = np.load(samples_file, mmap_mode="r", allow_pickle=False) x = samples_mmap[indices] return x @@ -268,25 +234,27 @@ def _get_feature_mmap(features_file, i): def _get_features_mmap(features_file): - return np.load(features_file, mmap_mode='r', allow_pickle=False) + return np.load(features_file, mmap_mode="r", allow_pickle=False) @task(priority=True, returns=2) -def _sample_selection(n_samples, y_codes, bootstrap, seed): +def _sample_selection(n_samples, y_targets, bootstrap, seed): if bootstrap: random_state = RandomState(seed) - selection = random_state.choice(n_samples, size=n_samples, - replace=True) + selection = random_state.choice( + n_samples, size=n_samples, replace=True + ) selection.sort() - return selection, y_codes[selection] + return selection, y_targets[selection] else: - return np.arange(n_samples), y_codes + return np.arange(n_samples), y_targets def _feature_selection(untried_indices, m_try, random_state): selection_len = min(m_try, len(untried_indices)) - return random_state.choice(untried_indices, size=selection_len, - replace=False) + return random_state.choice( + untried_indices, size=selection_len, replace=False + ) def _get_groups(sample, y_s, features_mmap, index, value): @@ -303,59 +271,71 @@ def _get_groups(sample, y_s, features_mmap, index, value): return left, y_l, right, y_r -def _compute_leaf_info(y_s, n_classes): - frequencies = np.bincount(y_s, minlength=n_classes) - mode = np.argmax(frequencies) - return _LeafInfo(len(y_s), frequencies, mode) +def _compute_leaf_info(y_s): + return _LeafInfo(len(y_s), np.mean(y_s)) -def _split_node_wrapper(sample, n_features, y_s, n_classes, m_try, - random_state, samples_file=None, features_file=None): +def _split_node_wrapper( + sample, + n_features, + y_s, + m_try, + random_state, + samples_file=None, + features_file=None, +): seed = random_state.randint(np.iinfo(np.int32).max) if features_file is not None: - return _split_node_using_features(sample, n_features, y_s, n_classes, - m_try, features_file, seed) + return _split_node_using_features( + sample, n_features, y_s, m_try, features_file, seed + ) elif samples_file is not None: - return _split_node(sample, n_features, y_s, n_classes, m_try, - samples_file, seed) + return _split_node(sample, n_features, y_s, m_try, samples_file, seed) else: - raise ValueError('Invalid combination of arguments. samples_file is ' - 'None and features_file is None.') + raise ValueError( + "Invalid combination of arguments. samples_file is " + "None and features_file is None." + ) @task(features_file=FILE_IN, returns=(object, list, list, list, list)) -def _split_node_using_features(sample, n_features, y_s, n_classes, m_try, - features_file, seed): - features_mmap = np.load(features_file, mmap_mode='r', allow_pickle=False) +def _split_node_using_features( + sample, n_features, y_s, m_try, features_file, seed +): + features_mmap = np.load(features_file, mmap_mode="r", allow_pickle=False) random_state = RandomState(seed) - return _compute_split(sample, n_features, y_s, n_classes, m_try, - features_mmap, random_state) + return _compute_split( + sample, n_features, y_s, m_try, features_mmap, random_state + ) @task(samples_file=FILE_IN, returns=(object, list, list, list, list)) -def _split_node(sample, n_features, y_s, n_classes, m_try, samples_file, seed): - features_mmap = np.load(samples_file, mmap_mode='r', allow_pickle=False).T +def _split_node(sample, n_features, y_s, m_try, samples_file, seed): + features_mmap = np.load(samples_file, mmap_mode="r", allow_pickle=False).T random_state = RandomState(seed) - return _compute_split(sample, n_features, y_s, n_classes, m_try, - features_mmap, random_state) + return _compute_split( + sample, n_features, y_s, m_try, features_mmap, random_state + ) -def _compute_split(sample, n_features, y_s, n_classes, m_try, features_mmap, - random_state): +def _compute_split( + sample, n_features, y_s, m_try, features_mmap, random_state +): node_info = left_group = y_l = right_group = y_r = None split_ended = False tried_indices = [] while not split_ended: untried_indices = np.setdiff1d(np.arange(n_features), tried_indices) - index_selection = _feature_selection(untried_indices, m_try, - random_state) + index_selection = _feature_selection( + untried_indices, m_try, random_state + ) b_score = float_info.max b_index = None b_value = None for index in index_selection: feature = features_mmap[index] - score, value = test_split(sample, y_s, feature, n_classes) + score, value = test_split(sample, y_s, feature) if score < b_score: b_score, b_value, b_index = score, value, index groups = _get_groups(sample, y_s, features_mmap, b_index, b_value) @@ -367,7 +347,7 @@ def _compute_split(sample, n_features, y_s, n_classes, m_try, features_mmap, tried_indices.extend(list(index_selection)) if len(tried_indices) == n_features: split_ended = True - node_info = _compute_leaf_info(y_s, n_classes) + node_info = _compute_leaf_info(y_s) left_group = sample y_l = y_s right_group = np.array([], dtype=np.int64) @@ -376,48 +356,111 @@ def _compute_split(sample, n_features, y_s, n_classes, m_try, features_mmap, return node_info, left_group, y_l, right_group, y_r -def _build_subtree_wrapper(sample, y_s, n_features, max_depth, n_classes, - m_try, sklearn_max, random_state, samples_file, - features_file): +def _build_subtree_wrapper( + sample, + y_s, + n_features, + max_depth, + m_try, + sklearn_max, + random_state, + samples_file, + features_file, +): seed = random_state.randint(np.iinfo(np.int32).max) if features_file is not None: - return _build_subtree_using_features(sample, y_s, n_features, - max_depth, n_classes, m_try, - sklearn_max, seed, samples_file, - features_file) + return _build_subtree_using_features( + sample, + y_s, + n_features, + max_depth, + m_try, + sklearn_max, + seed, + samples_file, + features_file, + ) else: - return _build_subtree(sample, y_s, n_features, max_depth, n_classes, - m_try, sklearn_max, seed, samples_file) + return _build_subtree( + sample, + y_s, + n_features, + max_depth, + m_try, + sklearn_max, + seed, + samples_file, + ) @task(samples_file=FILE_IN, features_file=FILE_IN, returns=_Node) -def _build_subtree_using_features(sample, y_s, n_features, max_depth, - n_classes, m_try, sklearn_max, seed, - samples_file, features_file): +def _build_subtree_using_features( + sample, + y_s, + n_features, + max_depth, + m_try, + sklearn_max, + seed, + samples_file, + features_file, +): random_state = RandomState(seed) - return _compute_build_subtree(sample, y_s, n_features, max_depth, - n_classes, m_try, sklearn_max, random_state, - samples_file, features_file=features_file) + return _compute_build_subtree( + sample, + y_s, + n_features, + max_depth, + m_try, + sklearn_max, + random_state, + samples_file, + features_file=features_file, + ) @task(samples_file=FILE_IN, returns=_Node) -def _build_subtree(sample, y_s, n_features, max_depth, n_classes, m_try, - sklearn_max, seed, samples_file): +def _build_subtree( + sample, + y_s, + n_features, + max_depth, + m_try, + sklearn_max, + seed, + samples_file, +): random_state = RandomState(seed) - return _compute_build_subtree(sample, y_s, n_features, max_depth, - n_classes, m_try, sklearn_max, random_state, - samples_file) - - -def _compute_build_subtree(sample, y_s, n_features, max_depth, n_classes, - m_try, sklearn_max, random_state, samples_file, - features_file=None, use_sklearn=True): + return _compute_build_subtree( + sample, + y_s, + n_features, + max_depth, + m_try, + sklearn_max, + random_state, + samples_file, + ) + + +def _compute_build_subtree( + sample, + y_s, + n_features, + max_depth, + m_try, + sklearn_max, + random_state, + samples_file, + features_file=None, + use_sklearn=True, +): if not sample.size: return _Node() if features_file is not None: - mmap = np.load(features_file, mmap_mode='r', allow_pickle=False) + mmap = np.load(features_file, mmap_mode="r", allow_pickle=False) else: - mmap = np.load(samples_file, mmap_mode='r', allow_pickle=False).T + mmap = np.load(samples_file, mmap_mode="r", allow_pickle=False).T subtree = _Node() tree_traversal = [(subtree, sample, y_s, 0)] while tree_traversal: @@ -428,30 +471,41 @@ def _compute_build_subtree(sample, y_s, n_features, max_depth, n_classes, sklearn_max_depth = None else: sklearn_max_depth = max_depth - depth - dt = SklearnDTClassifier(max_features=m_try, - max_depth=sklearn_max_depth, - random_state=random_state) - unique = np.unique(sample, return_index=True, - return_counts=True) + dt = SklearnDTRegressor( + max_features=m_try, + max_depth=sklearn_max_depth, + random_state=random_state, + ) + unique = np.unique( + sample, return_index=True, return_counts=True + ) sample, new_indices, sample_weight = unique x = _get_sample_attributes(samples_file, sample) y_s = y_s[new_indices] dt.fit(x, y_s, sample_weight=sample_weight, check_input=False) node.content = _SkTreeWrapper(dt) else: - split = _compute_split(sample, n_features, y_s, n_classes, - m_try, mmap, random_state) + split = _compute_split( + sample, + n_features, + y_s, + m_try, + mmap, + random_state, + ) node_info, left_group, y_l, right_group, y_r = split node.content = node_info if isinstance(node_info, _InnerNodeInfo): node.left = _Node() node.right = _Node() - tree_traversal.append((node.right, right_group, y_r, - depth + 1)) - tree_traversal.append((node.left, left_group, y_l, - depth + 1)) + tree_traversal.append( + (node.right, right_group, y_r, depth + 1) + ) + tree_traversal.append( + (node.left, left_group, y_l, depth + 1) + ) else: - node.content = _compute_leaf_info(y_s, n_classes) + node.content = _compute_leaf_info(y_s) return subtree @@ -462,7 +516,7 @@ def _merge(*object_list): def _get_subtree_path(subtree_index, distr_depth): if distr_depth == 0: - return '' + return "" return bin(subtree_index)[2:].zfill(distr_depth) @@ -471,12 +525,12 @@ def _get_predicted_indices(samples, tree, nodes_info, path): for direction in path: node_info = nodes_info[tree.content] if isinstance(node_info, _LeafInfo): - if direction == '1': + if direction == "1": idx_mask[:] = 0 else: col = node_info.index value = node_info.value - if direction == '0': + if direction == "0": idx_mask[idx_mask] = samples[idx_mask, col] <= value tree = tree.left else: @@ -486,8 +540,9 @@ def _get_predicted_indices(samples, tree, nodes_info, path): @task(row_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) -def _predict_branch(row_blocks, tree, nodes_info, subtree_index, subtree, - distr_depth): +def _predict_branch( + row_blocks, tree, nodes_info, subtree_index, subtree, distr_depth +): samples = Array._merge_blocks(row_blocks) path = _get_subtree_path(subtree_index, distr_depth) indices_mask = _get_predicted_indices(samples, tree, nodes_info, path) @@ -495,16 +550,6 @@ def _predict_branch(row_blocks, tree, nodes_info, subtree_index, subtree, return indices_mask, prediction -@task(row_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) -def _predict_branch_proba(row_blocks, tree, nodes_info, subtree_index, subtree, - distr_depth, n_classes): - samples = Array._merge_blocks(row_blocks) - path = _get_subtree_path(subtree_index, distr_depth) - indices_mask = _get_predicted_indices(samples, tree, nodes_info, path) - prediction = subtree.predict_proba(samples[indices_mask], n_classes) - return indices_mask, prediction - - @task(returns=list) def _merge_branches(n_classes, *predictions): samples_len = len(predictions[0][0]) diff --git a/dislib/regression/rf/test_split.py b/dislib/regression/rf/test_split.py index 70922783..aa482b3c 100644 --- a/dislib/regression/rf/test_split.py +++ b/dislib/regression/rf/test_split.py @@ -3,15 +3,15 @@ import numpy as np -def gini_criteria_proxy(l_weight, l_length, r_weight, r_length, not_repeated): +def mse_criteria_proxy(l_weight, l_length, r_weight, r_length, not_repeated): """ - Maximizing the Gini gain is equivalent to minimizing this proxy function. + Maximizing the MSE gain is equivalent to minimizing this proxy function. """ return -(l_weight / l_length + r_weight / r_length) * not_repeated -def test_split(sample, y_s, feature, n_classes): +def test_split(sample, y_s, feature): size = y_s.shape[0] if size == 0: return float_info.max, np.float64(np.inf) @@ -21,28 +21,26 @@ def test_split(sample, y_s, feature, n_classes): y_sorted = y_s[sort_indices] f_sorted = f[sort_indices] + # Threshold value must not be that value of a sample not_repeated = np.empty(size, dtype=np.bool_) - not_repeated[0: size - 1] = (f_sorted[1:] != f_sorted[:-1]) + not_repeated[0 : size - 1] = f_sorted[1:] != f_sorted[:-1] not_repeated[size - 1] = True - l_freq = np.zeros((n_classes, size), dtype=np.int64) - l_freq[y_sorted, np.arange(size)] = 1 - - r_freq = np.zeros((n_classes, size), dtype=np.int64) - r_freq[:, 1:] = l_freq[:, :0:-1] - - l_weight = np.sum(np.square(np.cumsum(l_freq, axis=-1)), axis=0) - r_weight = np.sum(np.square(np.cumsum(r_freq, axis=-1)), axis=0)[::-1] + # Square of the sum of the y values of each branch + r_weight = np.zeros(size) + l_weight = np.square(np.cumsum(y_sorted, axis=-1)) + r_weight[:-1] = np.square(np.cumsum(y_sorted[::-1], axis=-1)[-2::-1]) + # Number of samples of each branch l_length = np.arange(1, size + 1, dtype=np.int32) r_length = np.arange(size - 1, -1, -1, dtype=np.int32) r_length[size - 1] = 1 # Avoid div by zero, the right score is 0 anyways - scores = gini_criteria_proxy(l_weight, l_length, r_weight, r_length, - not_repeated) + scores = mse_criteria_proxy( + l_weight, l_length, r_weight, r_length, not_repeated + ) min_index = size - np.argmin(scores[::-1]) - 1 - if min_index + 1 == size: b_value = np.float64(np.inf) else: From 606de7cdffaad9b1e97c26885043068a4650a914 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Fri, 23 Jul 2021 12:51:08 +0200 Subject: [PATCH 12/46] Added RandomForestRegressor --- dislib/regression/__init__.py | 3 +- dislib/regression/rf/decision_tree.py | 13 +- dislib/regression/rf/forest.py | 242 +++++++++----------------- tests/test_rf_regressor.py | 105 +++++++++++ 4 files changed, 199 insertions(+), 164 deletions(-) create mode 100644 tests/test_rf_regressor.py diff --git a/dislib/regression/__init__.py b/dislib/regression/__init__.py index e3287a0b..4a222968 100644 --- a/dislib/regression/__init__.py +++ b/dislib/regression/__init__.py @@ -1,4 +1,5 @@ from dislib.regression.linear.base import LinearRegression from dislib.regression.lasso.base import Lasso +from dislib.regression.rf.forest import RandomForestRegressor -__all__ = ['LinearRegression', 'Lasso'] +__all__ = ["LinearRegression", "Lasso", "RandomForestRegressor"] diff --git a/dislib/regression/rf/decision_tree.py b/dislib/regression/rf/decision_tree.py index 43ecaf79..82730a5d 100644 --- a/dislib/regression/rf/decision_tree.py +++ b/dislib/regression/rf/decision_tree.py @@ -196,13 +196,13 @@ def predict(self, sample): if len(sample) > 0: return node_content.sk_tree.predict(sample) if isinstance(node_content, _InnerNodeInfo): - pred = np.empty((len(sample),), dtype=np.int64) + pred = np.empty((len(sample),), dtype=np.float64) left_mask = sample[:, node_content.index] <= node_content.value pred[left_mask] = self.left.predict(sample[left_mask]) pred[~left_mask] = self.right.predict(sample[~left_mask]) return pred assert len(sample) == 0, "Type not supported" - return np.empty((0,), dtype=np.int64) + return np.empty((0,), dtype=np.float64) class _InnerNodeInfo: @@ -220,7 +220,6 @@ def __init__(self, size=None, mean=None): class _SkTreeWrapper: def __init__(self, tree): self.sk_tree = tree - self.classes = tree.classes_ def _get_sample_attributes(samples_file, indices): @@ -260,8 +259,8 @@ def _feature_selection(untried_indices, m_try, random_state): def _get_groups(sample, y_s, features_mmap, index, value): if index is None: empty_sample = np.array([], dtype=np.int64) - empty_labels = np.array([], dtype=np.int8) - return sample, y_s, empty_sample, empty_labels + empty_target = np.array([], dtype=np.float64) + return sample, y_s, empty_sample, empty_target feature = features_mmap[index][sample] mask = feature < value left = sample[mask] @@ -351,7 +350,7 @@ def _compute_split( left_group = sample y_l = y_s right_group = np.array([], dtype=np.int64) - y_r = np.array([], dtype=np.int8) + y_r = np.array([], dtype=np.float64) return node_info, left_group, y_l, right_group, y_r @@ -558,7 +557,7 @@ def _merge_branches(n_classes, *predictions): dtype = np.float64 else: # predict_proba shape = (samples_len,) - dtype = np.int64 + dtype = np.float64 merged_prediction = np.empty(shape, dtype=dtype) for selected, prediction in predictions: merged_prediction[selected] = prediction diff --git a/dislib/regression/rf/forest.py b/dislib/regression/rf/forest.py index 8f6c0f2a..faae07c8 100644 --- a/dislib/regression/rf/forest.py +++ b/dislib/regression/rf/forest.py @@ -8,14 +8,14 @@ from sklearn.base import BaseEstimator from sklearn.utils import check_random_state -from dislib.classification.rf.decision_tree import DecisionTreeClassifier +from dislib.regression.rf.decision_tree import DecisionTreeRegressor from dislib.data.array import Array from dislib.utils.base import _paired_partition -from dislib.classification.rf._data import transform_to_rf_dataset +from dislib.regression.rf._data import transform_to_rf_dataset -class RandomForestClassifier(BaseEstimator): - """A distributed random forest classifier. +class RandomForestRegressor(BaseEstimator): + """A distributed random forest regressor. Parameters ---------- @@ -39,16 +39,11 @@ class RandomForestClassifier(BaseEstimator): distributed way. sklearn_max: int or float, optional (default=1e8) Maximum size (len(subsample)*n_features) of the arrays passed to - sklearn's DecisionTreeClassifier.fit(), which is called to fit subtrees - (subsamples) of our DecisionTreeClassifier. sklearn fit() is used + sklearn's DecisionTreeRegressor.fit(), which is called to fit subtrees + (subsamples) of our DecisionTreeRegressor. sklearn fit() is used because it's faster, but requires loading the data to memory, which can cause memory problems for large datasets. This parameter can be adjusted to fit the hardware capabilities. - hard_vote : bool, optional (default=False) - If True, it uses majority voting over the predict() result of the - decision tree predictions. If False, it takes the class with the higher - probability given by predict_proba(), which is an average of the - probabilities given by the decision trees. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; @@ -57,30 +52,28 @@ class RandomForestClassifier(BaseEstimator): Attributes ---------- - classes : None or ndarray - Array of distinct classes, set at fit(). - trees : list of DecisionTreeClassifier - List of the tree classifiers of this forest, populated at fit(). + trees : list of DecisionTreeRegressor + List of the tree regressors of this forest, populated at fit(). """ - def __init__(self, - n_estimators=10, - try_features='sqrt', - max_depth=np.inf, - distr_depth='auto', - sklearn_max=1e8, - hard_vote=False, - random_state=None): + def __init__( + self, + n_estimators=10, + try_features="sqrt", + max_depth=np.inf, + distr_depth="auto", + sklearn_max=1e8, + random_state=None, + ): self.n_estimators = n_estimators self.try_features = try_features self.max_depth = max_depth self.distr_depth = distr_depth self.sklearn_max = sklearn_max - self.hard_vote = hard_vote self.random_state = random_state def fit(self, x, y): - """Fits the RandomForestClassifier. + """Fits the RandomForestRegressor. Parameters ---------- @@ -92,10 +85,9 @@ def fit(self, x, y): Returns ------- - self : RandomForestClassifier + self : RandomForestRegressor """ - self.classes = None self.trees = [] dataset = transform_to_rf_dataset(x, y) @@ -104,20 +96,22 @@ def fit(self, x, y): try_features = _resolve_try_features(self.try_features, n_features) random_state = check_random_state(self.random_state) - self.classes = dataset.get_classes() - - if self.distr_depth == 'auto': + if self.distr_depth == "auto": dataset.n_samples = compss_wait_on(dataset.get_n_samples()) distr_depth = max(0, int(math.log10(dataset.n_samples)) - 4) distr_depth = min(distr_depth, self.max_depth) else: distr_depth = self.distr_depth - for i in range(self.n_estimators): - tree = DecisionTreeClassifier(try_features, self.max_depth, - distr_depth, self.sklearn_max, - bootstrap=True, - random_state=random_state) + for _ in range(self.n_estimators): + tree = DecisionTreeRegressor( + try_features, + self.max_depth, + distr_depth, + self.sklearn_max, + bootstrap=True, + random_state=random_state, + ) self.trees.append(tree) for tree in self.trees: @@ -125,44 +119,8 @@ def fit(self, x, y): return self - def predict_proba(self, x): - """Predicts class probabilities using a fitted forest. - - The probabilities are obtained as an average of the probabilities of - each decision tree. - - - Parameters - ---------- - x : ds-array, shape=(n_samples, n_features) - The input samples. - - Returns - ------- - probabilities : ds-array, shape=(n_samples, n_classes) - Predicted probabilities for the samples to belong to each class. - The columns of the array correspond to the classes given at - self.classes. - - """ - assert self.trees is not None, 'The random forest is not fitted.' - prob_blocks = [] - for x_row in x._iterator(axis=0): - tree_predictions = [] - for tree in self.trees: - tree_predictions.append(tree.predict_proba(x_row)) - prob_blocks.append([_join_predictions(*tree_predictions)]) - self.classes = compss_wait_on(self.classes) - n_classes = len(self.classes) - - probabilities = Array(blocks=prob_blocks, - top_left_shape=(x._top_left_shape[0], n_classes), - reg_shape=(x._reg_shape[0], n_classes), - shape=(x.shape[0], n_classes), sparse=False) - return probabilities - def predict(self, x): - """Predicts classes using a fitted forest. + """Predicts target values using a fitted forest. Parameters ---------- @@ -172,36 +130,40 @@ def predict(self, x): Returns ------- y_pred : ds-array, shape=(n_samples, 1) - Predicted class labels for x. + Predicted target values for x. """ - assert self.trees is not None, 'The random forest is not fitted.' + assert self.trees is not None, "The random forest is not fitted." pred_blocks = [] - if self.hard_vote: - for x_row in x._iterator(axis=0): - tree_predictions = [] - for tree in self.trees: - tree_predictions.append(tree.predict(x_row)) - pred_blocks.append(_hard_vote(self.classes, *tree_predictions)) - else: - for x_row in x._iterator(axis=0): - tree_predictions = [] - for tree in self.trees: - tree_predictions.append(tree.predict_proba(x_row)) - pred_blocks.append(_soft_vote(self.classes, *tree_predictions)) + for x_row in x._iterator(axis=0): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict(x_row)) + pred_blocks.append(_join_predictions(*tree_predictions)) - y_pred = Array(blocks=[pred_blocks], - top_left_shape=(x._top_left_shape[0], 1), - reg_shape=(x._reg_shape[0], 1), shape=(x.shape[0], 1), - sparse=False) + y_pred = Array( + blocks=[pred_blocks], + top_left_shape=(x._top_left_shape[0], 1), + reg_shape=(x._reg_shape[0], 1), + shape=(x.shape[0], 1), + sparse=False, + ) return y_pred def score(self, x, y): - """Accuracy classification score. - - Returns the mean accuracy on the given test data. - + """Accuracy regression score. + + Return the coefficient of determination $R^2$ of + the prediction. + The coefficient $R^2$ is defined as $(1-u/v)$, where $u$ + is the residual sum of squares `((y_true - y_pred) ** 2).sum()` and + $v$ is the total sum of squares + `((y_true - y_true.mean()) ** 2).sum()`. + The best possible score is 1.0 and it can be negative + (because the model can be arbitrarily worse). + A constant model that always predicts the expected value of y, + disregarding the input features, would get a $R^2$ score of 0.0. Parameters ---------- @@ -213,27 +175,17 @@ def score(self, x, y): Returns ------- score : float (as future object) - Fraction of correctly classified samples. + Coefficient of determination $R^2$. """ - assert self.trees is not None, 'The random forest is not fitted.' + assert self.trees is not None, "The random forest is not fitted." partial_scores = [] - if self.hard_vote: - for x_row, y_row in _paired_partition(x, y): - tree_predictions = [] - for tree in self.trees: - tree_predictions.append(tree.predict(x_row)) - subset_score = _hard_vote_score(y_row._blocks, self.classes, - *tree_predictions) - partial_scores.append(subset_score) - else: - for x_row, y_row in _paired_partition(x, y): - tree_predictions = [] - for tree in self.trees: - tree_predictions.append(tree.predict_proba(x_row)) - subset_score = _soft_vote_score(y_row._blocks, self.classes, - *tree_predictions) - partial_scores.append(subset_score) + for x_row, y_row in _paired_partition(x, y): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict(x_row)) + subset_score = _partial_score(y_row._blocks, *tree_predictions) + partial_scores.append(subset_score) return _merge_scores(*partial_scores) @@ -242,9 +194,9 @@ def score(self, x, y): def _resolve_try_features(try_features, n_features): if try_features is None: return n_features - elif try_features == 'sqrt': + elif try_features == "sqrt": return int(math.sqrt(n_features)) - elif try_features == 'third': + elif try_features == "third": return max(1, n_features // 3) else: return int(try_features) @@ -255,52 +207,30 @@ def _join_predictions(*predictions): aggregate = predictions[0] for p in predictions[1:]: aggregate += p - labels = aggregate / len(predictions) - return labels - - -@task(returns=1) -def _soft_vote(classes, *predictions): - aggregate = predictions[0] - for p in predictions[1:]: - aggregate += p - labels = classes[np.argmax(aggregate, axis=1)] - return labels - - -@task(returns=1) -def _hard_vote(classes, *predictions): - mode = np.empty((len(predictions[0]),), dtype=int) - for sample_i, votes in enumerate(zip(*predictions)): - mode[sample_i] = Counter(votes).most_common(1)[0][0] - labels = classes[mode] - return labels + target = aggregate / len(predictions) + return target @task(y_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) -def _soft_vote_score(y_blocks, classes, *predictions): - real_labels = Array._merge_blocks(y_blocks).flatten() - aggregate = predictions[0] - for p in predictions[1:]: - aggregate += p - predicted_labels = classes[np.argmax(aggregate, axis=1)] - correct = np.count_nonzero(predicted_labels == real_labels) - return correct, len(real_labels) - - -@task(y_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) -def _hard_vote_score(y_blocks, classes, *predictions): - real_labels = Array._merge_blocks(y_blocks).flatten() - mode = np.empty((len(predictions[0]),), dtype=int) - for sample_i, votes in enumerate(zip(*predictions)): - mode[sample_i] = Counter(votes).most_common(1)[0][0] - predicted_labels = classes[mode] - correct = np.count_nonzero(predicted_labels == real_labels) - return correct, len(real_labels) +def _partial_score(y_blocks, *predictions): + y_true = Array._merge_blocks(y_blocks).flatten() + y_pred = np.mean(predictions, axis=0) + n_samples = y_true.shape[0] + y_avg = np.mean(y_true) + u_partial = np.sum(np.square(y_true - y_pred), axis=0) + v_partial = np.sum(np.square(y_true - y_avg), axis=0) + return u_partial, v_partial, y_avg, n_samples @task(returns=1) def _merge_scores(*partial_scores): - correct = sum(subset_score[0] for subset_score in partial_scores) - total = sum(subset_score[1] for subset_score in partial_scores) - return correct / total + u = v = avg = n = 0 + for u_p, v_p, avg_p, n_p in partial_scores: + u += u_p + + delta = avg_p - avg + avg += delta * n_p / (n + n_p) + v += v_p + delta ** 2 * n * n_p / (n + n_p) + n += n_p + + return 1 - u / v diff --git a/tests/test_rf_regressor.py b/tests/test_rf_regressor.py new file mode 100644 index 00000000..2d82dbeb --- /dev/null +++ b/tests/test_rf_regressor.py @@ -0,0 +1,105 @@ +import unittest + +import numpy as np +from pycompss.api.api import compss_wait_on +from sklearn.datasets import make_regression + +import dislib as ds +from dislib.regression import RandomForestRegressor + + +def _determination_coefficient(y_true, y_pred): + u = np.sum(np.square(y_true - y_pred)) + v = np.sum(np.square(y_true - np.mean(y_true))) + return 1 - u / v + + +class RandomForestRegressorTest(unittest.TestCase): + def test_make_regression(self): + """Tests RandomForestRegressor fit and score with default params.""" + x, y = make_regression( + n_samples=3000, + n_features=10, + n_informative=4, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[: len(x) // 2], (300, 10)) + y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2 :], (300, 10)) + y_test = ds.array(y[len(y) // 2 :][:, np.newaxis], (300, 1)) + + rf = RandomForestRegressor(random_state=0) + + rf.fit(x_train, y_train) + accuracy1 = compss_wait_on(rf.score(x_test, y_test)) + + y_pred = rf.predict(x_test).collect() + y_true = y[len(y) // 2 :] + accuracy2 = _determination_coefficient(y_true, y_pred) + + self.assertGreater(accuracy1, 0.85) + self.assertGreater(accuracy2, 0.85) + self.assertAlmostEqual(accuracy1, accuracy2) + + def test_make_regression_predict_and_distr_depth(self): + """Tests RandomForestRegressor fit and predict with a distr_depth.""" + x, y = make_regression( + n_samples=3000, + n_features=10, + n_informative=4, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[: len(x) // 2], (300, 10)) + y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2 :], (300, 10)) + y_test = ds.array(y[len(y) // 2 :][:, np.newaxis], (300, 1)) + + rf = RandomForestRegressor(distr_depth=2, random_state=0) + + rf.fit(x_train, y_train) + accuracy1 = compss_wait_on(rf.score(x_test, y_test)) + + y_pred = rf.predict(x_test).collect() + y_true = y[len(y) // 2 :] + accuracy2 = _determination_coefficient(y_true, y_pred) + + self.assertGreater(accuracy1, 0.85) + self.assertGreater(accuracy2, 0.85) + self.assertAlmostEqual(accuracy1, accuracy2) + + def test_make_regression_sklearn_max_predict(self): + """Tests RandomForestRegressor predict with sklearn_max.""" + x, y = make_regression( + n_samples=3000, + n_features=10, + n_informative=4, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[: len(x) // 2], (300, 10)) + y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2 :], (300, 10)) + y_test = ds.array(y[len(y) // 2 :][:, np.newaxis], (300, 1)) + + rf = RandomForestRegressor(random_state=0, sklearn_max=10) + + rf.fit(x_train, y_train) + accuracy1 = compss_wait_on(rf.score(x_test, y_test)) + + y_pred = rf.predict(x_test).collect() + y_true = y[len(y) // 2 :] + accuracy2 = _determination_coefficient(y_true, y_pred) + + self.assertGreater(accuracy1, 0.85) + self.assertGreater(accuracy2, 0.85) + self.assertAlmostEqual(accuracy1, accuracy2) + + +def main(): + unittest.main() + + +if __name__ == "__main__": + main() From c9250a3a967dfc83bb11b03baa16fcb83e80a6ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Mon, 26 Jul 2021 13:01:47 +0200 Subject: [PATCH 13/46] Added RF to a new 'commons' module --- dislib/classification/__init__.py | 4 +- dislib/classification/rf/_data.py | 71 +-- dislib/commons/rf/__init__.py | 0 dislib/commons/rf/_data.py | 404 ++++++++++++++ dislib/commons/rf/_decision_tree.py | 784 ++++++++++++++++++++++++++++ dislib/commons/rf/_forest.py | 486 +++++++++++++++++ dislib/commons/rf/_test_split.py | 59 +++ dislib/regression/__init__.py | 2 +- 8 files changed, 1777 insertions(+), 33 deletions(-) create mode 100644 dislib/commons/rf/__init__.py create mode 100644 dislib/commons/rf/_data.py create mode 100644 dislib/commons/rf/_decision_tree.py create mode 100644 dislib/commons/rf/_forest.py create mode 100644 dislib/commons/rf/_test_split.py diff --git a/dislib/classification/__init__.py b/dislib/classification/__init__.py index 55bc2877..f4a90db6 100644 --- a/dislib/classification/__init__.py +++ b/dislib/classification/__init__.py @@ -1,4 +1,4 @@ from dislib.classification.csvm.base import CascadeSVM -from dislib.classification.rf.forest import RandomForestClassifier +from dislib.commons.rf._forest import RandomForestClassifier -__all__ = ['CascadeSVM', 'RandomForestClassifier'] +__all__ = ["CascadeSVM", "RandomForestClassifier"] diff --git a/dislib/classification/rf/_data.py b/dislib/classification/rf/_data.py index 9bd178b5..1a8da41f 100644 --- a/dislib/classification/rf/_data.py +++ b/dislib/classification/rf/_data.py @@ -2,8 +2,13 @@ import numpy as np from numpy.lib import format -from pycompss.api.parameter import FILE_IN, FILE_INOUT, COLLECTION_IN, Depth, \ - Type +from pycompss.api.parameter import ( + FILE_IN, + FILE_INOUT, + COLLECTION_IN, + Depth, + Type, +) from pycompss.api.task import task from dislib.data.array import Array @@ -82,12 +87,13 @@ def get_n_samples(self): """ if self.n_samples is None: - assert isinstance(self.samples_path, str), \ - 'self.n_samples must be set manually if self.samples_path ' \ - 'is a pycompss.runtime.Future object' + assert isinstance(self.samples_path, str), ( + "self.n_samples must be set manually if self.samples_path " + "is a pycompss.runtime.Future object" + ) shape = _NpyFile(self.samples_path).get_shape() if len(shape) != 2: - raise ValueError('Cannot read 2D array from the samples file.') + raise ValueError("Cannot read 2D array from the samples file.") self.n_samples, self.n_features = shape return self.n_samples @@ -107,12 +113,13 @@ def get_n_features(self): """ if self.n_features is None: - assert isinstance(self.samples_path, str), \ - 'self.n_features must be set manually if self.samples_path ' \ - 'is a pycompss.runtime.Future object' + assert isinstance(self.samples_path, str), ( + "self.n_features must be set manually if self.samples_path " + "is a pycompss.runtime.Future object" + ) shape = _NpyFile(self.samples_path).get_shape() if len(shape) != 2: - raise ValueError('Cannot read 2D array from the samples file.') + raise ValueError("Cannot read 2D array from the samples file.") self.n_samples, self.n_features = shape return self.n_features @@ -169,11 +176,11 @@ class n_samples and n_features or if the array is in fortran order. shape = features_npy_file.get_shape() fortran_order = features_npy_file.get_fortran_order() if len(shape) != 2: - raise ValueError('Cannot read 2D array from features_file.') + raise ValueError("Cannot read 2D array from features_file.") if (self.get_n_features(), self.get_n_samples()) != shape: - raise ValueError('Invalid dimensions for the features_file.') + raise ValueError("Invalid dimensions for the features_file.") if fortran_order: - raise ValueError('Fortran order not supported for features array.') + raise ValueError("Fortran order not supported for features array.") def transform_to_rf_dataset(x: Array, y: Array) -> RfDataset: @@ -197,9 +204,9 @@ def transform_to_rf_dataset(x: Array, y: Array) -> RfDataset: n_samples = x.shape[0] n_features = x.shape[1] - samples_file = tempfile.NamedTemporaryFile(mode='wb', - prefix='tmp_rf_samples_', - delete=False) + samples_file = tempfile.NamedTemporaryFile( + mode="wb", prefix="tmp_rf_samples_", delete=False + ) samples_path = samples_file.name samples_file.close() _allocate_samples_file(samples_path, n_samples, n_features) @@ -213,9 +220,9 @@ def transform_to_rf_dataset(x: Array, y: Array) -> RfDataset: _fill_samples_file(samples_path, x_row._blocks, start_idx) start_idx += x._reg_shape[0] - labels_file = tempfile.NamedTemporaryFile(mode='w', - prefix='tmp_rf_labels_', - delete=False) + labels_file = tempfile.NamedTemporaryFile( + mode="w", prefix="tmp_rf_labels_", delete=False + ) labels_path = labels_file.name labels_file.close() for y_row in y._iterator(axis=0): @@ -251,19 +258,19 @@ def get_dtype(self): return self.dtype def _read_header(self): - with open(self.path, 'rb') as fp: + with open(self.path, "rb") as fp: version = format.read_magic(fp) try: format._check_version(version) except ValueError: - raise ValueError('Invalid file format.') + raise ValueError("Invalid file format.") header_data = format._read_array_header(fp, version) self.shape, self.fortran_order, self.dtype = header_data @task(labels_path=FILE_IN, returns=3) def _get_labels(labels_path): - y = np.genfromtxt(labels_path, dtype=None, encoding='utf-8') + y = np.genfromtxt(labels_path, dtype=None, encoding="utf-8") categories, codes = np.unique(y, return_inverse=True) return codes.astype(np.int8), categories, len(categories) @@ -279,26 +286,30 @@ def _merge_shapes(*samples_shapes): n_features = samples_shapes[0][1] for shape in samples_shapes: n_samples += shape[0] - assert shape[1] == n_features, 'Subsamples with different n_features.' + assert shape[1] == n_features, "Subsamples with different n_features." return samples_shapes, n_samples, n_features @task(samples_path=FILE_INOUT) def _allocate_samples_file(samples_path, n_samples, n_features): - np.lib.format.open_memmap(samples_path, mode='w+', dtype='float32', - shape=(int(n_samples), int(n_features))) + np.lib.format.open_memmap( + samples_path, + mode="w+", + dtype="float32", + shape=(int(n_samples), int(n_features)), + ) @task(samples_path=FILE_INOUT, row_blocks={Type: COLLECTION_IN, Depth: 2}) def _fill_samples_file(samples_path, row_blocks, start_idx): rows_samples = Array._merge_blocks(row_blocks) - rows_samples = rows_samples.astype(dtype='float32', casting='same_kind') - samples = np.lib.format.open_memmap(samples_path, mode='r+') - samples[start_idx: start_idx + rows_samples.shape[0]] = rows_samples + rows_samples = rows_samples.astype(dtype="float32", casting="same_kind") + samples = np.lib.format.open_memmap(samples_path, mode="r+") + samples[start_idx : start_idx + rows_samples.shape[0]] = rows_samples @task(labels_path=FILE_INOUT, row_blocks={Type: COLLECTION_IN, Depth: 2}) def _fill_labels_file(labels_path, row_blocks): rows_labels = Array._merge_blocks(row_blocks) - with open(labels_path, 'at') as f: - np.savetxt(f, rows_labels, fmt='%s', encoding='utf-8') + with open(labels_path, "at") as f: + np.savetxt(f, rows_labels, fmt="%s", encoding="utf-8") diff --git a/dislib/commons/rf/__init__.py b/dislib/commons/rf/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/dislib/commons/rf/_data.py b/dislib/commons/rf/_data.py new file mode 100644 index 00000000..de692182 --- /dev/null +++ b/dislib/commons/rf/_data.py @@ -0,0 +1,404 @@ +import tempfile + +import numpy as np +from numpy.lib import format +from pycompss.api.parameter import ( + FILE_IN, + FILE_INOUT, + COLLECTION_IN, + Depth, + Type, +) +from pycompss.api.task import task + +from dislib.data.array import Array + + +class RfBaseDataset: + """Base class for Dataset format used by the fit() of the + RandomForestRegressor and RandomForestClassifier. + + Warning: This class should not be used directly. Use derived classes + instead. + """ + + def __init__(self, samples_path, targets_path, features_path=None): + self.samples_path = samples_path + self.targets_path = targets_path + self.features_path = features_path + self.n_samples = None + self.n_features = None + self.y_targets = None + + def get_n_samples(self): + """Gets the number of samples obtained from the samples file. + + Returns + ------- + n_samples : int + + Raises + ------ + AssertionError + If self.n_samples is None and self.samples_path is not a string. + ValueError + If invalid content is encountered in the samples file. + + """ + if self.n_samples is None: + assert isinstance(self.samples_path, str), ( + "self.n_samples must be set manually if self.samples_path " + "is a pycompss.runtime.Future object" + ) + shape = _NpyFile(self.samples_path).get_shape() + if len(shape) != 2: + raise ValueError("Cannot read 2D array from the samples file.") + self.n_samples, self.n_features = shape + return self.n_samples + + def get_n_features(self): + """Gets the number of features obtained from the samples file. + + Returns + ------- + n_features : int + + Raises + ------ + AssertionError + If self.n_features is None and self.samples_path is not a string. + ValueError + If invalid content is encountered in the samples file. + + """ + if self.n_features is None: + assert isinstance(self.samples_path, str), ( + "self.n_features must be set manually if self.samples_path " + "is a pycompss.runtime.Future object" + ) + shape = _NpyFile(self.samples_path).get_shape() + if len(shape) != 2: + raise ValueError("Cannot read 2D array from the samples file.") + self.n_samples, self.n_features = shape + return self.n_features + + def validate_features_file(self): + """Validates the features file header information. + + Raises + ------ + ValueError + If the shape of the array in the features_file doesn't match this + class n_samples and n_features or if the array is in fortran order. + + """ + features_npy_file = _NpyFile(self.features_path) + shape = features_npy_file.get_shape() + fortran_order = features_npy_file.get_fortran_order() + if len(shape) != 2: + raise ValueError("Cannot read 2D array from features_file.") + if (self.get_n_features(), self.get_n_samples()) != shape: + raise ValueError("Invalid dimensions for the features_file.") + if fortran_order: + raise ValueError("Fortran order not supported for features array.") + + +class RfClassifierDataset(RfBaseDataset): + """Dataset format used by the fit() of the RandomForestClassifier. + + The RfDataset contains a file path for the samples and another one for the + labels. Optionally, a path can be provided for a transposed version of the + samples matrix, i.e., the features. + + Note: For a representation of a dataset distributed in multiple files, use + dislib.data.Dataset instead. + + Parameters + ---------- + samples_path : str + Path of the .npy file containing the 2-d array of samples. It can be a + pycompss.runtime.Future object. If so, self.n_samples and + self.n_features must be set manually (they can also be + pycompss.runtime.Future objects). + targets_path : str + Path of the .dat file containing the 1-d array of target labels. + It can be a pycompss.runtime.Future object. + features_path : str, optional (default=None) + Path of the .npy file containing the 2-d array of samples transposed. + The array must be C-ordered. Providing this array may improve the + performance as it allows sequential access to the features. + + Attributes + ---------- + n_samples : int + The number of samples of the dataset. It can be a + pycompss.runtime.Future object. + n_features : int + The number of features of the dataset. It can be a + pycompss.runtime.Future object. + y_targets : ndarray + The codified array of labels for this RfDataset. The values are indices + of the array of classes, which contains the corresponding labels. The + dtype is np.int8. It can be a pycompss.runtime.Future object. + y_categories : ndarray + The array of classes for this RfDataset. The values are unique. It can + be a pycompss.runtime.Future object. + n_classes : int + The number of classes of this RfDataset. It can be a + pycompss.runtime.Future object. + + """ + + def __init__(self, samples_path, targets_path, features_path=None): + super().__init__(samples_path, targets_path, features_path) + self.y_categories = None + self.n_classes = None + + def get_y_targets(self): + """Obtains the codified array of target labels. + + Returns + ------- + y_targets : ndarray + + """ + if self.y_targets is None: + labels = _get_labels(self.targets_path) + self.y_targets, self.y_categories, self.n_classes = labels + return self.y_targets + + def get_classes(self): + """Obtains the array of label categories. + + Returns + ------- + y_categories : ndarray + + """ + if self.y_categories is None: + labels = _get_labels(self.targets_path) + self.y_targets, self.y_categories, self.n_classes = labels + return self.y_categories + + def get_n_classes(self): + """Obtains the number of classes. + + Returns + ------- + n_classes : int + + """ + if self.n_classes is None: + labels = _get_labels(self.targets_path) + self.y_targets, self.y_categories, self.n_classes = labels + return self.n_classes + + +class RfRegressorDataset(RfBaseDataset): + """Dataset format used by the fit() of the RandomForestRegressor. + + The RfDataset contains a file path for the samples and another one for the + targets. Optionally, a path can be provided for a transposed version of the + samples matrix, i.e., the features. + + Note: For a representation of a dataset distributed in multiple files, use + dislib.data.Dataset instead. + + Parameters + ---------- + samples_path : str + Path of the .npy file containing the 2-d array of samples. It can be a + pycompss.runtime.Future object. If so, self.n_samples and + self.n_features must be set manually (they can also be + pycompss.runtime.Future objects). + targets_path : str + Path of the .dat file containing the 1-d array of target values. + It can be a pycompss.runtime.Future object. + features_path : str, optional (default=None) + Path of the .npy file containing the 2-d array of samples transposed. + The array must be C-ordered. Providing this array may improve the + performance as it allows sequential access to the features. + + Attributes + ---------- + n_samples : int + The number of samples of the dataset. It can be a + pycompss.runtime.Future object. + n_features : int + The number of features of the dataset. It can be a + pycompss.runtime.Future object. + y_targets : ndarray + The array of targets for this RfDataset. It can be a + pycompss.runtime.Future object. + + """ + + def __init__(self, samples_path, targets_path, features_path=None): + super().__init__(samples_path, targets_path, features_path) + + def get_y_targets(self): + """Obtains the array of target values. + + Returns + ------- + y_targets : ndarray + + """ + if self.y_targets is None: + targets = _get_values(self.targets_path) + self.y_targets = targets + return self.y_targets + + def get_n_classes(self): + return None + + def get_classes(self): + return None + +def transform_to_rf_dataset( + x: Array, y: Array, task: str +) -> RfRegressorDataset or RfClassifierDataset: + """Creates a RfDataset object from samples x and targets y. + + This function creates a dislib.commons.rf.data.RfDataset by saving + x and y in files. + + Parameters + ---------- + x : ds-array, shape = (n_samples, n_features) + The training input samples. + y : ds-array, shape = (n_samples,) or (n_samples, n_outputs) + The target values. + task : {"classification", "regression"} + Task of the Random Forest. + + Returns + ------- + rf_dataset : dislib.regression.rf._data.RfDataset + + """ + n_samples = x.shape[0] + n_features = x.shape[1] + + samples_file = tempfile.NamedTemporaryFile( + mode="wb", prefix="tmp_rf_samples_", delete=False + ) + samples_path = samples_file.name + samples_file.close() + _allocate_samples_file(samples_path, n_samples, n_features) + + start_idx = 0 + row_blocks_iterator = x._iterator(axis=0) + top_row = next(row_blocks_iterator) + _fill_samples_file(samples_path, top_row._blocks, start_idx) + start_idx += x._top_left_shape[0] + for x_row in row_blocks_iterator: + _fill_samples_file(samples_path, x_row._blocks, start_idx) + start_idx += x._reg_shape[0] + + targets_file = tempfile.NamedTemporaryFile( + mode="w", prefix="tmp_rf_targets_", delete=False + ) + targets_path = targets_file.name + targets_file.close() + for y_row in y._iterator(axis=0): + _fill_targets_file(targets_path, y_row._blocks) + + if task == "classification": + rf_dataset = RfClassifierDataset(samples_path, targets_path) + elif task == "regression": + rf_dataset = RfRegressorDataset(samples_path, targets_path) + else: + raise ValueError("task must be either classification or regression.") + rf_dataset.n_samples = n_samples + rf_dataset.n_features = n_features + return rf_dataset + + +class _NpyFile(object): + def __init__(self, path): + self.path = path + + self.shape = None + self.fortran_order = None + self.dtype = None + + def get_shape(self): + if self.shape is None: + self._read_header() + return self.shape + + def get_fortran_order(self): + if self.fortran_order is None: + self._read_header() + return self.fortran_order + + def get_dtype(self): + if self.dtype is None: + self._read_header() + return self.dtype + + def _read_header(self): + with open(self.path, "rb") as fp: + version = format.read_magic(fp) + try: + format._check_version(version) + except ValueError: + raise ValueError("Invalid file format.") + header_data = format._read_array_header(fp, version) + self.shape, self.fortran_order, self.dtype = header_data + + +@task(targets_path=FILE_IN, returns=3) +def _get_labels(targets_path): + # Classification + y = np.genfromtxt(targets_path, dtype=None, encoding="utf-8") + categories, codes = np.unique(y, return_inverse=True) + return codes.astype(np.int8), categories, len(categories) + + +@task(targets_path=FILE_IN, returns=1) +def _get_values(targets_path): + # Regression + y = np.genfromtxt(targets_path, dtype=None, encoding="utf-8") + return y.astype(np.float64) + + +@task(returns=1) +def _get_samples_shape(subset): + return subset.samples.shape + + +@task(returns=3) +def _merge_shapes(*samples_shapes): + n_samples = 0 + n_features = samples_shapes[0][1] + for shape in samples_shapes: + n_samples += shape[0] + assert shape[1] == n_features, "Subsamples with different n_features." + return samples_shapes, n_samples, n_features + + +@task(samples_path=FILE_INOUT) +def _allocate_samples_file(samples_path, n_samples, n_features): + np.lib.format.open_memmap( + samples_path, + mode="w+", + dtype="float32", + shape=(int(n_samples), int(n_features)), + ) + + +@task(samples_path=FILE_INOUT, row_blocks={Type: COLLECTION_IN, Depth: 2}) +def _fill_samples_file(samples_path, row_blocks, start_idx): + rows_samples = Array._merge_blocks(row_blocks) + rows_samples = rows_samples.astype(dtype="float32", casting="same_kind") + samples = np.lib.format.open_memmap(samples_path, mode="r+") + samples[start_idx : start_idx + rows_samples.shape[0]] = rows_samples + + +@task(targets_path=FILE_INOUT, row_blocks={Type: COLLECTION_IN, Depth: 2}) +def _fill_targets_file(targets_path, row_blocks): + rows_targets = Array._merge_blocks(row_blocks) + with open(targets_path, "at") as f: + np.savetxt(f, rows_targets, fmt="%s", encoding="utf-8") diff --git a/dislib/commons/rf/_decision_tree.py b/dislib/commons/rf/_decision_tree.py new file mode 100644 index 00000000..07297a8d --- /dev/null +++ b/dislib/commons/rf/_decision_tree.py @@ -0,0 +1,784 @@ +from sys import float_info + +import numpy as np +from numpy.random.mtrand import RandomState +from pycompss.api.api import compss_delete_object +from pycompss.api.parameter import FILE_IN, Type, COLLECTION_IN, Depth +from pycompss.api.task import task +from sklearn.tree import DecisionTreeClassifier as SklearnDTClassifier +from sklearn.tree import DecisionTreeRegressor as SklearnDTRegressor + +from ._test_split import test_split +from dislib.data.array import Array + + +class BaseDecisionTree: + """Base class for distributed decision trees. + + Warning: This class should not be used directly. + Use derived classes instead. + """ + + def __init__( + self, + try_features, + max_depth, + distr_depth, + sklearn_max, + bootstrap, + random_state, + ): + self.try_features = try_features + self.max_depth = max_depth + self.distr_depth = distr_depth + self.sklearn_max = sklearn_max + self.bootstrap = bootstrap + self.random_state = random_state + + self.n_features = None + self.n_classes = None + + self.tree = None + self.nodes_info = None + self.subtrees = None + + def fit(self, dataset): + """Fits the DecisionTree. + + Parameters + ---------- + dataset : dislib.classification.rf._data.RfDataset + + """ + + self.n_features = dataset.get_n_features() + self.n_classes = dataset.get_n_classes() + samples_path = dataset.samples_path + features_path = dataset.features_path + n_samples = dataset.get_n_samples() + y_targets = dataset.get_y_targets() + + seed = self.random_state.randint(np.iinfo(np.int32).max) + + sample, y_s = _sample_selection( + n_samples, y_targets, self.bootstrap, seed + ) + Node = _ClassificationNode if self.n_classes else _RegressionNode + + self.tree = Node() + self.nodes_info = [] + self.subtrees = [] + tree_traversal = [(self.tree, sample, y_s, 0)] + while tree_traversal: + node, sample, y_s, depth = tree_traversal.pop() + if depth < self.distr_depth: + split = _split_node_wrapper( + sample, + self.n_features, + y_s, + self.n_classes, + self.try_features, + self.random_state, + samples_file=samples_path, + features_file=features_path, + ) + node_info, left_group, y_l, right_group, y_r = split + compss_delete_object(sample) + compss_delete_object(y_s) + node.content = len(self.nodes_info) + self.nodes_info.append(node_info) + node.left = Node() + node.right = Node() + depth = depth + 1 + tree_traversal.append((node.right, right_group, y_r, depth)) + tree_traversal.append((node.left, left_group, y_l, depth)) + else: + subtree = _build_subtree_wrapper( + sample, + y_s, + self.n_features, + self.max_depth - depth, + self.n_classes, + self.try_features, + self.sklearn_max, + self.random_state, + samples_path, + features_path, + ) + node.content = len(self.subtrees) + self.subtrees.append(subtree) + compss_delete_object(sample) + compss_delete_object(y_s) + self.nodes_info = _merge(*self.nodes_info) + + def predict(self, x_row): + """Predicts target values or classes for the given samples using + a fitted tree. + + Parameters + ---------- + x_row : ds-array + A row block of samples. + + Returns + ------- + predicted : ndarray + An array with the predicted classes for the given samples. The + values are codes of the fitted + dislib.classification.rf.data.RfDataset. The returned object can + be a pycompss.runtime.Future object. + + """ + + assert self.tree is not None, "The decision tree is not fitted." + + branch_predictions = [] + for i, subtree in enumerate(self.subtrees): + pred = _predict_branch( + x_row._blocks, + self.tree, + self.nodes_info, + i, + subtree, + self.distr_depth, + ) + branch_predictions.append(pred) + return _merge_branches( + None, *branch_predictions, classification=self.n_classes is None + ) + + +class DecisionTreeClassifier(BaseDecisionTree): + """A distributed decision tree classifier. + + Parameters + ---------- + try_features : int + The number of features to consider when looking for the best split. + + Note: the search for a split does not stop until at least one + valid partition of the node samples is found, even if it requires + to effectively inspect more than ``try_features`` features. + max_depth : int + The maximum depth of the tree. If np.inf, then nodes are expanded + until all leaves are pure. + distr_depth : int + Number of levels of the tree in which the nodes are split in a + distributed way. + bootstrap : bool + Randomly select n_instances samples with repetition (used in random + forests). + random_state : RandomState instance + The random number generator. + + Attributes + ---------- + n_features : int + The number of features of the dataset. It can be a + pycompss.runtime.Future object. + n_classes : int + The number of classes of this RfDataset. It can be a + pycompss.runtime.Future object. + tree : None or _Node + The root node of the tree after the tree is fitted. + nodes_info : None or list of _InnerNodeInfo and _LeafInfo + List of the node information for the nodes of the tree in the same + order as obtained in the fit() method, up to ``distr_depth`` depth. + After fit(), it is a pycompss.runtime.Future object. + subtrees : None or list of _Node + List of subtrees of the tree at ``distr_depth`` depth obtained in the + fit() method. After fit(), it is a list of pycompss.runtime.Future + objects. + + Methods + ------- + fit(dataset) + Fits the DecisionTreeClassifier. + predict(x_row) + Predicts classes for the given samples using a fitted tree. + predict_proba(x_row) + Predicts class probabilities for the given smaples using a fitted tree. + + """ + + def __init__( + self, + try_features, + max_depth, + distr_depth, + sklearn_max, + bootstrap, + random_state, + ): + super().__init__( + try_features, + max_depth, + distr_depth, + sklearn_max, + bootstrap, + random_state, + ) + + def predict_proba(self, x_row): + """Predicts class probabilities for a row block using a fitted tree. + + Parameters + ---------- + x_row : ds-array + A row block of samples. + + Returns + ------- + predicted_proba : ndarray + An array with the predicted probabilities for the given samples. + The shape is (len(subset.samples), self.n_classes), with the index + of the column being codes of the fitted + dislib.classification.rf.data.RfDataset. The returned object can be + a pycompss.runtime.Future object. + + """ + + assert self.tree is not None, "The decision tree is not fitted." + + branch_predictions = [] + for i, subtree in enumerate(self.subtrees): + pred = _predict_branch_proba( + x_row._blocks, + self.tree, + self.nodes_info, + i, + subtree, + self.distr_depth, + self.n_classes, + ) + branch_predictions.append(pred) + return _merge_branches( + self.n_classes, *branch_predictions, classification=True + ) + + +class DecisionTreeRegressor(BaseDecisionTree): + """A distributed decision tree regressor. + + Parameters + ---------- + try_features : int + The number of features to consider when looking for the best split. + + Note: the search for a split does not stop until at least one + valid partition of the node samples is found, even if it requires + to effectively inspect more than ``try_features`` features. + max_depth : int + The maximum depth of the tree. If np.inf, then nodes are expanded + until all leaves are pure. + distr_depth : int + Number of levels of the tree in which the nodes are split in a + distributed way. + bootstrap : bool + Randomly select n_instances samples with repetition (used in random + forests). + random_state : RandomState instance + The random number generator. + + Attributes + ---------- + n_features : int + The number of features of the dataset. It can be a + pycompss.runtime.Future object. + tree : None or _Node + The root node of the tree after the tree is fitted. + nodes_info : None or list of _InnerNodeInfo and _LeafInfo + List of the node information for the nodes of the tree in the same + order as obtained in the fit() method, up to ``distr_depth`` depth. + After fit(), it is a pycompss.runtime.Future object. + subtrees : None or list of _Node + List of subtrees of the tree at ``distr_depth`` depth obtained in the + fit() method. After fit(), it is a list of pycompss.runtime.Future + objects. + + Methods + ------- + fit(dataset) + Fits the DecisionTreeRegressor. + predict(x_row) + Predicts classes for the given samples using a fitted tree. + predict_proba(x_row) + Predicts class probabilities for the given smaples using a fitted tree. + + """ + + def __init__( + self, + try_features, + max_depth, + distr_depth, + sklearn_max, + bootstrap, + random_state, + ): + super().__init__( + try_features, + max_depth, + distr_depth, + sklearn_max, + bootstrap, + random_state, + ) + + +class _Node: + """Base class for tree nodes""" + + def __init__(self, is_classifier): + self.content = None + self.left = None + self.right = None + self.is_classifier = is_classifier + self.predict_dtype = np.int64 if is_classifier else np.float64 + + def predict(self, sample): + node_content = self.content + if isinstance(node_content, _LeafInfo): + return np.full((len(sample),), node_content.target) + if isinstance(node_content, _SkTreeWrapper): + if len(sample) > 0: + return node_content.sk_tree.predict(sample) + if isinstance(node_content, _InnerNodeInfo): + pred = np.empty((len(sample),), dtype=self.predict_dtype) + left_mask = sample[:, node_content.index] <= node_content.value + pred[left_mask] = self.left.predict(sample[left_mask]) + pred[~left_mask] = self.right.predict(sample[~left_mask]) + return pred + assert len(sample) == 0, "Type not supported" + return np.empty((0,), dtype=self.predict_dtype) + + +class _ClassificationNode(_Node): + def __init__(self): + super().__init__(is_classifier=True) + + def predict_proba(self, sample, n_classes): + node_content = self.content + if isinstance(node_content, _LeafInfo): + single_pred = node_content.frequencies / node_content.size + return np.tile(single_pred, (len(sample), 1)) + if isinstance(node_content, _SkTreeWrapper): + if len(sample) > 0: + sk_tree_pred = node_content.sk_tree.predict_proba(sample) + pred = np.zeros((len(sample), n_classes), dtype=np.float64) + pred[:, node_content.sk_tree.classes_] = sk_tree_pred + return pred + if isinstance(node_content, _InnerNodeInfo): + pred = np.empty((len(sample), n_classes), dtype=np.float64) + l_msk = sample[:, node_content.index] <= node_content.value + pred[l_msk] = self.left.predict_proba(sample[l_msk], n_classes) + pred[~l_msk] = self.right.predict_proba(sample[~l_msk], n_classes) + return pred + assert len(sample) == 0, "Type not supported" + return np.empty((0, n_classes), dtype=np.float64) + + +class _RegressionNode(_Node): + def __init__(self): + super().__init__(is_classifier=False) + + +class _InnerNodeInfo: + def __init__(self, index=None, value=None): + self.index = index + self.value = value + + +class _LeafInfo: + def __init__(self, size=None, frequencies=None, target=None): + self.size = size + self.frequencies = frequencies + self.target = target + + +class _SkTreeWrapper: + def __init__(self, tree): + self.sk_tree = tree + + +def _get_sample_attributes(samples_file, indices): + samples_mmap = np.load(samples_file, mmap_mode="r", allow_pickle=False) + x = samples_mmap[indices] + return x + + +def _get_feature_mmap(features_file, i): + return _get_features_mmap(features_file)[i] + + +def _get_features_mmap(features_file): + return np.load(features_file, mmap_mode="r", allow_pickle=False) + + +@task(priority=True, returns=2) +def _sample_selection(n_samples, y_targets, bootstrap, seed): + if bootstrap: + random_state = RandomState(seed) + selection = random_state.choice( + n_samples, size=n_samples, replace=True + ) + selection.sort() + return selection, y_targets[selection] + else: + return np.arange(n_samples), y_targets + + +def _feature_selection(untried_indices, m_try, random_state): + selection_len = min(m_try, len(untried_indices)) + return random_state.choice( + untried_indices, size=selection_len, replace=False + ) + + +def _get_groups(sample, y_s, features_mmap, index, value): + if index is None: + empty_sample = np.array([], dtype=np.int64) + empty_target = np.array([], dtype=y_s.dtype) + return sample, y_s, empty_sample, empty_target + feature = features_mmap[index][sample] + mask = feature < value + left = sample[mask] + right = sample[~mask] + y_l = y_s[mask] + y_r = y_s[~mask] + return left, y_l, right, y_r + + +def _compute_leaf_info(y_s, n_classes): + if n_classes is not None: + frequencies = np.bincount(y_s, minlength=n_classes) + mode = np.argmax(frequencies) + return _LeafInfo(len(y_s), frequencies, mode) + else: + return _LeafInfo(len(y_s), None, np.mean(y_s)) + + +def _split_node_wrapper( + sample, + n_features, + y_s, + n_classes, + m_try, + random_state, + samples_file=None, + features_file=None, +): + seed = random_state.randint(np.iinfo(np.int32).max) + + if features_file is not None: + return _split_node_using_features( + sample, n_features, y_s, n_classes, m_try, features_file, seed + ) + elif samples_file is not None: + return _split_node( + sample, n_features, y_s, n_classes, m_try, samples_file, seed + ) + else: + raise ValueError( + "Invalid combination of arguments. samples_file is " + "None and features_file is None." + ) + + +@task(features_file=FILE_IN, returns=(object, list, list, list, list)) +def _split_node_using_features( + sample, n_features, y_s, n_classes, m_try, features_file, seed +): + features_mmap = np.load(features_file, mmap_mode="r", allow_pickle=False) + random_state = RandomState(seed) + return _compute_split( + sample, n_features, y_s, n_classes, m_try, features_mmap, random_state + ) + + +@task(samples_file=FILE_IN, returns=(object, list, list, list, list)) +def _split_node(sample, n_features, y_s, n_classes, m_try, samples_file, seed): + features_mmap = np.load(samples_file, mmap_mode="r", allow_pickle=False).T + random_state = RandomState(seed) + return _compute_split( + sample, n_features, y_s, n_classes, m_try, features_mmap, random_state + ) + + +def _compute_split( + sample, n_features, y_s, n_classes, m_try, features_mmap, random_state +): + node_info = left_group = y_l = right_group = y_r = None + split_ended = False + tried_indices = [] + while not split_ended: + untried_indices = np.setdiff1d(np.arange(n_features), tried_indices) + index_selection = _feature_selection( + untried_indices, m_try, random_state + ) + b_score = float_info.max + b_index = None + b_value = None + for index in index_selection: + feature = features_mmap[index] + score, value = test_split(sample, y_s, feature, n_classes) + if score < b_score: + b_score, b_value, b_index = score, value, index + groups = _get_groups(sample, y_s, features_mmap, b_index, b_value) + left_group, y_l, right_group, y_r = groups + if left_group.size and right_group.size: + split_ended = True + node_info = _InnerNodeInfo(b_index, b_value) + else: + tried_indices.extend(list(index_selection)) + if len(tried_indices) == n_features: + split_ended = True + node_info = _compute_leaf_info(y_s, n_classes) + left_group = sample + y_l = y_s + right_group = np.array([], dtype=np.int64) + y_r = np.array([], dtype=y_s.dtype) + + return node_info, left_group, y_l, right_group, y_r + + +def _build_subtree_wrapper( + sample, + y_s, + n_features, + max_depth, + n_classes, + m_try, + sklearn_max, + random_state, + samples_file, + features_file, +): + seed = random_state.randint(np.iinfo(np.int32).max) + if features_file is not None: + return _build_subtree_using_features( + sample, + y_s, + n_features, + max_depth, + n_classes, + m_try, + sklearn_max, + seed, + samples_file, + features_file, + ) + else: + return _build_subtree( + sample, + y_s, + n_features, + max_depth, + n_classes, + m_try, + sklearn_max, + seed, + samples_file, + ) + + +@task(samples_file=FILE_IN, features_file=FILE_IN, returns=_Node) +def _build_subtree_using_features( + sample, + y_s, + n_features, + max_depth, + n_classes, + m_try, + sklearn_max, + seed, + samples_file, + features_file, +): + random_state = RandomState(seed) + return _compute_build_subtree( + sample, + y_s, + n_features, + max_depth, + n_classes, + m_try, + sklearn_max, + random_state, + samples_file, + features_file=features_file, + ) + + +@task(samples_file=FILE_IN, returns=_Node) +def _build_subtree( + sample, + y_s, + n_features, + max_depth, + n_classes, + m_try, + sklearn_max, + seed, + samples_file, +): + random_state = RandomState(seed) + return _compute_build_subtree( + sample, + y_s, + n_features, + max_depth, + n_classes, + m_try, + sklearn_max, + random_state, + samples_file, + ) + + +def _compute_build_subtree( + sample, + y_s, + n_features, + max_depth, + n_classes, + m_try, + sklearn_max, + random_state, + samples_file, + features_file=None, + use_sklearn=True, +): + Node = _ClassificationNode if n_classes else _RegressionNode + SklearnDT = SklearnDTClassifier if n_classes else SklearnDTRegressor + if not sample.size: + return Node() + if features_file is not None: + mmap = np.load(features_file, mmap_mode="r", allow_pickle=False) + else: + mmap = np.load(samples_file, mmap_mode="r", allow_pickle=False).T + subtree = Node() + tree_traversal = [(subtree, sample, y_s, 0)] + while tree_traversal: + node, sample, y_s, depth = tree_traversal.pop() + if depth < max_depth: + if use_sklearn and n_features * len(sample) <= sklearn_max: + if max_depth == np.inf: + sklearn_max_depth = None + else: + sklearn_max_depth = max_depth - depth + dt = SklearnDT( + max_features=m_try, + max_depth=sklearn_max_depth, + random_state=random_state, + ) + unique = np.unique( + sample, return_index=True, return_counts=True + ) + sample, new_indices, sample_weight = unique + x = _get_sample_attributes(samples_file, sample) + y_s = y_s[new_indices] + dt.fit(x, y_s, sample_weight=sample_weight, check_input=False) + node.content = _SkTreeWrapper(dt) + else: + split = _compute_split( + sample, + n_features, + y_s, + n_classes, + m_try, + mmap, + random_state, + ) + node_info, left_group, y_l, right_group, y_r = split + node.content = node_info + if isinstance(node_info, _InnerNodeInfo): + node.left = Node() + node.right = Node() + tree_traversal.append( + (node.right, right_group, y_r, depth + 1) + ) + tree_traversal.append( + (node.left, left_group, y_l, depth + 1) + ) + else: + node.content = _compute_leaf_info(y_s, n_classes) + return subtree + + +@task(returns=list) +def _merge(*object_list): + return object_list + + +def _get_subtree_path(subtree_index, distr_depth): + if distr_depth == 0: + return "" + return bin(subtree_index)[2:].zfill(distr_depth) + + +def _get_predicted_indices(samples, tree, nodes_info, path): + idx_mask = np.full((len(samples),), True) + for direction in path: + node_info = nodes_info[tree.content] + if isinstance(node_info, _LeafInfo): + if direction == "1": + idx_mask[:] = 0 + else: + col = node_info.index + value = node_info.value + if direction == "0": + idx_mask[idx_mask] = samples[idx_mask, col] <= value + tree = tree.left + else: + idx_mask[idx_mask] = samples[idx_mask, col] > value + tree = tree.right + return idx_mask + + +@task(row_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) +def _predict_branch( + row_blocks, tree, nodes_info, subtree_index, subtree, distr_depth +): + samples = Array._merge_blocks(row_blocks) + path = _get_subtree_path(subtree_index, distr_depth) + indices_mask = _get_predicted_indices(samples, tree, nodes_info, path) + prediction = subtree.predict(samples[indices_mask]) + return indices_mask, prediction + + +@task(row_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) +def _predict_branch_proba( + row_blocks, + tree, + nodes_info, + subtree_index, + subtree, + distr_depth, + n_classes, +): + samples = Array._merge_blocks(row_blocks) + path = _get_subtree_path(subtree_index, distr_depth) + indices_mask = _get_predicted_indices(samples, tree, nodes_info, path) + prediction = subtree.predict_proba(samples[indices_mask], n_classes) + return indices_mask, prediction + + +@task(returns=list) +def _merge_branches(n_classes, *predictions, classification): + samples_len = len(predictions[0][0]) + if classification: + if n_classes is not None: # predict class + shape = (samples_len, n_classes) + dtype = np.float64 + else: # predict_proba + shape = (samples_len,) + dtype = np.int64 + else: # predict value + shape = (samples_len,) + dtype = np.float64 + + merged_prediction = np.empty(shape, dtype=dtype) + for selected, prediction in predictions: + merged_prediction[selected] = prediction + return merged_prediction diff --git a/dislib/commons/rf/_forest.py b/dislib/commons/rf/_forest.py new file mode 100644 index 00000000..e0f4561d --- /dev/null +++ b/dislib/commons/rf/_forest.py @@ -0,0 +1,486 @@ +import math +from collections import Counter + +import numpy as np +from pycompss.api.api import compss_wait_on +from pycompss.api.parameter import Type, COLLECTION_IN, Depth +from pycompss.api.task import task +from sklearn.base import BaseEstimator +from sklearn.utils import check_random_state + +from dislib.commons.rf._decision_tree import ( + DecisionTreeClassifier, + DecisionTreeRegressor, +) +from dislib.data.array import Array +from dislib.utils.base import _paired_partition +from ._data import transform_to_rf_dataset + + +class BaseRandomForest(BaseEstimator): + """Base class for distributed random forests. + + Warning: This class should not be used directly. + Use derived classes instead. + """ + + def __init__( + self, + n_estimators, + try_features, + max_depth, + distr_depth, + sklearn_max, + hard_vote, + random_state, + ): + self.n_estimators = n_estimators + self.try_features = try_features + self.max_depth = max_depth + self.distr_depth = distr_depth + self.sklearn_max = sklearn_max + self.hard_vote = hard_vote + self.random_state = random_state + + def fit(self, x, y): + """Fits the RandomForest. + + Parameters + ---------- + x : ds-array, shape=(n_samples, n_features) + The training input samples. Internally, its dtype will be converted + to ``dtype=np.float32``. + y : ds-array, shape=(n_samples, 1) + The target values. + + Returns + ------- + self : RandomForest + + """ + self.classes = None + self.trees = [] + + if self.hard_vote is not None: + # Classification + task = "classification" + Tree = DecisionTreeClassifier + else: + # Regression + task = "regression" + Tree = DecisionTreeRegressor + + dataset = transform_to_rf_dataset(x, y, task) + + n_features = dataset.get_n_features() + try_features = _resolve_try_features(self.try_features, n_features) + random_state = check_random_state(self.random_state) + + self.classes = dataset.get_classes() + + if self.distr_depth == "auto": + dataset.n_samples = compss_wait_on(dataset.get_n_samples()) + distr_depth = max(0, int(math.log10(dataset.n_samples)) - 4) + distr_depth = min(distr_depth, self.max_depth) + else: + distr_depth = self.distr_depth + + for i in range(self.n_estimators): + tree = Tree( + try_features, + self.max_depth, + distr_depth, + self.sklearn_max, + bootstrap=True, + random_state=random_state, + ) + self.trees.append(tree) + + for tree in self.trees: + tree.fit(dataset) + + return self + + def predict(self, x): + """Predicts target classes or values using a fitted forest. + + Parameters + ---------- + x : ds-array, shape=(n_samples, n_features) + The input samples. + + Returns + ------- + y_pred : ds-array, shape=(n_samples, 1) + Predicted class labels or values for x. + + """ + assert self.trees is not None, "The random forest is not fitted." + pred_blocks = [] + if self.hard_vote is not None: + # Classification + if self.hard_vote: + for x_row in x._iterator(axis=0): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict(x_row)) + pred_blocks.append( + _hard_vote(self.classes, *tree_predictions) + ) + else: + for x_row in x._iterator(axis=0): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict_proba(x_row)) + pred_blocks.append( + _soft_vote(self.classes, *tree_predictions) + ) + else: + # Regression + for x_row in x._iterator(axis=0): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict(x_row)) + pred_blocks.append(_join_predictions(*tree_predictions)) + + y_pred = Array( + blocks=[pred_blocks], + top_left_shape=(x._top_left_shape[0], 1), + reg_shape=(x._reg_shape[0], 1), + shape=(x.shape[0], 1), + sparse=False, + ) + + return y_pred + + def score(self, x, y): + """Accuracy classification score. + + For classification returns the mean accuracy on the given test data. + + For regression returns the coefficient of determination $R^2$ of + the prediction. + The coefficient $R^2$ is defined as $(1-u/v)$, where $u$ + is the residual sum of squares `((y_true - y_pred) ** 2).sum()` and + $v$ is the total sum of squares + `((y_true - y_true.mean()) ** 2).sum()`. + The best possible score is 1.0 and it can be negative + if the model is arbitrarily worse. + A constant model that always predicts the expected value of y, + disregarding the input features, would get a $R^2$ score of 0.0. + + Parameters + ---------- + x : ds-array, shape=(n_samples, n_features) + The training input samples. + y : ds-array, shape (n_samples, 1) + The true labels. + + Returns + ------- + score : float (as future object) + Fraction of correctly classified samples for classification + or coefficient of determination $R^2$ for regression. + + """ + assert self.trees is not None, "The random forest is not fitted." + partial_scores = [] + if self.hard_vote is not None: + # Classification + if self.hard_vote: + for x_row, y_row in _paired_partition(x, y): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict(x_row)) + subset_score = _hard_vote_score( + y_row._blocks, self.classes, *tree_predictions + ) + partial_scores.append(subset_score) + else: + for x_row, y_row in _paired_partition(x, y): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict_proba(x_row)) + subset_score = _soft_vote_score( + y_row._blocks, self.classes, *tree_predictions + ) + partial_scores.append(subset_score) + score = _merge_classification_scores(*partial_scores) + else: + # Regression + for x_row, y_row in _paired_partition(x, y): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict(x_row)) + subset_score = _regression_score( + y_row._blocks, *tree_predictions + ) + partial_scores.append(subset_score) + score = _merge_regression_scores(*partial_scores) + + return score + + +class RandomForestClassifier(BaseRandomForest): + """A distributed random forest classifier. + + Parameters + ---------- + n_estimators : int, optional (default=10) + Number of trees to fit. + try_features : int, str or None, optional (default='sqrt') + The number of features to consider when looking for the best split: + + - If "sqrt", then `try_features=sqrt(n_features)`. + - If "third", then `try_features=n_features // 3`. + - If None, then `try_features=n_features`. + + Note: the search for a split does not stop until at least one + valid partition of the node samples is found, even if it requires + to effectively inspect more than ``try_features`` features. + max_depth : int or np.inf, optional (default=np.inf) + The maximum depth of the tree. If np.inf, then nodes are expanded + until all leaves are pure. + distr_depth : int or str, optional (default='auto') + Number of levels of the tree in which the nodes are split in a + distributed way. + sklearn_max: int or float, optional (default=1e8) + Maximum size (len(subsample)*n_features) of the arrays passed to + sklearn's DecisionTreeClassifier.fit(), which is called to fit subtrees + (subsamples) of our DecisionTreeClassifier. sklearn fit() is used + because it's faster, but requires loading the data to memory, which can + cause memory problems for large datasets. This parameter can be + adjusted to fit the hardware capabilities. + hard_vote : bool, optional (default=False) + If True, it uses majority voting over the predict() result of the + decision tree predictions. If False, it takes the class with the higher + probability given by predict_proba(), which is an average of the + probabilities given by the decision trees. + random_state : int, RandomState instance or None, optional (default=None) + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by `np.random`. + + Attributes + ---------- + classes : None or ndarray + Array of distinct classes, set at fit(). + trees : list of DecisionTreeClassifier + List of the tree classifiers of this forest, populated at fit(). + """ + + def __init__( + self, + n_estimators=10, + try_features="sqrt", + max_depth=np.inf, + distr_depth="auto", + sklearn_max=1e8, + hard_vote=False, + random_state=None, + ): + super().__init__( + n_estimators, + try_features, + max_depth, + distr_depth, + sklearn_max, + hard_vote, + random_state, + ) + + def predict_proba(self, x): + """Predicts class probabilities using a fitted forest. + + The probabilities are obtained as an average of the probabilities of + each decision tree. + + + Parameters + ---------- + x : ds-array, shape=(n_samples, n_features) + The input samples. + + Returns + ------- + probabilities : ds-array, shape=(n_samples, n_classes) + Predicted probabilities for the samples to belong to each class. + The columns of the array correspond to the classes given at + self.classes. + + """ + assert self.trees is not None, "The random forest is not fitted." + prob_blocks = [] + for x_row in x._iterator(axis=0): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict_proba(x_row)) + prob_blocks.append([_join_predictions(*tree_predictions)]) + self.classes = compss_wait_on(self.classes) + n_classes = len(self.classes) + + probabilities = Array( + blocks=prob_blocks, + top_left_shape=(x._top_left_shape[0], n_classes), + reg_shape=(x._reg_shape[0], n_classes), + shape=(x.shape[0], n_classes), + sparse=False, + ) + return probabilities + + +class RandomForestRegressor(BaseRandomForest): + """A distributed random forest regressor. + + Parameters + ---------- + n_estimators : int, optional (default=10) + Number of trees to fit. + try_features : int, str or None, optional (default='sqrt') + The number of features to consider when looking for the best split: + + - If "sqrt", then `try_features=sqrt(n_features)`. + - If "third", then `try_features=n_features // 3`. + - If None, then `try_features=n_features`. + + Note: the search for a split does not stop until at least one + valid partition of the node samples is found, even if it requires + to effectively inspect more than ``try_features`` features. + max_depth : int or np.inf, optional (default=np.inf) + The maximum depth of the tree. If np.inf, then nodes are expanded + until all leaves are pure. + distr_depth : int or str, optional (default='auto') + Number of levels of the tree in which the nodes are split in a + distributed way. + sklearn_max: int or float, optional (default=1e8) + Maximum size (len(subsample)*n_features) of the arrays passed to + sklearn's DecisionTreeRegressor.fit(), which is called to fit subtrees + (subsamples) of our DecisionTreeRegressor. sklearn fit() is used + because it's faster, but requires loading the data to memory, which can + cause memory problems for large datasets. This parameter can be + adjusted to fit the hardware capabilities. + random_state : int, RandomState instance or None, optional (default=None) + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by `np.random`. + + Attributes + ---------- + trees : list of DecisionTreeRegressor + List of the tree regressors of this forest, populated at fit(). + """ + + def __init__( + self, + n_estimators=10, + try_features="sqrt", + max_depth=np.inf, + distr_depth="auto", + sklearn_max=1e8, + random_state=None, + ): + hard_vote = None + super().__init__( + n_estimators, + try_features, + max_depth, + distr_depth, + sklearn_max, + hard_vote, + random_state, + ) + + +@task(returns=1) +def _resolve_try_features(try_features, n_features): + if try_features is None: + return n_features + elif try_features == "sqrt": + return int(math.sqrt(n_features)) + elif try_features == "third": + return max(1, n_features // 3) + else: + return int(try_features) + + +@task(returns=1) +def _join_predictions(*predictions): + aggregate = predictions[0] + for p in predictions[1:]: + aggregate += p + labels = aggregate / len(predictions) + return labels + + +@task(returns=1) +def _soft_vote(classes, *predictions): + aggregate = predictions[0] + for p in predictions[1:]: + aggregate += p + labels = classes[np.argmax(aggregate, axis=1)] + return labels + + +@task(returns=1) +def _hard_vote(classes, *predictions): + mode = np.empty((len(predictions[0]),), dtype=int) + for sample_i, votes in enumerate(zip(*predictions)): + mode[sample_i] = Counter(votes).most_common(1)[0][0] + labels = classes[mode] + return labels + + +@task(y_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) +def _soft_vote_score(y_blocks, classes, *predictions): + real_labels = Array._merge_blocks(y_blocks).flatten() + aggregate = predictions[0] + for p in predictions[1:]: + aggregate += p + predicted_labels = classes[np.argmax(aggregate, axis=1)] + correct = np.count_nonzero(predicted_labels == real_labels) + return correct, len(real_labels) + + +@task(y_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) +def _hard_vote_score(y_blocks, classes, *predictions): + real_labels = Array._merge_blocks(y_blocks).flatten() + mode = np.empty((len(predictions[0]),), dtype=int) + for sample_i, votes in enumerate(zip(*predictions)): + mode[sample_i] = Counter(votes).most_common(1)[0][0] + predicted_labels = classes[mode] + correct = np.count_nonzero(predicted_labels == real_labels) + return correct, len(real_labels) + + +@task(y_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) +def _regression_score(y_blocks, *predictions): + y_true = Array._merge_blocks(y_blocks).flatten() + y_pred = np.mean(predictions, axis=0) + n_samples = y_true.shape[0] + y_avg = np.mean(y_true) + u_partial = np.sum(np.square(y_true - y_pred), axis=0) + v_partial = np.sum(np.square(y_true - y_avg), axis=0) + return u_partial, v_partial, y_avg, n_samples + + +@task(returns=1) +def _merge_classification_scores(*partial_scores): + correct = sum(subset_score[0] for subset_score in partial_scores) + total = sum(subset_score[1] for subset_score in partial_scores) + return correct / total + + +@task(returns=1) +def _merge_regression_scores(*partial_scores): + u = v = avg = n = 0 + for u_p, v_p, avg_p, n_p in partial_scores: + u += u_p + + delta = avg_p - avg + avg += delta * n_p / (n + n_p) + v += v_p + delta ** 2 * n * n_p / (n + n_p) + n += n_p + + return 1 - u / v diff --git a/dislib/commons/rf/_test_split.py b/dislib/commons/rf/_test_split.py new file mode 100644 index 00000000..38b9015f --- /dev/null +++ b/dislib/commons/rf/_test_split.py @@ -0,0 +1,59 @@ +from sys import float_info + +import numpy as np + + +def criteria_proxy(l_weight, l_length, r_weight, r_length, not_repeated): + """ + Maximizing the MSE or Gini gain is equivalent to minimizing + this proxy function. + """ + return -(l_weight / l_length + r_weight / r_length) * not_repeated + + +def test_split(sample, y_s, feature, n_classes): + size = y_s.shape[0] + if size == 0: + return float_info.max, np.float64(np.inf) + + f = feature[sample] + sort_indices = np.argsort(f) + y_sorted = y_s[sort_indices] + f_sorted = f[sort_indices] + + # Threshold value must not be that value of a sample + not_repeated = np.empty(size, dtype=np.bool_) + not_repeated[0 : size - 1] = f_sorted[1:] != f_sorted[:-1] + not_repeated[size - 1] = True + + if n_classes is not None: # Classification + l_freq = np.zeros((n_classes, size), dtype=np.int64) + l_freq[y_sorted, np.arange(size)] = 1 + + r_freq = np.zeros((n_classes, size), dtype=np.int64) + r_freq[:, 1:] = l_freq[:, :0:-1] + + l_weight = np.sum(np.square(np.cumsum(l_freq, axis=-1)), axis=0) + r_weight = np.sum(np.square(np.cumsum(r_freq, axis=-1)), axis=0)[::-1] + + else: # Regression + # Square of the sum of the y values of each branch + r_weight = np.zeros(size) + l_weight = np.square(np.cumsum(y_sorted, axis=-1)) + r_weight[:-1] = np.square(np.cumsum(y_sorted[::-1], axis=-1)[-2::-1]) + + # Number of samples of each branch + l_length = np.arange(1, size + 1, dtype=np.int32) + r_length = np.arange(size - 1, -1, -1, dtype=np.int32) + r_length[size - 1] = 1 # Avoid div by zero, the right score is 0 + + scores = criteria_proxy( + l_weight, l_length, r_weight, r_length, not_repeated + ) + + min_index = size - np.argmin(scores[::-1]) - 1 + if min_index + 1 == size: + b_value = np.float64(np.inf) + else: + b_value = (f_sorted[min_index] + f_sorted[min_index + 1]) / 2 + return scores[min_index], b_value diff --git a/dislib/regression/__init__.py b/dislib/regression/__init__.py index 4a222968..ecde22d8 100644 --- a/dislib/regression/__init__.py +++ b/dislib/regression/__init__.py @@ -1,5 +1,5 @@ from dislib.regression.linear.base import LinearRegression from dislib.regression.lasso.base import Lasso -from dislib.regression.rf.forest import RandomForestRegressor +from dislib.commons.rf._forest import RandomForestRegressor __all__ = ["LinearRegression", "Lasso", "RandomForestRegressor"] From 8c738dceafe86a2194f1c15cb0f1796870b668a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Mon, 26 Jul 2021 13:02:51 +0200 Subject: [PATCH 14/46] Removed RF from 'classification' and 'regression' --- dislib/classification/rf/__init__.py | 0 dislib/classification/rf/_data.py | 315 ------------ dislib/classification/rf/decision_tree.py | 520 -------------------- dislib/classification/rf/forest.py | 306 ------------ dislib/classification/rf/test_split.py | 50 -- dislib/regression/rf/__init__.py | 0 dislib/regression/rf/_data.py | 279 ----------- dislib/regression/rf/decision_tree.py | 564 ---------------------- dislib/regression/rf/forest.py | 236 --------- dislib/regression/rf/test_split.py | 48 -- 10 files changed, 2318 deletions(-) delete mode 100644 dislib/classification/rf/__init__.py delete mode 100644 dislib/classification/rf/_data.py delete mode 100644 dislib/classification/rf/decision_tree.py delete mode 100644 dislib/classification/rf/forest.py delete mode 100644 dislib/classification/rf/test_split.py delete mode 100644 dislib/regression/rf/__init__.py delete mode 100644 dislib/regression/rf/_data.py delete mode 100644 dislib/regression/rf/decision_tree.py delete mode 100644 dislib/regression/rf/forest.py delete mode 100644 dislib/regression/rf/test_split.py diff --git a/dislib/classification/rf/__init__.py b/dislib/classification/rf/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/dislib/classification/rf/_data.py b/dislib/classification/rf/_data.py deleted file mode 100644 index 1a8da41f..00000000 --- a/dislib/classification/rf/_data.py +++ /dev/null @@ -1,315 +0,0 @@ -import tempfile - -import numpy as np -from numpy.lib import format -from pycompss.api.parameter import ( - FILE_IN, - FILE_INOUT, - COLLECTION_IN, - Depth, - Type, -) -from pycompss.api.task import task - -from dislib.data.array import Array - - -class RfDataset(object): - """Dataset format used by the fit() of the RandomForestClassifier. - - The RfDataset contains a file path for the samples and another one for the - labels. Optionally, a path can be provided for a transposed version of the - samples matrix, i.e., the features. - - Note: For a representation of a dataset distributed in multiple files, use - dislib.data.Dataset instead. - - Parameters - ---------- - samples_path : str - Path of the .npy file containing the 2-d array of samples. It can be a - pycompss.runtime.Future object. If so, self.n_samples and - self.n_features must be set manually (they can also be - pycompss.runtime.Future objects). - labels_path : str - Path of the .dat file containing the 1-d array of labels. It can be a - pycompss.runtime.Future object. - features_path : str, optional (default=None) - Path of the .npy file containing the 2-d array of samples transposed. - The array must be C-ordered. Providing this array may improve the - performance as it allows sequential access to the features. - - Attributes - ---------- - n_samples : int - The number of samples of the dataset. It can be a - pycompss.runtime.Future object. - n_features : int - The number of features of the dataset. It can be a - pycompss.runtime.Future object. - y_codes : ndarray - The codified array of labels for this RfDataset. The values are indices - of the array of classes, which contains the corresponding labels. The - dtype is np.int8. It can be a pycompss.runtime.Future object. - y_categories : ndarray - The array of classes for this RfDataset. The values are unique. It can - be a pycompss.runtime.Future object. - n_classes : int - The number of classes of this RfDataset. It can be a - pycompss.runtime.Future object. - - """ - - def __init__(self, samples_path, labels_path, features_path=None): - self.samples_path = samples_path - self.labels_path = labels_path - self.features_path = features_path - self.n_samples = None - self.n_features = None - - self.y_codes = None - self.y_categories = None - self.n_classes = None - - def get_n_samples(self): - """Gets the number of samples obtained from the samples file. - - Returns - ------- - n_samples : int - - Raises - ------ - AssertionError - If self.n_samples is None and self.samples_path is not a string. - ValueError - If invalid content is encountered in the samples file. - - """ - if self.n_samples is None: - assert isinstance(self.samples_path, str), ( - "self.n_samples must be set manually if self.samples_path " - "is a pycompss.runtime.Future object" - ) - shape = _NpyFile(self.samples_path).get_shape() - if len(shape) != 2: - raise ValueError("Cannot read 2D array from the samples file.") - self.n_samples, self.n_features = shape - return self.n_samples - - def get_n_features(self): - """Gets the number of features obtained from the samples file. - - Returns - ------- - n_features : int - - Raises - ------ - AssertionError - If self.n_features is None and self.samples_path is not a string. - ValueError - If invalid content is encountered in the samples file. - - """ - if self.n_features is None: - assert isinstance(self.samples_path, str), ( - "self.n_features must be set manually if self.samples_path " - "is a pycompss.runtime.Future object" - ) - shape = _NpyFile(self.samples_path).get_shape() - if len(shape) != 2: - raise ValueError("Cannot read 2D array from the samples file.") - self.n_samples, self.n_features = shape - return self.n_features - - def get_y_codes(self): - """Obtains the codified array of labels. - - Returns - ------- - y_codes : ndarray - - """ - if self.y_codes is None: - labels = _get_labels(self.labels_path) - self.y_codes, self.y_categories, self.n_classes = labels - return self.y_codes - - def get_classes(self): - """Obtains the array of label categories. - - Returns - ------- - y_categories : ndarray - - """ - if self.y_categories is None: - labels = _get_labels(self.labels_path) - self.y_codes, self.y_categories, self.n_classes = labels - return self.y_categories - - def get_n_classes(self): - """Obtains the number of classes. - - Returns - ------- - n_classes : int - - """ - if self.n_classes is None: - labels = _get_labels(self.labels_path) - self.y_codes, self.y_categories, self.n_classes = labels - return self.n_classes - - def validate_features_file(self): - """Validates the features file header information. - - Raises - ------ - ValueError - If the shape of the array in the features_file doesn't match this - class n_samples and n_features or if the array is in fortran order. - - """ - features_npy_file = _NpyFile(self.features_path) - shape = features_npy_file.get_shape() - fortran_order = features_npy_file.get_fortran_order() - if len(shape) != 2: - raise ValueError("Cannot read 2D array from features_file.") - if (self.get_n_features(), self.get_n_samples()) != shape: - raise ValueError("Invalid dimensions for the features_file.") - if fortran_order: - raise ValueError("Fortran order not supported for features array.") - - -def transform_to_rf_dataset(x: Array, y: Array) -> RfDataset: - """Creates a RfDataset object from samples x and labels y. - - This function creates a dislib.classification.rf.data.RfDataset by saving - x and y in files. - - Parameters - ---------- - x : ds-array, shape = (n_samples, n_features) - The training input samples. - y : ds-array, shape = (n_samples,) or (n_samples, n_outputs) - The target values. - - Returns - ------- - rf_dataset : dislib.classification.rf._data.RfDataset - - """ - n_samples = x.shape[0] - n_features = x.shape[1] - - samples_file = tempfile.NamedTemporaryFile( - mode="wb", prefix="tmp_rf_samples_", delete=False - ) - samples_path = samples_file.name - samples_file.close() - _allocate_samples_file(samples_path, n_samples, n_features) - - start_idx = 0 - row_blocks_iterator = x._iterator(axis=0) - top_row = next(row_blocks_iterator) - _fill_samples_file(samples_path, top_row._blocks, start_idx) - start_idx += x._top_left_shape[0] - for x_row in row_blocks_iterator: - _fill_samples_file(samples_path, x_row._blocks, start_idx) - start_idx += x._reg_shape[0] - - labels_file = tempfile.NamedTemporaryFile( - mode="w", prefix="tmp_rf_labels_", delete=False - ) - labels_path = labels_file.name - labels_file.close() - for y_row in y._iterator(axis=0): - _fill_labels_file(labels_path, y_row._blocks) - - rf_dataset = RfDataset(samples_path, labels_path) - rf_dataset.n_samples = n_samples - rf_dataset.n_features = n_features - return rf_dataset - - -class _NpyFile(object): - def __init__(self, path): - self.path = path - - self.shape = None - self.fortran_order = None - self.dtype = None - - def get_shape(self): - if self.shape is None: - self._read_header() - return self.shape - - def get_fortran_order(self): - if self.fortran_order is None: - self._read_header() - return self.fortran_order - - def get_dtype(self): - if self.dtype is None: - self._read_header() - return self.dtype - - def _read_header(self): - with open(self.path, "rb") as fp: - version = format.read_magic(fp) - try: - format._check_version(version) - except ValueError: - raise ValueError("Invalid file format.") - header_data = format._read_array_header(fp, version) - self.shape, self.fortran_order, self.dtype = header_data - - -@task(labels_path=FILE_IN, returns=3) -def _get_labels(labels_path): - y = np.genfromtxt(labels_path, dtype=None, encoding="utf-8") - categories, codes = np.unique(y, return_inverse=True) - return codes.astype(np.int8), categories, len(categories) - - -@task(returns=1) -def _get_samples_shape(subset): - return subset.samples.shape - - -@task(returns=3) -def _merge_shapes(*samples_shapes): - n_samples = 0 - n_features = samples_shapes[0][1] - for shape in samples_shapes: - n_samples += shape[0] - assert shape[1] == n_features, "Subsamples with different n_features." - return samples_shapes, n_samples, n_features - - -@task(samples_path=FILE_INOUT) -def _allocate_samples_file(samples_path, n_samples, n_features): - np.lib.format.open_memmap( - samples_path, - mode="w+", - dtype="float32", - shape=(int(n_samples), int(n_features)), - ) - - -@task(samples_path=FILE_INOUT, row_blocks={Type: COLLECTION_IN, Depth: 2}) -def _fill_samples_file(samples_path, row_blocks, start_idx): - rows_samples = Array._merge_blocks(row_blocks) - rows_samples = rows_samples.astype(dtype="float32", casting="same_kind") - samples = np.lib.format.open_memmap(samples_path, mode="r+") - samples[start_idx : start_idx + rows_samples.shape[0]] = rows_samples - - -@task(labels_path=FILE_INOUT, row_blocks={Type: COLLECTION_IN, Depth: 2}) -def _fill_labels_file(labels_path, row_blocks): - rows_labels = Array._merge_blocks(row_blocks) - with open(labels_path, "at") as f: - np.savetxt(f, rows_labels, fmt="%s", encoding="utf-8") diff --git a/dislib/classification/rf/decision_tree.py b/dislib/classification/rf/decision_tree.py deleted file mode 100644 index 0725fcfa..00000000 --- a/dislib/classification/rf/decision_tree.py +++ /dev/null @@ -1,520 +0,0 @@ -from sys import float_info - -import numpy as np -from numpy.random.mtrand import RandomState -from pycompss.api.api import compss_delete_object -from pycompss.api.parameter import FILE_IN, Type, COLLECTION_IN, Depth -from pycompss.api.task import task -from sklearn.tree import DecisionTreeClassifier as SklearnDTClassifier - -from dislib.classification.rf.test_split import test_split -from dislib.data.array import Array - - -class DecisionTreeClassifier: - """A distributed decision tree classifier. - - Parameters - ---------- - try_features : int - The number of features to consider when looking for the best split. - - Note: the search for a split does not stop until at least one - valid partition of the node samples is found, even if it requires - to effectively inspect more than ``try_features`` features. - max_depth : int - The maximum depth of the tree. If np.inf, then nodes are expanded - until all leaves are pure. - distr_depth : int - Number of levels of the tree in which the nodes are split in a - distributed way. - bootstrap : bool - Randomly select n_instances samples with repetition (used in random - forests). - random_state : RandomState instance - The random number generator. - - Attributes - ---------- - n_features : int - The number of features of the dataset. It can be a - pycompss.runtime.Future object. - n_classes : int - The number of classes of this RfDataset. It can be a - pycompss.runtime.Future object. - tree : None or _Node - The root node of the tree after the tree is fitted. - nodes_info : None or list of _InnerNodeInfo and _LeafInfo - List of the node information for the nodes of the tree in the same - order as obtained in the fit() method, up to ``distr_depth`` depth. - After fit(), it is a pycompss.runtime.Future object. - subtrees : None or list of _Node - List of subtrees of the tree at ``distr_depth`` depth obtained in the - fit() method. After fit(), it is a list of pycompss.runtime.Future - objects. - - Methods - ------- - fit(dataset) - Fits the DecisionTreeClassifier. - predict(x_row) - Predicts classes for the given samples using a fitted tree. - predict_proba(x_row) - Predicts class probabilities for the given smaples using a fitted tree. - - """ - - def __init__(self, try_features, max_depth, distr_depth, sklearn_max, - bootstrap, random_state): - self.try_features = try_features - self.max_depth = max_depth - self.distr_depth = distr_depth - self.sklearn_max = sklearn_max - self.bootstrap = bootstrap - self.random_state = random_state - - self.n_features = None - self.n_classes = None - - self.tree = None - self.nodes_info = None - self.subtrees = None - - def fit(self, dataset): - """Fits the DecisionTreeClassifier. - - Parameters - ---------- - dataset : dislib.classification.rf._data.RfDataset - - """ - - self.n_features = dataset.get_n_features() - self.n_classes = dataset.get_n_classes() - samples_path = dataset.samples_path - features_path = dataset.features_path - n_samples = dataset.get_n_samples() - y_codes = dataset.get_y_codes() - - seed = self.random_state.randint(np.iinfo(np.int32).max) - - sample, y_s = _sample_selection(n_samples, y_codes, self.bootstrap, - seed) - - self.tree = _Node() - self.nodes_info = [] - self.subtrees = [] - tree_traversal = [(self.tree, sample, y_s, 0)] - while tree_traversal: - node, sample, y_s, depth = tree_traversal.pop() - if depth < self.distr_depth: - split = _split_node_wrapper(sample, self.n_features, y_s, - self.n_classes, self.try_features, - self.random_state, - samples_file=samples_path, - features_file=features_path) - node_info, left_group, y_l, right_group, y_r = split - compss_delete_object(sample) - compss_delete_object(y_s) - node.content = len(self.nodes_info) - self.nodes_info.append(node_info) - node.left = _Node() - node.right = _Node() - depth = depth + 1 - tree_traversal.append((node.right, right_group, y_r, depth)) - tree_traversal.append((node.left, left_group, y_l, depth)) - else: - subtree = _build_subtree_wrapper(sample, y_s, self.n_features, - self.max_depth - depth, - self.n_classes, - self.try_features, - self.sklearn_max, - self.random_state, - samples_path, features_path) - node.content = len(self.subtrees) - self.subtrees.append(subtree) - compss_delete_object(sample) - compss_delete_object(y_s) - self.nodes_info = _merge(*self.nodes_info) - - def predict(self, x_row): - """Predicts classes for the given samples using a fitted tree. - - Parameters - ---------- - x_row : ds-array - A row block of samples. - - Returns - ------- - predicted : ndarray - An array with the predicted classes for the given samples. The - values are codes of the fitted - dislib.classification.rf.data.RfDataset. The returned object can - be a pycompss.runtime.Future object. - - """ - - assert self.tree is not None, 'The decision tree is not fitted.' - - branch_predictions = [] - for i, subtree in enumerate(self.subtrees): - pred = _predict_branch(x_row._blocks, self.tree, self.nodes_info, - i, subtree, self.distr_depth) - branch_predictions.append(pred) - return _merge_branches(None, *branch_predictions) - - def predict_proba(self, x_row): - """Predicts class probabilities for a row block using a fitted tree. - - Parameters - ---------- - x_row : ds-array - A row block of samples. - - Returns - ------- - predicted_proba : ndarray - An array with the predicted probabilities for the given samples. - The shape is (len(subset.samples), self.n_classes), with the index - of the column being codes of the fitted - dislib.classification.rf.data.RfDataset. The returned object can be - a pycompss.runtime.Future object. - - """ - - assert self.tree is not None, 'The decision tree is not fitted.' - - branch_predictions = [] - for i, subtree in enumerate(self.subtrees): - pred = _predict_branch_proba(x_row._blocks, self.tree, - self.nodes_info, i, subtree, - self.distr_depth, self.n_classes) - branch_predictions.append(pred) - return _merge_branches(self.n_classes, *branch_predictions) - - -class _Node: - - def __init__(self): - self.content = None - self.left = None - self.right = None - - def predict(self, sample): - node_content = self.content - if isinstance(node_content, _LeafInfo): - return np.full((len(sample),), node_content.mode) - if isinstance(node_content, _SkTreeWrapper): - if len(sample) > 0: - return node_content.sk_tree.predict(sample) - if isinstance(node_content, _InnerNodeInfo): - pred = np.empty((len(sample),), dtype=np.int64) - left_mask = sample[:, node_content.index] <= node_content.value - pred[left_mask] = self.left.predict(sample[left_mask]) - pred[~left_mask] = self.right.predict(sample[~left_mask]) - return pred - assert len(sample) == 0, 'Type not supported' - return np.empty((0,), dtype=np.int64) - - def predict_proba(self, sample, n_classes): - node_content = self.content - if isinstance(node_content, _LeafInfo): - single_pred = node_content.frequencies / node_content.size - return np.tile(single_pred, (len(sample), 1)) - if isinstance(node_content, _SkTreeWrapper): - if len(sample) > 0: - sk_tree_pred = node_content.sk_tree.predict_proba(sample) - pred = np.zeros((len(sample), n_classes), dtype=np.float64) - pred[:, node_content.sk_tree.classes_] = sk_tree_pred - return pred - if isinstance(node_content, _InnerNodeInfo): - pred = np.empty((len(sample), n_classes), dtype=np.float64) - l_msk = sample[:, node_content.index] <= node_content.value - pred[l_msk] = self.left.predict_proba(sample[l_msk], n_classes) - pred[~l_msk] = self.right.predict_proba(sample[~l_msk], n_classes) - return pred - assert len(sample) == 0, 'Type not supported' - return np.empty((0, n_classes), dtype=np.float64) - - -class _InnerNodeInfo: - def __init__(self, index=None, value=None): - self.index = index - self.value = value - - -class _LeafInfo: - def __init__(self, size=None, frequencies=None, mode=None): - self.size = size - self.frequencies = frequencies - self.mode = mode - - -class _SkTreeWrapper: - def __init__(self, tree): - self.sk_tree = tree - self.classes = tree.classes_ - - -def _get_sample_attributes(samples_file, indices): - samples_mmap = np.load(samples_file, mmap_mode='r', allow_pickle=False) - x = samples_mmap[indices] - return x - - -def _get_feature_mmap(features_file, i): - return _get_features_mmap(features_file)[i] - - -def _get_features_mmap(features_file): - return np.load(features_file, mmap_mode='r', allow_pickle=False) - - -@task(priority=True, returns=2) -def _sample_selection(n_samples, y_codes, bootstrap, seed): - if bootstrap: - random_state = RandomState(seed) - selection = random_state.choice(n_samples, size=n_samples, - replace=True) - selection.sort() - return selection, y_codes[selection] - else: - return np.arange(n_samples), y_codes - - -def _feature_selection(untried_indices, m_try, random_state): - selection_len = min(m_try, len(untried_indices)) - return random_state.choice(untried_indices, size=selection_len, - replace=False) - - -def _get_groups(sample, y_s, features_mmap, index, value): - if index is None: - empty_sample = np.array([], dtype=np.int64) - empty_labels = np.array([], dtype=np.int8) - return sample, y_s, empty_sample, empty_labels - feature = features_mmap[index][sample] - mask = feature < value - left = sample[mask] - right = sample[~mask] - y_l = y_s[mask] - y_r = y_s[~mask] - return left, y_l, right, y_r - - -def _compute_leaf_info(y_s, n_classes): - frequencies = np.bincount(y_s, minlength=n_classes) - mode = np.argmax(frequencies) - return _LeafInfo(len(y_s), frequencies, mode) - - -def _split_node_wrapper(sample, n_features, y_s, n_classes, m_try, - random_state, samples_file=None, features_file=None): - seed = random_state.randint(np.iinfo(np.int32).max) - - if features_file is not None: - return _split_node_using_features(sample, n_features, y_s, n_classes, - m_try, features_file, seed) - elif samples_file is not None: - return _split_node(sample, n_features, y_s, n_classes, m_try, - samples_file, seed) - else: - raise ValueError('Invalid combination of arguments. samples_file is ' - 'None and features_file is None.') - - -@task(features_file=FILE_IN, returns=(object, list, list, list, list)) -def _split_node_using_features(sample, n_features, y_s, n_classes, m_try, - features_file, seed): - features_mmap = np.load(features_file, mmap_mode='r', allow_pickle=False) - random_state = RandomState(seed) - return _compute_split(sample, n_features, y_s, n_classes, m_try, - features_mmap, random_state) - - -@task(samples_file=FILE_IN, returns=(object, list, list, list, list)) -def _split_node(sample, n_features, y_s, n_classes, m_try, samples_file, seed): - features_mmap = np.load(samples_file, mmap_mode='r', allow_pickle=False).T - random_state = RandomState(seed) - return _compute_split(sample, n_features, y_s, n_classes, m_try, - features_mmap, random_state) - - -def _compute_split(sample, n_features, y_s, n_classes, m_try, features_mmap, - random_state): - node_info = left_group = y_l = right_group = y_r = None - split_ended = False - tried_indices = [] - while not split_ended: - untried_indices = np.setdiff1d(np.arange(n_features), tried_indices) - index_selection = _feature_selection(untried_indices, m_try, - random_state) - b_score = float_info.max - b_index = None - b_value = None - for index in index_selection: - feature = features_mmap[index] - score, value = test_split(sample, y_s, feature, n_classes) - if score < b_score: - b_score, b_value, b_index = score, value, index - groups = _get_groups(sample, y_s, features_mmap, b_index, b_value) - left_group, y_l, right_group, y_r = groups - if left_group.size and right_group.size: - split_ended = True - node_info = _InnerNodeInfo(b_index, b_value) - else: - tried_indices.extend(list(index_selection)) - if len(tried_indices) == n_features: - split_ended = True - node_info = _compute_leaf_info(y_s, n_classes) - left_group = sample - y_l = y_s - right_group = np.array([], dtype=np.int64) - y_r = np.array([], dtype=np.int8) - - return node_info, left_group, y_l, right_group, y_r - - -def _build_subtree_wrapper(sample, y_s, n_features, max_depth, n_classes, - m_try, sklearn_max, random_state, samples_file, - features_file): - seed = random_state.randint(np.iinfo(np.int32).max) - if features_file is not None: - return _build_subtree_using_features(sample, y_s, n_features, - max_depth, n_classes, m_try, - sklearn_max, seed, samples_file, - features_file) - else: - return _build_subtree(sample, y_s, n_features, max_depth, n_classes, - m_try, sklearn_max, seed, samples_file) - - -@task(samples_file=FILE_IN, features_file=FILE_IN, returns=_Node) -def _build_subtree_using_features(sample, y_s, n_features, max_depth, - n_classes, m_try, sklearn_max, seed, - samples_file, features_file): - random_state = RandomState(seed) - return _compute_build_subtree(sample, y_s, n_features, max_depth, - n_classes, m_try, sklearn_max, random_state, - samples_file, features_file=features_file) - - -@task(samples_file=FILE_IN, returns=_Node) -def _build_subtree(sample, y_s, n_features, max_depth, n_classes, m_try, - sklearn_max, seed, samples_file): - random_state = RandomState(seed) - return _compute_build_subtree(sample, y_s, n_features, max_depth, - n_classes, m_try, sklearn_max, random_state, - samples_file) - - -def _compute_build_subtree(sample, y_s, n_features, max_depth, n_classes, - m_try, sklearn_max, random_state, samples_file, - features_file=None, use_sklearn=True): - if not sample.size: - return _Node() - if features_file is not None: - mmap = np.load(features_file, mmap_mode='r', allow_pickle=False) - else: - mmap = np.load(samples_file, mmap_mode='r', allow_pickle=False).T - subtree = _Node() - tree_traversal = [(subtree, sample, y_s, 0)] - while tree_traversal: - node, sample, y_s, depth = tree_traversal.pop() - if depth < max_depth: - if use_sklearn and n_features * len(sample) <= sklearn_max: - if max_depth == np.inf: - sklearn_max_depth = None - else: - sklearn_max_depth = max_depth - depth - dt = SklearnDTClassifier(max_features=m_try, - max_depth=sklearn_max_depth, - random_state=random_state) - unique = np.unique(sample, return_index=True, - return_counts=True) - sample, new_indices, sample_weight = unique - x = _get_sample_attributes(samples_file, sample) - y_s = y_s[new_indices] - dt.fit(x, y_s, sample_weight=sample_weight, check_input=False) - node.content = _SkTreeWrapper(dt) - else: - split = _compute_split(sample, n_features, y_s, n_classes, - m_try, mmap, random_state) - node_info, left_group, y_l, right_group, y_r = split - node.content = node_info - if isinstance(node_info, _InnerNodeInfo): - node.left = _Node() - node.right = _Node() - tree_traversal.append((node.right, right_group, y_r, - depth + 1)) - tree_traversal.append((node.left, left_group, y_l, - depth + 1)) - else: - node.content = _compute_leaf_info(y_s, n_classes) - return subtree - - -@task(returns=list) -def _merge(*object_list): - return object_list - - -def _get_subtree_path(subtree_index, distr_depth): - if distr_depth == 0: - return '' - return bin(subtree_index)[2:].zfill(distr_depth) - - -def _get_predicted_indices(samples, tree, nodes_info, path): - idx_mask = np.full((len(samples),), True) - for direction in path: - node_info = nodes_info[tree.content] - if isinstance(node_info, _LeafInfo): - if direction == '1': - idx_mask[:] = 0 - else: - col = node_info.index - value = node_info.value - if direction == '0': - idx_mask[idx_mask] = samples[idx_mask, col] <= value - tree = tree.left - else: - idx_mask[idx_mask] = samples[idx_mask, col] > value - tree = tree.right - return idx_mask - - -@task(row_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) -def _predict_branch(row_blocks, tree, nodes_info, subtree_index, subtree, - distr_depth): - samples = Array._merge_blocks(row_blocks) - path = _get_subtree_path(subtree_index, distr_depth) - indices_mask = _get_predicted_indices(samples, tree, nodes_info, path) - prediction = subtree.predict(samples[indices_mask]) - return indices_mask, prediction - - -@task(row_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) -def _predict_branch_proba(row_blocks, tree, nodes_info, subtree_index, subtree, - distr_depth, n_classes): - samples = Array._merge_blocks(row_blocks) - path = _get_subtree_path(subtree_index, distr_depth) - indices_mask = _get_predicted_indices(samples, tree, nodes_info, path) - prediction = subtree.predict_proba(samples[indices_mask], n_classes) - return indices_mask, prediction - - -@task(returns=list) -def _merge_branches(n_classes, *predictions): - samples_len = len(predictions[0][0]) - if n_classes is not None: # predict - shape = (samples_len, n_classes) - dtype = np.float64 - else: # predict_proba - shape = (samples_len,) - dtype = np.int64 - merged_prediction = np.empty(shape, dtype=dtype) - for selected, prediction in predictions: - merged_prediction[selected] = prediction - return merged_prediction diff --git a/dislib/classification/rf/forest.py b/dislib/classification/rf/forest.py deleted file mode 100644 index 8f6c0f2a..00000000 --- a/dislib/classification/rf/forest.py +++ /dev/null @@ -1,306 +0,0 @@ -import math -from collections import Counter - -import numpy as np -from pycompss.api.api import compss_wait_on -from pycompss.api.parameter import Type, COLLECTION_IN, Depth -from pycompss.api.task import task -from sklearn.base import BaseEstimator -from sklearn.utils import check_random_state - -from dislib.classification.rf.decision_tree import DecisionTreeClassifier -from dislib.data.array import Array -from dislib.utils.base import _paired_partition -from dislib.classification.rf._data import transform_to_rf_dataset - - -class RandomForestClassifier(BaseEstimator): - """A distributed random forest classifier. - - Parameters - ---------- - n_estimators : int, optional (default=10) - Number of trees to fit. - try_features : int, str or None, optional (default='sqrt') - The number of features to consider when looking for the best split: - - - If "sqrt", then `try_features=sqrt(n_features)`. - - If "third", then `try_features=n_features // 3`. - - If None, then `try_features=n_features`. - - Note: the search for a split does not stop until at least one - valid partition of the node samples is found, even if it requires - to effectively inspect more than ``try_features`` features. - max_depth : int or np.inf, optional (default=np.inf) - The maximum depth of the tree. If np.inf, then nodes are expanded - until all leaves are pure. - distr_depth : int or str, optional (default='auto') - Number of levels of the tree in which the nodes are split in a - distributed way. - sklearn_max: int or float, optional (default=1e8) - Maximum size (len(subsample)*n_features) of the arrays passed to - sklearn's DecisionTreeClassifier.fit(), which is called to fit subtrees - (subsamples) of our DecisionTreeClassifier. sklearn fit() is used - because it's faster, but requires loading the data to memory, which can - cause memory problems for large datasets. This parameter can be - adjusted to fit the hardware capabilities. - hard_vote : bool, optional (default=False) - If True, it uses majority voting over the predict() result of the - decision tree predictions. If False, it takes the class with the higher - probability given by predict_proba(), which is an average of the - probabilities given by the decision trees. - random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. - - Attributes - ---------- - classes : None or ndarray - Array of distinct classes, set at fit(). - trees : list of DecisionTreeClassifier - List of the tree classifiers of this forest, populated at fit(). - """ - - def __init__(self, - n_estimators=10, - try_features='sqrt', - max_depth=np.inf, - distr_depth='auto', - sklearn_max=1e8, - hard_vote=False, - random_state=None): - self.n_estimators = n_estimators - self.try_features = try_features - self.max_depth = max_depth - self.distr_depth = distr_depth - self.sklearn_max = sklearn_max - self.hard_vote = hard_vote - self.random_state = random_state - - def fit(self, x, y): - """Fits the RandomForestClassifier. - - Parameters - ---------- - x : ds-array, shape=(n_samples, n_features) - The training input samples. Internally, its dtype will be converted - to ``dtype=np.float32``. - y : ds-array, shape=(n_samples, 1) - The target values. - - Returns - ------- - self : RandomForestClassifier - - """ - self.classes = None - self.trees = [] - - dataset = transform_to_rf_dataset(x, y) - - n_features = dataset.get_n_features() - try_features = _resolve_try_features(self.try_features, n_features) - random_state = check_random_state(self.random_state) - - self.classes = dataset.get_classes() - - if self.distr_depth == 'auto': - dataset.n_samples = compss_wait_on(dataset.get_n_samples()) - distr_depth = max(0, int(math.log10(dataset.n_samples)) - 4) - distr_depth = min(distr_depth, self.max_depth) - else: - distr_depth = self.distr_depth - - for i in range(self.n_estimators): - tree = DecisionTreeClassifier(try_features, self.max_depth, - distr_depth, self.sklearn_max, - bootstrap=True, - random_state=random_state) - self.trees.append(tree) - - for tree in self.trees: - tree.fit(dataset) - - return self - - def predict_proba(self, x): - """Predicts class probabilities using a fitted forest. - - The probabilities are obtained as an average of the probabilities of - each decision tree. - - - Parameters - ---------- - x : ds-array, shape=(n_samples, n_features) - The input samples. - - Returns - ------- - probabilities : ds-array, shape=(n_samples, n_classes) - Predicted probabilities for the samples to belong to each class. - The columns of the array correspond to the classes given at - self.classes. - - """ - assert self.trees is not None, 'The random forest is not fitted.' - prob_blocks = [] - for x_row in x._iterator(axis=0): - tree_predictions = [] - for tree in self.trees: - tree_predictions.append(tree.predict_proba(x_row)) - prob_blocks.append([_join_predictions(*tree_predictions)]) - self.classes = compss_wait_on(self.classes) - n_classes = len(self.classes) - - probabilities = Array(blocks=prob_blocks, - top_left_shape=(x._top_left_shape[0], n_classes), - reg_shape=(x._reg_shape[0], n_classes), - shape=(x.shape[0], n_classes), sparse=False) - return probabilities - - def predict(self, x): - """Predicts classes using a fitted forest. - - Parameters - ---------- - x : ds-array, shape=(n_samples, n_features) - The input samples. - - Returns - ------- - y_pred : ds-array, shape=(n_samples, 1) - Predicted class labels for x. - - """ - assert self.trees is not None, 'The random forest is not fitted.' - pred_blocks = [] - if self.hard_vote: - for x_row in x._iterator(axis=0): - tree_predictions = [] - for tree in self.trees: - tree_predictions.append(tree.predict(x_row)) - pred_blocks.append(_hard_vote(self.classes, *tree_predictions)) - else: - for x_row in x._iterator(axis=0): - tree_predictions = [] - for tree in self.trees: - tree_predictions.append(tree.predict_proba(x_row)) - pred_blocks.append(_soft_vote(self.classes, *tree_predictions)) - - y_pred = Array(blocks=[pred_blocks], - top_left_shape=(x._top_left_shape[0], 1), - reg_shape=(x._reg_shape[0], 1), shape=(x.shape[0], 1), - sparse=False) - - return y_pred - - def score(self, x, y): - """Accuracy classification score. - - Returns the mean accuracy on the given test data. - - - Parameters - ---------- - x : ds-array, shape=(n_samples, n_features) - The training input samples. - y : ds-array, shape (n_samples, 1) - The true labels. - - Returns - ------- - score : float (as future object) - Fraction of correctly classified samples. - - """ - assert self.trees is not None, 'The random forest is not fitted.' - partial_scores = [] - if self.hard_vote: - for x_row, y_row in _paired_partition(x, y): - tree_predictions = [] - for tree in self.trees: - tree_predictions.append(tree.predict(x_row)) - subset_score = _hard_vote_score(y_row._blocks, self.classes, - *tree_predictions) - partial_scores.append(subset_score) - else: - for x_row, y_row in _paired_partition(x, y): - tree_predictions = [] - for tree in self.trees: - tree_predictions.append(tree.predict_proba(x_row)) - subset_score = _soft_vote_score(y_row._blocks, self.classes, - *tree_predictions) - partial_scores.append(subset_score) - - return _merge_scores(*partial_scores) - - -@task(returns=1) -def _resolve_try_features(try_features, n_features): - if try_features is None: - return n_features - elif try_features == 'sqrt': - return int(math.sqrt(n_features)) - elif try_features == 'third': - return max(1, n_features // 3) - else: - return int(try_features) - - -@task(returns=1) -def _join_predictions(*predictions): - aggregate = predictions[0] - for p in predictions[1:]: - aggregate += p - labels = aggregate / len(predictions) - return labels - - -@task(returns=1) -def _soft_vote(classes, *predictions): - aggregate = predictions[0] - for p in predictions[1:]: - aggregate += p - labels = classes[np.argmax(aggregate, axis=1)] - return labels - - -@task(returns=1) -def _hard_vote(classes, *predictions): - mode = np.empty((len(predictions[0]),), dtype=int) - for sample_i, votes in enumerate(zip(*predictions)): - mode[sample_i] = Counter(votes).most_common(1)[0][0] - labels = classes[mode] - return labels - - -@task(y_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) -def _soft_vote_score(y_blocks, classes, *predictions): - real_labels = Array._merge_blocks(y_blocks).flatten() - aggregate = predictions[0] - for p in predictions[1:]: - aggregate += p - predicted_labels = classes[np.argmax(aggregate, axis=1)] - correct = np.count_nonzero(predicted_labels == real_labels) - return correct, len(real_labels) - - -@task(y_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) -def _hard_vote_score(y_blocks, classes, *predictions): - real_labels = Array._merge_blocks(y_blocks).flatten() - mode = np.empty((len(predictions[0]),), dtype=int) - for sample_i, votes in enumerate(zip(*predictions)): - mode[sample_i] = Counter(votes).most_common(1)[0][0] - predicted_labels = classes[mode] - correct = np.count_nonzero(predicted_labels == real_labels) - return correct, len(real_labels) - - -@task(returns=1) -def _merge_scores(*partial_scores): - correct = sum(subset_score[0] for subset_score in partial_scores) - total = sum(subset_score[1] for subset_score in partial_scores) - return correct / total diff --git a/dislib/classification/rf/test_split.py b/dislib/classification/rf/test_split.py deleted file mode 100644 index 70922783..00000000 --- a/dislib/classification/rf/test_split.py +++ /dev/null @@ -1,50 +0,0 @@ -from sys import float_info - -import numpy as np - - -def gini_criteria_proxy(l_weight, l_length, r_weight, r_length, not_repeated): - """ - Maximizing the Gini gain is equivalent to minimizing this proxy function. - - """ - return -(l_weight / l_length + r_weight / r_length) * not_repeated - - -def test_split(sample, y_s, feature, n_classes): - size = y_s.shape[0] - if size == 0: - return float_info.max, np.float64(np.inf) - - f = feature[sample] - sort_indices = np.argsort(f) - y_sorted = y_s[sort_indices] - f_sorted = f[sort_indices] - - not_repeated = np.empty(size, dtype=np.bool_) - not_repeated[0: size - 1] = (f_sorted[1:] != f_sorted[:-1]) - not_repeated[size - 1] = True - - l_freq = np.zeros((n_classes, size), dtype=np.int64) - l_freq[y_sorted, np.arange(size)] = 1 - - r_freq = np.zeros((n_classes, size), dtype=np.int64) - r_freq[:, 1:] = l_freq[:, :0:-1] - - l_weight = np.sum(np.square(np.cumsum(l_freq, axis=-1)), axis=0) - r_weight = np.sum(np.square(np.cumsum(r_freq, axis=-1)), axis=0)[::-1] - - l_length = np.arange(1, size + 1, dtype=np.int32) - r_length = np.arange(size - 1, -1, -1, dtype=np.int32) - r_length[size - 1] = 1 # Avoid div by zero, the right score is 0 anyways - - scores = gini_criteria_proxy(l_weight, l_length, r_weight, r_length, - not_repeated) - - min_index = size - np.argmin(scores[::-1]) - 1 - - if min_index + 1 == size: - b_value = np.float64(np.inf) - else: - b_value = (f_sorted[min_index] + f_sorted[min_index + 1]) / 2 - return scores[min_index], b_value diff --git a/dislib/regression/rf/__init__.py b/dislib/regression/rf/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/dislib/regression/rf/_data.py b/dislib/regression/rf/_data.py deleted file mode 100644 index 360f705d..00000000 --- a/dislib/regression/rf/_data.py +++ /dev/null @@ -1,279 +0,0 @@ -import tempfile - -import numpy as np -from numpy.lib import format -from pycompss.api.parameter import ( - FILE_IN, - FILE_INOUT, - COLLECTION_IN, - Depth, - Type, -) -from pycompss.api.task import task - -from dislib.data.array import Array - - -class RfDataset(object): - """Dataset format used by the fit() of the RandomForestRegressor. - - The RfDataset contains a file path for the samples and another one for the - targets. Optionally, a path can be provided for a transposed version of the - samples matrix, i.e., the features. - - Note: For a representation of a dataset distributed in multiple files, use - dislib.data.Dataset instead. - - Parameters - ---------- - samples_path : str - Path of the .npy file containing the 2-d array of samples. It can be a - pycompss.runtime.Future object. If so, self.n_samples and - self.n_features must be set manually (they can also be - pycompss.runtime.Future objects). - targets_path : str - Path of the .dat file containing the 1-d array of targets. It can be a - pycompss.runtime.Future object. - features_path : str, optional (default=None) - Path of the .npy file containing the 2-d array of samples transposed. - The array must be C-ordered. Providing this array may improve the - performance as it allows sequential access to the features. - - Attributes - ---------- - n_samples : int - The number of samples of the dataset. It can be a - pycompss.runtime.Future object. - n_features : int - The number of features of the dataset. It can be a - pycompss.runtime.Future object. - y_targets : ndarray - The array of targets for this RfDataset. It can be a - pycompss.runtime.Future object. - - """ - - def __init__(self, samples_path, targets_path, features_path=None): - self.samples_path = samples_path - self.targets_path = targets_path - self.features_path = features_path - self.n_samples = None - self.n_features = None - - self.y_targets = None - - def get_n_samples(self): - """Gets the number of samples obtained from the samples file. - - Returns - ------- - n_samples : int - - Raises - ------ - AssertionError - If self.n_samples is None and self.samples_path is not a string. - ValueError - If invalid content is encountered in the samples file. - - """ - if self.n_samples is None: - assert isinstance(self.samples_path, str), ( - "self.n_samples must be set manually if self.samples_path " - "is a pycompss.runtime.Future object" - ) - shape = _NpyFile(self.samples_path).get_shape() - if len(shape) != 2: - raise ValueError("Cannot read 2D array from the samples file.") - self.n_samples, self.n_features = shape - return self.n_samples - - def get_n_features(self): - """Gets the number of features obtained from the samples file. - - Returns - ------- - n_features : int - - Raises - ------ - AssertionError - If self.n_features is None and self.samples_path is not a string. - ValueError - If invalid content is encountered in the samples file. - - """ - if self.n_features is None: - assert isinstance(self.samples_path, str), ( - "self.n_features must be set manually if self.samples_path " - "is a pycompss.runtime.Future object" - ) - shape = _NpyFile(self.samples_path).get_shape() - if len(shape) != 2: - raise ValueError("Cannot read 2D array from the samples file.") - self.n_samples, self.n_features = shape - return self.n_features - - def get_y_targets(self): - """Obtains the array of targets. - - Returns - ------- - y_targets : ndarray - - """ - if self.y_targets is None: - targets = _get_targets(self.targets_path) - self.y_targets = targets - return self.y_targets - - def validate_features_file(self): - """Validates the features file header information. - - Raises - ------ - ValueError - If the shape of the array in the features_file doesn't match this - class n_samples and n_features or if the array is in fortran order. - - """ - features_npy_file = _NpyFile(self.features_path) - shape = features_npy_file.get_shape() - fortran_order = features_npy_file.get_fortran_order() - if len(shape) != 2: - raise ValueError("Cannot read 2D array from features_file.") - if (self.get_n_features(), self.get_n_samples()) != shape: - raise ValueError("Invalid dimensions for the features_file.") - if fortran_order: - raise ValueError("Fortran order not supported for features array.") - - -def transform_to_rf_dataset(x: Array, y: Array) -> RfDataset: - """Creates a RfDataset object from samples x and targets y. - - This function creates a dislib.regression.rf.data.RfDataset by saving - x and y in files. - - Parameters - ---------- - x : ds-array, shape = (n_samples, n_features) - The training input samples. - y : ds-array, shape = (n_samples,) or (n_samples, n_outputs) - The target values. - - Returns - ------- - rf_dataset : dislib.regression.rf._data.RfDataset - - """ - n_samples = x.shape[0] - n_features = x.shape[1] - - samples_file = tempfile.NamedTemporaryFile( - mode="wb", prefix="tmp_rf_samples_", delete=False - ) - samples_path = samples_file.name - samples_file.close() - _allocate_samples_file(samples_path, n_samples, n_features) - - start_idx = 0 - row_blocks_iterator = x._iterator(axis=0) - top_row = next(row_blocks_iterator) - _fill_samples_file(samples_path, top_row._blocks, start_idx) - start_idx += x._top_left_shape[0] - for x_row in row_blocks_iterator: - _fill_samples_file(samples_path, x_row._blocks, start_idx) - start_idx += x._reg_shape[0] - - targets_file = tempfile.NamedTemporaryFile( - mode="w", prefix="tmp_rf_targets_", delete=False - ) - targets_path = targets_file.name - targets_file.close() - for y_row in y._iterator(axis=0): - _fill_targets_file(targets_path, y_row._blocks) - - rf_dataset = RfDataset(samples_path, targets_path) - rf_dataset.n_samples = n_samples - rf_dataset.n_features = n_features - return rf_dataset - - -class _NpyFile(object): - def __init__(self, path): - self.path = path - - self.shape = None - self.fortran_order = None - self.dtype = None - - def get_shape(self): - if self.shape is None: - self._read_header() - return self.shape - - def get_fortran_order(self): - if self.fortran_order is None: - self._read_header() - return self.fortran_order - - def get_dtype(self): - if self.dtype is None: - self._read_header() - return self.dtype - - def _read_header(self): - with open(self.path, "rb") as fp: - version = format.read_magic(fp) - try: - format._check_version(version) - except ValueError: - raise ValueError("Invalid file format.") - header_data = format._read_array_header(fp, version) - self.shape, self.fortran_order, self.dtype = header_data - - -@task(targets_path=FILE_IN, returns=1) -def _get_targets(targets_path): - y = np.genfromtxt(targets_path, dtype=None, encoding="utf-8") - return y - - -@task(returns=1) -def _get_samples_shape(subset): - return subset.samples.shape - - -@task(returns=3) -def _merge_shapes(*samples_shapes): - n_samples = 0 - n_features = samples_shapes[0][1] - for shape in samples_shapes: - n_samples += shape[0] - assert shape[1] == n_features, "Subsamples with different n_features." - return samples_shapes, n_samples, n_features - - -@task(samples_path=FILE_INOUT) -def _allocate_samples_file(samples_path, n_samples, n_features): - np.lib.format.open_memmap( - samples_path, - mode="w+", - dtype="float32", - shape=(int(n_samples), int(n_features)), - ) - - -@task(samples_path=FILE_INOUT, row_blocks={Type: COLLECTION_IN, Depth: 2}) -def _fill_samples_file(samples_path, row_blocks, start_idx): - rows_samples = Array._merge_blocks(row_blocks) - rows_samples = rows_samples.astype(dtype="float32", casting="same_kind") - samples = np.lib.format.open_memmap(samples_path, mode="r+") - samples[start_idx : start_idx + rows_samples.shape[0]] = rows_samples - - -@task(targets_path=FILE_INOUT, row_blocks={Type: COLLECTION_IN, Depth: 2}) -def _fill_targets_file(targets_path, row_blocks): - rows_targets = Array._merge_blocks(row_blocks) - with open(targets_path, "at") as f: - np.savetxt(f, rows_targets, fmt="%s", encoding="utf-8") diff --git a/dislib/regression/rf/decision_tree.py b/dislib/regression/rf/decision_tree.py deleted file mode 100644 index 82730a5d..00000000 --- a/dislib/regression/rf/decision_tree.py +++ /dev/null @@ -1,564 +0,0 @@ -from sys import float_info - -import numpy as np -from numpy.random.mtrand import RandomState -from pycompss.api.api import compss_delete_object -from pycompss.api.parameter import FILE_IN, Type, COLLECTION_IN, Depth -from pycompss.api.task import task -from sklearn.tree import DecisionTreeRegressor as SklearnDTRegressor - -from dislib.regression.rf.test_split import test_split -from dislib.data.array import Array - - -class DecisionTreeRegressor: - """A distributed decision tree regressor. - - Parameters - ---------- - try_features : int - The number of features to consider when looking for the best split. - - Note: the search for a split does not stop until at least one - valid partition of the node samples is found, even if it requires - to effectively inspect more than ``try_features`` features. - max_depth : int - The maximum depth of the tree. If np.inf, then nodes are expanded - until all leaves are pure. - distr_depth : int - Number of levels of the tree in which the nodes are split in a - distributed way. - bootstrap : bool - Randomly select n_instances samples with repetition (used in random - forests). - random_state : RandomState instance - The random number generator. - - Attributes - ---------- - n_features : int - The number of features of the dataset. It can be a - pycompss.runtime.Future object. - tree : None or _Node - The root node of the tree after the tree is fitted. - nodes_info : None or list of _InnerNodeInfo and _LeafInfo - List of the node information for the nodes of the tree in the same - order as obtained in the fit() method, up to ``distr_depth`` depth. - After fit(), it is a pycompss.runtime.Future object. - subtrees : None or list of _Node - List of subtrees of the tree at ``distr_depth`` depth obtained in the - fit() method. After fit(), it is a list of pycompss.runtime.Future - objects. - - Methods - ------- - fit(dataset) - Fits the DecisionTreeRegressor. - predict(x_row) - Predicts classes for the given samples using a fitted tree. - predict_proba(x_row) - Predicts class probabilities for the given smaples using a fitted tree. - - """ - - def __init__( - self, - try_features, - max_depth, - distr_depth, - sklearn_max, - bootstrap, - random_state, - ): - self.try_features = try_features - self.max_depth = max_depth - self.distr_depth = distr_depth - self.sklearn_max = sklearn_max - self.bootstrap = bootstrap - self.random_state = random_state - - self.n_features = None - - self.tree = None - self.nodes_info = None - self.subtrees = None - - def fit(self, dataset): - """Fits the DecisionTreeRegressor. - - Parameters - ---------- - dataset : dislib.classification.rf._data.RfDataset - - """ - - self.n_features = dataset.get_n_features() - samples_path = dataset.samples_path - features_path = dataset.features_path - n_samples = dataset.get_n_samples() - y_targets = dataset.get_y_targets() - - seed = self.random_state.randint(np.iinfo(np.int32).max) - - sample, y_s = _sample_selection( - n_samples, y_targets, self.bootstrap, seed - ) - - self.tree = _Node() - self.nodes_info = [] - self.subtrees = [] - tree_traversal = [(self.tree, sample, y_s, 0)] - while tree_traversal: - node, sample, y_s, depth = tree_traversal.pop() - if depth < self.distr_depth: - split = _split_node_wrapper( - sample, - self.n_features, - y_s, - self.try_features, - self.random_state, - samples_file=samples_path, - features_file=features_path, - ) - node_info, left_group, y_l, right_group, y_r = split - compss_delete_object(sample) - compss_delete_object(y_s) - node.content = len(self.nodes_info) - self.nodes_info.append(node_info) - node.left = _Node() - node.right = _Node() - depth = depth + 1 - tree_traversal.append((node.right, right_group, y_r, depth)) - tree_traversal.append((node.left, left_group, y_l, depth)) - else: - subtree = _build_subtree_wrapper( - sample, - y_s, - self.n_features, - self.max_depth - depth, - self.try_features, - self.sklearn_max, - self.random_state, - samples_path, - features_path, - ) - node.content = len(self.subtrees) - self.subtrees.append(subtree) - compss_delete_object(sample) - compss_delete_object(y_s) - self.nodes_info = _merge(*self.nodes_info) - - def predict(self, x_row): - """Predicts classes for the given samples using a fitted tree. - - Parameters - ---------- - x_row : ds-array - A row block of samples. - - Returns - ------- - predicted : ndarray - An array with the predicted classes for the given samples. The - values are codes of the fitted - dislib.classification.rf.data.RfDataset. The returned object can - be a pycompss.runtime.Future object. - - """ - - assert self.tree is not None, "The decision tree is not fitted." - - branch_predictions = [] - for i, subtree in enumerate(self.subtrees): - pred = _predict_branch( - x_row._blocks, - self.tree, - self.nodes_info, - i, - subtree, - self.distr_depth, - ) - branch_predictions.append(pred) - return _merge_branches(None, *branch_predictions) - - -class _Node: - def __init__(self): - self.content = None - self.left = None - self.right = None - - def predict(self, sample): - node_content = self.content - if isinstance(node_content, _LeafInfo): - return np.full((len(sample),), node_content.mean) - if isinstance(node_content, _SkTreeWrapper): - if len(sample) > 0: - return node_content.sk_tree.predict(sample) - if isinstance(node_content, _InnerNodeInfo): - pred = np.empty((len(sample),), dtype=np.float64) - left_mask = sample[:, node_content.index] <= node_content.value - pred[left_mask] = self.left.predict(sample[left_mask]) - pred[~left_mask] = self.right.predict(sample[~left_mask]) - return pred - assert len(sample) == 0, "Type not supported" - return np.empty((0,), dtype=np.float64) - - -class _InnerNodeInfo: - def __init__(self, index=None, value=None): - self.index = index - self.value = value - - -class _LeafInfo: - def __init__(self, size=None, mean=None): - self.size = size - self.mean = mean - - -class _SkTreeWrapper: - def __init__(self, tree): - self.sk_tree = tree - - -def _get_sample_attributes(samples_file, indices): - samples_mmap = np.load(samples_file, mmap_mode="r", allow_pickle=False) - x = samples_mmap[indices] - return x - - -def _get_feature_mmap(features_file, i): - return _get_features_mmap(features_file)[i] - - -def _get_features_mmap(features_file): - return np.load(features_file, mmap_mode="r", allow_pickle=False) - - -@task(priority=True, returns=2) -def _sample_selection(n_samples, y_targets, bootstrap, seed): - if bootstrap: - random_state = RandomState(seed) - selection = random_state.choice( - n_samples, size=n_samples, replace=True - ) - selection.sort() - return selection, y_targets[selection] - else: - return np.arange(n_samples), y_targets - - -def _feature_selection(untried_indices, m_try, random_state): - selection_len = min(m_try, len(untried_indices)) - return random_state.choice( - untried_indices, size=selection_len, replace=False - ) - - -def _get_groups(sample, y_s, features_mmap, index, value): - if index is None: - empty_sample = np.array([], dtype=np.int64) - empty_target = np.array([], dtype=np.float64) - return sample, y_s, empty_sample, empty_target - feature = features_mmap[index][sample] - mask = feature < value - left = sample[mask] - right = sample[~mask] - y_l = y_s[mask] - y_r = y_s[~mask] - return left, y_l, right, y_r - - -def _compute_leaf_info(y_s): - return _LeafInfo(len(y_s), np.mean(y_s)) - - -def _split_node_wrapper( - sample, - n_features, - y_s, - m_try, - random_state, - samples_file=None, - features_file=None, -): - seed = random_state.randint(np.iinfo(np.int32).max) - - if features_file is not None: - return _split_node_using_features( - sample, n_features, y_s, m_try, features_file, seed - ) - elif samples_file is not None: - return _split_node(sample, n_features, y_s, m_try, samples_file, seed) - else: - raise ValueError( - "Invalid combination of arguments. samples_file is " - "None and features_file is None." - ) - - -@task(features_file=FILE_IN, returns=(object, list, list, list, list)) -def _split_node_using_features( - sample, n_features, y_s, m_try, features_file, seed -): - features_mmap = np.load(features_file, mmap_mode="r", allow_pickle=False) - random_state = RandomState(seed) - return _compute_split( - sample, n_features, y_s, m_try, features_mmap, random_state - ) - - -@task(samples_file=FILE_IN, returns=(object, list, list, list, list)) -def _split_node(sample, n_features, y_s, m_try, samples_file, seed): - features_mmap = np.load(samples_file, mmap_mode="r", allow_pickle=False).T - random_state = RandomState(seed) - return _compute_split( - sample, n_features, y_s, m_try, features_mmap, random_state - ) - - -def _compute_split( - sample, n_features, y_s, m_try, features_mmap, random_state -): - node_info = left_group = y_l = right_group = y_r = None - split_ended = False - tried_indices = [] - while not split_ended: - untried_indices = np.setdiff1d(np.arange(n_features), tried_indices) - index_selection = _feature_selection( - untried_indices, m_try, random_state - ) - b_score = float_info.max - b_index = None - b_value = None - for index in index_selection: - feature = features_mmap[index] - score, value = test_split(sample, y_s, feature) - if score < b_score: - b_score, b_value, b_index = score, value, index - groups = _get_groups(sample, y_s, features_mmap, b_index, b_value) - left_group, y_l, right_group, y_r = groups - if left_group.size and right_group.size: - split_ended = True - node_info = _InnerNodeInfo(b_index, b_value) - else: - tried_indices.extend(list(index_selection)) - if len(tried_indices) == n_features: - split_ended = True - node_info = _compute_leaf_info(y_s) - left_group = sample - y_l = y_s - right_group = np.array([], dtype=np.int64) - y_r = np.array([], dtype=np.float64) - - return node_info, left_group, y_l, right_group, y_r - - -def _build_subtree_wrapper( - sample, - y_s, - n_features, - max_depth, - m_try, - sklearn_max, - random_state, - samples_file, - features_file, -): - seed = random_state.randint(np.iinfo(np.int32).max) - if features_file is not None: - return _build_subtree_using_features( - sample, - y_s, - n_features, - max_depth, - m_try, - sklearn_max, - seed, - samples_file, - features_file, - ) - else: - return _build_subtree( - sample, - y_s, - n_features, - max_depth, - m_try, - sklearn_max, - seed, - samples_file, - ) - - -@task(samples_file=FILE_IN, features_file=FILE_IN, returns=_Node) -def _build_subtree_using_features( - sample, - y_s, - n_features, - max_depth, - m_try, - sklearn_max, - seed, - samples_file, - features_file, -): - random_state = RandomState(seed) - return _compute_build_subtree( - sample, - y_s, - n_features, - max_depth, - m_try, - sklearn_max, - random_state, - samples_file, - features_file=features_file, - ) - - -@task(samples_file=FILE_IN, returns=_Node) -def _build_subtree( - sample, - y_s, - n_features, - max_depth, - m_try, - sklearn_max, - seed, - samples_file, -): - random_state = RandomState(seed) - return _compute_build_subtree( - sample, - y_s, - n_features, - max_depth, - m_try, - sklearn_max, - random_state, - samples_file, - ) - - -def _compute_build_subtree( - sample, - y_s, - n_features, - max_depth, - m_try, - sklearn_max, - random_state, - samples_file, - features_file=None, - use_sklearn=True, -): - if not sample.size: - return _Node() - if features_file is not None: - mmap = np.load(features_file, mmap_mode="r", allow_pickle=False) - else: - mmap = np.load(samples_file, mmap_mode="r", allow_pickle=False).T - subtree = _Node() - tree_traversal = [(subtree, sample, y_s, 0)] - while tree_traversal: - node, sample, y_s, depth = tree_traversal.pop() - if depth < max_depth: - if use_sklearn and n_features * len(sample) <= sklearn_max: - if max_depth == np.inf: - sklearn_max_depth = None - else: - sklearn_max_depth = max_depth - depth - dt = SklearnDTRegressor( - max_features=m_try, - max_depth=sklearn_max_depth, - random_state=random_state, - ) - unique = np.unique( - sample, return_index=True, return_counts=True - ) - sample, new_indices, sample_weight = unique - x = _get_sample_attributes(samples_file, sample) - y_s = y_s[new_indices] - dt.fit(x, y_s, sample_weight=sample_weight, check_input=False) - node.content = _SkTreeWrapper(dt) - else: - split = _compute_split( - sample, - n_features, - y_s, - m_try, - mmap, - random_state, - ) - node_info, left_group, y_l, right_group, y_r = split - node.content = node_info - if isinstance(node_info, _InnerNodeInfo): - node.left = _Node() - node.right = _Node() - tree_traversal.append( - (node.right, right_group, y_r, depth + 1) - ) - tree_traversal.append( - (node.left, left_group, y_l, depth + 1) - ) - else: - node.content = _compute_leaf_info(y_s) - return subtree - - -@task(returns=list) -def _merge(*object_list): - return object_list - - -def _get_subtree_path(subtree_index, distr_depth): - if distr_depth == 0: - return "" - return bin(subtree_index)[2:].zfill(distr_depth) - - -def _get_predicted_indices(samples, tree, nodes_info, path): - idx_mask = np.full((len(samples),), True) - for direction in path: - node_info = nodes_info[tree.content] - if isinstance(node_info, _LeafInfo): - if direction == "1": - idx_mask[:] = 0 - else: - col = node_info.index - value = node_info.value - if direction == "0": - idx_mask[idx_mask] = samples[idx_mask, col] <= value - tree = tree.left - else: - idx_mask[idx_mask] = samples[idx_mask, col] > value - tree = tree.right - return idx_mask - - -@task(row_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) -def _predict_branch( - row_blocks, tree, nodes_info, subtree_index, subtree, distr_depth -): - samples = Array._merge_blocks(row_blocks) - path = _get_subtree_path(subtree_index, distr_depth) - indices_mask = _get_predicted_indices(samples, tree, nodes_info, path) - prediction = subtree.predict(samples[indices_mask]) - return indices_mask, prediction - - -@task(returns=list) -def _merge_branches(n_classes, *predictions): - samples_len = len(predictions[0][0]) - if n_classes is not None: # predict - shape = (samples_len, n_classes) - dtype = np.float64 - else: # predict_proba - shape = (samples_len,) - dtype = np.float64 - merged_prediction = np.empty(shape, dtype=dtype) - for selected, prediction in predictions: - merged_prediction[selected] = prediction - return merged_prediction diff --git a/dislib/regression/rf/forest.py b/dislib/regression/rf/forest.py deleted file mode 100644 index faae07c8..00000000 --- a/dislib/regression/rf/forest.py +++ /dev/null @@ -1,236 +0,0 @@ -import math -from collections import Counter - -import numpy as np -from pycompss.api.api import compss_wait_on -from pycompss.api.parameter import Type, COLLECTION_IN, Depth -from pycompss.api.task import task -from sklearn.base import BaseEstimator -from sklearn.utils import check_random_state - -from dislib.regression.rf.decision_tree import DecisionTreeRegressor -from dislib.data.array import Array -from dislib.utils.base import _paired_partition -from dislib.regression.rf._data import transform_to_rf_dataset - - -class RandomForestRegressor(BaseEstimator): - """A distributed random forest regressor. - - Parameters - ---------- - n_estimators : int, optional (default=10) - Number of trees to fit. - try_features : int, str or None, optional (default='sqrt') - The number of features to consider when looking for the best split: - - - If "sqrt", then `try_features=sqrt(n_features)`. - - If "third", then `try_features=n_features // 3`. - - If None, then `try_features=n_features`. - - Note: the search for a split does not stop until at least one - valid partition of the node samples is found, even if it requires - to effectively inspect more than ``try_features`` features. - max_depth : int or np.inf, optional (default=np.inf) - The maximum depth of the tree. If np.inf, then nodes are expanded - until all leaves are pure. - distr_depth : int or str, optional (default='auto') - Number of levels of the tree in which the nodes are split in a - distributed way. - sklearn_max: int or float, optional (default=1e8) - Maximum size (len(subsample)*n_features) of the arrays passed to - sklearn's DecisionTreeRegressor.fit(), which is called to fit subtrees - (subsamples) of our DecisionTreeRegressor. sklearn fit() is used - because it's faster, but requires loading the data to memory, which can - cause memory problems for large datasets. This parameter can be - adjusted to fit the hardware capabilities. - random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. - - Attributes - ---------- - trees : list of DecisionTreeRegressor - List of the tree regressors of this forest, populated at fit(). - """ - - def __init__( - self, - n_estimators=10, - try_features="sqrt", - max_depth=np.inf, - distr_depth="auto", - sklearn_max=1e8, - random_state=None, - ): - self.n_estimators = n_estimators - self.try_features = try_features - self.max_depth = max_depth - self.distr_depth = distr_depth - self.sklearn_max = sklearn_max - self.random_state = random_state - - def fit(self, x, y): - """Fits the RandomForestRegressor. - - Parameters - ---------- - x : ds-array, shape=(n_samples, n_features) - The training input samples. Internally, its dtype will be converted - to ``dtype=np.float32``. - y : ds-array, shape=(n_samples, 1) - The target values. - - Returns - ------- - self : RandomForestRegressor - - """ - self.trees = [] - - dataset = transform_to_rf_dataset(x, y) - - n_features = dataset.get_n_features() - try_features = _resolve_try_features(self.try_features, n_features) - random_state = check_random_state(self.random_state) - - if self.distr_depth == "auto": - dataset.n_samples = compss_wait_on(dataset.get_n_samples()) - distr_depth = max(0, int(math.log10(dataset.n_samples)) - 4) - distr_depth = min(distr_depth, self.max_depth) - else: - distr_depth = self.distr_depth - - for _ in range(self.n_estimators): - tree = DecisionTreeRegressor( - try_features, - self.max_depth, - distr_depth, - self.sklearn_max, - bootstrap=True, - random_state=random_state, - ) - self.trees.append(tree) - - for tree in self.trees: - tree.fit(dataset) - - return self - - def predict(self, x): - """Predicts target values using a fitted forest. - - Parameters - ---------- - x : ds-array, shape=(n_samples, n_features) - The input samples. - - Returns - ------- - y_pred : ds-array, shape=(n_samples, 1) - Predicted target values for x. - - """ - assert self.trees is not None, "The random forest is not fitted." - pred_blocks = [] - for x_row in x._iterator(axis=0): - tree_predictions = [] - for tree in self.trees: - tree_predictions.append(tree.predict(x_row)) - pred_blocks.append(_join_predictions(*tree_predictions)) - - y_pred = Array( - blocks=[pred_blocks], - top_left_shape=(x._top_left_shape[0], 1), - reg_shape=(x._reg_shape[0], 1), - shape=(x.shape[0], 1), - sparse=False, - ) - - return y_pred - - def score(self, x, y): - """Accuracy regression score. - - Return the coefficient of determination $R^2$ of - the prediction. - The coefficient $R^2$ is defined as $(1-u/v)$, where $u$ - is the residual sum of squares `((y_true - y_pred) ** 2).sum()` and - $v$ is the total sum of squares - `((y_true - y_true.mean()) ** 2).sum()`. - The best possible score is 1.0 and it can be negative - (because the model can be arbitrarily worse). - A constant model that always predicts the expected value of y, - disregarding the input features, would get a $R^2$ score of 0.0. - - Parameters - ---------- - x : ds-array, shape=(n_samples, n_features) - The training input samples. - y : ds-array, shape (n_samples, 1) - The true labels. - - Returns - ------- - score : float (as future object) - Coefficient of determination $R^2$. - - """ - assert self.trees is not None, "The random forest is not fitted." - partial_scores = [] - for x_row, y_row in _paired_partition(x, y): - tree_predictions = [] - for tree in self.trees: - tree_predictions.append(tree.predict(x_row)) - subset_score = _partial_score(y_row._blocks, *tree_predictions) - partial_scores.append(subset_score) - - return _merge_scores(*partial_scores) - - -@task(returns=1) -def _resolve_try_features(try_features, n_features): - if try_features is None: - return n_features - elif try_features == "sqrt": - return int(math.sqrt(n_features)) - elif try_features == "third": - return max(1, n_features // 3) - else: - return int(try_features) - - -@task(returns=1) -def _join_predictions(*predictions): - aggregate = predictions[0] - for p in predictions[1:]: - aggregate += p - target = aggregate / len(predictions) - return target - - -@task(y_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) -def _partial_score(y_blocks, *predictions): - y_true = Array._merge_blocks(y_blocks).flatten() - y_pred = np.mean(predictions, axis=0) - n_samples = y_true.shape[0] - y_avg = np.mean(y_true) - u_partial = np.sum(np.square(y_true - y_pred), axis=0) - v_partial = np.sum(np.square(y_true - y_avg), axis=0) - return u_partial, v_partial, y_avg, n_samples - - -@task(returns=1) -def _merge_scores(*partial_scores): - u = v = avg = n = 0 - for u_p, v_p, avg_p, n_p in partial_scores: - u += u_p - - delta = avg_p - avg - avg += delta * n_p / (n + n_p) - v += v_p + delta ** 2 * n * n_p / (n + n_p) - n += n_p - - return 1 - u / v diff --git a/dislib/regression/rf/test_split.py b/dislib/regression/rf/test_split.py deleted file mode 100644 index aa482b3c..00000000 --- a/dislib/regression/rf/test_split.py +++ /dev/null @@ -1,48 +0,0 @@ -from sys import float_info - -import numpy as np - - -def mse_criteria_proxy(l_weight, l_length, r_weight, r_length, not_repeated): - """ - Maximizing the MSE gain is equivalent to minimizing this proxy function. - - """ - return -(l_weight / l_length + r_weight / r_length) * not_repeated - - -def test_split(sample, y_s, feature): - size = y_s.shape[0] - if size == 0: - return float_info.max, np.float64(np.inf) - - f = feature[sample] - sort_indices = np.argsort(f) - y_sorted = y_s[sort_indices] - f_sorted = f[sort_indices] - - # Threshold value must not be that value of a sample - not_repeated = np.empty(size, dtype=np.bool_) - not_repeated[0 : size - 1] = f_sorted[1:] != f_sorted[:-1] - not_repeated[size - 1] = True - - # Square of the sum of the y values of each branch - r_weight = np.zeros(size) - l_weight = np.square(np.cumsum(y_sorted, axis=-1)) - r_weight[:-1] = np.square(np.cumsum(y_sorted[::-1], axis=-1)[-2::-1]) - - # Number of samples of each branch - l_length = np.arange(1, size + 1, dtype=np.int32) - r_length = np.arange(size - 1, -1, -1, dtype=np.int32) - r_length[size - 1] = 1 # Avoid div by zero, the right score is 0 anyways - - scores = mse_criteria_proxy( - l_weight, l_length, r_weight, r_length, not_repeated - ) - - min_index = size - np.argmin(scores[::-1]) - 1 - if min_index + 1 == size: - b_value = np.float64(np.inf) - else: - b_value = (f_sorted[min_index] + f_sorted[min_index + 1]) / 2 - return scores[min_index], b_value From 79981dbbac11f870506e79fbd06c491f03b489e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Mon, 26 Jul 2021 18:15:57 +0200 Subject: [PATCH 15/46] Edited tests. --- dislib/utils/saving.py | 11 +- tests/test_saving.py | 31 ++ tests/test_saving_cbor.py | 555 +------------------------------ tests/test_saving_json.py | 667 ++++---------------------------------- 4 files changed, 122 insertions(+), 1142 deletions(-) create mode 100644 tests/test_saving.py diff --git a/dislib/utils/saving.py b/dislib/utils/saving.py index f0b8313c..620cc90a 100644 --- a/dislib/utils/saving.py +++ b/dislib/utils/saving.py @@ -16,9 +16,12 @@ import dislib.recommendation import dislib.regression from dislib.data.array import Array -from dislib.classification.rf.decision_tree import ( +from dislib.commons.rf._decision_tree import ( DecisionTreeClassifier, + DecisionTreeRegressor, _Node, + _ClassificationNode, + _RegressionNode, _InnerNodeInfo, _LeafInfo, _SkTreeWrapper, @@ -44,7 +47,10 @@ DISLIB_CLASSES = { "KMeans": dislib.cluster.KMeans, "DecisionTreeClassifier": DecisionTreeClassifier, + "DecisionTreeRegressor": DecisionTreeRegressor, "_Node": _Node, + "_ClassificationNode": _ClassificationNode, + "_RegressionNode": _RegressionNode, "_InnerNodeInfo": _InnerNodeInfo, "_LeafInfo": _LeafInfo, "_SkTreeWrapper": _SkTreeWrapper, @@ -347,6 +353,7 @@ def _sync_obj(obj): elif isinstance(obj, list): iterator = iter(enumerate(obj)) else: + print(obj) raise ValueError("Expected dict or list and received %s." % type(obj)) for key, val in iterator: @@ -358,7 +365,7 @@ def _sync_obj(obj): raise TypeError( "Could not synchronize Future (%s, %s)." % (key, val) ) - if hasattr(obj[key], "__dict__"): + if isinstance(getattr(obj[key], "__dict__", None), dict): _sync_obj(obj[key].__dict__) diff --git a/tests/test_saving.py b/tests/test_saving.py new file mode 100644 index 00000000..d1a8bb92 --- /dev/null +++ b/tests/test_saving.py @@ -0,0 +1,31 @@ +import unittest +from unittest.mock import patch + +import numpy as np +import sys + +from dislib.cluster import KMeans +from dislib.utils import save_model, load_model + + +class SavingTest(unittest.TestCase): + filepath = "tests/files/saving/kmeans.json" + + def test_errors(self): + """Test that errors are raised""" + km = KMeans(n_clusters=2, verbose=False) + + with patch(sys.modules["cbor"]) as mock_cbor: + mock_cbor.return_value = None + self.assertRaises( + ModuleNotFoundError, + save_model(km, self.filepath, save_format="json"), + ) + + +def main(): + unittest.main() + + +if __name__ == "__main__": + main() diff --git a/tests/test_saving_cbor.py b/tests/test_saving_cbor.py index 64cd534a..c8efd336 100644 --- a/tests/test_saving_cbor.py +++ b/tests/test_saving_cbor.py @@ -25,32 +25,6 @@ class KMeansSavingTestCBOR(unittest.TestCase): filepath = "tests/files/saving/kmeans.cbor" - def test_init_params_kmeans(self): - """Tests that KMeans correctly sets the initialization - parameters""" - n_clusters = 2 - max_iter = 1 - tol = 1e-4 - seed = 666 - arity = 2 - init = "random" - - km = KMeans( - n_clusters=n_clusters, - max_iter=max_iter, - tol=tol, - arity=arity, - random_state=seed, - ) - save_model(km, self.filepath, save_format="cbor") - km2 = load_model(self.filepath, load_format="cbor") - - expected = (n_clusters, init, max_iter, tol, arity) - real = (km.n_clusters, km.init, km.max_iter, km.tol, km.arity) - real2 = (km2.n_clusters, km2.init, km2.max_iter, km2.tol, km2.arity) - self.assertEqual(expected, real) - self.assertEqual(expected, real2) - def test_fit_kmeans(self): """Tests that the fit method returns the expected centers using toy data. @@ -94,35 +68,6 @@ def test_predict_kmeans(self): self.assertTrue(np.array_equal(labels, expected_labels)) self.assertTrue(np.array_equal(labels2, expected_labels)) - def test_fit_predict_kmeans(self): - """Tests fit_predict.""" - x, y = make_blobs(n_samples=1500, random_state=170) - x_filtered = np.vstack( - (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10]) - ) - - x_train = ds.array(x_filtered, block_size=(300, 2)) - - kmeans = KMeans(n_clusters=3, random_state=170) - labels = kmeans.fit_predict(x_train).collect() - - save_model(kmeans, self.filepath, save_format="cbor") - kmeans = load_model(self.filepath, load_format="cbor") - - skmeans = SKMeans(n_clusters=3, random_state=170) - sklabels = skmeans.fit_predict(x_filtered) - - centers = np.array( - [ - [-8.941375656533449, -5.481371322614891], - [-4.524023204953875, 0.06235042593214654], - [2.332994701667008, 0.37681003933082696], - ] - ) - - self.assertTrue(np.allclose(centers, kmeans.centers)) - self.assertTrue(np.allclose(labels, sklabels)) - def test_sparse_kmeans(self): """Tests K-means produces the same results using dense and sparse data structures.""" @@ -192,67 +137,6 @@ def test_init_kmeans(self): class GaussianMixtureSavingTestCBOR(unittest.TestCase): filepath = "tests/files/saving/gm.cbor" - def test_init_params(self): - """Tests that GaussianMixture params are set""" - n_components = 2 - covariance_type = "diag" - tol = 1e-4 - reg_covar = 1e-5 - max_iter = 3 - init_params = "random" - weights_init = np.array([0.4, 0.6]) - means_init = np.array([[0, 0], [2, 3]]) - precisions_init = "todo" - random_state = RandomState(666) - gm = GaussianMixture( - n_components=n_components, - covariance_type=covariance_type, - tol=tol, - reg_covar=reg_covar, - max_iter=max_iter, - init_params=init_params, - weights_init=weights_init, - means_init=means_init, - precisions_init=precisions_init, - random_state=random_state, - ) - - save_model(gm, self.filepath, save_format="cbor") - gm2 = load_model(self.filepath, load_format="cbor") - - real = ( - gm.n_components, - gm.covariance_type, - gm.tol, - gm.reg_covar, - gm.max_iter, - gm.init_params, - gm.weights_init.tolist(), - gm.means_init.tolist(), - gm.precisions_init, - *[ - list(x) if isinstance(x, np.ndarray) else x - for x in gm.random_state.get_state() - ], - ) - real2 = ( - gm2.n_components, - gm2.covariance_type, - gm2.tol, - gm2.reg_covar, - gm2.max_iter, - gm2.init_params, - gm2.weights_init.tolist(), - gm2.means_init.tolist(), - gm2.precisions_init, - *[ - list(x) if isinstance(x, np.ndarray) else x - for x in gm2.random_state.get_state() - ], - ) - - self.assertEqual(real, real2) - def test_fit(self): """Tests GaussianMixture.fit()""" @@ -324,32 +208,6 @@ def test_predict(self): self.assertTrue(pred2[0] == pred2[2] == pred2[4]) self.assertTrue(pred2[1] == pred2[3] == pred2[5]) - def test_fit_predict(self): - """Tests GaussianMixture.fit_predict()""" - x, y = make_blobs(n_samples=1500, random_state=170) - x_filtered = np.vstack( - (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10]) - ) - y_real = np.concatenate((np.zeros(500), np.ones(100), 2 * np.ones(10))) - - ds_x = ds.array(x_filtered, block_size=(300, 2)) - - gm = GaussianMixture(n_components=3, random_state=170) - pred = gm.fit_predict(ds_x).collect() - - save_model(gm, self.filepath, save_format="cbor") - gm2 = load_model(self.filepath, load_format="cbor") - - pred2 = gm2.predict(ds_x).collect() - - self.assertEqual(len(pred), 610) - accuracy = np.count_nonzero(pred == y_real) / len(pred) - self.assertGreater(accuracy, 0.99) - - self.assertEqual(len(pred2), 610) - accuracy2 = np.count_nonzero(pred2 == y_real) / len(pred2) - self.assertGreater(accuracy2, 0.99) - def test_sparse(self): """Tests GaussianMixture produces the same results using dense and sparse data structures""" @@ -383,86 +241,10 @@ def test_sparse(self): self.assertTrue(np.array_equal(labels_sparse, labels_dense)) self.assertTrue(np.array_equal(labels_sparse2, labels_dense2)) - def test_init_random(self): - """Tests GaussianMixture random initialization""" - x = ds.random_array((50, 3), (10, 3), random_state=0) - gm = GaussianMixture( - init_params="random", n_components=4, arity=2, random_state=170 - ) - gm.fit(x) - save_model(gm, self.filepath, save_format="cbor") - gm2 = load_model(self.filepath, load_format="cbor") - self.assertGreater(gm.n_iter, 5) - self.assertGreater(gm2.n_iter, 5) - - def test_means_init_and_weights_init(self): - """Tests GaussianMixture means_init and weights_init parameters""" - x, _ = load_iris(return_X_y=True) - x_ds = ds.array(x, (75, 4)) - weights_init = [1 / 3, 1 / 3, 1 / 3] - means_init = np.array([[5, 3, 2, 0], [6, 3, 4, 1], [7, 3, 6, 2]]) - gm = GaussianMixture( - random_state=0, - n_components=3, - weights_init=weights_init, - means_init=means_init, - ) - gm.fit(x_ds) - save_model(gm, self.filepath, save_format="cbor") - gm2 = load_model(self.filepath, load_format="cbor") - self.assertTrue(gm.converged_) - self.assertTrue(gm2.converged_) - class CSVMSavingTestCBOR(unittest.TestCase): filepath = "tests/files/saving/csvm.cbor" - def test_init_params(self): - """Test constructor parameters""" - cascade_arity = 3 - max_iter = 1 - tol = 1e-4 - kernel = "rbf" - c = 2 - gamma = 0.1 - check_convergence = True - seed = 666 - verbose = False - - csvm = CascadeSVM( - cascade_arity=cascade_arity, - max_iter=max_iter, - tol=tol, - kernel=kernel, - c=c, - gamma=gamma, - check_convergence=check_convergence, - random_state=seed, - verbose=verbose, - ) - save_model(csvm, self.filepath, save_format="cbor") - csvm2 = load_model(self.filepath, load_format="cbor") - - self.assertEqual(csvm.cascade_arity, cascade_arity) - self.assertEqual(csvm.max_iter, max_iter) - self.assertEqual(csvm.tol, tol) - self.assertEqual(csvm.kernel, kernel) - self.assertEqual(csvm.c, c) - self.assertEqual(csvm.gamma, gamma) - self.assertEqual(csvm.check_convergence, check_convergence) - self.assertEqual(csvm.random_state, seed) - self.assertEqual(csvm.verbose, verbose) - - self.assertEqual(csvm2.cascade_arity, cascade_arity) - self.assertEqual(csvm2.max_iter, max_iter) - self.assertEqual(csvm2.tol, tol) - self.assertEqual(csvm2.kernel, kernel) - self.assertEqual(csvm2.c, c) - self.assertEqual(csvm2.gamma, gamma) - self.assertEqual(csvm2.check_convergence, check_convergence) - self.assertEqual(csvm2.random_state, seed) - self.assertEqual(csvm2.verbose, verbose) - def test_fit_private_params(self): kernel = "rbf" c = 2 @@ -495,51 +277,6 @@ def test_fit_private_params(self): # # check for exception when incorrect kernel is passed # self.assertRaises(AttributeError, CascadeSVM(kernel='fake_kernel')) - def test_fit(self): - seed = 666 - file_ = "tests/files/libsvm/2" - - x, y = ds.load_svmlight_file(file_, (10, 300), 780, False) - - csvm = CascadeSVM( - cascade_arity=3, - max_iter=5, - tol=1e-4, - kernel="linear", - c=2, - gamma=0.1, - check_convergence=True, - random_state=seed, - verbose=False, - ) - - csvm.fit(x, y) - save_model(csvm, self.filepath, save_format="cbor") - csvm2 = load_model(self.filepath, load_format="cbor") - - self.assertTrue(csvm.converged) - self.assertTrue(csvm2.converged) - - csvm = CascadeSVM( - cascade_arity=3, - max_iter=1, - tol=1e-4, - kernel="linear", - c=2, - gamma=0.1, - check_convergence=False, - random_state=seed, - verbose=False, - ) - - csvm.fit(x, y) - save_model(csvm, self.filepath, save_format="cbor") - csvm2 = load_model(self.filepath, load_format="cbor") - self.assertFalse(csvm.converged) - self.assertEqual(csvm.iterations, 1) - self.assertFalse(csvm2.converged) - self.assertEqual(csvm2.iterations, 1) - def test_predict(self): seed = 666 @@ -617,60 +354,6 @@ def test_score(self): self.assertEqual(accuracy, 1.0) self.assertEqual(accuracy2, 1.0) - def test_decision_func(self): - seed = 666 - - # negative points belong to class 1, positives to 0 - # all points are in the x-axis - p1, p2, p3, p4 = [0, 2], [0, 1], [0, -2], [0, -1] - - x = ds.array(np.array([p1, p4, p3, p2]), (2, 2)) - y = ds.array(np.array([0, 1, 1, 0]).reshape(-1, 1), (2, 1)) - - csvm = CascadeSVM( - cascade_arity=3, - max_iter=10, - tol=1e-4, - kernel="rbf", - c=2, - gamma=0.1, - check_convergence=False, - random_state=seed, - verbose=False, - ) - - csvm.fit(x, y) - save_model(csvm, self.filepath, save_format="cbor") - csvm2 = load_model(self.filepath, load_format="cbor") - - # p1 should be equidistant to p3, and p2 to p4 - x_test = ds.array(np.array([p1, p2, p3, p4]), (2, 2)) - - y_pred = csvm.decision_function(x_test) - y_pred2 = csvm2.decision_function(x_test) - - d1, d2, d3, d4 = y_pred.collect() - self.assertTrue(np.isclose(abs(d1) - abs(d3), 0)) - self.assertTrue(np.isclose(abs(d2) - abs(d4), 0)) - d1, d2, d3, d4 = y_pred2.collect() - self.assertTrue(np.isclose(abs(d1) - abs(d3), 0)) - self.assertTrue(np.isclose(abs(d2) - abs(d4), 0)) - - # p5 and p6 should be in the decision function (distance=0) - p5, p6 = np.array([1, 0]), np.array([-1, 0]) - - x_test = ds.array(np.array([p5, p6]), (1, 2)) - - y_pred = csvm.decision_function(x_test) - y_pred2 = csvm2.decision_function(x_test) - - d5, d6 = y_pred.collect() - self.assertTrue(np.isclose(d5, 0)) - self.assertTrue(np.isclose(d6, 0)) - d5, d6 = y_pred2.collect() - self.assertTrue(np.isclose(d5, 0)) - self.assertTrue(np.isclose(d6, 0)) - def test_sparse(self): """Tests that C-SVM produces the same results with sparse and dense data""" @@ -708,36 +391,6 @@ def test_sparse(self): self.assertTrue(np.array_equal(coef_d2, coef_sp2)) self.assertTrue(np.array_equal(coef_d, coef_d2)) - def test_duplicates(self): - """Tests that C-SVM does not generate duplicate support vectors""" - x = ds.array( - np.array( - [ - [0, 1], - [1, 1], - [0, 1], - [1, 2], - [0, 0], - [2, 2], - [2, 1], - [1, 0], - ] - ), - (2, 2), - ) - - y = ds.array(np.array([1, 0, 1, 0, 1, 0, 0, 1]).reshape(-1, 1), (2, 1)) - - csvm = CascadeSVM(c=1, random_state=1, max_iter=100, tol=0) - csvm.fit(x, y) - save_model(csvm, self.filepath, save_format="cbor") - csvm2 = load_model(self.filepath, load_format="cbor") - - csvm._collect_clf() - csvm2._collect_clf() - self.assertEqual(csvm._clf.support_vectors_.shape[0], 6) - self.assertEqual(csvm2._clf.support_vectors_.shape[0], 6) - class RFSavingTestCBOR(unittest.TestCase): filepath = "tests/files/saving/rf.cbor" @@ -757,8 +410,8 @@ def test_make_classification_score(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2 :], (300, 10)) + y_test = ds.array(y[len(y) // 2 :][:, np.newaxis], (300, 1)) rf = RandomForestClassifier(random_state=0) rf.fit(x_train, y_train) @@ -785,8 +438,8 @@ def test_make_classification_predict_and_distr_depth(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = y[len(y) // 2:] + x_test = ds.array(x[len(x) // 2 :], (300, 10)) + y_test = y[len(y) // 2 :] rf = RandomForestClassifier(distr_depth=2, random_state=0) rf.fit(x_train, y_train) @@ -800,35 +453,6 @@ def test_make_classification_predict_and_distr_depth(self): self.assertGreater(accuracy, 0.7) self.assertGreater(accuracy2, 0.7) - def test_make_classification_fit_predict(self): - """Tests RandomForestClassifier fit_predict with default params.""" - x, y = make_classification( - n_samples=3000, - n_features=10, - n_classes=3, - n_informative=4, - n_redundant=2, - n_repeated=1, - n_clusters_per_class=2, - shuffle=True, - random_state=0, - ) - x_train = ds.array(x[: len(x) // 2], (300, 10)) - y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - - rf = RandomForestClassifier(random_state=0) - rf.fit(x_train, y_train) - save_model(rf, self.filepath, save_format="cbor") - rf2 = load_model(self.filepath, load_format="cbor") - - y_pred = rf.predict(x_train).collect() - y_pred2 = rf2.predict(x_train).collect() - y_train = y_train.collect() - accuracy = np.count_nonzero(y_pred == y_train) / len(y_train) - accuracy2 = np.count_nonzero(y_pred2 == y_train) / len(y_train) - self.assertGreater(accuracy, 0.7) - self.assertGreater(accuracy2, 0.7) - def test_make_classification_sklearn_max_predict(self): """Tests RandomForestClassifier predict with sklearn_max.""" x, y = make_classification( @@ -844,8 +468,8 @@ def test_make_classification_sklearn_max_predict(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = y[len(y) // 2:] + x_test = ds.array(x[len(x) // 2 :], (300, 10)) + y_test = y[len(y) // 2 :] rf = RandomForestClassifier(random_state=0, sklearn_max=10) rf.fit(x_train, y_train) @@ -874,8 +498,8 @@ def test_make_classification_sklearn_max_predict_proba(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = y[len(y) // 2:] + x_test = ds.array(x[len(x) // 2 :], (300, 10)) + y_test = y[len(y) // 2 :] rf = RandomForestClassifier(random_state=0, sklearn_max=10) rf.fit(x_train, y_train) @@ -908,8 +532,8 @@ def test_make_classification_hard_vote_predict(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = y[len(y) // 2:] + x_test = ds.array(x[len(x) // 2 :], (300, 10)) + y_test = y[len(y) // 2 :] rf = RandomForestClassifier( random_state=0, sklearn_max=10, hard_vote=True @@ -941,8 +565,8 @@ def test_make_classification_hard_vote_score_mix(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2 :], (300, 10)) + y_test = ds.array(y[len(y) // 2 :][:, np.newaxis], (300, 1)) rf = RandomForestClassifier( random_state=0, @@ -960,28 +584,6 @@ def test_make_classification_hard_vote_score_mix(self): self.assertGreater(accuracy, 0.7) self.assertGreater(accuracy2, 0.7) - def test_iris(self): - """Tests RandomForestClassifier with a minimal example.""" - x, y = datasets.load_iris(return_X_y=True) - ds_fit = ds.array(x[::2], block_size=(30, 2)) - fit_y = ds.array(y[::2].reshape(-1, 1), block_size=(30, 1)) - ds_validate = ds.array(x[1::2], block_size=(30, 2)) - validate_y = ds.array(y[1::2].reshape(-1, 1), block_size=(30, 1)) - - rf = RandomForestClassifier( - n_estimators=1, max_depth=1, random_state=0 - ) - rf.fit(ds_fit, fit_y) - save_model(rf, self.filepath, save_format="cbor") - rf2 = load_model(self.filepath, load_format="cbor") - - accuracy = compss_wait_on(rf.score(ds_validate, validate_y)) - accuracy2 = compss_wait_on(rf2.score(ds_validate, validate_y)) - - # Accuracy should be <= 2/3 for any seed, often exactly equal. - self.assertAlmostEqual(accuracy, 2 / 3) - self.assertAlmostEqual(accuracy2, 2 / 3) - class LassoSavingTestCBOR(unittest.TestCase): filepath = "tests/files/saving/lasso.cbor" @@ -1005,7 +607,7 @@ def test_fit_predict(self): n_samples = X.shape[0] X_train, y_train = X[: n_samples // 2], y[: n_samples // 2] - X_test, y_test = X[n_samples // 2:], y[n_samples // 2:] + X_test, y_test = X[n_samples // 2 :], y[n_samples // 2 :] lasso = Lasso(lmbd=0.1, max_iter=50) @@ -1175,100 +777,6 @@ def test_multivariate_no_intercept(self): np.allclose(pred2, [2.05649718, 3.14689266, 1.3940678]) ) - def test_multivariate_multiobjective(self): - """Tests fit() and predict(), multivariate, multiobjective.""" - x_data = np.array( - [[1, 2, 3], [2, 0, 4], [3, 1, 8], [4, 4, 2], [5, 3, 1], [2, 7, 1]] - ) - y_data = np.array( - [ - [2, 0, 3], - [1, 5, 2], - [1, 3, 4], - [2, 7, 9], - [4.5, -1, 4], - [0, 0, 0], - ] - ) - - bn, bm = 2, 2 - - x = ds.array(x=x_data, block_size=(bn, bm)) - y = ds.array(x=y_data, block_size=(bn, bm)) - - reg = LinearRegression() - reg.fit(x, y) - save_model(reg, self.filepath, save_format="cbor") - reg2 = load_model(self.filepath, load_format="cbor") - - # Predict one sample - x_test = np.array([3, 2, 1]) - test_data = ds.array(x=x_test, block_size=(1, bm)) - pred = reg.predict(test_data).collect() - pred2 = reg2.predict(test_data).collect() - self.assertTrue(np.allclose(pred, [3.0318415, 1.97164872, 3.85410906])) - self.assertTrue( - np.allclose(pred2, [3.0318415, 1.97164872, 3.85410906]) - ) - - # Predict multiple samples - x_test = np.array([[3, 2, 1], [4, 3, 3], [1, 1, 1]]) - test_data = ds.array(x=x_test, block_size=(bn, bm)) - pred = reg.predict(test_data).collect() - pred2 = reg2.predict(test_data).collect() - self.assertTrue( - np.allclose( - pred, - [ - [3.0318415, 1.97164872, 3.85410906], - [2.5033157, 2.65809327, 5.05310495], - [2.145797, 1.4840121, 1.5739791], - ], - ) - ) - self.assertTrue( - np.allclose( - pred2, - [ - [3.0318415, 1.97164872, 3.85410906], - [2.5033157, 2.65809327, 5.05310495], - [2.145797, 1.4840121, 1.5739791], - ], - ) - ) - - # Check attributes values - self.assertTrue( - np.allclose( - reg2.coef_.collect(), - [ - [0.65034768, 0.34673933, 1.22176283], - [-0.41465084, -0.20584208, -0.16339571], - [-0.38211131, 0.27277365, 0.07031439], - ], - ) - ) - self.assertTrue( - np.allclose( - reg2.coef_.collect(), - [ - [0.65034768, 0.34673933, 1.22176283], - [-0.41465084, -0.20584208, -0.16339571], - [-0.38211131, 0.27277365, 0.07031439], - ], - ) - ) - self.assertTrue( - np.allclose( - reg.intercept_.collect(), [2.29221145, 1.07034124, 0.44529761] - ) - ) - self.assertTrue( - np.allclose( - reg2.intercept_.collect(), [2.29221145, 1.07034124, 0.44529761] - ) - ) - def load_movielens(train_ratio=0.9): file = "tests/files/sample_movielens_ratings.csv" @@ -1307,43 +815,6 @@ def load_movielens(train_ratio=0.9): class ALSSavingTestCBOR(unittest.TestCase): filepath = "tests/files/saving/als.cbor" - def test_init_params(self): - # Test all parameters - seed = 666 - n_f = 100 - lambda_ = 0.001 - convergence_threshold = 0.1 - max_iter = 10 - verbose = True - arity = 12 - - als = ALS( - random_state=seed, - n_f=n_f, - lambda_=lambda_, - tol=convergence_threshold, - max_iter=max_iter, - verbose=verbose, - arity=arity, - ) - save_model(als, self.filepath, save_format="cbor") - als2 = load_model(self.filepath, load_format="cbor") - - self.assertEqual(als.random_state, seed) - self.assertEqual(als.n_f, n_f) - self.assertEqual(als.lambda_, lambda_) - self.assertEqual(als.tol, convergence_threshold) - self.assertEqual(als.max_iter, max_iter) - self.assertEqual(als.verbose, verbose) - self.assertEqual(als.arity, arity) - self.assertEqual(als2.random_state, seed) - self.assertEqual(als2.n_f, n_f) - self.assertEqual(als2.lambda_, lambda_) - self.assertEqual(als2.tol, convergence_threshold) - self.assertEqual(als2.max_iter, max_iter) - self.assertEqual(als2.verbose, verbose) - self.assertEqual(als2.arity, arity) - def test_fit(self): train, test = load_movielens() diff --git a/tests/test_saving_json.py b/tests/test_saving_json.py index be18474d..1488d83c 100644 --- a/tests/test_saving_json.py +++ b/tests/test_saving_json.py @@ -25,32 +25,6 @@ class KMeansSavingTestJSON(unittest.TestCase): filepath = "tests/files/saving/kmeans.json" - def test_init_params_kmeans(self): - """Tests that saved and loaded KMeans object correctly sets the initialization - parameters""" - n_clusters = 2 - max_iter = 1 - tol = 1e-4 - seed = 666 - arity = 2 - init = "random" - - km = KMeans( - n_clusters=n_clusters, - max_iter=max_iter, - tol=tol, - arity=arity, - random_state=seed, - ) - save_model(km, self.filepath) - km2 = load_model(self.filepath) - - expected = (n_clusters, init, max_iter, tol, arity) - real = (km.n_clusters, km.init, km.max_iter, km.tol, km.arity) - real2 = (km2.n_clusters, km2.init, km2.max_iter, km2.tol, km2.arity) - self.assertEqual(expected, real) - self.assertEqual(expected, real2) - def test_fit_kmeans(self): """Tests that the fit method returns the expected centers using toy data. @@ -63,8 +37,8 @@ def test_fit_kmeans(self): expected_centers = np.array([[1.5, 1.5], [-1.5, -1.5]]) - save_model(km, self.filepath) - km2 = load_model(self.filepath) + save_model(km, self.filepath, save_format="json") + km2 = load_model(self.filepath, load_format="json") self.assertTrue((km.centers == expected_centers).all()) self.assertTrue((km2.centers == expected_centers).all()) @@ -79,8 +53,8 @@ def test_predict_kmeans(self): km = KMeans(n_clusters=2, random_state=666) km.fit(x) - save_model(km, self.filepath) - km2 = load_model(self.filepath) + save_model(km, self.filepath, save_format="json") + km2 = load_model(self.filepath, load_format="json") p5, p6 = [10, 10], [-10, -10] @@ -94,35 +68,6 @@ def test_predict_kmeans(self): self.assertTrue(np.array_equal(labels, expected_labels)) self.assertTrue(np.array_equal(labels2, expected_labels)) - def test_fit_predict_kmeans(self): - """Tests fit_predict.""" - x, y = make_blobs(n_samples=1500, random_state=170) - x_filtered = np.vstack( - (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10]) - ) - - x_train = ds.array(x_filtered, block_size=(300, 2)) - - kmeans = KMeans(n_clusters=3, random_state=170) - labels = kmeans.fit_predict(x_train).collect() - - save_model(kmeans, self.filepath) - kmeans = load_model(self.filepath) - - skmeans = SKMeans(n_clusters=3, random_state=170) - sklabels = skmeans.fit_predict(x_filtered) - - centers = np.array( - [ - [-8.941375656533449, -5.481371322614891], - [-4.524023204953875, 0.06235042593214654], - [2.332994701667008, 0.37681003933082696], - ] - ) - - self.assertTrue(np.allclose(centers, kmeans.centers)) - self.assertTrue(np.allclose(labels, sklabels)) - def test_sparse_kmeans(self): """Tests K-means produces the same results using dense and sparse data structures.""" @@ -134,8 +79,8 @@ def test_sparse_kmeans(self): kmeans = KMeans(random_state=170) kmeans.fit(x_sp) - save_model(kmeans, self.filepath) - kmeans2 = load_model(self.filepath) + save_model(kmeans, self.filepath, save_format="json") + kmeans2 = load_model(self.filepath, load_format="json") y_sparse = kmeans.predict(x_sp).collect() y_sparse2 = kmeans2.predict(x_sp).collect() @@ -165,8 +110,8 @@ def test_init_kmeans(self): km = KMeans(n_clusters=5, init=init) km.fit(x_train) - save_model(km, self.filepath) - km2 = load_model(self.filepath) + save_model(km, self.filepath, save_format="json") + km2 = load_model(self.filepath, load_format="json") self.assertTrue(np.array_equal(km.init, init)) self.assertTrue(np.array_equal(km2.init, init)) @@ -180,8 +125,8 @@ def test_init_kmeans(self): km = KMeans(n_clusters=5, init=init) km.fit(x_sp) - save_model(km, self.filepath) - km2 = load_model(self.filepath) + save_model(km, self.filepath, save_format="json") + km2 = load_model(self.filepath, load_format="json") self.assertTrue(np.array_equal(km.init.toarray(), init.toarray())) self.assertTrue(np.array_equal(km2.init.toarray(), init.toarray())) @@ -192,67 +137,6 @@ def test_init_kmeans(self): class GaussianMixtureSavingTestJSON(unittest.TestCase): filepath = "tests/files/saving/gm.json" - def test_init_params(self): - """Tests that GaussianMixture params are set""" - n_components = 2 - covariance_type = "diag" - tol = 1e-4 - reg_covar = 1e-5 - max_iter = 3 - init_params = "random" - weights_init = np.array([0.4, 0.6]) - means_init = np.array([[0, 0], [2, 3]]) - precisions_init = "todo" - random_state = RandomState(666) - gm = GaussianMixture( - n_components=n_components, - covariance_type=covariance_type, - tol=tol, - reg_covar=reg_covar, - max_iter=max_iter, - init_params=init_params, - weights_init=weights_init, - means_init=means_init, - precisions_init=precisions_init, - random_state=random_state, - ) - - save_model(gm, self.filepath) - gm2 = load_model(self.filepath) - - real = ( - gm.n_components, - gm.covariance_type, - gm.tol, - gm.reg_covar, - gm.max_iter, - gm.init_params, - gm.weights_init.tolist(), - gm.means_init.tolist(), - gm.precisions_init, - *[ - list(x) if isinstance(x, np.ndarray) else x - for x in gm.random_state.get_state() - ], - ) - real2 = ( - gm2.n_components, - gm2.covariance_type, - gm2.tol, - gm2.reg_covar, - gm2.max_iter, - gm2.init_params, - gm2.weights_init.tolist(), - gm2.means_init.tolist(), - gm2.precisions_init, - *[ - list(x) if isinstance(x, np.ndarray) else x - for x in gm2.random_state.get_state() - ], - ) - - self.assertEqual(real, real2) - def test_fit(self): """Tests GaussianMixture.fit()""" @@ -277,8 +161,8 @@ def test_fit(self): ] ) - save_model(gm, self.filepath) - gm2 = load_model(self.filepath) + save_model(gm, self.filepath, save_format="json") + gm2 = load_model(self.filepath, load_format="json") gm.weights_ = compss_wait_on(gm.weights_) gm.means_ = compss_wait_on(gm.means_) @@ -308,8 +192,8 @@ def test_predict(self): gm = GaussianMixture(n_components=2, random_state=666) gm.fit(ds_x_train) - save_model(gm, self.filepath) - gm2 = load_model(self.filepath) + save_model(gm, self.filepath, save_format="json") + gm2 = load_model(self.filepath, load_format="json") x_test = np.concatenate((x_train, [[2, 2], [-1, -3]])) ds_x_test = ds.array(x_test, block_size=(2, 2)) @@ -324,32 +208,6 @@ def test_predict(self): self.assertTrue(pred2[0] == pred2[2] == pred2[4]) self.assertTrue(pred2[1] == pred2[3] == pred2[5]) - def test_fit_predict(self): - """Tests GaussianMixture.fit_predict()""" - x, y = make_blobs(n_samples=1500, random_state=170) - x_filtered = np.vstack( - (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10]) - ) - y_real = np.concatenate((np.zeros(500), np.ones(100), 2 * np.ones(10))) - - ds_x = ds.array(x_filtered, block_size=(300, 2)) - - gm = GaussianMixture(n_components=3, random_state=170) - pred = gm.fit_predict(ds_x).collect() - - save_model(gm, self.filepath) - gm2 = load_model(self.filepath) - - pred2 = gm2.predict(ds_x).collect() - - self.assertEqual(len(pred), 610) - accuracy = np.count_nonzero(pred == y_real) / len(pred) - self.assertGreater(accuracy, 0.99) - - self.assertEqual(len(pred2), 610) - accuracy2 = np.count_nonzero(pred2 == y_real) / len(pred2) - self.assertGreater(accuracy2, 0.99) - def test_sparse(self): """Tests GaussianMixture produces the same results using dense and sparse data structures""" @@ -365,8 +223,8 @@ def test_sparse(self): n_components=4, random_state=0, covariance_type=cov_type ) gm.fit(x_sparse) - save_model(gm, self.filepath) - gm2 = load_model(self.filepath) + save_model(gm, self.filepath, save_format="json") + gm2 = load_model(self.filepath, load_format="json") labels_sparse = gm.predict(x_sparse).collect() labels_sparse2 = gm2.predict(x_sparse).collect() @@ -374,8 +232,8 @@ def test_sparse(self): n_components=4, random_state=0, covariance_type=cov_type ) gm.fit(x_dense) - save_model(gm, self.filepath) - gm2 = load_model(self.filepath) + save_model(gm, self.filepath, save_format="json") + gm2 = load_model(self.filepath, load_format="json") labels_dense = gm.predict(x_dense).collect() labels_dense2 = gm2.predict(x_dense).collect() @@ -383,86 +241,10 @@ def test_sparse(self): self.assertTrue(np.array_equal(labels_sparse, labels_dense)) self.assertTrue(np.array_equal(labels_sparse2, labels_dense2)) - def test_init_random(self): - """Tests GaussianMixture random initialization""" - x = ds.random_array((50, 3), (10, 3), random_state=0) - gm = GaussianMixture( - init_params="random", n_components=4, arity=2, random_state=170 - ) - gm.fit(x) - save_model(gm, self.filepath) - gm2 = load_model(self.filepath) - self.assertGreater(gm.n_iter, 5) - self.assertGreater(gm2.n_iter, 5) - - def test_means_init_and_weights_init(self): - """Tests GaussianMixture means_init and weights_init parameters""" - x, _ = load_iris(return_X_y=True) - x_ds = ds.array(x, (75, 4)) - weights_init = [1 / 3, 1 / 3, 1 / 3] - means_init = np.array([[5, 3, 2, 0], [6, 3, 4, 1], [7, 3, 6, 2]]) - gm = GaussianMixture( - random_state=0, - n_components=3, - weights_init=weights_init, - means_init=means_init, - ) - gm.fit(x_ds) - save_model(gm, self.filepath) - gm2 = load_model(self.filepath) - self.assertTrue(gm.converged_) - self.assertTrue(gm2.converged_) - class CSVMSavingTestJSON(unittest.TestCase): filepath = "tests/files/saving/csvm.json" - def test_init_params(self): - """Test constructor parameters""" - cascade_arity = 3 - max_iter = 1 - tol = 1e-4 - kernel = "rbf" - c = 2 - gamma = 0.1 - check_convergence = True - seed = 666 - verbose = False - - csvm = CascadeSVM( - cascade_arity=cascade_arity, - max_iter=max_iter, - tol=tol, - kernel=kernel, - c=c, - gamma=gamma, - check_convergence=check_convergence, - random_state=seed, - verbose=verbose, - ) - save_model(csvm, self.filepath) - csvm2 = load_model(self.filepath) - - self.assertEqual(csvm.cascade_arity, cascade_arity) - self.assertEqual(csvm.max_iter, max_iter) - self.assertEqual(csvm.tol, tol) - self.assertEqual(csvm.kernel, kernel) - self.assertEqual(csvm.c, c) - self.assertEqual(csvm.gamma, gamma) - self.assertEqual(csvm.check_convergence, check_convergence) - self.assertEqual(csvm.random_state, seed) - self.assertEqual(csvm.verbose, verbose) - - self.assertEqual(csvm2.cascade_arity, cascade_arity) - self.assertEqual(csvm2.max_iter, max_iter) - self.assertEqual(csvm2.tol, tol) - self.assertEqual(csvm2.kernel, kernel) - self.assertEqual(csvm2.c, c) - self.assertEqual(csvm2.gamma, gamma) - self.assertEqual(csvm2.check_convergence, check_convergence) - self.assertEqual(csvm2.random_state, seed) - self.assertEqual(csvm2.verbose, verbose) - def test_fit_private_params(self): kernel = "rbf" c = 2 @@ -473,8 +255,8 @@ def test_fit_private_params(self): x, y = ds.load_svmlight_file(file_, (10, 300), 780, False) csvm = CascadeSVM(kernel=kernel, c=c, gamma=gamma, random_state=seed) csvm.fit(x, y) - save_model(csvm, self.filepath) - csvm2 = load_model(self.filepath) + save_model(csvm, self.filepath, save_format="json") + csvm2 = load_model(self.filepath, load_format="json") self.assertEqual(csvm._clf_params["kernel"], kernel) self.assertEqual(csvm._clf_params["C"], c) self.assertEqual(csvm._clf_params["gamma"], gamma) @@ -485,8 +267,8 @@ def test_fit_private_params(self): kernel, c = "linear", 0.3 csvm = CascadeSVM(kernel=kernel, c=c, random_state=seed) csvm.fit(x, y) - save_model(csvm, self.filepath) - csvm2 = load_model(self.filepath) + save_model(csvm, self.filepath, save_format="json") + csvm2 = load_model(self.filepath, load_format="json") self.assertEqual(csvm._clf_params["kernel"], kernel) self.assertEqual(csvm._clf_params["C"], c) self.assertEqual(csvm2._clf_params["kernel"], kernel) @@ -495,51 +277,6 @@ def test_fit_private_params(self): # # check for exception when incorrect kernel is passed # self.assertRaises(AttributeError, CascadeSVM(kernel='fake_kernel')) - def test_fit(self): - seed = 666 - file_ = "tests/files/libsvm/2" - - x, y = ds.load_svmlight_file(file_, (10, 300), 780, False) - - csvm = CascadeSVM( - cascade_arity=3, - max_iter=5, - tol=1e-4, - kernel="linear", - c=2, - gamma=0.1, - check_convergence=True, - random_state=seed, - verbose=False, - ) - - csvm.fit(x, y) - save_model(csvm, self.filepath) - csvm2 = load_model(self.filepath) - - self.assertTrue(csvm.converged) - self.assertTrue(csvm2.converged) - - csvm = CascadeSVM( - cascade_arity=3, - max_iter=1, - tol=1e-4, - kernel="linear", - c=2, - gamma=0.1, - check_convergence=False, - random_state=seed, - verbose=False, - ) - - csvm.fit(x, y) - save_model(csvm, self.filepath) - csvm2 = load_model(self.filepath) - self.assertFalse(csvm.converged) - self.assertEqual(csvm.iterations, 1) - self.assertFalse(csvm2.converged) - self.assertEqual(csvm2.iterations, 1) - def test_predict(self): seed = 666 @@ -562,8 +299,8 @@ def test_predict(self): ) csvm.fit(x, y) - save_model(csvm, self.filepath) - csvm2 = load_model(self.filepath) + save_model(csvm, self.filepath, save_format="json") + csvm2 = load_model(self.filepath, load_format="json") # p5 should belong to class 0, p6 to class 1 p5, p6 = np.array([1, 1]), np.array([-1, -1]) @@ -603,8 +340,8 @@ def test_score(self): ) csvm.fit(x, y) - save_model(csvm, self.filepath) - csvm2 = load_model(self.filepath) + save_model(csvm, self.filepath, save_format="json") + csvm2 = load_model(self.filepath, load_format="json") # points are separable, scoring the training dataset should have 100% # accuracy @@ -617,60 +354,6 @@ def test_score(self): self.assertEqual(accuracy, 1.0) self.assertEqual(accuracy2, 1.0) - def test_decision_func(self): - seed = 666 - - # negative points belong to class 1, positives to 0 - # all points are in the x-axis - p1, p2, p3, p4 = [0, 2], [0, 1], [0, -2], [0, -1] - - x = ds.array(np.array([p1, p4, p3, p2]), (2, 2)) - y = ds.array(np.array([0, 1, 1, 0]).reshape(-1, 1), (2, 1)) - - csvm = CascadeSVM( - cascade_arity=3, - max_iter=10, - tol=1e-4, - kernel="rbf", - c=2, - gamma=0.1, - check_convergence=False, - random_state=seed, - verbose=False, - ) - - csvm.fit(x, y) - save_model(csvm, self.filepath) - csvm2 = load_model(self.filepath) - - # p1 should be equidistant to p3, and p2 to p4 - x_test = ds.array(np.array([p1, p2, p3, p4]), (2, 2)) - - y_pred = csvm.decision_function(x_test) - y_pred2 = csvm2.decision_function(x_test) - - d1, d2, d3, d4 = y_pred.collect() - self.assertTrue(np.isclose(abs(d1) - abs(d3), 0)) - self.assertTrue(np.isclose(abs(d2) - abs(d4), 0)) - d1, d2, d3, d4 = y_pred2.collect() - self.assertTrue(np.isclose(abs(d1) - abs(d3), 0)) - self.assertTrue(np.isclose(abs(d2) - abs(d4), 0)) - - # p5 and p6 should be in the decision function (distance=0) - p5, p6 = np.array([1, 0]), np.array([-1, 0]) - - x_test = ds.array(np.array([p5, p6]), (1, 2)) - - y_pred = csvm.decision_function(x_test) - y_pred2 = csvm2.decision_function(x_test) - - d5, d6 = y_pred.collect() - self.assertTrue(np.isclose(d5, 0)) - self.assertTrue(np.isclose(d6, 0)) - d5, d6 = y_pred2.collect() - self.assertTrue(np.isclose(d5, 0)) - self.assertTrue(np.isclose(d6, 0)) - def test_sparse(self): """Tests that C-SVM produces the same results with sparse and dense data""" @@ -682,13 +365,13 @@ def test_sparse(self): csvm_sp = CascadeSVM(random_state=seed) csvm_sp.fit(x_sp, y_sp) - save_model(csvm_sp, self.filepath) - csvm_sp2 = load_model(self.filepath) + save_model(csvm_sp, self.filepath, save_format="json") + csvm_sp2 = load_model(self.filepath, load_format="json") csvm_d = CascadeSVM(random_state=seed) csvm_d.fit(x_d, y_d) - save_model(csvm_d, self.filepath) - csvm_d2 = load_model(self.filepath) + save_model(csvm_d, self.filepath, save_format="json") + csvm_d2 = load_model(self.filepath, load_format="json") sv_d = csvm_d._clf.support_vectors_ sv_sp = csvm_sp._clf.support_vectors_.toarray() @@ -708,36 +391,6 @@ def test_sparse(self): self.assertTrue(np.array_equal(coef_d2, coef_sp2)) self.assertTrue(np.array_equal(coef_d, coef_d2)) - def test_duplicates(self): - """Tests that C-SVM does not generate duplicate support vectors""" - x = ds.array( - np.array( - [ - [0, 1], - [1, 1], - [0, 1], - [1, 2], - [0, 0], - [2, 2], - [2, 1], - [1, 0], - ] - ), - (2, 2), - ) - - y = ds.array(np.array([1, 0, 1, 0, 1, 0, 0, 1]).reshape(-1, 1), (2, 1)) - - csvm = CascadeSVM(c=1, random_state=1, max_iter=100, tol=0) - csvm.fit(x, y) - save_model(csvm, self.filepath) - csvm2 = load_model(self.filepath) - - csvm._collect_clf() - csvm2._collect_clf() - self.assertEqual(csvm._clf.support_vectors_.shape[0], 6) - self.assertEqual(csvm2._clf.support_vectors_.shape[0], 6) - class RFSavingTestJSON(unittest.TestCase): filepath = "tests/files/saving/rf.json" @@ -757,13 +410,13 @@ def test_make_classification_score(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2 :], (300, 10)) + y_test = ds.array(y[len(y) // 2 :][:, np.newaxis], (300, 1)) rf = RandomForestClassifier(random_state=0) rf.fit(x_train, y_train) - save_model(rf, self.filepath) - rf2 = load_model(self.filepath) + save_model(rf, self.filepath, save_format="json") + rf2 = load_model(self.filepath, load_format="json") accuracy = compss_wait_on(rf.score(x_test, y_test)) accuracy2 = compss_wait_on(rf2.score(x_test, y_test)) @@ -785,13 +438,13 @@ def test_make_classification_predict_and_distr_depth(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = y[len(y) // 2:] + x_test = ds.array(x[len(x) // 2 :], (300, 10)) + y_test = y[len(y) // 2 :] rf = RandomForestClassifier(distr_depth=2, random_state=0) rf.fit(x_train, y_train) - save_model(rf, self.filepath) - rf2 = load_model(self.filepath) + save_model(rf, self.filepath, save_format="json") + rf2 = load_model(self.filepath, load_format="json") y_pred = rf.predict(x_test).collect() y_pred2 = rf2.predict(x_test).collect() @@ -800,35 +453,6 @@ def test_make_classification_predict_and_distr_depth(self): self.assertGreater(accuracy, 0.7) self.assertGreater(accuracy2, 0.7) - def test_make_classification_fit_predict(self): - """Tests RandomForestClassifier fit_predict with default params.""" - x, y = make_classification( - n_samples=3000, - n_features=10, - n_classes=3, - n_informative=4, - n_redundant=2, - n_repeated=1, - n_clusters_per_class=2, - shuffle=True, - random_state=0, - ) - x_train = ds.array(x[: len(x) // 2], (300, 10)) - y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - - rf = RandomForestClassifier(random_state=0) - rf.fit(x_train, y_train) - save_model(rf, self.filepath) - rf2 = load_model(self.filepath) - - y_pred = rf.predict(x_train).collect() - y_pred2 = rf2.predict(x_train).collect() - y_train = y_train.collect() - accuracy = np.count_nonzero(y_pred == y_train) / len(y_train) - accuracy2 = np.count_nonzero(y_pred2 == y_train) / len(y_train) - self.assertGreater(accuracy, 0.7) - self.assertGreater(accuracy2, 0.7) - def test_make_classification_sklearn_max_predict(self): """Tests RandomForestClassifier predict with sklearn_max.""" x, y = make_classification( @@ -844,13 +468,13 @@ def test_make_classification_sklearn_max_predict(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = y[len(y) // 2:] + x_test = ds.array(x[len(x) // 2 :], (300, 10)) + y_test = y[len(y) // 2 :] rf = RandomForestClassifier(random_state=0, sklearn_max=10) rf.fit(x_train, y_train) - save_model(rf, self.filepath) - rf2 = load_model(self.filepath) + save_model(rf, self.filepath, save_format="json") + rf2 = load_model(self.filepath, load_format="json") y_pred = rf.predict(x_test).collect() y_pred2 = rf2.predict(x_test).collect() @@ -874,13 +498,13 @@ def test_make_classification_sklearn_max_predict_proba(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = y[len(y) // 2:] + x_test = ds.array(x[len(x) // 2 :], (300, 10)) + y_test = y[len(y) // 2 :] rf = RandomForestClassifier(random_state=0, sklearn_max=10) rf.fit(x_train, y_train) - save_model(rf, self.filepath) - rf2 = load_model(self.filepath) + save_model(rf, self.filepath, save_format="json") + rf2 = load_model(self.filepath, load_format="json") probabilities = rf.predict_proba(x_test).collect() probabilities2 = rf2.predict_proba(x_test).collect() @@ -908,15 +532,15 @@ def test_make_classification_hard_vote_predict(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = y[len(y) // 2:] + x_test = ds.array(x[len(x) // 2 :], (300, 10)) + y_test = y[len(y) // 2 :] rf = RandomForestClassifier( random_state=0, sklearn_max=10, hard_vote=True ) rf.fit(x_train, y_train) - save_model(rf, self.filepath) - rf2 = load_model(self.filepath) + save_model(rf, self.filepath, save_format="json") + rf2 = load_model(self.filepath, load_format="json") y_pred = rf.predict(x_test).collect() y_pred2 = rf2.predict(x_test).collect() @@ -941,8 +565,8 @@ def test_make_classification_hard_vote_score_mix(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2 :], (300, 10)) + y_test = ds.array(y[len(y) // 2 :][:, np.newaxis], (300, 1)) rf = RandomForestClassifier( random_state=0, @@ -952,36 +576,14 @@ def test_make_classification_hard_vote_score_mix(self): hard_vote=True, ) rf.fit(x_train, y_train) - save_model(rf, self.filepath) - rf2 = load_model(self.filepath) + save_model(rf, self.filepath, save_format="json") + rf2 = load_model(self.filepath, load_format="json") accuracy = compss_wait_on(rf.score(x_test, y_test)) accuracy2 = compss_wait_on(rf2.score(x_test, y_test)) self.assertGreater(accuracy, 0.7) self.assertGreater(accuracy2, 0.7) - def test_iris(self): - """Tests RandomForestClassifier with a minimal example.""" - x, y = datasets.load_iris(return_X_y=True) - ds_fit = ds.array(x[::2], block_size=(30, 2)) - fit_y = ds.array(y[::2].reshape(-1, 1), block_size=(30, 1)) - ds_validate = ds.array(x[1::2], block_size=(30, 2)) - validate_y = ds.array(y[1::2].reshape(-1, 1), block_size=(30, 1)) - - rf = RandomForestClassifier( - n_estimators=1, max_depth=1, random_state=0 - ) - rf.fit(ds_fit, fit_y) - save_model(rf, self.filepath) - rf2 = load_model(self.filepath) - - accuracy = compss_wait_on(rf.score(ds_validate, validate_y)) - accuracy2 = compss_wait_on(rf2.score(ds_validate, validate_y)) - - # Accuracy should be <= 2/3 for any seed, often exactly equal. - self.assertAlmostEqual(accuracy, 2 / 3) - self.assertAlmostEqual(accuracy2, 2 / 3) - class LassoSavingTestJSON(unittest.TestCase): filepath = "tests/files/saving/lasso.json" @@ -1005,13 +607,13 @@ def test_fit_predict(self): n_samples = X.shape[0] X_train, y_train = X[: n_samples // 2], y[: n_samples // 2] - X_test, y_test = X[n_samples // 2:], y[n_samples // 2:] + X_test, y_test = X[n_samples // 2 :], y[n_samples // 2 :] lasso = Lasso(lmbd=0.1, max_iter=50) lasso.fit(ds.array(X_train, (5, 100)), ds.array(y_train, (5, 1))) - save_model(lasso, self.filepath) - lasso2 = load_model(self.filepath) + save_model(lasso, self.filepath, save_format="json") + lasso2 = load_model(self.filepath, load_format="json") y_pred_lasso = lasso.predict(ds.array(X_test, (25, 100))) r2_score_lasso = r2_score(y_test, y_pred_lasso.collect()) @@ -1037,8 +639,8 @@ def test_univariate(self): reg = LinearRegression() reg.fit(x, y) - save_model(reg, self.filepath) - reg2 = load_model(self.filepath) + save_model(reg, self.filepath, save_format="json") + reg2 = load_model(self.filepath, load_format="json") self.assertTrue(np.allclose(reg.coef_.collect(), 0.6)) self.assertTrue(np.allclose(reg.intercept_.collect(), 0.3)) @@ -1073,8 +675,8 @@ def test_univariate_no_intercept(self): reg = LinearRegression(fit_intercept=False) reg.fit(x, y) - save_model(reg, self.filepath) - reg2 = load_model(self.filepath) + save_model(reg, self.filepath, save_format="json") + reg2 = load_model(self.filepath, load_format="json") self.assertTrue(np.allclose(reg.coef_.collect(), 0.68181818)) self.assertTrue(np.allclose(reg2.coef_.collect(), 0.68181818)) @@ -1109,8 +711,8 @@ def test_multivariate(self): reg = LinearRegression() reg.fit(x, y) - save_model(reg, self.filepath) - reg2 = load_model(self.filepath) + save_model(reg, self.filepath, save_format="json") + reg2 = load_model(self.filepath, load_format="json") self.assertTrue(np.allclose(reg.coef_.collect(), [0.421875, 0.296875])) self.assertTrue( @@ -1145,8 +747,8 @@ def test_multivariate_no_intercept(self): reg = LinearRegression(fit_intercept=False) reg.fit(x, y) - save_model(reg, self.filepath) - reg2 = load_model(self.filepath) + save_model(reg, self.filepath, save_format="json") + reg2 = load_model(self.filepath, load_format="json") self.assertTrue( np.allclose(reg.coef_.collect(), [0.48305085, 0.30367232]) @@ -1175,100 +777,6 @@ def test_multivariate_no_intercept(self): np.allclose(pred2, [2.05649718, 3.14689266, 1.3940678]) ) - def test_multivariate_multiobjective(self): - """Tests fit() and predict(), multivariate, multiobjective.""" - x_data = np.array( - [[1, 2, 3], [2, 0, 4], [3, 1, 8], [4, 4, 2], [5, 3, 1], [2, 7, 1]] - ) - y_data = np.array( - [ - [2, 0, 3], - [1, 5, 2], - [1, 3, 4], - [2, 7, 9], - [4.5, -1, 4], - [0, 0, 0], - ] - ) - - bn, bm = 2, 2 - - x = ds.array(x=x_data, block_size=(bn, bm)) - y = ds.array(x=y_data, block_size=(bn, bm)) - - reg = LinearRegression() - reg.fit(x, y) - save_model(reg, self.filepath) - reg2 = load_model(self.filepath) - - # Predict one sample - x_test = np.array([3, 2, 1]) - test_data = ds.array(x=x_test, block_size=(1, bm)) - pred = reg.predict(test_data).collect() - pred2 = reg2.predict(test_data).collect() - self.assertTrue(np.allclose(pred, [3.0318415, 1.97164872, 3.85410906])) - self.assertTrue( - np.allclose(pred2, [3.0318415, 1.97164872, 3.85410906]) - ) - - # Predict multiple samples - x_test = np.array([[3, 2, 1], [4, 3, 3], [1, 1, 1]]) - test_data = ds.array(x=x_test, block_size=(bn, bm)) - pred = reg.predict(test_data).collect() - pred2 = reg2.predict(test_data).collect() - self.assertTrue( - np.allclose( - pred, - [ - [3.0318415, 1.97164872, 3.85410906], - [2.5033157, 2.65809327, 5.05310495], - [2.145797, 1.4840121, 1.5739791], - ], - ) - ) - self.assertTrue( - np.allclose( - pred2, - [ - [3.0318415, 1.97164872, 3.85410906], - [2.5033157, 2.65809327, 5.05310495], - [2.145797, 1.4840121, 1.5739791], - ], - ) - ) - - # Check attributes values - self.assertTrue( - np.allclose( - reg2.coef_.collect(), - [ - [0.65034768, 0.34673933, 1.22176283], - [-0.41465084, -0.20584208, -0.16339571], - [-0.38211131, 0.27277365, 0.07031439], - ], - ) - ) - self.assertTrue( - np.allclose( - reg2.coef_.collect(), - [ - [0.65034768, 0.34673933, 1.22176283], - [-0.41465084, -0.20584208, -0.16339571], - [-0.38211131, 0.27277365, 0.07031439], - ], - ) - ) - self.assertTrue( - np.allclose( - reg.intercept_.collect(), [2.29221145, 1.07034124, 0.44529761] - ) - ) - self.assertTrue( - np.allclose( - reg2.intercept_.collect(), [2.29221145, 1.07034124, 0.44529761] - ) - ) - def load_movielens(train_ratio=0.9): file = "tests/files/sample_movielens_ratings.csv" @@ -1307,43 +815,6 @@ def load_movielens(train_ratio=0.9): class ALSSavingTestJSON(unittest.TestCase): filepath = "tests/files/saving/als.json" - def test_init_params(self): - # Test all parameters - seed = 666 - n_f = 100 - lambda_ = 0.001 - convergence_threshold = 0.1 - max_iter = 10 - verbose = True - arity = 12 - - als = ALS( - random_state=seed, - n_f=n_f, - lambda_=lambda_, - tol=convergence_threshold, - max_iter=max_iter, - verbose=verbose, - arity=arity, - ) - save_model(als, self.filepath) - als2 = load_model(self.filepath) - - self.assertEqual(als.random_state, seed) - self.assertEqual(als.n_f, n_f) - self.assertEqual(als.lambda_, lambda_) - self.assertEqual(als.tol, convergence_threshold) - self.assertEqual(als.max_iter, max_iter) - self.assertEqual(als.verbose, verbose) - self.assertEqual(als.arity, arity) - self.assertEqual(als2.random_state, seed) - self.assertEqual(als2.n_f, n_f) - self.assertEqual(als2.lambda_, lambda_) - self.assertEqual(als2.tol, convergence_threshold) - self.assertEqual(als2.max_iter, max_iter) - self.assertEqual(als2.verbose, verbose) - self.assertEqual(als2.arity, arity) - def test_fit(self): train, test = load_movielens() @@ -1359,8 +830,8 @@ def test_fit(self): self.assertTrue(als.converged) als.fit(train) - save_model(als, self.filepath) - als2 = load_model(self.filepath) + save_model(als, self.filepath, save_format="json") + als2 = load_model(self.filepath, load_format="json") self.assertTrue(als.converged) self.assertTrue(als2.converged) @@ -1371,8 +842,8 @@ def test_predict(self): train = ds.array(x=ratings, block_size=(1, 1)) als = ALS(tol=0.01, random_state=666, n_f=5, verbose=False) als.fit(train) - save_model(als, self.filepath) - als2 = load_model(self.filepath) + save_model(als, self.filepath, save_format="json") + als2 = load_model(self.filepath, load_format="json") predictions = als.predict_user(user_id=0) predictions2 = als2.predict_user(user_id=0) From 6eaac6cba26490efb51cea2abb83bf4276769620 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Tue, 27 Jul 2021 11:32:34 +0200 Subject: [PATCH 16/46] Changed tests and file names in commons/rf --- dislib/classification/__init__.py | 2 +- dislib/commons/rf/{_data.py => data.py} | 51 +++++------ .../{_decision_tree.py => decision_tree.py} | 2 +- dislib/commons/rf/{_forest.py => forest.py} | 4 +- .../rf/{_test_split.py => test_split.py} | 2 +- dislib/regression/__init__.py | 2 +- dislib/utils/saving.py | 30 ++----- tests/{test_rf.py => test_rf_classifier.py} | 0 tests/test_rf_regressor.py | 18 ++-- tests/test_saving.py | 89 ++++++++++++++++--- tests/test_saving_cbor.py | 31 +++---- tests/test_saving_json.py | 31 +++---- 12 files changed, 151 insertions(+), 111 deletions(-) rename dislib/commons/rf/{_data.py => data.py} (93%) rename dislib/commons/rf/{_decision_tree.py => decision_tree.py} (99%) rename dislib/commons/rf/{_forest.py => forest.py} (99%) rename dislib/commons/rf/{_test_split.py => test_split.py} (96%) rename tests/{test_rf.py => test_rf_classifier.py} (100%) diff --git a/dislib/classification/__init__.py b/dislib/classification/__init__.py index f4a90db6..695dd571 100644 --- a/dislib/classification/__init__.py +++ b/dislib/classification/__init__.py @@ -1,4 +1,4 @@ from dislib.classification.csvm.base import CascadeSVM -from dislib.commons.rf._forest import RandomForestClassifier +from dislib.commons.rf.forest import RandomForestClassifier __all__ = ["CascadeSVM", "RandomForestClassifier"] diff --git a/dislib/commons/rf/_data.py b/dislib/commons/rf/data.py similarity index 93% rename from dislib/commons/rf/_data.py rename to dislib/commons/rf/data.py index de692182..a762e5b6 100644 --- a/dislib/commons/rf/_data.py +++ b/dislib/commons/rf/data.py @@ -35,7 +35,7 @@ def get_n_samples(self): Returns ------- - n_samples : int + n_samples: int Raises ------ @@ -61,7 +61,7 @@ def get_n_features(self): Returns ------- - n_features : int + n_features: int Raises ------ @@ -115,35 +115,35 @@ class RfClassifierDataset(RfBaseDataset): Parameters ---------- - samples_path : str + samples_path: str Path of the .npy file containing the 2-d array of samples. It can be a pycompss.runtime.Future object. If so, self.n_samples and self.n_features must be set manually (they can also be pycompss.runtime.Future objects). - targets_path : str + targets_path: str Path of the .dat file containing the 1-d array of target labels. It can be a pycompss.runtime.Future object. - features_path : str, optional (default=None) + features_path: str, optional (default=None) Path of the .npy file containing the 2-d array of samples transposed. The array must be C-ordered. Providing this array may improve the performance as it allows sequential access to the features. Attributes ---------- - n_samples : int + n_samples: int The number of samples of the dataset. It can be a pycompss.runtime.Future object. - n_features : int + n_features: int The number of features of the dataset. It can be a pycompss.runtime.Future object. - y_targets : ndarray + y_targets: ndarray The codified array of labels for this RfDataset. The values are indices of the array of classes, which contains the corresponding labels. The dtype is np.int8. It can be a pycompss.runtime.Future object. - y_categories : ndarray + y_categories: ndarray The array of classes for this RfDataset. The values are unique. It can be a pycompss.runtime.Future object. - n_classes : int + n_classes: int The number of classes of this RfDataset. It can be a pycompss.runtime.Future object. @@ -159,7 +159,7 @@ def get_y_targets(self): Returns ------- - y_targets : ndarray + y_targets: ndarray """ if self.y_targets is None: @@ -172,7 +172,7 @@ def get_classes(self): Returns ------- - y_categories : ndarray + y_categories: ndarray """ if self.y_categories is None: @@ -185,7 +185,7 @@ def get_n_classes(self): Returns ------- - n_classes : int + n_classes: int """ if self.n_classes is None: @@ -206,28 +206,28 @@ class RfRegressorDataset(RfBaseDataset): Parameters ---------- - samples_path : str + samples_path: str Path of the .npy file containing the 2-d array of samples. It can be a pycompss.runtime.Future object. If so, self.n_samples and self.n_features must be set manually (they can also be pycompss.runtime.Future objects). - targets_path : str + targets_path: str Path of the .dat file containing the 1-d array of target values. It can be a pycompss.runtime.Future object. - features_path : str, optional (default=None) + features_path: str, optional (default=None) Path of the .npy file containing the 2-d array of samples transposed. The array must be C-ordered. Providing this array may improve the performance as it allows sequential access to the features. Attributes ---------- - n_samples : int + n_samples: int The number of samples of the dataset. It can be a pycompss.runtime.Future object. - n_features : int + n_features: int The number of features of the dataset. It can be a pycompss.runtime.Future object. - y_targets : ndarray + y_targets: ndarray The array of targets for this RfDataset. It can be a pycompss.runtime.Future object. @@ -241,7 +241,7 @@ def get_y_targets(self): Returns ------- - y_targets : ndarray + y_targets: ndarray """ if self.y_targets is None: @@ -255,6 +255,7 @@ def get_n_classes(self): def get_classes(self): return None + def transform_to_rf_dataset( x: Array, y: Array, task: str ) -> RfRegressorDataset or RfClassifierDataset: @@ -265,16 +266,16 @@ def transform_to_rf_dataset( Parameters ---------- - x : ds-array, shape = (n_samples, n_features) + x: ds-array, shape = (n_samples, n_features) The training input samples. - y : ds-array, shape = (n_samples,) or (n_samples, n_outputs) + y: ds-array, shape = (n_samples,) or (n_samples, n_outputs) The target values. - task : {"classification", "regression"} + task: {"classification", "regression"} Task of the Random Forest. Returns ------- - rf_dataset : dislib.regression.rf._data.RfDataset + rf_dataset: dislib.regression.rf._data.RfDataset """ n_samples = x.shape[0] @@ -394,7 +395,7 @@ def _fill_samples_file(samples_path, row_blocks, start_idx): rows_samples = Array._merge_blocks(row_blocks) rows_samples = rows_samples.astype(dtype="float32", casting="same_kind") samples = np.lib.format.open_memmap(samples_path, mode="r+") - samples[start_idx : start_idx + rows_samples.shape[0]] = rows_samples + samples[start_idx: start_idx + rows_samples.shape[0]] = rows_samples @task(targets_path=FILE_INOUT, row_blocks={Type: COLLECTION_IN, Depth: 2}) diff --git a/dislib/commons/rf/_decision_tree.py b/dislib/commons/rf/decision_tree.py similarity index 99% rename from dislib/commons/rf/_decision_tree.py rename to dislib/commons/rf/decision_tree.py index 07297a8d..5a501240 100644 --- a/dislib/commons/rf/_decision_tree.py +++ b/dislib/commons/rf/decision_tree.py @@ -8,7 +8,7 @@ from sklearn.tree import DecisionTreeClassifier as SklearnDTClassifier from sklearn.tree import DecisionTreeRegressor as SklearnDTRegressor -from ._test_split import test_split +from dislib.commons.rf.test_split import test_split from dislib.data.array import Array diff --git a/dislib/commons/rf/_forest.py b/dislib/commons/rf/forest.py similarity index 99% rename from dislib/commons/rf/_forest.py rename to dislib/commons/rf/forest.py index e0f4561d..bf121124 100644 --- a/dislib/commons/rf/_forest.py +++ b/dislib/commons/rf/forest.py @@ -8,13 +8,13 @@ from sklearn.base import BaseEstimator from sklearn.utils import check_random_state -from dislib.commons.rf._decision_tree import ( +from dislib.commons.rf.decision_tree import ( DecisionTreeClassifier, DecisionTreeRegressor, ) from dislib.data.array import Array from dislib.utils.base import _paired_partition -from ._data import transform_to_rf_dataset +from dislib.commons.rf.data import transform_to_rf_dataset class BaseRandomForest(BaseEstimator): diff --git a/dislib/commons/rf/_test_split.py b/dislib/commons/rf/test_split.py similarity index 96% rename from dislib/commons/rf/_test_split.py rename to dislib/commons/rf/test_split.py index 38b9015f..428fbc88 100644 --- a/dislib/commons/rf/_test_split.py +++ b/dislib/commons/rf/test_split.py @@ -23,7 +23,7 @@ def test_split(sample, y_s, feature, n_classes): # Threshold value must not be that value of a sample not_repeated = np.empty(size, dtype=np.bool_) - not_repeated[0 : size - 1] = f_sorted[1:] != f_sorted[:-1] + not_repeated[0: size - 1] = f_sorted[1:] != f_sorted[:-1] not_repeated[size - 1] = True if n_classes is not None: # Classification diff --git a/dislib/regression/__init__.py b/dislib/regression/__init__.py index ecde22d8..a47cd17d 100644 --- a/dislib/regression/__init__.py +++ b/dislib/regression/__init__.py @@ -1,5 +1,5 @@ from dislib.regression.linear.base import LinearRegression from dislib.regression.lasso.base import Lasso -from dislib.commons.rf._forest import RandomForestRegressor +from dislib.commons.rf.forest import RandomForestRegressor __all__ = ["LinearRegression", "Lasso", "RandomForestRegressor"] diff --git a/dislib/utils/saving.py b/dislib/utils/saving.py index 620cc90a..53de386e 100644 --- a/dislib/utils/saving.py +++ b/dislib/utils/saving.py @@ -16,7 +16,7 @@ import dislib.recommendation import dislib.regression from dislib.data.array import Array -from dislib.commons.rf._decision_tree import ( +from dislib.commons.rf.decision_tree import ( DecisionTreeClassifier, DecisionTreeRegressor, _Node, @@ -187,16 +187,8 @@ def load_model(filepath, load_format="json"): model_module = getattr(ds, IMPLEMENTED_MODELS[model_name]) model_class = getattr(model_module, model_name) model = model_class() - model.__dict__.update(model_metadata) - - # Set class methods - if model_name == "CascadeSVM" and "kernel" in model_metadata: - try: - model._kernel_f = getattr( - model, model._name_to_kernel[model_metadata["kernel"]] - ) - except AttributeError: - model._kernel_f = getattr(model, "_rbf_kernel") + for key, val in model_metadata.items(): + setattr(model, key, val) return model @@ -217,13 +209,6 @@ def _encode_helper(obj): """ if isinstance(obj, np.generic): return obj.item() - elif isinstance(obj, range): - return { - "class_name": "range", - "start": obj.start, - "stop": obj.stop, - "step": obj.step, - } elif isinstance(obj, csr_matrix): return { "class_name": "csr_matrix", @@ -272,11 +257,7 @@ def _decode_helper(obj): if isinstance(obj, dict) and "class_name" in obj: class_name = obj["class_name"] - if class_name == "range": - return range(obj["start"], obj["stop"], obj["step"]) - elif class_name == "tuple": - return tuple(obj["items"]) - elif class_name == "ndarray": + if class_name == "ndarray": if obj["dtype_list"]: items = list(map(tuple, obj["items"])) return np.rec.fromrecords(items, dtype=eval(obj["dtype"])) @@ -353,8 +334,7 @@ def _sync_obj(obj): elif isinstance(obj, list): iterator = iter(enumerate(obj)) else: - print(obj) - raise ValueError("Expected dict or list and received %s." % type(obj)) + raise TypeError("Expected dict or list and received %s." % type(obj)) for key, val in iterator: if isinstance(val, (dict, list)): diff --git a/tests/test_rf.py b/tests/test_rf_classifier.py similarity index 100% rename from tests/test_rf.py rename to tests/test_rf_classifier.py diff --git a/tests/test_rf_regressor.py b/tests/test_rf_regressor.py index 2d82dbeb..36da50f7 100644 --- a/tests/test_rf_regressor.py +++ b/tests/test_rf_regressor.py @@ -26,8 +26,8 @@ def test_make_regression(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2 :], (300, 10)) - y_test = ds.array(y[len(y) // 2 :][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) rf = RandomForestRegressor(random_state=0) @@ -35,7 +35,7 @@ def test_make_regression(self): accuracy1 = compss_wait_on(rf.score(x_test, y_test)) y_pred = rf.predict(x_test).collect() - y_true = y[len(y) // 2 :] + y_true = y[len(y) // 2:] accuracy2 = _determination_coefficient(y_true, y_pred) self.assertGreater(accuracy1, 0.85) @@ -53,8 +53,8 @@ def test_make_regression_predict_and_distr_depth(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2 :], (300, 10)) - y_test = ds.array(y[len(y) // 2 :][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) rf = RandomForestRegressor(distr_depth=2, random_state=0) @@ -62,7 +62,7 @@ def test_make_regression_predict_and_distr_depth(self): accuracy1 = compss_wait_on(rf.score(x_test, y_test)) y_pred = rf.predict(x_test).collect() - y_true = y[len(y) // 2 :] + y_true = y[len(y) // 2:] accuracy2 = _determination_coefficient(y_true, y_pred) self.assertGreater(accuracy1, 0.85) @@ -80,8 +80,8 @@ def test_make_regression_sklearn_max_predict(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2 :], (300, 10)) - y_test = ds.array(y[len(y) // 2 :][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) rf = RandomForestRegressor(random_state=0, sklearn_max=10) @@ -89,7 +89,7 @@ def test_make_regression_sklearn_max_predict(self): accuracy1 = compss_wait_on(rf.score(x_test, y_test)) y_pred = rf.predict(x_test).collect() - y_true = y[len(y) // 2 :] + y_true = y[len(y) // 2:] accuracy2 = _determination_coefficient(y_true, y_pred) self.assertGreater(accuracy1, 0.85) diff --git a/tests/test_saving.py b/tests/test_saving.py index d1a8bb92..13445f1c 100644 --- a/tests/test_saving.py +++ b/tests/test_saving.py @@ -1,26 +1,91 @@ import unittest -from unittest.mock import patch +import sys +import json import numpy as np -import sys +# Workaround to mask cbor2 +if True: + sys.modules["cbor2"] = None from dislib.cluster import KMeans -from dislib.utils import save_model, load_model +from dislib.cluster import DBSCAN +from dislib.classification import RandomForestClassifier +from dislib.data import array +from dislib.utils.saving import save_model, load_model, _sync_obj + +from sklearn.datasets import make_classification +from pycompss.api.api import compss_wait_on class SavingTest(unittest.TestCase): - filepath = "tests/files/saving/kmeans.json" + filepath = "tests/files/saving/model.json" def test_errors(self): """Test that errors are raised""" - km = KMeans(n_clusters=2, verbose=False) - - with patch(sys.modules["cbor"]) as mock_cbor: - mock_cbor.return_value = None - self.assertRaises( - ModuleNotFoundError, - save_model(km, self.filepath, save_format="json"), - ) + + # Models + km = KMeans(n_clusters=2) + km2 = KMeans(n_clusters=10) + dbscan = DBSCAN() + rf = RandomForestClassifier() + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_clusters_per_class=2, + ) + x_train = array(x[: len(x) // 2], (300, 10)) + y_train = array(y[: len(y) // 2][:, np.newaxis], (300, 1)) + rf.fit(x_train, y_train) + + # Import error + with self.assertRaises(ModuleNotFoundError): + save_model(km, self.filepath, save_format="cbor") + with self.assertRaises(ModuleNotFoundError): + load_model(self.filepath, load_format="cbor") + + # Saving model not implemented + with self.assertRaises(NotImplementedError): + save_model(dbscan, self.filepath) + + # Wrong save format + with self.assertRaises(ValueError): + save_model(km, self.filepath, save_format="xxxx") + + # Overwrite + save_model(km, self.filepath, save_format="json") + with open(self.filepath, "r") as f: + json_str = f.read() + save_model(km2, self.filepath, overwrite=False, save_format="json") + with open(self.filepath, "r") as f: + json_str2 = f.read() + self.assertEqual(json_str, json_str2) + + # Wrong load format + with self.assertRaises(ValueError): + load_model(self.filepath, load_format="xxxx") + + # Load model not implemented + model_data = {"model_name": "dbscan"} + with open(self.filepath, "w") as f: + json.dump(model_data, f) + with self.assertRaises(NotImplementedError): + load_model(self.filepath, load_format="json") + + # Not JSON serializable + setattr(km, "n_clusters", dbscan) + with self.assertRaises(TypeError): + save_model(km, self.filepath, save_format="json") + + # Not dict or list + with self.assertRaises(TypeError): + _sync_obj(km) + + # Future not synchronized + compss_wait_on(rf.trees[0].try_features) + with self.assertRaises(TypeError): + save_model(rf, self.filepath, save_format="json") def main(): diff --git a/tests/test_saving_cbor.py b/tests/test_saving_cbor.py index c8efd336..3d10c1c6 100644 --- a/tests/test_saving_cbor.py +++ b/tests/test_saving_cbor.py @@ -1,13 +1,10 @@ import unittest import numpy as np -from numpy.random.mtrand import RandomState from scipy.sparse import csr_matrix -from sklearn import datasets -from sklearn.cluster import KMeans as SKMeans from sklearn.metrics import r2_score from sklearn.datasets import make_classification -from sklearn.datasets import make_blobs, load_iris +from sklearn.datasets import make_blobs import dislib as ds from dislib.cluster import KMeans @@ -410,8 +407,8 @@ def test_make_classification_score(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2 :], (300, 10)) - y_test = ds.array(y[len(y) // 2 :][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) rf = RandomForestClassifier(random_state=0) rf.fit(x_train, y_train) @@ -438,8 +435,8 @@ def test_make_classification_predict_and_distr_depth(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2 :], (300, 10)) - y_test = y[len(y) // 2 :] + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = y[len(y) // 2:] rf = RandomForestClassifier(distr_depth=2, random_state=0) rf.fit(x_train, y_train) @@ -468,8 +465,8 @@ def test_make_classification_sklearn_max_predict(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2 :], (300, 10)) - y_test = y[len(y) // 2 :] + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = y[len(y) // 2:] rf = RandomForestClassifier(random_state=0, sklearn_max=10) rf.fit(x_train, y_train) @@ -498,8 +495,8 @@ def test_make_classification_sklearn_max_predict_proba(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2 :], (300, 10)) - y_test = y[len(y) // 2 :] + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = y[len(y) // 2:] rf = RandomForestClassifier(random_state=0, sklearn_max=10) rf.fit(x_train, y_train) @@ -532,8 +529,8 @@ def test_make_classification_hard_vote_predict(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2 :], (300, 10)) - y_test = y[len(y) // 2 :] + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = y[len(y) // 2:] rf = RandomForestClassifier( random_state=0, sklearn_max=10, hard_vote=True @@ -565,8 +562,8 @@ def test_make_classification_hard_vote_score_mix(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2 :], (300, 10)) - y_test = ds.array(y[len(y) // 2 :][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) rf = RandomForestClassifier( random_state=0, @@ -607,7 +604,7 @@ def test_fit_predict(self): n_samples = X.shape[0] X_train, y_train = X[: n_samples // 2], y[: n_samples // 2] - X_test, y_test = X[n_samples // 2 :], y[n_samples // 2 :] + X_test, y_test = X[n_samples // 2:], y[n_samples // 2:] lasso = Lasso(lmbd=0.1, max_iter=50) diff --git a/tests/test_saving_json.py b/tests/test_saving_json.py index 1488d83c..0c694cf8 100644 --- a/tests/test_saving_json.py +++ b/tests/test_saving_json.py @@ -1,13 +1,10 @@ import unittest import numpy as np -from numpy.random.mtrand import RandomState from scipy.sparse import csr_matrix -from sklearn import datasets -from sklearn.cluster import KMeans as SKMeans from sklearn.metrics import r2_score from sklearn.datasets import make_classification -from sklearn.datasets import make_blobs, load_iris +from sklearn.datasets import make_blobs import dislib as ds from dislib.cluster import KMeans @@ -410,8 +407,8 @@ def test_make_classification_score(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2 :], (300, 10)) - y_test = ds.array(y[len(y) // 2 :][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) rf = RandomForestClassifier(random_state=0) rf.fit(x_train, y_train) @@ -438,8 +435,8 @@ def test_make_classification_predict_and_distr_depth(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2 :], (300, 10)) - y_test = y[len(y) // 2 :] + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = y[len(y) // 2:] rf = RandomForestClassifier(distr_depth=2, random_state=0) rf.fit(x_train, y_train) @@ -468,8 +465,8 @@ def test_make_classification_sklearn_max_predict(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2 :], (300, 10)) - y_test = y[len(y) // 2 :] + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = y[len(y) // 2:] rf = RandomForestClassifier(random_state=0, sklearn_max=10) rf.fit(x_train, y_train) @@ -498,8 +495,8 @@ def test_make_classification_sklearn_max_predict_proba(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2 :], (300, 10)) - y_test = y[len(y) // 2 :] + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = y[len(y) // 2:] rf = RandomForestClassifier(random_state=0, sklearn_max=10) rf.fit(x_train, y_train) @@ -532,8 +529,8 @@ def test_make_classification_hard_vote_predict(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2 :], (300, 10)) - y_test = y[len(y) // 2 :] + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = y[len(y) // 2:] rf = RandomForestClassifier( random_state=0, sklearn_max=10, hard_vote=True @@ -565,8 +562,8 @@ def test_make_classification_hard_vote_score_mix(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2 :], (300, 10)) - y_test = ds.array(y[len(y) // 2 :][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) rf = RandomForestClassifier( random_state=0, @@ -607,7 +604,7 @@ def test_fit_predict(self): n_samples = X.shape[0] X_train, y_train = X[: n_samples // 2], y[: n_samples // 2] - X_test, y_test = X[n_samples // 2 :], y[n_samples // 2 :] + X_test, y_test = X[n_samples // 2:], y[n_samples // 2:] lasso = Lasso(lmbd=0.1, max_iter=50) From 676530b11063fa5580460c65be5e4be999e5f629 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Tue, 27 Jul 2021 18:10:30 +0200 Subject: [PATCH 17/46] Modified test_saving.py to raise ModuleNotFound --- tests/test_saving.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/tests/test_saving.py b/tests/test_saving.py index 13445f1c..0ca7deda 100644 --- a/tests/test_saving.py +++ b/tests/test_saving.py @@ -1,17 +1,12 @@ import unittest -import sys import json - import numpy as np -# Workaround to mask cbor2 -if True: - sys.modules["cbor2"] = None from dislib.cluster import KMeans from dislib.cluster import DBSCAN from dislib.classification import RandomForestClassifier from dislib.data import array -from dislib.utils.saving import save_model, load_model, _sync_obj +import dislib.utils.saving as saving from sklearn.datasets import make_classification from pycompss.api.api import compss_wait_on @@ -22,7 +17,8 @@ class SavingTest(unittest.TestCase): def test_errors(self): """Test that errors are raised""" - + cbor2_module = saving.cbor2 + saving.cbor2 = None # Models km = KMeans(n_clusters=2) km2 = KMeans(n_clusters=10) @@ -41,51 +37,55 @@ def test_errors(self): # Import error with self.assertRaises(ModuleNotFoundError): - save_model(km, self.filepath, save_format="cbor") + saving.save_model(km, self.filepath, save_format="cbor") with self.assertRaises(ModuleNotFoundError): - load_model(self.filepath, load_format="cbor") + saving.load_model(self.filepath, load_format="cbor") # Saving model not implemented with self.assertRaises(NotImplementedError): - save_model(dbscan, self.filepath) + saving.save_model(dbscan, self.filepath) # Wrong save format with self.assertRaises(ValueError): - save_model(km, self.filepath, save_format="xxxx") + saving.save_model(km, self.filepath, save_format="xxxx") # Overwrite - save_model(km, self.filepath, save_format="json") + saving.save_model(km, self.filepath, save_format="json") with open(self.filepath, "r") as f: json_str = f.read() - save_model(km2, self.filepath, overwrite=False, save_format="json") + saving.save_model( + km2, self.filepath, overwrite=False, save_format="json" + ) with open(self.filepath, "r") as f: json_str2 = f.read() self.assertEqual(json_str, json_str2) # Wrong load format with self.assertRaises(ValueError): - load_model(self.filepath, load_format="xxxx") + saving.load_model(self.filepath, load_format="xxxx") # Load model not implemented model_data = {"model_name": "dbscan"} with open(self.filepath, "w") as f: json.dump(model_data, f) with self.assertRaises(NotImplementedError): - load_model(self.filepath, load_format="json") + saving.load_model(self.filepath, load_format="json") # Not JSON serializable setattr(km, "n_clusters", dbscan) with self.assertRaises(TypeError): - save_model(km, self.filepath, save_format="json") + saving.save_model(km, self.filepath, save_format="json") # Not dict or list with self.assertRaises(TypeError): - _sync_obj(km) + saving._sync_obj(km) # Future not synchronized compss_wait_on(rf.trees[0].try_features) with self.assertRaises(TypeError): - save_model(rf, self.filepath, save_format="json") + saving.save_model(rf, self.filepath, save_format="json") + + saving.cbor2 = cbor2_module def main(): From 3476dbe80e7b60536e646abd88b19b5572720b17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Wed, 28 Jul 2021 17:44:19 +0200 Subject: [PATCH 18/46] Reduced saving tests and added tests for RFRegr --- dislib/commons/rf/decision_tree.py | 18 +- dislib/utils/saving.py | 25 +- tests/test_saving.py | 56 +-- tests/test_saving_cbor.py | 688 ++++------------------------- tests/test_saving_json.py | 688 ++++------------------------- 5 files changed, 188 insertions(+), 1287 deletions(-) diff --git a/dislib/commons/rf/decision_tree.py b/dislib/commons/rf/decision_tree.py index 5a501240..751983d4 100644 --- a/dislib/commons/rf/decision_tree.py +++ b/dislib/commons/rf/decision_tree.py @@ -123,11 +123,10 @@ def predict(self, x_row): Returns ------- predicted : ndarray - An array with the predicted classes for the given samples. The - values are codes of the fitted + An array with the predicted classes or values for the given + samples. For classification, the values are codes of the fitted dislib.classification.rf.data.RfDataset. The returned object can be a pycompss.runtime.Future object. - """ assert self.tree is not None, "The decision tree is not fitted." @@ -301,10 +300,7 @@ class DecisionTreeRegressor(BaseDecisionTree): fit(dataset) Fits the DecisionTreeRegressor. predict(x_row) - Predicts classes for the given samples using a fitted tree. - predict_proba(x_row) - Predicts class probabilities for the given smaples using a fitted tree. - + Predicts target values for the given samples using a fitted tree. """ def __init__( @@ -407,14 +403,6 @@ def _get_sample_attributes(samples_file, indices): return x -def _get_feature_mmap(features_file, i): - return _get_features_mmap(features_file)[i] - - -def _get_features_mmap(features_file): - return np.load(features_file, mmap_mode="r", allow_pickle=False) - - @task(priority=True, returns=2) def _sample_selection(n_samples, y_targets, bootstrap, seed): if bootstrap: diff --git a/dislib/utils/saving.py b/dislib/utils/saving.py index 53de386e..02ecfb8a 100644 --- a/dislib/utils/saving.py +++ b/dislib/utils/saving.py @@ -2,11 +2,11 @@ import os import numpy as np -from pycompss.runtime.management.classes import Future from pycompss.api.api import compss_wait_on from sklearn.svm import SVC as SklearnSVC from sklearn.tree import DecisionTreeClassifier as SklearnDTClassifier +from sklearn.tree import DecisionTreeRegressor as SklearnDTRegressor from sklearn.tree._tree import Tree as SklearnTree from scipy.sparse import csr_matrix @@ -38,6 +38,7 @@ "GaussianMixture": "cluster", "CascadeSVM": "classification", "RandomForestClassifier": "classification", + "RandomForestRegressor": "regression", "ALS": "recommendation", "LinearRegression": "regression", "Lasso": "regression", @@ -59,6 +60,7 @@ SKLEARN_CLASSES = { "SVC": SklearnSVC, "DecisionTreeClassifier": SklearnDTClassifier, + "DecisionTreeRegressor": SklearnDTRegressor, } @@ -112,7 +114,7 @@ def save_model(model, filepath, overwrite=True, save_format="json"): ) # Synchronize model - if model_name == "RandomForestClassifier": + if model_name in ("RandomForestClassifier", "RandomForestRegressor"): _sync_rf(model) _sync_obj(model.__dict__) @@ -293,7 +295,9 @@ def _decode_helper(obj): and "dislib" in obj["module_name"] ): dict_ = _decode_helper(obj["items"]) - if class_name == "DecisionTreeClassifier": + if class_name in ( + "DecisionTreeClassifier", "DecisionTreeRegressor" + ): model = DISLIB_CLASSES[obj["class_name"]]( try_features=dict_.pop("try_features"), max_depth=dict_.pop("max_depth"), @@ -341,10 +345,6 @@ def _sync_obj(obj): _sync_obj(obj[key]) else: obj[key] = compss_wait_on(val) - if isinstance(obj[key], Future): - raise TypeError( - "Could not synchronize Future (%s, %s)." % (key, val) - ) if isinstance(getattr(obj[key], "__dict__", None), dict): _sync_obj(obj[key].__dict__) @@ -353,9 +353,8 @@ def _sync_rf(rf): """Sync the `try_features` and `n_classes` attribute of the different trees since they cannot be synced recursively. """ - if isinstance(rf.trees[0].try_features, Future): - try_features = compss_wait_on(rf.trees[0].try_features) - n_classes = compss_wait_on(rf.trees[0].n_classes) - for tree in rf.trees: - tree.try_features = try_features - tree.n_classes = n_classes + try_features = compss_wait_on(rf.trees[0].try_features) + n_classes = compss_wait_on(rf.trees[0].n_classes) + for tree in rf.trees: + tree.try_features = try_features + tree.n_classes = n_classes diff --git a/tests/test_saving.py b/tests/test_saving.py index 0ca7deda..7545e9ad 100644 --- a/tests/test_saving.py +++ b/tests/test_saving.py @@ -1,92 +1,70 @@ import unittest import json -import numpy as np from dislib.cluster import KMeans from dislib.cluster import DBSCAN -from dislib.classification import RandomForestClassifier -from dislib.data import array import dislib.utils.saving as saving -from sklearn.datasets import make_classification -from pycompss.api.api import compss_wait_on - class SavingTest(unittest.TestCase): - filepath = "tests/files/saving/model.json" def test_errors(self): """Test that errors are raised""" - cbor2_module = saving.cbor2 - saving.cbor2 = None + filepath = "tests/files/saving/model.json" + # Models km = KMeans(n_clusters=2) km2 = KMeans(n_clusters=10) dbscan = DBSCAN() - rf = RandomForestClassifier() - x, y = make_classification( - n_samples=3000, - n_features=10, - n_classes=3, - n_informative=4, - n_clusters_per_class=2, - ) - x_train = array(x[: len(x) // 2], (300, 10)) - y_train = array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - rf.fit(x_train, y_train) # Import error + cbor2_module = saving.cbor2 + saving.cbor2 = None with self.assertRaises(ModuleNotFoundError): - saving.save_model(km, self.filepath, save_format="cbor") + saving.save_model(km, filepath, save_format="cbor") with self.assertRaises(ModuleNotFoundError): - saving.load_model(self.filepath, load_format="cbor") + saving.load_model(filepath, load_format="cbor") + saving.cbor2 = cbor2_module # Saving model not implemented with self.assertRaises(NotImplementedError): - saving.save_model(dbscan, self.filepath) + saving.save_model(dbscan, filepath) # Wrong save format with self.assertRaises(ValueError): - saving.save_model(km, self.filepath, save_format="xxxx") + saving.save_model(km, filepath, save_format="xxxx") # Overwrite - saving.save_model(km, self.filepath, save_format="json") - with open(self.filepath, "r") as f: + saving.save_model(km, filepath, save_format="json") + with open(filepath, "r") as f: json_str = f.read() saving.save_model( - km2, self.filepath, overwrite=False, save_format="json" + km2, filepath, overwrite=False, save_format="json" ) - with open(self.filepath, "r") as f: + with open(filepath, "r") as f: json_str2 = f.read() self.assertEqual(json_str, json_str2) # Wrong load format with self.assertRaises(ValueError): - saving.load_model(self.filepath, load_format="xxxx") + saving.load_model(filepath, load_format="xxxx") # Load model not implemented model_data = {"model_name": "dbscan"} - with open(self.filepath, "w") as f: + with open(filepath, "w") as f: json.dump(model_data, f) with self.assertRaises(NotImplementedError): - saving.load_model(self.filepath, load_format="json") + saving.load_model(filepath, load_format="json") # Not JSON serializable setattr(km, "n_clusters", dbscan) with self.assertRaises(TypeError): - saving.save_model(km, self.filepath, save_format="json") + saving.save_model(km, filepath, save_format="json") # Not dict or list with self.assertRaises(TypeError): saving._sync_obj(km) - # Future not synchronized - compss_wait_on(rf.trees[0].try_features) - with self.assertRaises(TypeError): - saving.save_model(rf, self.filepath, save_format="json") - - saving.cbor2 = cbor2_module - def main(): unittest.main() diff --git a/tests/test_saving_cbor.py b/tests/test_saving_cbor.py index 3d10c1c6..a5c2f23c 100644 --- a/tests/test_saving_cbor.py +++ b/tests/test_saving_cbor.py @@ -3,14 +3,14 @@ import numpy as np from scipy.sparse import csr_matrix from sklearn.metrics import r2_score -from sklearn.datasets import make_classification -from sklearn.datasets import make_blobs +from sklearn.datasets import make_classification, make_regression import dislib as ds from dislib.cluster import KMeans from dislib.cluster import GaussianMixture from dislib.classification import CascadeSVM from dislib.classification import RandomForestClassifier +from dislib.regression import RandomForestRegressor from dislib.regression import Lasso from dislib.regression import LinearRegression from dislib.recommendation import ALS @@ -19,56 +19,11 @@ from pycompss.api.api import compss_wait_on -class KMeansSavingTestCBOR(unittest.TestCase): - filepath = "tests/files/saving/kmeans.cbor" +class CBORSavingTest(unittest.TestCase): - def test_fit_kmeans(self): - """Tests that the fit method returns the expected centers using toy - data. - """ - arr = np.array([[1, 2], [2, 1], [-1, -2], [-2, -1]]) - x = ds.array(arr, block_size=(2, 2)) - - km = KMeans(n_clusters=2, random_state=666, verbose=False) - km.fit(x) - - expected_centers = np.array([[1.5, 1.5], [-1.5, -1.5]]) - - save_model(km, self.filepath, save_format="cbor") - km2 = load_model(self.filepath, load_format="cbor") - - self.assertTrue((km.centers == expected_centers).all()) - self.assertTrue((km2.centers == expected_centers).all()) - - def test_predict_kmeans(self): - """Tests that labels are correctly predicted using toy data.""" - p1, p2, p3, p4 = [1, 2], [2, 1], [-1, -2], [-2, -1] - - arr1 = np.array([p1, p2, p3, p4]) - x = ds.array(arr1, block_size=(2, 2)) - - km = KMeans(n_clusters=2, random_state=666) - km.fit(x) - - save_model(km, self.filepath, save_format="cbor") - km2 = load_model(self.filepath, load_format="cbor") - - p5, p6 = [10, 10], [-10, -10] - - arr2 = np.array([p1, p2, p3, p4, p5, p6]) - x_test = ds.array(arr2, block_size=(2, 2)) - - labels = km.predict(x_test).collect() - labels2 = km2.predict(x_test).collect() - expected_labels = np.array([0, 0, 1, 1, 0, 1]) - - self.assertTrue(np.array_equal(labels, expected_labels)) - self.assertTrue(np.array_equal(labels2, expected_labels)) - - def test_sparse_kmeans(self): - """Tests K-means produces the same results using dense and sparse - data structures.""" + def test_saving_kmeans(self): file_ = "tests/files/libsvm/2" + filepath = "tests/files/saving/kmeans.cbor" x_sp, _ = ds.load_svmlight_file(file_, (10, 300), 780, True) x_ds, _ = ds.load_svmlight_file(file_, (10, 300), 780, False) @@ -76,8 +31,8 @@ def test_sparse_kmeans(self): kmeans = KMeans(random_state=170) kmeans.fit(x_sp) - save_model(kmeans, self.filepath, save_format="cbor") - kmeans2 = load_model(self.filepath, load_format="cbor") + save_model(kmeans, filepath, save_format="cbor") + kmeans2 = load_model(filepath, load_format="cbor") y_sparse = kmeans.predict(x_sp).collect() y_sparse2 = kmeans2.predict(x_sp).collect() @@ -95,120 +50,9 @@ def test_sparse_kmeans(self): self.assertTrue(np.array_equal(y_sparse, y_dense)) self.assertTrue(np.array_equal(y_sparse2, y_dense)) - def test_init_kmeans(self): - # With dense data - x, y = make_blobs(n_samples=1500, random_state=170) - x_filtered = np.vstack( - (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10]) - ) - x_train = ds.array(x_filtered, block_size=(300, 2)) - - init = np.random.random((5, 2)) - km = KMeans(n_clusters=5, init=init) - km.fit(x_train) - - save_model(km, self.filepath, save_format="cbor") - km2 = load_model(self.filepath, load_format="cbor") - - self.assertTrue(np.array_equal(km.init, init)) - self.assertTrue(np.array_equal(km2.init, init)) - self.assertFalse(np.array_equal(km.centers, init)) - self.assertFalse(np.array_equal(km2.centers, init)) - - # With sparse data - x_sp = ds.array(csr_matrix(x_filtered), block_size=(300, 2)) - init = csr_matrix(np.random.random((5, 2))) - - km = KMeans(n_clusters=5, init=init) - km.fit(x_sp) - - save_model(km, self.filepath, save_format="cbor") - km2 = load_model(self.filepath, load_format="cbor") - - self.assertTrue(np.array_equal(km.init.toarray(), init.toarray())) - self.assertTrue(np.array_equal(km2.init.toarray(), init.toarray())) - self.assertFalse(np.array_equal(km.centers.toarray(), init.toarray())) - self.assertFalse(np.array_equal(km2.centers.toarray(), init.toarray())) - - -class GaussianMixtureSavingTestCBOR(unittest.TestCase): - filepath = "tests/files/saving/gm.cbor" - - def test_fit(self): - """Tests GaussianMixture.fit()""" - - x = np.array([[1, 2], [2, 1], [-3, -3], [-1, -2], [-2, -1], [3, 3]]) - ds_x = ds.array(x, block_size=(3, 2)) - - gm = GaussianMixture(n_components=2, random_state=666) - gm.fit(ds_x) - - expected_weights = np.array([0.5, 0.5]) - expected_means = np.array([[-2, -2], [2, 2]]) - expected_cov = np.array( - [ - [[0.66671688, 0.33338255], [0.33338255, 0.66671688]], - [[0.66671688, 0.33338255], [0.33338255, 0.66671688]], - ] - ) - expected_pc = np.array( - [ - [[1.22469875, -0.70714834], [0.0, 1.4141944]], - [[1.22469875, -0.70714834], [0.0, 1.4141944]], - ] - ) - - save_model(gm, self.filepath, save_format="cbor") - gm2 = load_model(self.filepath, load_format="cbor") - - gm.weights_ = compss_wait_on(gm.weights_) - gm.means_ = compss_wait_on(gm.means_) - gm.covariances_ = compss_wait_on(gm.covariances_) - gm.precisions_cholesky_ = compss_wait_on(gm.precisions_cholesky_) - - gm2.weights_ = compss_wait_on(gm2.weights_) - gm2.means_ = compss_wait_on(gm2.means_) - gm2.covariances_ = compss_wait_on(gm2.covariances_) - gm2.precisions_cholesky_ = compss_wait_on(gm2.precisions_cholesky_) - - self.assertTrue((np.allclose(gm.weights_, expected_weights))) - self.assertTrue((np.allclose(gm.means_, expected_means))) - self.assertTrue((np.allclose(gm.covariances_, expected_cov))) - self.assertTrue((np.allclose(gm.precisions_cholesky_, expected_pc))) - - self.assertTrue((np.allclose(gm2.weights_, expected_weights))) - self.assertTrue((np.allclose(gm2.means_, expected_means))) - self.assertTrue((np.allclose(gm2.covariances_, expected_cov))) - self.assertTrue((np.allclose(gm2.precisions_cholesky_, expected_pc))) - - def test_predict(self): - """Tests GaussianMixture.predict()""" - x_train = np.array([[1, 2], [-1, -2], [2, 1], [-2, -1]]) - ds_x_train = ds.array(x_train, block_size=(2, 2)) - - gm = GaussianMixture(n_components=2, random_state=666) - gm.fit(ds_x_train) - - save_model(gm, self.filepath, save_format="cbor") - gm2 = load_model(self.filepath, load_format="cbor") - - x_test = np.concatenate((x_train, [[2, 2], [-1, -3]])) - ds_x_test = ds.array(x_test, block_size=(2, 2)) - pred = gm.predict(ds_x_test).collect() - pred2 = gm2.predict(ds_x_test).collect() - - self.assertTrue(pred[0] != pred[1]) - self.assertTrue(pred[0] == pred[2] == pred[4]) - self.assertTrue(pred[1] == pred[3] == pred[5]) - - self.assertTrue(pred2[0] != pred2[1]) - self.assertTrue(pred2[0] == pred2[2] == pred2[4]) - self.assertTrue(pred2[1] == pred2[3] == pred2[5]) - - def test_sparse(self): - """Tests GaussianMixture produces the same results using dense and - sparse data structures""" + def test_saving_gm(self): file_ = "tests/files/libsvm/2" + filepath = "tests/files/saving/gm.cbor" x_sparse, _ = ds.load_svmlight_file(file_, (10, 780), 780, True) x_dense, _ = ds.load_svmlight_file(file_, (10, 780), 780, False) @@ -220,8 +64,8 @@ def test_sparse(self): n_components=4, random_state=0, covariance_type=cov_type ) gm.fit(x_sparse) - save_model(gm, self.filepath, save_format="cbor") - gm2 = load_model(self.filepath, load_format="cbor") + save_model(gm, filepath, save_format="cbor") + gm2 = load_model(filepath, load_format="cbor") labels_sparse = gm.predict(x_sparse).collect() labels_sparse2 = gm2.predict(x_sparse).collect() @@ -229,8 +73,8 @@ def test_sparse(self): n_components=4, random_state=0, covariance_type=cov_type ) gm.fit(x_dense) - save_model(gm, self.filepath, save_format="cbor") - gm2 = load_model(self.filepath, load_format="cbor") + save_model(gm, filepath, save_format="cbor") + gm2 = load_model(filepath, load_format="cbor") labels_dense = gm.predict(x_dense).collect() labels_dense2 = gm2.predict(x_dense).collect() @@ -238,137 +82,23 @@ def test_sparse(self): self.assertTrue(np.array_equal(labels_sparse, labels_dense)) self.assertTrue(np.array_equal(labels_sparse2, labels_dense2)) - -class CSVMSavingTestCBOR(unittest.TestCase): - filepath = "tests/files/saving/csvm.cbor" - - def test_fit_private_params(self): - kernel = "rbf" - c = 2 - gamma = 0.1 - seed = 666 - file_ = "tests/files/libsvm/2" - - x, y = ds.load_svmlight_file(file_, (10, 300), 780, False) - csvm = CascadeSVM(kernel=kernel, c=c, gamma=gamma, random_state=seed) - csvm.fit(x, y) - save_model(csvm, self.filepath, save_format="cbor") - csvm2 = load_model(self.filepath, load_format="cbor") - self.assertEqual(csvm._clf_params["kernel"], kernel) - self.assertEqual(csvm._clf_params["C"], c) - self.assertEqual(csvm._clf_params["gamma"], gamma) - self.assertEqual(csvm2._clf_params["kernel"], kernel) - self.assertEqual(csvm2._clf_params["C"], c) - self.assertEqual(csvm2._clf_params["gamma"], gamma) - - kernel, c = "linear", 0.3 - csvm = CascadeSVM(kernel=kernel, c=c, random_state=seed) - csvm.fit(x, y) - save_model(csvm, self.filepath, save_format="cbor") - csvm2 = load_model(self.filepath, load_format="cbor") - self.assertEqual(csvm._clf_params["kernel"], kernel) - self.assertEqual(csvm._clf_params["C"], c) - self.assertEqual(csvm2._clf_params["kernel"], kernel) - self.assertEqual(csvm2._clf_params["C"], c) - - # # check for exception when incorrect kernel is passed - # self.assertRaises(AttributeError, CascadeSVM(kernel='fake_kernel')) - - def test_predict(self): - seed = 666 - - # negative points belong to class 1, positives to 0 - p1, p2, p3, p4 = [1, 2], [2, 1], [-1, -2], [-2, -1] - - x = ds.array(np.array([p1, p4, p3, p2]), (2, 2)) - y = ds.array(np.array([0, 1, 1, 0]).reshape(-1, 1), (2, 1)) - - csvm = CascadeSVM( - cascade_arity=3, - max_iter=10, - tol=1e-4, - kernel="linear", - c=2, - gamma=0.1, - check_convergence=False, - random_state=seed, - verbose=False, - ) - - csvm.fit(x, y) - save_model(csvm, self.filepath, save_format="cbor") - csvm2 = load_model(self.filepath, load_format="cbor") - - # p5 should belong to class 0, p6 to class 1 - p5, p6 = np.array([1, 1]), np.array([-1, -1]) - - x_test = ds.array(np.array([p1, p2, p3, p4, p5, p6]), (2, 2)) - - y_pred = csvm.predict(x_test) - y_pred2 = csvm2.predict(x_test) - - l1, l2, l3, l4, l5, l6 = y_pred.collect() - self.assertTrue(l1 == l2 == l5 == 0) - self.assertTrue(l3 == l4 == l6 == 1) - - l1, l2, l3, l4, l5, l6 = y_pred2.collect() - self.assertTrue(l1 == l2 == l5 == 0) - self.assertTrue(l3 == l4 == l6 == 1) - - def test_score(self): - seed = 666 - - # negative points belong to class 1, positives to 0 - p1, p2, p3, p4 = [1, 2], [2, 1], [-1, -2], [-2, -1] - - x = ds.array(np.array([p1, p4, p3, p2]), (2, 2)) - y = ds.array(np.array([0, 1, 1, 0]).reshape(-1, 1), (2, 1)) - - csvm = CascadeSVM( - cascade_arity=3, - max_iter=10, - tol=1e-4, - kernel="rbf", - c=2, - gamma=0.1, - check_convergence=True, - random_state=seed, - verbose=False, - ) - - csvm.fit(x, y) - save_model(csvm, self.filepath, save_format="cbor") - csvm2 = load_model(self.filepath, load_format="cbor") - - # points are separable, scoring the training dataset should have 100% - # accuracy - x_test = ds.array(np.array([p1, p2, p3, p4]), (2, 2)) - y_test = ds.array(np.array([0, 0, 1, 1]).reshape(-1, 1), (2, 1)) - - accuracy = compss_wait_on(csvm.score(x_test, y_test)) - accuracy2 = compss_wait_on(csvm2.score(x_test, y_test)) - - self.assertEqual(accuracy, 1.0) - self.assertEqual(accuracy2, 1.0) - - def test_sparse(self): - """Tests that C-SVM produces the same results with sparse and dense - data""" + def test_saving_csvm(self): seed = 666 train = "tests/files/libsvm/3" + filepath = "tests/files/saving/csvm.cbor" x_sp, y_sp = ds.load_svmlight_file(train, (10, 300), 780, True) x_d, y_d = ds.load_svmlight_file(train, (10, 300), 780, False) csvm_sp = CascadeSVM(random_state=seed) csvm_sp.fit(x_sp, y_sp) - save_model(csvm_sp, self.filepath, save_format="cbor") - csvm_sp2 = load_model(self.filepath, load_format="cbor") + save_model(csvm_sp, filepath, save_format="cbor") + csvm_sp2 = load_model(filepath, load_format="cbor") csvm_d = CascadeSVM(random_state=seed) csvm_d.fit(x_d, y_d) - save_model(csvm_d, self.filepath, save_format="cbor") - csvm_d2 = load_model(self.filepath, load_format="cbor") + save_model(csvm_d, filepath, save_format="cbor") + csvm_d2 = load_model(filepath, load_format="cbor") sv_d = csvm_d._clf.support_vectors_ sv_sp = csvm_sp._clf.support_vectors_.toarray() @@ -388,100 +118,8 @@ def test_sparse(self): self.assertTrue(np.array_equal(coef_d2, coef_sp2)) self.assertTrue(np.array_equal(coef_d, coef_d2)) - -class RFSavingTestCBOR(unittest.TestCase): - filepath = "tests/files/saving/rf.cbor" - - def test_make_classification_score(self): - """Tests RandomForestClassifier fit and score with default params.""" - x, y = make_classification( - n_samples=3000, - n_features=10, - n_classes=3, - n_informative=4, - n_redundant=2, - n_repeated=1, - n_clusters_per_class=2, - shuffle=True, - random_state=0, - ) - x_train = ds.array(x[: len(x) // 2], (300, 10)) - y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) - - rf = RandomForestClassifier(random_state=0) - rf.fit(x_train, y_train) - save_model(rf, self.filepath, save_format="cbor") - rf2 = load_model(self.filepath, load_format="cbor") - - accuracy = compss_wait_on(rf.score(x_test, y_test)) - accuracy2 = compss_wait_on(rf2.score(x_test, y_test)) - self.assertGreater(accuracy, 0.7) - self.assertGreater(accuracy2, 0.7) - - def test_make_classification_predict_and_distr_depth(self): - """Tests RandomForestClassifier fit and predict with a distr_depth.""" - x, y = make_classification( - n_samples=3000, - n_features=10, - n_classes=3, - n_informative=4, - n_redundant=2, - n_repeated=1, - n_clusters_per_class=2, - shuffle=True, - random_state=0, - ) - x_train = ds.array(x[: len(x) // 2], (300, 10)) - y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = y[len(y) // 2:] - - rf = RandomForestClassifier(distr_depth=2, random_state=0) - rf.fit(x_train, y_train) - save_model(rf, self.filepath, save_format="cbor") - rf2 = load_model(self.filepath, load_format="cbor") - - y_pred = rf.predict(x_test).collect() - y_pred2 = rf2.predict(x_test).collect() - accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) - accuracy2 = np.count_nonzero(y_pred2 == y_test) / len(y_test) - self.assertGreater(accuracy, 0.7) - self.assertGreater(accuracy2, 0.7) - - def test_make_classification_sklearn_max_predict(self): - """Tests RandomForestClassifier predict with sklearn_max.""" - x, y = make_classification( - n_samples=3000, - n_features=10, - n_classes=3, - n_informative=4, - n_redundant=2, - n_repeated=1, - n_clusters_per_class=2, - shuffle=True, - random_state=0, - ) - x_train = ds.array(x[: len(x) // 2], (300, 10)) - y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = y[len(y) // 2:] - - rf = RandomForestClassifier(random_state=0, sklearn_max=10) - rf.fit(x_train, y_train) - save_model(rf, self.filepath, save_format="cbor") - rf2 = load_model(self.filepath, load_format="cbor") - - y_pred = rf.predict(x_test).collect() - y_pred2 = rf2.predict(x_test).collect() - accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) - accuracy2 = np.count_nonzero(y_pred2 == y_test) / len(y_test) - self.assertGreater(accuracy, 0.7) - self.assertGreater(accuracy2, 0.7) - - def test_make_classification_sklearn_max_predict_proba(self): - """Tests RandomForestClassifier predict_proba with sklearn_max.""" + def test_saving_rf_class(self): + filepath = "tests/files/saving/rf_class.cbor" x, y = make_classification( n_samples=3000, n_features=10, @@ -500,8 +138,8 @@ def test_make_classification_sklearn_max_predict_proba(self): rf = RandomForestClassifier(random_state=0, sklearn_max=10) rf.fit(x_train, y_train) - save_model(rf, self.filepath, save_format="cbor") - rf2 = load_model(self.filepath, load_format="cbor") + save_model(rf, filepath, save_format="cbor") + rf2 = load_model(filepath, load_format="cbor") probabilities = rf.predict_proba(x_test).collect() probabilities2 = rf2.predict_proba(x_test).collect() @@ -514,49 +152,18 @@ def test_make_classification_sklearn_max_predict_proba(self): self.assertGreater(accuracy, 0.7) self.assertGreater(accuracy2, 0.7) - def test_make_classification_hard_vote_predict(self): - """Tests RandomForestClassifier predict with hard_vote.""" - x, y = make_classification( - n_samples=3000, - n_features=10, - n_classes=3, - n_informative=4, - n_redundant=2, - n_repeated=1, - n_clusters_per_class=2, - shuffle=True, - random_state=0, - ) - x_train = ds.array(x[: len(x) // 2], (300, 10)) - y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = y[len(y) // 2:] + def test_saving_rf_regr(self): + filepath = "tests/files/saving/rf_regr.cbor" - rf = RandomForestClassifier( - random_state=0, sklearn_max=10, hard_vote=True - ) - rf.fit(x_train, y_train) - save_model(rf, self.filepath, save_format="cbor") - rf2 = load_model(self.filepath, load_format="cbor") - - y_pred = rf.predict(x_test).collect() - y_pred2 = rf2.predict(x_test).collect() - accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) - accuracy2 = np.count_nonzero(y_pred2 == y_test) / len(y_test) - self.assertGreater(accuracy, 0.7) - self.assertGreater(accuracy2, 0.7) + def determination_coefficient(y_true, y_pred): + u = np.sum(np.square(y_true - y_pred)) + v = np.sum(np.square(y_true - np.mean(y_true))) + return 1 - u / v - def test_make_classification_hard_vote_score_mix(self): - """Tests RandomForestClassifier score with hard_vote, sklearn_max, - distr_depth and max_depth.""" - x, y = make_classification( + x, y = make_regression( n_samples=3000, n_features=10, - n_classes=3, n_informative=4, - n_redundant=2, - n_repeated=1, - n_clusters_per_class=2, shuffle=True, random_state=0, ) @@ -565,29 +172,30 @@ def test_make_classification_hard_vote_score_mix(self): x_test = ds.array(x[len(x) // 2:], (300, 10)) y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) - rf = RandomForestClassifier( - random_state=0, - sklearn_max=100, - distr_depth=2, - max_depth=12, - hard_vote=True, - ) + rf = RandomForestRegressor(random_state=0, sklearn_max=10) + rf.fit(x_train, y_train) - save_model(rf, self.filepath, save_format="cbor") - rf2 = load_model(self.filepath, load_format="cbor") + save_model(rf, filepath, save_format="cbor") + rf2 = load_model(filepath, load_format="cbor") - accuracy = compss_wait_on(rf.score(x_test, y_test)) + accuracy1 = compss_wait_on(rf.score(x_test, y_test)) accuracy2 = compss_wait_on(rf2.score(x_test, y_test)) - self.assertGreater(accuracy, 0.7) - self.assertGreater(accuracy2, 0.7) - - -class LassoSavingTestCBOR(unittest.TestCase): - filepath = "tests/files/saving/lasso.cbor" - - def test_fit_predict(self): - """Tests fit and predicts methods""" - + y_pred = rf.predict(x_test).collect() + y_true = y[len(y) // 2:] + y_pred2 = rf2.predict(x_test).collect() + y_true2 = y[len(y) // 2:] + coef1 = determination_coefficient(y_true, y_pred) + coef2 = determination_coefficient(y_true2, y_pred2) + + self.assertGreater(accuracy1, 0.85) + self.assertGreater(accuracy2, 0.85) + self.assertGreater(coef1, 0.85) + self.assertGreater(coef2, 0.85) + self.assertAlmostEqual(accuracy1, accuracy2) + self.assertAlmostEqual(coef1, coef2) + + def test_saving_lasso(self): + filepath = "tests/files/saving/lasso.cbor" np.random.seed(42) n_samples, n_features = 50, 100 @@ -609,8 +217,8 @@ def test_fit_predict(self): lasso = Lasso(lmbd=0.1, max_iter=50) lasso.fit(ds.array(X_train, (5, 100)), ds.array(y_train, (5, 1))) - save_model(lasso, self.filepath, save_format="cbor") - lasso2 = load_model(self.filepath, load_format="cbor") + save_model(lasso, filepath, save_format="cbor") + lasso2 = load_model(filepath, load_format="cbor") y_pred_lasso = lasso.predict(ds.array(X_test, (25, 100))) r2_score_lasso = r2_score(y_test, y_pred_lasso.collect()) @@ -620,84 +228,9 @@ def test_fit_predict(self): self.assertAlmostEqual(r2_score_lasso, 0.9481746925431124) self.assertAlmostEqual(r2_score_lasso2, 0.9481746925431124) + def test_saving_linear(self): + filepath = "tests/files/saving/linear_regression.cbor" -class LinearRegressionSavingTestCBOR(unittest.TestCase): - filepath = "tests/files/saving/linear_regression.cbor" - - def test_univariate(self): - """Tests fit() and predict(), univariate.""" - x_data = np.array([1, 2, 3, 4, 5]) - y_data = np.array([2, 1, 1, 2, 4.5]) - - bn, bm = 2, 1 - - x = ds.array(x=x_data, block_size=(bn, bm)) - y = ds.array(x=y_data, block_size=(bn, bm)) - - reg = LinearRegression() - reg.fit(x, y) - save_model(reg, self.filepath, save_format="cbor") - reg2 = load_model(self.filepath, load_format="cbor") - - self.assertTrue(np.allclose(reg.coef_.collect(), 0.6)) - self.assertTrue(np.allclose(reg.intercept_.collect(), 0.3)) - self.assertTrue(np.allclose(reg2.coef_.collect(), 0.6)) - self.assertTrue(np.allclose(reg2.intercept_.collect(), 0.3)) - - # Predict one sample - x_test = np.array([3]) - test_data = ds.array(x=x_test, block_size=(1, 1)) - pred = reg.predict(test_data).collect() - pred2 = reg2.predict(test_data).collect() - self.assertTrue(np.allclose(pred, 2.1)) - self.assertTrue(np.allclose(pred2, 2.1)) - - # Predict multiple samples - x_test = np.array([3, 5, 6]) - test_data = ds.array(x=x_test, block_size=(bn, bm)) - pred = reg.predict(test_data).collect() - pred2 = reg2.predict(test_data).collect() - self.assertTrue(np.allclose(pred, [2.1, 3.3, 3.9])) - self.assertTrue(np.allclose(pred2, [2.1, 3.3, 3.9])) - - def test_univariate_no_intercept(self): - """Tests fit() and predict(), univariate, fit_intercept=False.""" - x_data = np.array([1, 2, 3, 4, 5]) - y_data = np.array([2, 1, 1, 2, 4.5]) - - bn, bm = 2, 1 - - x = ds.array(x=x_data, block_size=(bn, bm)) - y = ds.array(x=y_data, block_size=(bn, bm)) - - reg = LinearRegression(fit_intercept=False) - reg.fit(x, y) - save_model(reg, self.filepath, save_format="cbor") - reg2 = load_model(self.filepath, load_format="cbor") - - self.assertTrue(np.allclose(reg.coef_.collect(), 0.68181818)) - self.assertTrue(np.allclose(reg2.coef_.collect(), 0.68181818)) - self.assertTrue(np.allclose(reg.intercept_.collect(), 0)) - self.assertTrue(np.allclose(reg2.intercept_.collect(), 0)) - - # Predict one sample - x_test = np.array([3]) - test_data = ds.array(x=x_test, block_size=(1, 1)) - pred = reg.predict(test_data).collect() - pred2 = reg2.predict(test_data).collect() - self.assertTrue(np.allclose(pred, 2.04545455)) - self.assertTrue(np.allclose(pred2, 2.04545455)) - - # Predict multiple samples - x_test = np.array([3, 5, 6]) - test_data = ds.array(x=x_test, block_size=(bn, bm)) - pred = reg.predict(test_data).collect() - pred2 = reg.predict(test_data).collect() - self.assertTrue(np.allclose(pred, [2.04545455, 3.4090909, 4.0909091])) - self.assertTrue(np.allclose(pred2, [2.04545455, 3.4090909, 4.0909091])) - - def test_multivariate(self): - """Tests fit() and predict(), multivariate.""" x_data = np.array([[1, 2], [2, 0], [3, 1], [4, 4], [5, 3]]) y_data = np.array([2, 1, 1, 2, 4.5]) @@ -708,8 +241,8 @@ def test_multivariate(self): reg = LinearRegression() reg.fit(x, y) - save_model(reg, self.filepath, save_format="cbor") - reg2 = load_model(self.filepath, load_format="cbor") + save_model(reg, filepath, save_format="cbor") + reg2 = load_model(filepath, load_format="cbor") self.assertTrue(np.allclose(reg.coef_.collect(), [0.421875, 0.296875])) self.assertTrue( @@ -732,46 +265,35 @@ def test_multivariate(self): pred = reg.predict(test_data).collect() self.assertTrue(np.allclose(pred, [2.1, 3.115625, 1.553125])) - def test_multivariate_no_intercept(self): - """Tests fit() and predict(), multivariate, fit_intercept=False.""" - x_data = np.array([[1, 2], [2, 0], [3, 1], [4, 4], [5, 3]]) - y_data = np.array([2, 1, 1, 2, 4.5]) + def test_saving_als(self): + filepath = "tests/files/saving/als.cbor" - bn, bm = 2, 2 - - x = ds.array(x=x_data, block_size=(bn, bm)) - y = ds.array(x=y_data, block_size=(bn, 1)) + data = np.array([[0, 0, 5], [3, 0, 5], [3, 1, 2]]) + ratings = csr_matrix(data) + train = ds.array(x=ratings, block_size=(1, 1)) + als = ALS(tol=0.01, random_state=666, n_f=5, verbose=False) + als.fit(train) + save_model(als, filepath, save_format="cbor") + als2 = load_model(filepath, load_format="cbor") - reg = LinearRegression(fit_intercept=False) - reg.fit(x, y) - save_model(reg, self.filepath, save_format="cbor") - reg2 = load_model(self.filepath, load_format="cbor") + predictions = als.predict_user(user_id=0) + predictions2 = als2.predict_user(user_id=0) + # Check that the ratings for user 0 are similar to user 1 because they + # share preferences (third movie), thus it is expected that user 0 + # will rate movie 1 similarly to user 1. self.assertTrue( - np.allclose(reg.coef_.collect(), [0.48305085, 0.30367232]) + 2.75 < predictions[0] < 3.25 + and predictions[1] < 1 + and predictions[2] > 4.5 ) self.assertTrue( - np.allclose(reg2.coef_.collect(), [0.48305085, 0.30367232]) + 2.75 < predictions2[0] < 3.25 + and predictions2[1] < 1 + and predictions2[2] > 4.5 ) - self.assertTrue(np.allclose(reg.intercept_.collect(), 0)) - self.assertTrue(np.allclose(reg2.intercept_.collect(), 0)) - - # Predict one sample - x_test = np.array([3, 2]) - test_data = ds.array(x=x_test, block_size=(1, bm)) - pred = reg.predict(test_data).collect() - pred2 = reg2.predict(test_data).collect() - self.assertTrue(np.allclose(pred, [2.05649718])) - self.assertTrue(np.allclose(pred2, [2.05649718])) - - # Predict multiple samples - x_test = np.array([[3, 2], [4, 4], [1, 3]]) - test_data = ds.array(x=x_test, block_size=(bn, bm)) - pred = reg.predict(test_data).collect() - pred2 = reg2.predict(test_data).collect() - self.assertTrue(np.allclose(pred, [2.05649718, 3.14689266, 1.3940678])) self.assertTrue( - np.allclose(pred2, [2.05649718, 3.14689266, 1.3940678]) + np.array_equal(predictions, predictions2, equal_nan=True) ) @@ -809,60 +331,6 @@ def load_movielens(train_ratio=0.9): return train_arr, test_arr -class ALSSavingTestCBOR(unittest.TestCase): - filepath = "tests/files/saving/als.cbor" - - def test_fit(self): - train, test = load_movielens() - - als = ALS( - tol=0.01, - random_state=666, - n_f=100, - verbose=False, - check_convergence=True, - ) - - als.fit(train, test) - self.assertTrue(als.converged) - - als.fit(train) - save_model(als, self.filepath, save_format="cbor") - als2 = load_model(self.filepath, load_format="cbor") - - self.assertTrue(als.converged) - self.assertTrue(als2.converged) - - def test_predict(self): - data = np.array([[0, 0, 5], [3, 0, 5], [3, 1, 2]]) - ratings = csr_matrix(data) - train = ds.array(x=ratings, block_size=(1, 1)) - als = ALS(tol=0.01, random_state=666, n_f=5, verbose=False) - als.fit(train) - save_model(als, self.filepath, save_format="cbor") - als2 = load_model(self.filepath, load_format="cbor") - - predictions = als.predict_user(user_id=0) - predictions2 = als2.predict_user(user_id=0) - - # Check that the ratings for user 0 are similar to user 1 because they - # share preferences (third movie), thus it is expected that user 0 - # will rate movie 1 similarly to user 1. - self.assertTrue( - 2.75 < predictions[0] < 3.25 - and predictions[1] < 1 - and predictions[2] > 4.5 - ) - self.assertTrue( - 2.75 < predictions2[0] < 3.25 - and predictions2[1] < 1 - and predictions2[2] > 4.5 - ) - self.assertTrue( - np.array_equal(predictions, predictions2, equal_nan=True) - ) - - def main(): unittest.main() diff --git a/tests/test_saving_json.py b/tests/test_saving_json.py index 0c694cf8..0a19429f 100644 --- a/tests/test_saving_json.py +++ b/tests/test_saving_json.py @@ -3,14 +3,14 @@ import numpy as np from scipy.sparse import csr_matrix from sklearn.metrics import r2_score -from sklearn.datasets import make_classification -from sklearn.datasets import make_blobs +from sklearn.datasets import make_classification, make_regression import dislib as ds from dislib.cluster import KMeans from dislib.cluster import GaussianMixture from dislib.classification import CascadeSVM from dislib.classification import RandomForestClassifier +from dislib.regression import RandomForestRegressor from dislib.regression import Lasso from dislib.regression import LinearRegression from dislib.recommendation import ALS @@ -19,56 +19,11 @@ from pycompss.api.api import compss_wait_on -class KMeansSavingTestJSON(unittest.TestCase): - filepath = "tests/files/saving/kmeans.json" +class JSONSavingTest(unittest.TestCase): - def test_fit_kmeans(self): - """Tests that the fit method returns the expected centers using toy - data. - """ - arr = np.array([[1, 2], [2, 1], [-1, -2], [-2, -1]]) - x = ds.array(arr, block_size=(2, 2)) - - km = KMeans(n_clusters=2, random_state=666, verbose=False) - km.fit(x) - - expected_centers = np.array([[1.5, 1.5], [-1.5, -1.5]]) - - save_model(km, self.filepath, save_format="json") - km2 = load_model(self.filepath, load_format="json") - - self.assertTrue((km.centers == expected_centers).all()) - self.assertTrue((km2.centers == expected_centers).all()) - - def test_predict_kmeans(self): - """Tests that labels are correctly predicted using toy data.""" - p1, p2, p3, p4 = [1, 2], [2, 1], [-1, -2], [-2, -1] - - arr1 = np.array([p1, p2, p3, p4]) - x = ds.array(arr1, block_size=(2, 2)) - - km = KMeans(n_clusters=2, random_state=666) - km.fit(x) - - save_model(km, self.filepath, save_format="json") - km2 = load_model(self.filepath, load_format="json") - - p5, p6 = [10, 10], [-10, -10] - - arr2 = np.array([p1, p2, p3, p4, p5, p6]) - x_test = ds.array(arr2, block_size=(2, 2)) - - labels = km.predict(x_test).collect() - labels2 = km2.predict(x_test).collect() - expected_labels = np.array([0, 0, 1, 1, 0, 1]) - - self.assertTrue(np.array_equal(labels, expected_labels)) - self.assertTrue(np.array_equal(labels2, expected_labels)) - - def test_sparse_kmeans(self): - """Tests K-means produces the same results using dense and sparse - data structures.""" + def test_saving_kmeans(self): file_ = "tests/files/libsvm/2" + filepath = "tests/files/saving/kmeans.json" x_sp, _ = ds.load_svmlight_file(file_, (10, 300), 780, True) x_ds, _ = ds.load_svmlight_file(file_, (10, 300), 780, False) @@ -76,8 +31,8 @@ def test_sparse_kmeans(self): kmeans = KMeans(random_state=170) kmeans.fit(x_sp) - save_model(kmeans, self.filepath, save_format="json") - kmeans2 = load_model(self.filepath, load_format="json") + save_model(kmeans, filepath, save_format="json") + kmeans2 = load_model(filepath, load_format="json") y_sparse = kmeans.predict(x_sp).collect() y_sparse2 = kmeans2.predict(x_sp).collect() @@ -95,120 +50,9 @@ def test_sparse_kmeans(self): self.assertTrue(np.array_equal(y_sparse, y_dense)) self.assertTrue(np.array_equal(y_sparse2, y_dense)) - def test_init_kmeans(self): - # With dense data - x, y = make_blobs(n_samples=1500, random_state=170) - x_filtered = np.vstack( - (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10]) - ) - x_train = ds.array(x_filtered, block_size=(300, 2)) - - init = np.random.random((5, 2)) - km = KMeans(n_clusters=5, init=init) - km.fit(x_train) - - save_model(km, self.filepath, save_format="json") - km2 = load_model(self.filepath, load_format="json") - - self.assertTrue(np.array_equal(km.init, init)) - self.assertTrue(np.array_equal(km2.init, init)) - self.assertFalse(np.array_equal(km.centers, init)) - self.assertFalse(np.array_equal(km2.centers, init)) - - # With sparse data - x_sp = ds.array(csr_matrix(x_filtered), block_size=(300, 2)) - init = csr_matrix(np.random.random((5, 2))) - - km = KMeans(n_clusters=5, init=init) - km.fit(x_sp) - - save_model(km, self.filepath, save_format="json") - km2 = load_model(self.filepath, load_format="json") - - self.assertTrue(np.array_equal(km.init.toarray(), init.toarray())) - self.assertTrue(np.array_equal(km2.init.toarray(), init.toarray())) - self.assertFalse(np.array_equal(km.centers.toarray(), init.toarray())) - self.assertFalse(np.array_equal(km2.centers.toarray(), init.toarray())) - - -class GaussianMixtureSavingTestJSON(unittest.TestCase): - filepath = "tests/files/saving/gm.json" - - def test_fit(self): - """Tests GaussianMixture.fit()""" - - x = np.array([[1, 2], [2, 1], [-3, -3], [-1, -2], [-2, -1], [3, 3]]) - ds_x = ds.array(x, block_size=(3, 2)) - - gm = GaussianMixture(n_components=2, random_state=666) - gm.fit(ds_x) - - expected_weights = np.array([0.5, 0.5]) - expected_means = np.array([[-2, -2], [2, 2]]) - expected_cov = np.array( - [ - [[0.66671688, 0.33338255], [0.33338255, 0.66671688]], - [[0.66671688, 0.33338255], [0.33338255, 0.66671688]], - ] - ) - expected_pc = np.array( - [ - [[1.22469875, -0.70714834], [0.0, 1.4141944]], - [[1.22469875, -0.70714834], [0.0, 1.4141944]], - ] - ) - - save_model(gm, self.filepath, save_format="json") - gm2 = load_model(self.filepath, load_format="json") - - gm.weights_ = compss_wait_on(gm.weights_) - gm.means_ = compss_wait_on(gm.means_) - gm.covariances_ = compss_wait_on(gm.covariances_) - gm.precisions_cholesky_ = compss_wait_on(gm.precisions_cholesky_) - - gm2.weights_ = compss_wait_on(gm2.weights_) - gm2.means_ = compss_wait_on(gm2.means_) - gm2.covariances_ = compss_wait_on(gm2.covariances_) - gm2.precisions_cholesky_ = compss_wait_on(gm2.precisions_cholesky_) - - self.assertTrue((np.allclose(gm.weights_, expected_weights))) - self.assertTrue((np.allclose(gm.means_, expected_means))) - self.assertTrue((np.allclose(gm.covariances_, expected_cov))) - self.assertTrue((np.allclose(gm.precisions_cholesky_, expected_pc))) - - self.assertTrue((np.allclose(gm2.weights_, expected_weights))) - self.assertTrue((np.allclose(gm2.means_, expected_means))) - self.assertTrue((np.allclose(gm2.covariances_, expected_cov))) - self.assertTrue((np.allclose(gm2.precisions_cholesky_, expected_pc))) - - def test_predict(self): - """Tests GaussianMixture.predict()""" - x_train = np.array([[1, 2], [-1, -2], [2, 1], [-2, -1]]) - ds_x_train = ds.array(x_train, block_size=(2, 2)) - - gm = GaussianMixture(n_components=2, random_state=666) - gm.fit(ds_x_train) - - save_model(gm, self.filepath, save_format="json") - gm2 = load_model(self.filepath, load_format="json") - - x_test = np.concatenate((x_train, [[2, 2], [-1, -3]])) - ds_x_test = ds.array(x_test, block_size=(2, 2)) - pred = gm.predict(ds_x_test).collect() - pred2 = gm2.predict(ds_x_test).collect() - - self.assertTrue(pred[0] != pred[1]) - self.assertTrue(pred[0] == pred[2] == pred[4]) - self.assertTrue(pred[1] == pred[3] == pred[5]) - - self.assertTrue(pred2[0] != pred2[1]) - self.assertTrue(pred2[0] == pred2[2] == pred2[4]) - self.assertTrue(pred2[1] == pred2[3] == pred2[5]) - - def test_sparse(self): - """Tests GaussianMixture produces the same results using dense and - sparse data structures""" + def test_saving_gm(self): file_ = "tests/files/libsvm/2" + filepath = "tests/files/saving/gm.json" x_sparse, _ = ds.load_svmlight_file(file_, (10, 780), 780, True) x_dense, _ = ds.load_svmlight_file(file_, (10, 780), 780, False) @@ -220,8 +64,8 @@ def test_sparse(self): n_components=4, random_state=0, covariance_type=cov_type ) gm.fit(x_sparse) - save_model(gm, self.filepath, save_format="json") - gm2 = load_model(self.filepath, load_format="json") + save_model(gm, filepath, save_format="json") + gm2 = load_model(filepath, load_format="json") labels_sparse = gm.predict(x_sparse).collect() labels_sparse2 = gm2.predict(x_sparse).collect() @@ -229,8 +73,8 @@ def test_sparse(self): n_components=4, random_state=0, covariance_type=cov_type ) gm.fit(x_dense) - save_model(gm, self.filepath, save_format="json") - gm2 = load_model(self.filepath, load_format="json") + save_model(gm, filepath, save_format="json") + gm2 = load_model(filepath, load_format="json") labels_dense = gm.predict(x_dense).collect() labels_dense2 = gm2.predict(x_dense).collect() @@ -238,137 +82,23 @@ def test_sparse(self): self.assertTrue(np.array_equal(labels_sparse, labels_dense)) self.assertTrue(np.array_equal(labels_sparse2, labels_dense2)) - -class CSVMSavingTestJSON(unittest.TestCase): - filepath = "tests/files/saving/csvm.json" - - def test_fit_private_params(self): - kernel = "rbf" - c = 2 - gamma = 0.1 - seed = 666 - file_ = "tests/files/libsvm/2" - - x, y = ds.load_svmlight_file(file_, (10, 300), 780, False) - csvm = CascadeSVM(kernel=kernel, c=c, gamma=gamma, random_state=seed) - csvm.fit(x, y) - save_model(csvm, self.filepath, save_format="json") - csvm2 = load_model(self.filepath, load_format="json") - self.assertEqual(csvm._clf_params["kernel"], kernel) - self.assertEqual(csvm._clf_params["C"], c) - self.assertEqual(csvm._clf_params["gamma"], gamma) - self.assertEqual(csvm2._clf_params["kernel"], kernel) - self.assertEqual(csvm2._clf_params["C"], c) - self.assertEqual(csvm2._clf_params["gamma"], gamma) - - kernel, c = "linear", 0.3 - csvm = CascadeSVM(kernel=kernel, c=c, random_state=seed) - csvm.fit(x, y) - save_model(csvm, self.filepath, save_format="json") - csvm2 = load_model(self.filepath, load_format="json") - self.assertEqual(csvm._clf_params["kernel"], kernel) - self.assertEqual(csvm._clf_params["C"], c) - self.assertEqual(csvm2._clf_params["kernel"], kernel) - self.assertEqual(csvm2._clf_params["C"], c) - - # # check for exception when incorrect kernel is passed - # self.assertRaises(AttributeError, CascadeSVM(kernel='fake_kernel')) - - def test_predict(self): - seed = 666 - - # negative points belong to class 1, positives to 0 - p1, p2, p3, p4 = [1, 2], [2, 1], [-1, -2], [-2, -1] - - x = ds.array(np.array([p1, p4, p3, p2]), (2, 2)) - y = ds.array(np.array([0, 1, 1, 0]).reshape(-1, 1), (2, 1)) - - csvm = CascadeSVM( - cascade_arity=3, - max_iter=10, - tol=1e-4, - kernel="linear", - c=2, - gamma=0.1, - check_convergence=False, - random_state=seed, - verbose=False, - ) - - csvm.fit(x, y) - save_model(csvm, self.filepath, save_format="json") - csvm2 = load_model(self.filepath, load_format="json") - - # p5 should belong to class 0, p6 to class 1 - p5, p6 = np.array([1, 1]), np.array([-1, -1]) - - x_test = ds.array(np.array([p1, p2, p3, p4, p5, p6]), (2, 2)) - - y_pred = csvm.predict(x_test) - y_pred2 = csvm2.predict(x_test) - - l1, l2, l3, l4, l5, l6 = y_pred.collect() - self.assertTrue(l1 == l2 == l5 == 0) - self.assertTrue(l3 == l4 == l6 == 1) - - l1, l2, l3, l4, l5, l6 = y_pred2.collect() - self.assertTrue(l1 == l2 == l5 == 0) - self.assertTrue(l3 == l4 == l6 == 1) - - def test_score(self): - seed = 666 - - # negative points belong to class 1, positives to 0 - p1, p2, p3, p4 = [1, 2], [2, 1], [-1, -2], [-2, -1] - - x = ds.array(np.array([p1, p4, p3, p2]), (2, 2)) - y = ds.array(np.array([0, 1, 1, 0]).reshape(-1, 1), (2, 1)) - - csvm = CascadeSVM( - cascade_arity=3, - max_iter=10, - tol=1e-4, - kernel="rbf", - c=2, - gamma=0.1, - check_convergence=True, - random_state=seed, - verbose=False, - ) - - csvm.fit(x, y) - save_model(csvm, self.filepath, save_format="json") - csvm2 = load_model(self.filepath, load_format="json") - - # points are separable, scoring the training dataset should have 100% - # accuracy - x_test = ds.array(np.array([p1, p2, p3, p4]), (2, 2)) - y_test = ds.array(np.array([0, 0, 1, 1]).reshape(-1, 1), (2, 1)) - - accuracy = compss_wait_on(csvm.score(x_test, y_test)) - accuracy2 = compss_wait_on(csvm2.score(x_test, y_test)) - - self.assertEqual(accuracy, 1.0) - self.assertEqual(accuracy2, 1.0) - - def test_sparse(self): - """Tests that C-SVM produces the same results with sparse and dense - data""" + def test_saving_csvm(self): seed = 666 train = "tests/files/libsvm/3" + filepath = "tests/files/saving/csvm.json" x_sp, y_sp = ds.load_svmlight_file(train, (10, 300), 780, True) x_d, y_d = ds.load_svmlight_file(train, (10, 300), 780, False) csvm_sp = CascadeSVM(random_state=seed) csvm_sp.fit(x_sp, y_sp) - save_model(csvm_sp, self.filepath, save_format="json") - csvm_sp2 = load_model(self.filepath, load_format="json") + save_model(csvm_sp, filepath, save_format="json") + csvm_sp2 = load_model(filepath, load_format="json") csvm_d = CascadeSVM(random_state=seed) csvm_d.fit(x_d, y_d) - save_model(csvm_d, self.filepath, save_format="json") - csvm_d2 = load_model(self.filepath, load_format="json") + save_model(csvm_d, filepath, save_format="json") + csvm_d2 = load_model(filepath, load_format="json") sv_d = csvm_d._clf.support_vectors_ sv_sp = csvm_sp._clf.support_vectors_.toarray() @@ -388,100 +118,8 @@ def test_sparse(self): self.assertTrue(np.array_equal(coef_d2, coef_sp2)) self.assertTrue(np.array_equal(coef_d, coef_d2)) - -class RFSavingTestJSON(unittest.TestCase): - filepath = "tests/files/saving/rf.json" - - def test_make_classification_score(self): - """Tests RandomForestClassifier fit and score with default params.""" - x, y = make_classification( - n_samples=3000, - n_features=10, - n_classes=3, - n_informative=4, - n_redundant=2, - n_repeated=1, - n_clusters_per_class=2, - shuffle=True, - random_state=0, - ) - x_train = ds.array(x[: len(x) // 2], (300, 10)) - y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) - - rf = RandomForestClassifier(random_state=0) - rf.fit(x_train, y_train) - save_model(rf, self.filepath, save_format="json") - rf2 = load_model(self.filepath, load_format="json") - - accuracy = compss_wait_on(rf.score(x_test, y_test)) - accuracy2 = compss_wait_on(rf2.score(x_test, y_test)) - self.assertGreater(accuracy, 0.7) - self.assertGreater(accuracy2, 0.7) - - def test_make_classification_predict_and_distr_depth(self): - """Tests RandomForestClassifier fit and predict with a distr_depth.""" - x, y = make_classification( - n_samples=3000, - n_features=10, - n_classes=3, - n_informative=4, - n_redundant=2, - n_repeated=1, - n_clusters_per_class=2, - shuffle=True, - random_state=0, - ) - x_train = ds.array(x[: len(x) // 2], (300, 10)) - y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = y[len(y) // 2:] - - rf = RandomForestClassifier(distr_depth=2, random_state=0) - rf.fit(x_train, y_train) - save_model(rf, self.filepath, save_format="json") - rf2 = load_model(self.filepath, load_format="json") - - y_pred = rf.predict(x_test).collect() - y_pred2 = rf2.predict(x_test).collect() - accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) - accuracy2 = np.count_nonzero(y_pred2 == y_test) / len(y_test) - self.assertGreater(accuracy, 0.7) - self.assertGreater(accuracy2, 0.7) - - def test_make_classification_sklearn_max_predict(self): - """Tests RandomForestClassifier predict with sklearn_max.""" - x, y = make_classification( - n_samples=3000, - n_features=10, - n_classes=3, - n_informative=4, - n_redundant=2, - n_repeated=1, - n_clusters_per_class=2, - shuffle=True, - random_state=0, - ) - x_train = ds.array(x[: len(x) // 2], (300, 10)) - y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = y[len(y) // 2:] - - rf = RandomForestClassifier(random_state=0, sklearn_max=10) - rf.fit(x_train, y_train) - save_model(rf, self.filepath, save_format="json") - rf2 = load_model(self.filepath, load_format="json") - - y_pred = rf.predict(x_test).collect() - y_pred2 = rf2.predict(x_test).collect() - accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) - accuracy2 = np.count_nonzero(y_pred2 == y_test) / len(y_test) - self.assertGreater(accuracy, 0.7) - self.assertGreater(accuracy2, 0.7) - - def test_make_classification_sklearn_max_predict_proba(self): - """Tests RandomForestClassifier predict_proba with sklearn_max.""" + def test_saving_rf_class(self): + filepath = "tests/files/saving/rf_class.json" x, y = make_classification( n_samples=3000, n_features=10, @@ -500,8 +138,8 @@ def test_make_classification_sklearn_max_predict_proba(self): rf = RandomForestClassifier(random_state=0, sklearn_max=10) rf.fit(x_train, y_train) - save_model(rf, self.filepath, save_format="json") - rf2 = load_model(self.filepath, load_format="json") + save_model(rf, filepath, save_format="json") + rf2 = load_model(filepath, load_format="json") probabilities = rf.predict_proba(x_test).collect() probabilities2 = rf2.predict_proba(x_test).collect() @@ -514,49 +152,18 @@ def test_make_classification_sklearn_max_predict_proba(self): self.assertGreater(accuracy, 0.7) self.assertGreater(accuracy2, 0.7) - def test_make_classification_hard_vote_predict(self): - """Tests RandomForestClassifier predict with hard_vote.""" - x, y = make_classification( - n_samples=3000, - n_features=10, - n_classes=3, - n_informative=4, - n_redundant=2, - n_repeated=1, - n_clusters_per_class=2, - shuffle=True, - random_state=0, - ) - x_train = ds.array(x[: len(x) // 2], (300, 10)) - y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = y[len(y) // 2:] + def test_saving_rf_regr(self): + filepath = "tests/files/saving/rf_regr.json" - rf = RandomForestClassifier( - random_state=0, sklearn_max=10, hard_vote=True - ) - rf.fit(x_train, y_train) - save_model(rf, self.filepath, save_format="json") - rf2 = load_model(self.filepath, load_format="json") - - y_pred = rf.predict(x_test).collect() - y_pred2 = rf2.predict(x_test).collect() - accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) - accuracy2 = np.count_nonzero(y_pred2 == y_test) / len(y_test) - self.assertGreater(accuracy, 0.7) - self.assertGreater(accuracy2, 0.7) + def determination_coefficient(y_true, y_pred): + u = np.sum(np.square(y_true - y_pred)) + v = np.sum(np.square(y_true - np.mean(y_true))) + return 1 - u / v - def test_make_classification_hard_vote_score_mix(self): - """Tests RandomForestClassifier score with hard_vote, sklearn_max, - distr_depth and max_depth.""" - x, y = make_classification( + x, y = make_regression( n_samples=3000, n_features=10, - n_classes=3, n_informative=4, - n_redundant=2, - n_repeated=1, - n_clusters_per_class=2, shuffle=True, random_state=0, ) @@ -565,29 +172,30 @@ def test_make_classification_hard_vote_score_mix(self): x_test = ds.array(x[len(x) // 2:], (300, 10)) y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) - rf = RandomForestClassifier( - random_state=0, - sklearn_max=100, - distr_depth=2, - max_depth=12, - hard_vote=True, - ) + rf = RandomForestRegressor(random_state=0, sklearn_max=10) + rf.fit(x_train, y_train) - save_model(rf, self.filepath, save_format="json") - rf2 = load_model(self.filepath, load_format="json") + save_model(rf, filepath, save_format="json") + rf2 = load_model(filepath, load_format="json") - accuracy = compss_wait_on(rf.score(x_test, y_test)) + accuracy1 = compss_wait_on(rf.score(x_test, y_test)) accuracy2 = compss_wait_on(rf2.score(x_test, y_test)) - self.assertGreater(accuracy, 0.7) - self.assertGreater(accuracy2, 0.7) - - -class LassoSavingTestJSON(unittest.TestCase): - filepath = "tests/files/saving/lasso.json" - - def test_fit_predict(self): - """Tests fit and predicts methods""" - + y_pred = rf.predict(x_test).collect() + y_true = y[len(y) // 2:] + y_pred2 = rf2.predict(x_test).collect() + y_true2 = y[len(y) // 2:] + coef1 = determination_coefficient(y_true, y_pred) + coef2 = determination_coefficient(y_true2, y_pred2) + + self.assertGreater(accuracy1, 0.85) + self.assertGreater(accuracy2, 0.85) + self.assertGreater(coef1, 0.85) + self.assertGreater(coef2, 0.85) + self.assertAlmostEqual(accuracy1, accuracy2) + self.assertAlmostEqual(coef1, coef2) + + def test_saving_lasso(self): + filepath = "tests/files/saving/lasso.json" np.random.seed(42) n_samples, n_features = 50, 100 @@ -609,8 +217,8 @@ def test_fit_predict(self): lasso = Lasso(lmbd=0.1, max_iter=50) lasso.fit(ds.array(X_train, (5, 100)), ds.array(y_train, (5, 1))) - save_model(lasso, self.filepath, save_format="json") - lasso2 = load_model(self.filepath, load_format="json") + save_model(lasso, filepath, save_format="json") + lasso2 = load_model(filepath, load_format="json") y_pred_lasso = lasso.predict(ds.array(X_test, (25, 100))) r2_score_lasso = r2_score(y_test, y_pred_lasso.collect()) @@ -620,84 +228,9 @@ def test_fit_predict(self): self.assertAlmostEqual(r2_score_lasso, 0.9481746925431124) self.assertAlmostEqual(r2_score_lasso2, 0.9481746925431124) + def test_saving_linear(self): + filepath = "tests/files/saving/linear_regression.json" -class LinearRegressionSavingTestJSON(unittest.TestCase): - filepath = "tests/files/saving/linear_regression.json" - - def test_univariate(self): - """Tests fit() and predict(), univariate.""" - x_data = np.array([1, 2, 3, 4, 5]) - y_data = np.array([2, 1, 1, 2, 4.5]) - - bn, bm = 2, 1 - - x = ds.array(x=x_data, block_size=(bn, bm)) - y = ds.array(x=y_data, block_size=(bn, bm)) - - reg = LinearRegression() - reg.fit(x, y) - save_model(reg, self.filepath, save_format="json") - reg2 = load_model(self.filepath, load_format="json") - - self.assertTrue(np.allclose(reg.coef_.collect(), 0.6)) - self.assertTrue(np.allclose(reg.intercept_.collect(), 0.3)) - self.assertTrue(np.allclose(reg2.coef_.collect(), 0.6)) - self.assertTrue(np.allclose(reg2.intercept_.collect(), 0.3)) - - # Predict one sample - x_test = np.array([3]) - test_data = ds.array(x=x_test, block_size=(1, 1)) - pred = reg.predict(test_data).collect() - pred2 = reg2.predict(test_data).collect() - self.assertTrue(np.allclose(pred, 2.1)) - self.assertTrue(np.allclose(pred2, 2.1)) - - # Predict multiple samples - x_test = np.array([3, 5, 6]) - test_data = ds.array(x=x_test, block_size=(bn, bm)) - pred = reg.predict(test_data).collect() - pred2 = reg2.predict(test_data).collect() - self.assertTrue(np.allclose(pred, [2.1, 3.3, 3.9])) - self.assertTrue(np.allclose(pred2, [2.1, 3.3, 3.9])) - - def test_univariate_no_intercept(self): - """Tests fit() and predict(), univariate, fit_intercept=False.""" - x_data = np.array([1, 2, 3, 4, 5]) - y_data = np.array([2, 1, 1, 2, 4.5]) - - bn, bm = 2, 1 - - x = ds.array(x=x_data, block_size=(bn, bm)) - y = ds.array(x=y_data, block_size=(bn, bm)) - - reg = LinearRegression(fit_intercept=False) - reg.fit(x, y) - save_model(reg, self.filepath, save_format="json") - reg2 = load_model(self.filepath, load_format="json") - - self.assertTrue(np.allclose(reg.coef_.collect(), 0.68181818)) - self.assertTrue(np.allclose(reg2.coef_.collect(), 0.68181818)) - self.assertTrue(np.allclose(reg.intercept_.collect(), 0)) - self.assertTrue(np.allclose(reg2.intercept_.collect(), 0)) - - # Predict one sample - x_test = np.array([3]) - test_data = ds.array(x=x_test, block_size=(1, 1)) - pred = reg.predict(test_data).collect() - pred2 = reg2.predict(test_data).collect() - self.assertTrue(np.allclose(pred, 2.04545455)) - self.assertTrue(np.allclose(pred2, 2.04545455)) - - # Predict multiple samples - x_test = np.array([3, 5, 6]) - test_data = ds.array(x=x_test, block_size=(bn, bm)) - pred = reg.predict(test_data).collect() - pred2 = reg.predict(test_data).collect() - self.assertTrue(np.allclose(pred, [2.04545455, 3.4090909, 4.0909091])) - self.assertTrue(np.allclose(pred2, [2.04545455, 3.4090909, 4.0909091])) - - def test_multivariate(self): - """Tests fit() and predict(), multivariate.""" x_data = np.array([[1, 2], [2, 0], [3, 1], [4, 4], [5, 3]]) y_data = np.array([2, 1, 1, 2, 4.5]) @@ -708,8 +241,8 @@ def test_multivariate(self): reg = LinearRegression() reg.fit(x, y) - save_model(reg, self.filepath, save_format="json") - reg2 = load_model(self.filepath, load_format="json") + save_model(reg, filepath, save_format="json") + reg2 = load_model(filepath, load_format="json") self.assertTrue(np.allclose(reg.coef_.collect(), [0.421875, 0.296875])) self.assertTrue( @@ -732,46 +265,35 @@ def test_multivariate(self): pred = reg.predict(test_data).collect() self.assertTrue(np.allclose(pred, [2.1, 3.115625, 1.553125])) - def test_multivariate_no_intercept(self): - """Tests fit() and predict(), multivariate, fit_intercept=False.""" - x_data = np.array([[1, 2], [2, 0], [3, 1], [4, 4], [5, 3]]) - y_data = np.array([2, 1, 1, 2, 4.5]) + def test_saving_als(self): + filepath = "tests/files/saving/als.json" - bn, bm = 2, 2 - - x = ds.array(x=x_data, block_size=(bn, bm)) - y = ds.array(x=y_data, block_size=(bn, 1)) + data = np.array([[0, 0, 5], [3, 0, 5], [3, 1, 2]]) + ratings = csr_matrix(data) + train = ds.array(x=ratings, block_size=(1, 1)) + als = ALS(tol=0.01, random_state=666, n_f=5, verbose=False) + als.fit(train) + save_model(als, filepath, save_format="json") + als2 = load_model(filepath, load_format="json") - reg = LinearRegression(fit_intercept=False) - reg.fit(x, y) - save_model(reg, self.filepath, save_format="json") - reg2 = load_model(self.filepath, load_format="json") + predictions = als.predict_user(user_id=0) + predictions2 = als2.predict_user(user_id=0) + # Check that the ratings for user 0 are similar to user 1 because they + # share preferences (third movie), thus it is expected that user 0 + # will rate movie 1 similarly to user 1. self.assertTrue( - np.allclose(reg.coef_.collect(), [0.48305085, 0.30367232]) + 2.75 < predictions[0] < 3.25 + and predictions[1] < 1 + and predictions[2] > 4.5 ) self.assertTrue( - np.allclose(reg2.coef_.collect(), [0.48305085, 0.30367232]) + 2.75 < predictions2[0] < 3.25 + and predictions2[1] < 1 + and predictions2[2] > 4.5 ) - self.assertTrue(np.allclose(reg.intercept_.collect(), 0)) - self.assertTrue(np.allclose(reg2.intercept_.collect(), 0)) - - # Predict one sample - x_test = np.array([3, 2]) - test_data = ds.array(x=x_test, block_size=(1, bm)) - pred = reg.predict(test_data).collect() - pred2 = reg2.predict(test_data).collect() - self.assertTrue(np.allclose(pred, [2.05649718])) - self.assertTrue(np.allclose(pred2, [2.05649718])) - - # Predict multiple samples - x_test = np.array([[3, 2], [4, 4], [1, 3]]) - test_data = ds.array(x=x_test, block_size=(bn, bm)) - pred = reg.predict(test_data).collect() - pred2 = reg2.predict(test_data).collect() - self.assertTrue(np.allclose(pred, [2.05649718, 3.14689266, 1.3940678])) self.assertTrue( - np.allclose(pred2, [2.05649718, 3.14689266, 1.3940678]) + np.array_equal(predictions, predictions2, equal_nan=True) ) @@ -809,60 +331,6 @@ def load_movielens(train_ratio=0.9): return train_arr, test_arr -class ALSSavingTestJSON(unittest.TestCase): - filepath = "tests/files/saving/als.json" - - def test_fit(self): - train, test = load_movielens() - - als = ALS( - tol=0.01, - random_state=666, - n_f=100, - verbose=False, - check_convergence=True, - ) - - als.fit(train, test) - self.assertTrue(als.converged) - - als.fit(train) - save_model(als, self.filepath, save_format="json") - als2 = load_model(self.filepath, load_format="json") - - self.assertTrue(als.converged) - self.assertTrue(als2.converged) - - def test_predict(self): - data = np.array([[0, 0, 5], [3, 0, 5], [3, 1, 2]]) - ratings = csr_matrix(data) - train = ds.array(x=ratings, block_size=(1, 1)) - als = ALS(tol=0.01, random_state=666, n_f=5, verbose=False) - als.fit(train) - save_model(als, self.filepath, save_format="json") - als2 = load_model(self.filepath, load_format="json") - - predictions = als.predict_user(user_id=0) - predictions2 = als2.predict_user(user_id=0) - - # Check that the ratings for user 0 are similar to user 1 because they - # share preferences (third movie), thus it is expected that user 0 - # will rate movie 1 similarly to user 1. - self.assertTrue( - 2.75 < predictions[0] < 3.25 - and predictions[1] < 1 - and predictions[2] > 4.5 - ) - self.assertTrue( - 2.75 < predictions2[0] < 3.25 - and predictions2[1] < 1 - and predictions2[2] > 4.5 - ) - self.assertTrue( - np.array_equal(predictions, predictions2, equal_nan=True) - ) - - def main(): unittest.main() From efcfa1f9fa566fa51768a2c303820a84d5165eed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Fri, 30 Jul 2021 10:15:16 +0200 Subject: [PATCH 19/46] Added tests for RF dataset --- dislib/commons/rf/data.py | 24 ++-- tests/test_rf_dataset.py | 241 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 251 insertions(+), 14 deletions(-) create mode 100644 tests/test_rf_dataset.py diff --git a/dislib/commons/rf/data.py b/dislib/commons/rf/data.py index a762e5b6..af9fb066 100644 --- a/dislib/commons/rf/data.py +++ b/dislib/commons/rf/data.py @@ -46,13 +46,12 @@ def get_n_samples(self): """ if self.n_samples is None: - assert isinstance(self.samples_path, str), ( - "self.n_samples must be set manually if self.samples_path " - "is a pycompss.runtime.Future object" - ) + if not isinstance(self.samples_path, str): + raise TypeError( + "self.n_samples must be set manually if self.samples_path " + "is a pycompss.runtime.Future object" + ) shape = _NpyFile(self.samples_path).get_shape() - if len(shape) != 2: - raise ValueError("Cannot read 2D array from the samples file.") self.n_samples, self.n_features = shape return self.n_samples @@ -72,13 +71,12 @@ def get_n_features(self): """ if self.n_features is None: - assert isinstance(self.samples_path, str), ( - "self.n_features must be set manually if self.samples_path " - "is a pycompss.runtime.Future object" - ) + if not isinstance(self.samples_path, str): + raise TypeError( + "self.n_features must be set manually if self.samples_path" + " is a pycompss.runtime.Future object" + ) shape = _NpyFile(self.samples_path).get_shape() - if len(shape) != 2: - raise ValueError("Cannot read 2D array from the samples file.") self.n_samples, self.n_features = shape return self.n_features @@ -95,8 +93,6 @@ class n_samples and n_features or if the array is in fortran order. features_npy_file = _NpyFile(self.features_path) shape = features_npy_file.get_shape() fortran_order = features_npy_file.get_fortran_order() - if len(shape) != 2: - raise ValueError("Cannot read 2D array from features_file.") if (self.get_n_features(), self.get_n_samples()) != shape: raise ValueError("Invalid dimensions for the features_file.") if fortran_order: diff --git a/tests/test_rf_dataset.py b/tests/test_rf_dataset.py new file mode 100644 index 00000000..c70664e1 --- /dev/null +++ b/tests/test_rf_dataset.py @@ -0,0 +1,241 @@ +import unittest + +import os +import shutil +from sklearn.datasets import make_classification +import dislib as ds +from dislib.commons.rf import data +from dislib.commons.rf import test_split +from dislib.data.array import Array +import numpy as np +from sys import float_info +from pycompss.api.api import compss_wait_on + +DIRPATH = "tests/files/saving" + + +class RFDatasetTest(unittest.TestCase): + def setUp(self) -> None: + os.makedirs(DIRPATH, exist_ok=True) + return super().setUp() + + def tearDown(self) -> None: + shutil.rmtree(DIRPATH) + return super().tearDown() + + def test_rf_dataset(self): + # Save samples and features + x, y = make_classification( + n_samples=900, + n_features=10, + n_classes=3, + n_informative=4, + random_state=0, + ) + x_ds_1 = ds.array(x, (300, 10)) + x_ds_2 = ds.array(x[:600], (300, 10)) + y_ds_1 = ds.array(y[:, np.newaxis], (300, 1)) + y_ds_2 = ds.array(y[:600][:, np.newaxis], (300, 1)) + samples_path_1 = os.path.join(DIRPATH, "feats_1") + samples_path_2 = os.path.join(DIRPATH, "feats_2") + targets_path_1 = os.path.join(DIRPATH, "targets_1") + targets_path_2 = os.path.join(DIRPATH, "targets_2") + features_path_f = os.path.join(DIRPATH, "targets_f") + save_samples(x_ds_1, samples_path_1, False) + save_samples(x_ds_2, samples_path_2, False) + save_targets(y_ds_1, targets_path_1) + save_targets(y_ds_2, targets_path_2) + save_features(x_ds_2, features_path_f, True) + + # Regression and classification datatser + rf_regr = data.RfRegressorDataset(samples_path_1, targets_path_1) + rf_class = data.RfClassifierDataset(samples_path_1, targets_path_1) + + # Test get number of samples and features + self.assertEqual(rf_regr.get_n_samples(), 900) + self.assertEqual(rf_class.get_n_samples(), 900) + self.assertEqual(rf_regr.get_n_features(), 10) + self.assertEqual(rf_class.get_n_features(), 10) + + # Test get y targets + y_regr = compss_wait_on(rf_regr.get_y_targets()) + y_class = compss_wait_on(rf_class.get_y_targets()) + self.assertTrue(np.all(y_regr == y_ds_1.collect())) + self.assertTrue(np.all(y_class == y_ds_1.collect())) + + # Test get number of classes and classes + n_class = compss_wait_on(rf_regr.get_n_classes()) + classes = compss_wait_on(rf_regr.get_classes()) + self.assertTrue(n_class is None) + self.assertTrue(classes is None) + + rf_class.n_classes = None + n_class = compss_wait_on(rf_class.get_n_classes()) + rf_class.y_categories = None + classes = compss_wait_on(rf_class.get_classes()) + self.assertEqual(n_class, 3) + self.assertTrue(np.all(classes == [0, 1, 2])) + + # Sample and feature paths must be str + rf_dataset = data.RfBaseDataset(None, None) + with self.assertRaises(TypeError): + rf_dataset.get_n_samples() + with self.assertRaises(TypeError): + rf_dataset.get_n_features() + + # Task must be classification or regression + with self.assertRaises(ValueError): + rf_dataset = data.transform_to_rf_dataset(x_ds_1, y_ds_1, "aaa") + + # Validate dimension + rf_dataset = data.RfBaseDataset( + samples_path_1, targets_path_1, features_path_f + ) + rf_dataset.samples_path = samples_path_2 + with self.assertRaises(ValueError): + rf_dataset.validate_features_file() + + # Validate Fortran order + rf_dataset = data.RfBaseDataset( + samples_path_1, targets_path_1, features_path_f + ) + with self.assertRaises(ValueError): + rf_dataset.validate_features_file() + + # Dataset creation + rf_regr = data.transform_to_rf_dataset( + x_ds_1, y_ds_1, "regression" + ) + rf_class = data.transform_to_rf_dataset( + x_ds_1, y_ds_1, "classification" + ) + self.assertEquals(compss_wait_on(rf_regr.get_n_samples()), 900) + self.assertEquals(compss_wait_on(rf_regr.get_n_features()), 10) + self.assertEquals(compss_wait_on(rf_class.get_n_samples()), 900) + self.assertEquals(compss_wait_on(rf_class.get_n_features()), 10) + + # Npy files + file = data._NpyFile(features_path_f) + file.shape = None + self.assertEqual(file.get_shape(), (10, 600)) + file.fortran_order = None + self.assertTrue(file.get_fortran_order()) + file.dtype = None + self.assertEqual(file.get_dtype().name, "float32") + + file = data._NpyFile(samples_path_2) + file.shape = None + self.assertEqual(file.get_shape(), (600, 10)) + file.fortran_order = None + self.assertFalse(file.get_fortran_order()) + file.dtype = None + self.assertEqual(file.get_dtype().name, "float32") + + # Test returns for empty size + score, value = test_split.test_split(None, np.array([]), None, None) + self.assertEqual(score, float_info.max) + self.assertEqual(value, np.float64(np.inf)) + + +def _fill_samples_file( + samples_path, row_blocks, start_idx, fortran_order +): + rows_samples = Array._merge_blocks(row_blocks) + rows_samples = rows_samples.astype( + dtype="float32", casting="same_kind" + ) + samples = np.lib.format.open_memmap( + samples_path, mode="r+", fortran_order=fortran_order + ) + samples[start_idx: start_idx + rows_samples.shape[0]] = ( + rows_samples + ) + + +def _fill_features_file( + samples_path, row_blocks, start_idx, fortran_order +): + rows_samples = Array._merge_blocks(row_blocks).T + rows_samples = rows_samples.astype( + dtype="float32", casting="same_kind" + ) + samples = np.lib.format.open_memmap( + samples_path, mode="r+", fortran_order=fortran_order + ) + samples[start_idx: start_idx + rows_samples.shape[1]] = ( + rows_samples + ) + + +def _fill_targets_file(targets_path, row_blocks): + rows_targets = Array._merge_blocks(row_blocks) + with open(targets_path, "at") as f: + np.savetxt(f, rows_targets, fmt="%s", encoding="utf-8") + + +def save_samples(x, samples_path, fortran_order): + n_samples = x.shape[0] + n_features = x.shape[1] + + open(samples_path, 'w').close() + np.lib.format.open_memmap( + samples_path, + mode="w+", + dtype="float32", + fortran_order=fortran_order, + shape=(int(n_samples), int(n_features)), + ) + start_idx = 0 + row_blocks_iterator = x._iterator(axis=0) + top_row = next(row_blocks_iterator) + _fill_samples_file( + samples_path, top_row._blocks, start_idx, fortran_order + ) + start_idx += x._top_left_shape[0] + for x_row in row_blocks_iterator: + _fill_samples_file( + samples_path, x_row._blocks, start_idx, fortran_order + ) + start_idx += x._reg_shape[0] + + +def save_targets(y, targets_path): + open(targets_path, 'w').close() + for y_row in y._iterator(axis=0): + _fill_targets_file(targets_path, y_row._blocks) + + +def save_features(x, features_path, fortran_order): + n_samples = x.shape[0] + n_features = x.shape[1] + + if features_path is not None: + np.lib.format.open_memmap( + features_path, + mode="w+", + dtype="float32", + fortran_order=fortran_order, + shape=(int(n_features), int(n_samples)), + ) + start_idx = 0 + col_blocks_iterator = x._iterator(axis=1) + left_col = next(col_blocks_iterator) + _fill_features_file( + features_path, left_col._blocks, + start_idx, fortran_order + ) + start_idx += x._top_left_shape[1] + for x_row in col_blocks_iterator: + _fill_features_file( + features_path, x_row._blocks, + start_idx, fortran_order + ) + start_idx += x._reg_shape[1] + + +def main(): + unittest.main() + + +if __name__ == '__main__': + main() From 95538b610d5e18cbda05ec9ee1a3a5ab1031f888 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Fri, 30 Jul 2021 10:17:45 +0200 Subject: [PATCH 20/46] Added setup and teardown for saving tests. --- .gitignore | 3 --- tests/files/saving/saving.txt | 1 - tests/test_saving.py | 15 ++++++++++++--- tests/test_saving_cbor.py | 28 +++++++++++++++++++--------- tests/test_saving_json.py | 28 +++++++++++++++++++--------- 5 files changed, 50 insertions(+), 25 deletions(-) delete mode 100644 tests/files/saving/saving.txt diff --git a/.gitignore b/.gitignore index ad8ef5a4..4b75fb4c 100644 --- a/.gitignore +++ b/.gitignore @@ -112,9 +112,6 @@ target/ *compss*.out *compss*.err -# Saving -tests/files/saving/* -!tests/files/saving/*.txt # ========== C & C++ ignores ================= # Prerequisites diff --git a/tests/files/saving/saving.txt b/tests/files/saving/saving.txt deleted file mode 100644 index d7d8541b..00000000 --- a/tests/files/saving/saving.txt +++ /dev/null @@ -1 +0,0 @@ -Directory where the models generated by the tests regarding saving functionalities should be located. \ No newline at end of file diff --git a/tests/test_saving.py b/tests/test_saving.py index 7545e9ad..523ed5cc 100644 --- a/tests/test_saving.py +++ b/tests/test_saving.py @@ -1,16 +1,25 @@ import unittest import json - +import os +import shutil from dislib.cluster import KMeans from dislib.cluster import DBSCAN import dislib.utils.saving as saving +DIRPATH = "tests/files/saving" + class SavingTest(unittest.TestCase): + def setUp(self) -> None: + os.makedirs(DIRPATH, exist_ok=True) + return super().setUp() + + def tearDown(self) -> None: + shutil.rmtree(DIRPATH) + return super().tearDown() def test_errors(self): - """Test that errors are raised""" - filepath = "tests/files/saving/model.json" + filepath = os.path.join(DIRPATH, "model.json") # Models km = KMeans(n_clusters=2) diff --git a/tests/test_saving_cbor.py b/tests/test_saving_cbor.py index a5c2f23c..5a0ef438 100644 --- a/tests/test_saving_cbor.py +++ b/tests/test_saving_cbor.py @@ -1,5 +1,6 @@ import unittest - +import os +import shutil import numpy as np from scipy.sparse import csr_matrix from sklearn.metrics import r2_score @@ -18,12 +19,21 @@ from pycompss.api.api import compss_wait_on +DIRPATH = "tests/files/saving" + class CBORSavingTest(unittest.TestCase): + def setUp(self) -> None: + os.makedirs(DIRPATH, exist_ok=True) + return super().setUp() + + def tearDown(self) -> None: + shutil.rmtree(DIRPATH) + return super().tearDown() def test_saving_kmeans(self): file_ = "tests/files/libsvm/2" - filepath = "tests/files/saving/kmeans.cbor" + filepath = os.path.join(DIRPATH, "kmeans.cbor") x_sp, _ = ds.load_svmlight_file(file_, (10, 300), 780, True) x_ds, _ = ds.load_svmlight_file(file_, (10, 300), 780, False) @@ -52,7 +62,7 @@ def test_saving_kmeans(self): def test_saving_gm(self): file_ = "tests/files/libsvm/2" - filepath = "tests/files/saving/gm.cbor" + filepath = os.path.join(DIRPATH, "gm.cbor") x_sparse, _ = ds.load_svmlight_file(file_, (10, 780), 780, True) x_dense, _ = ds.load_svmlight_file(file_, (10, 780), 780, False) @@ -85,7 +95,7 @@ def test_saving_gm(self): def test_saving_csvm(self): seed = 666 train = "tests/files/libsvm/3" - filepath = "tests/files/saving/csvm.cbor" + filepath = os.path.join(DIRPATH, "csvm.cbor") x_sp, y_sp = ds.load_svmlight_file(train, (10, 300), 780, True) x_d, y_d = ds.load_svmlight_file(train, (10, 300), 780, False) @@ -119,7 +129,7 @@ def test_saving_csvm(self): self.assertTrue(np.array_equal(coef_d, coef_d2)) def test_saving_rf_class(self): - filepath = "tests/files/saving/rf_class.cbor" + filepath = os.path.join(DIRPATH, "rf_class.cbor") x, y = make_classification( n_samples=3000, n_features=10, @@ -153,7 +163,7 @@ def test_saving_rf_class(self): self.assertGreater(accuracy2, 0.7) def test_saving_rf_regr(self): - filepath = "tests/files/saving/rf_regr.cbor" + filepath = os.path.join(DIRPATH, "rf_regr.cbor") def determination_coefficient(y_true, y_pred): u = np.sum(np.square(y_true - y_pred)) @@ -195,7 +205,7 @@ def determination_coefficient(y_true, y_pred): self.assertAlmostEqual(coef1, coef2) def test_saving_lasso(self): - filepath = "tests/files/saving/lasso.cbor" + filepath = os.path.join(DIRPATH, "lasso.cbor") np.random.seed(42) n_samples, n_features = 50, 100 @@ -229,7 +239,7 @@ def test_saving_lasso(self): self.assertAlmostEqual(r2_score_lasso2, 0.9481746925431124) def test_saving_linear(self): - filepath = "tests/files/saving/linear_regression.cbor" + filepath = os.path.join(DIRPATH, "linear_regression.cbor") x_data = np.array([[1, 2], [2, 0], [3, 1], [4, 4], [5, 3]]) y_data = np.array([2, 1, 1, 2, 4.5]) @@ -266,7 +276,7 @@ def test_saving_linear(self): self.assertTrue(np.allclose(pred, [2.1, 3.115625, 1.553125])) def test_saving_als(self): - filepath = "tests/files/saving/als.cbor" + filepath = os.path.join(DIRPATH, "als.cbor") data = np.array([[0, 0, 5], [3, 0, 5], [3, 1, 2]]) ratings = csr_matrix(data) diff --git a/tests/test_saving_json.py b/tests/test_saving_json.py index 0a19429f..783f9f31 100644 --- a/tests/test_saving_json.py +++ b/tests/test_saving_json.py @@ -1,5 +1,6 @@ import unittest - +import os +import shutil import numpy as np from scipy.sparse import csr_matrix from sklearn.metrics import r2_score @@ -18,12 +19,21 @@ from pycompss.api.api import compss_wait_on +DIRPATH = "tests/files/saving" + class JSONSavingTest(unittest.TestCase): + def setUp(self) -> None: + os.makedirs(DIRPATH, exist_ok=True) + return super().setUp() + + def tearDown(self) -> None: + shutil.rmtree(DIRPATH) + return super().tearDown() def test_saving_kmeans(self): file_ = "tests/files/libsvm/2" - filepath = "tests/files/saving/kmeans.json" + filepath = os.path.join(DIRPATH, "kmeans.json") x_sp, _ = ds.load_svmlight_file(file_, (10, 300), 780, True) x_ds, _ = ds.load_svmlight_file(file_, (10, 300), 780, False) @@ -52,7 +62,7 @@ def test_saving_kmeans(self): def test_saving_gm(self): file_ = "tests/files/libsvm/2" - filepath = "tests/files/saving/gm.json" + filepath = os.path.join(DIRPATH, "gm.json") x_sparse, _ = ds.load_svmlight_file(file_, (10, 780), 780, True) x_dense, _ = ds.load_svmlight_file(file_, (10, 780), 780, False) @@ -85,7 +95,7 @@ def test_saving_gm(self): def test_saving_csvm(self): seed = 666 train = "tests/files/libsvm/3" - filepath = "tests/files/saving/csvm.json" + filepath = os.path.join(DIRPATH, "csvm.json") x_sp, y_sp = ds.load_svmlight_file(train, (10, 300), 780, True) x_d, y_d = ds.load_svmlight_file(train, (10, 300), 780, False) @@ -119,7 +129,7 @@ def test_saving_csvm(self): self.assertTrue(np.array_equal(coef_d, coef_d2)) def test_saving_rf_class(self): - filepath = "tests/files/saving/rf_class.json" + filepath = os.path.join(DIRPATH, "rf_class.json") x, y = make_classification( n_samples=3000, n_features=10, @@ -153,7 +163,7 @@ def test_saving_rf_class(self): self.assertGreater(accuracy2, 0.7) def test_saving_rf_regr(self): - filepath = "tests/files/saving/rf_regr.json" + filepath = os.path.join(DIRPATH, "rf_regr.json") def determination_coefficient(y_true, y_pred): u = np.sum(np.square(y_true - y_pred)) @@ -195,7 +205,7 @@ def determination_coefficient(y_true, y_pred): self.assertAlmostEqual(coef1, coef2) def test_saving_lasso(self): - filepath = "tests/files/saving/lasso.json" + filepath = os.path.join(DIRPATH, "lasso.json") np.random.seed(42) n_samples, n_features = 50, 100 @@ -229,7 +239,7 @@ def test_saving_lasso(self): self.assertAlmostEqual(r2_score_lasso2, 0.9481746925431124) def test_saving_linear(self): - filepath = "tests/files/saving/linear_regression.json" + filepath = os.path.join(DIRPATH, "linear_regression.json") x_data = np.array([[1, 2], [2, 0], [3, 1], [4, 4], [5, 3]]) y_data = np.array([2, 1, 1, 2, 4.5]) @@ -266,7 +276,7 @@ def test_saving_linear(self): self.assertTrue(np.allclose(pred, [2.1, 3.115625, 1.553125])) def test_saving_als(self): - filepath = "tests/files/saving/als.json" + filepath = os.path.join(DIRPATH, "als.json") data = np.array([[0, 0, 5], [3, 0, 5], [3, 1, 2]]) ratings = csr_matrix(data) From 1c8f7ef5fc8130e4b8ca918676c644a64f66a743 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Mon, 2 Aug 2021 11:17:25 +0200 Subject: [PATCH 21/46] Updated user guide with RF Regressor --- docs/source/user-guide.rst | 72 +++++++++++++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) diff --git a/docs/source/user-guide.rst b/docs/source/user-guide.rst index 8e91e7e2..3fb02dc1 100644 --- a/docs/source/user-guide.rst +++ b/docs/source/user-guide.rst @@ -294,7 +294,7 @@ scalability of the estimator is limited by the reduction phase of the cascade. Random forest classifier ........................ -:class:`RandomForestClassifier ` +:class:`RandomForestClassifier ` is a classifier that uses an ensemble of decision trees and aggregates their predictions. The process of building each decision tree includes some randomization in order to make them different. The accuracy of the joint @@ -565,6 +565,76 @@ shape ``(n_features, n_features)`` and process it as a single block. this with a distributed implementation of a method for solving a system of linear equations.) + +Random forest regressor +........................ + +:class:`RandomForestRegressor ` +is a regressor that uses an ensemble of decision trees and aggregates their +predictions. The process of building each decision tree includes some +randomization in order to make them different. The accuracy of the joint +prediction can be greater than that of individual decision trees. One advantage +of Random Forests is that you cannot overfit by increasing the number of +trees. Several variations of random forests have been proposed and implemented. +A fundamental paper that has been cited extensively is [Bre01]_, which +describes a method for classification problems that can be adapted to regression +problems: + + For building each tree, the original sample set is replaced by a set of the + same size, obtained by drawing with replacement (this method is called + bootstrap aggregating or bagging). At each tree node, a certain number of + random features is selected (random feature selection). The sample set + is splitted in two according to the values of these features, and a + metric called 'Mean Squared Error' is computed for every split. The MSE + measures the squared residuals with respect to the average value of the + target variables, which could be interpreted as a measure of the sample + variance. The split with the lowest MSE value is selected, and + the subsamples are propagated to the children nodes. The trees grown are + not pruned. + +Ensemble estimators can be implemented in an embarrassingly parallel pattern. +You can do this with scikit-learn's RandomForestClassifier using a +``joblib.parallel_backend`` and setting the ``n_jobs`` parameter. However, you +need to be able to load your data into memory for each processor or to use +memory mapped arrays, which can be tricky specially with a distributed backend. + +In our implementation, the samples as a whole are written into a binary file +and accessed using memory maps (the COMPSs runtime manages the transfers to +other nodes when needed). We used this approach because the performance penalty +of using distributed data was too large. Storing the samples file and saving +the decision trees introduces a big load to the disk storage of all nodes. If +your execution fails because you reach your disk storage limits, you can try +reducing the number of trees or reducing their size by setting the +``max_depth`` parameter. If this is not enough, you may consider reducing +the samples. + +In order to get further parallelism, each decision tree is not necessarily +built in a single task: there are tasks for building just a subtree, just a +node or even just part of a node. You can use the ``distr_depth`` parameter to +control the number of tasks used for each tree. However, be aware that the +number of tasks grows exponentially when you increase ``distr_depth``, and that +the task loads become very unbalanced. The fitted decision trees are not +synchronized, so the prediction is equally distributed. + +The results of the RandomForestRegressor can vary in every execution, due to +its random nature. To get reproducible results, a RandomState (pseudorandom +number generator) or an int can be provided to the ``random_state`` +parameter of the constructor. This works by passing a seed (generated by the +master's RandomState) to each task that uses randomness, and creating a new +RandomState inside the task. + +.. topic:: References: + + .. [Chan79] `Updating Formulae and a Pairwise Algorithm for Computing Sample Variances. + `_ + T. F. Chan, G. H. Golub, R. J. LeVeque, 1979 + Technical Report STAN-CS-79-773, Department of Computer Science, Stanford University. + .. [Tor99] `Inductive Learning of Tree-based Regression Models + `_ + L. Torgo, 1999 + Chapter 3, PhD Thesis, Faculdade de Ciè‚ncias da Universidade do Porto + + Decomposition ------------- From 89b6db39d42ef38e83383255cc5c02f957ffeb13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Fri, 16 Jul 2021 16:15:32 +0200 Subject: [PATCH 22/46] Added saving and loading utils --- dislib/utils/__init__.py | 3 +- dislib/utils/saving.py | 363 ++++++++++ tests/test_saving_cbor.py | 1403 +++++++++++++++++++++++++++++++++++++ tests/test_saving_json.py | 1403 +++++++++++++++++++++++++++++++++++++ 4 files changed, 3171 insertions(+), 1 deletion(-) create mode 100644 dislib/utils/saving.py create mode 100644 tests/test_saving_cbor.py create mode 100644 tests/test_saving_json.py diff --git a/dislib/utils/__init__.py b/dislib/utils/__init__.py index 34b84166..299601a7 100644 --- a/dislib/utils/__init__.py +++ b/dislib/utils/__init__.py @@ -1,3 +1,4 @@ from dislib.utils.base import shuffle +from dislib.utils.saving import save_model, load_model -__all__ = ['shuffle'] +__all__ = ["shuffle", "save_model", "load_model"] diff --git a/dislib/utils/saving.py b/dislib/utils/saving.py new file mode 100644 index 00000000..31159a7f --- /dev/null +++ b/dislib/utils/saving.py @@ -0,0 +1,363 @@ +import json +import os +import numpy as np + +from pycompss.runtime.management.classes import Future +from pycompss.api.api import compss_wait_on + +from sklearn.svm import SVC as SklearnSVC +from sklearn.tree import DecisionTreeClassifier as SklearnDTClassifier +from sklearn.tree._tree import Tree as SklearnTree +from scipy.sparse import csr_matrix + +import dislib as ds +import dislib.classification +import dislib.cluster +import dislib.recommendation +import dislib.regression +from dislib.data.array import Array +from dislib.classification.rf.decision_tree import ( + DecisionTreeClassifier, + _Node, + _InnerNodeInfo, + _LeafInfo, + _SkTreeWrapper, +) + +try: + import cbor2 +except ImportError: + cbor2 = None + +# Dislib models with saving tested (model: str -> module: str) +_implemented_models = { + "KMeans": "cluster", + "GaussianMixture": "cluster", + "CascadeSVM": "classification", + "RandomForestClassifier": "classification", + "ALS": "recommendation", + "LinearRegression": "regression", + "Lasso": "regression", +} + +# Classes used by models +_dislib_classes = { + "KMeans": dislib.cluster.KMeans, + "DecisionTreeClassifier": DecisionTreeClassifier, + "_Node": _Node, + "_InnerNodeInfo": _InnerNodeInfo, + "_LeafInfo": _LeafInfo, + "_SkTreeWrapper": _SkTreeWrapper, +} + +_sklearn_classes = { + "SVC": SklearnSVC, + "DecisionTreeClassifier": SklearnDTClassifier, +} + + +def save_model(model, filepath, overwrite=True, save_format=None): + """Saves a model to a file. + Usage: + >>> from dislib.cluster import KMeans + >>> from dislib.utils import save_model, load_model + >>> import numpy as np + >>> import dislib as ds + >>> x = np.array([[1, 2], [1, 4], [1, 0], [4, 2], [4, 4], [4, 0]]) + >>> x_train = ds.array(x, (2, 2)) + >>> model = KMeans(n_clusters=2, random_state=0) + >>> model.fit(x_train) + >>> save_model(model, '/tmp/model') + >>> loaded_model = load_model('/tmp/model') + >>> x_test = ds.array(np.array([[0, 0], [4, 4]]), (2, 2)) + >>> model_pred = model.predict(x_test) + >>> loaded_model_pred = loaded_model.predict(x_test) + >>> assert np.allclose(model_pred.collect(), loaded_model_pred.collect()) + + The file contains: + - the model's class + - the model's attributes + The model is synchronized before saving and can be reinstantiated in the + exact same state, without any of the code used for model definition or + fitting. + Args: + model: `dislib` model instance to be saved. + filepath: String path where to save the model + overwrite: Whether we should overwrite any existing model at the target + location, or instead ask the user with a manual prompt. + save_format: Format used to save the model. Defaults to `json`. + """ + # Check overwrite + if not overwrite and os.path.isfile(filepath): + return + + # Check for dislib model + model_name = model.__class__.__name__ + if model_name not in _implemented_models.keys(): + raise NotImplementedError( + "Saving has only been implemented for the following models:\n%s" + % _implemented_models.keys() + ) + + # Synchronize model + if model_name == "RandomForestClassifier": + _sync_rf(model) + + _sync_obj(model.__dict__) + model_metadata = model.__dict__.copy() + model_metadata["model_name"] = model_name + + # Save model + default_format = "json" + save_format = save_format or default_format + if save_format == "json": + with open(filepath, "w") as f: + json.dump(model_metadata, f, default=_encode_helper) + elif save_format == "cbor": + if cbor2 is None: + raise ModuleNotFoundError("No module named 'cbor2'") + with open(filepath, "wb") as f: + cbor2.dump(model_metadata, f, default=_encode_helper_cbor) + else: + raise ValueError("Save format must be either json or h5.") + + +def load_model(filepath, load_format=None): + """Loads a model from a file. + Usage: + >>> from dislib.cluster import KMeans + >>> from dislib.utils import save_model, load_model + >>> import numpy as np + >>> import dislib as ds + >>> x = np.array([[1, 2], [1, 4], [1, 0], [4, 2], [4, 4], [4, 0]]) + >>> x_train = ds.array(x, (2, 2)) + >>> model = KMeans(n_clusters=2, random_state=0) + >>> model.fit(x_train) + >>> save_model(model, '/tmp/model') + >>> loaded_model = load_model('/tmp/model') + >>> x_test = ds.array(np.array([[0, 0], [4, 4]]), (2, 2)) + >>> model_pred = model.predict(x_test) + >>> loaded_model_pred = loaded_model.predict(x_test) + >>> assert np.allclose(model_pred.collect(), loaded_model_pred.collect()) + + The file must contain: + - the model's class + - the model's attributes + The model is reinstantiated in the exact same state in which it was saved, + without any of the code used for model definition or fitting. + Args: + filepath: String path where to save the model + load_format: Format used to load the model. Defaults to 'json'. + """ + # Load model + default_format = "json" + load_format = load_format or default_format + + if load_format == "json": + with open(filepath, "r") as f: + model_metadata = json.load(f, object_hook=_decode_helper) + elif load_format == "cbor": + if cbor2 is None: + raise ModuleNotFoundError("No module named 'cbor2'") + with open(filepath, "rb") as f: + model_metadata = cbor2.load(f, object_hook=_decode_helper_cbor) + else: + raise ValueError("Load format must be either json or h5.") + + # Check for dislib model + model_name = model_metadata["model_name"] + if model_name not in _implemented_models.keys(): + raise NotImplementedError( + "Loading has only been implemented for the following models:\n%s" + % _implemented_models.keys() + ) + del model_metadata["model_name"] + + # Create model + model_module = getattr(ds, _implemented_models[model_name]) + model_class = getattr(model_module, model_name) + model = model_class() + model.__dict__.update(model_metadata) + + # Set class methodss + if model_name == "CascadeSVM" and "kernel" in model_metadata: + try: + model._kernel_f = getattr( + model, model._name_to_kernel[model_metadata["kernel"]] + ) + except AttributeError: + model._kernel_f = getattr(model, "_rbf_kernel") + + return model + + +def _encode_helper_cbor(encoder, obj): + """Special encoder wrapper for dislib using cbor""" + encoder.encode(_encode_helper(obj)) + + +def _decode_helper_cbor(decoder, obj): + """Special decoder wrapper for dislib using cbor""" + return _decode_helper(obj) + + +def _encode_helper(obj): + """Special encoder for dislib""" + if isinstance(obj, np.generic): + return obj.item() + elif isinstance(obj, range): + return { + "class_name": "range", + "start": obj.start, + "stop": obj.stop, + "step": obj.step, + } + elif isinstance(obj, csr_matrix): + return { + "class_name": "csr_matrix", + **obj.__dict__, + } + elif isinstance(obj, np.ndarray): + return { + "class_name": "ndarray", + "dtype_list": len(obj.dtype.descr) > 1, + "dtype": str(obj.dtype), + "items": obj.tolist(), + } + elif isinstance(obj, Array): + return {"class_name": "dsarray", **obj.__dict__} + elif isinstance(obj, np.random.RandomState): + return {"class_name": "RandomState", "items": obj.get_state()} + elif callable(obj): + return { + "class_name": "callable", + "module": obj.__module__, + "name": obj.__name__, + } + elif isinstance(obj, SklearnTree): + return { + "class_name": obj.__class__.__name__, + "n_features": obj.n_features, + "n_classes": obj.n_classes, + "n_outputs": obj.n_outputs, + "items": obj.__getstate__(), + } + elif isinstance( + obj, tuple(_dislib_classes.values()) + tuple(_sklearn_classes.values()) + ): + return { + "class_name": obj.__class__.__name__, + "module_name": obj.__module__, + "items": obj.__dict__, + } + raise TypeError("Not JSON Serializable:", obj) + + +def _decode_helper(obj): + """Special decoder for dislib""" + if isinstance(obj, dict) and "class_name" in obj: + + class_name = obj["class_name"] + if class_name == "range": + return range(obj["start"], obj["stop"], obj["step"]) + elif class_name == "tuple": + return tuple(obj["items"]) + elif class_name == "ndarray": + if obj["dtype_list"]: + items = list(map(tuple, obj["items"])) + return np.rec.fromrecords(items, dtype=eval(obj["dtype"])) + else: + return np.array(obj["items"], dtype=obj["dtype"]) + elif class_name == "csr_matrix": + return csr_matrix( + (obj["data"], obj["indices"], obj["indptr"]), + shape=obj["_shape"], + ) + elif class_name == "dsarray": + return Array( + blocks=obj["_blocks"], + top_left_shape=obj["_top_left_shape"], + reg_shape=obj["_reg_shape"], + shape=obj["_shape"], + sparse=obj["_sparse"], + delete=obj["_delete"], + ) + elif class_name == "RandomState": + random_state = np.random.RandomState() + random_state.set_state(_decode_helper(obj["items"])) + return random_state + elif class_name == "Tree": + dict_ = _decode_helper(obj["items"]) + model = SklearnTree( + obj["n_features"], obj["n_classes"], obj["n_outputs"] + ) + model.__setstate__(dict_) + return model + elif ( + class_name in _dislib_classes.keys() + and "dislib" in obj["module_name"] + ): + dict_ = _decode_helper(obj["items"]) + if class_name == "DecisionTreeClassifier": + model = _dislib_classes[obj["class_name"]]( + try_features=dict_.pop("try_features"), + max_depth=dict_.pop("max_depth"), + distr_depth=dict_.pop("distr_depth"), + sklearn_max=dict_.pop("sklearn_max"), + bootstrap=dict_.pop("bootstrap"), + random_state=dict_.pop("random_state"), + ) + elif class_name == "_SkTreeWrapper": + sk_tree = _decode_helper(dict_.pop("sk_tree")) + model = _dislib_classes[obj["class_name"]](sk_tree) + else: + model = _dislib_classes[obj["class_name"]]() + model.__dict__.update(dict_) + return model + elif ( + class_name in _sklearn_classes.keys() + and "sklearn" in obj["module_name"] + ): + dict_ = _decode_helper(obj["items"]) + model = _sklearn_classes[obj["class_name"]]() + model.__dict__.update(dict_) + return model + elif class_name == "callable": + if obj["module"] == "numpy": + return getattr(np, obj["name"]) + return None + + return obj + + +def _sync_obj(obj): + """Recursively synchronizes the Future objects of a list or dictionary.""" + if isinstance(obj, dict): + iterator = iter(obj.items()) + elif isinstance(obj, list): + iterator = iter(enumerate(obj)) + else: + raise ValueError("Expected dict or list and received %s." % type(obj)) + + for key, val in iterator: + if isinstance(val, (dict, list)): + _sync_obj(obj[key]) + else: + obj[key] = compss_wait_on(val) + if isinstance(obj[key], Future): + raise TypeError( + "Could not synchronize Future (%s, %s)." % (key, val) + ) + if hasattr(obj[key], "__dict__"): + _sync_obj(obj[key].__dict__) + + +def _sync_rf(rf): + """Sync the `try_features` and 'n_classes' attribute of the different trees + """ + if isinstance(rf.trees[0].try_features, Future): + try_features = compss_wait_on(rf.trees[0].try_features) + n_classes = compss_wait_on(rf.trees[0].n_classes) + for tree in rf.trees: + tree.try_features = try_features + tree.n_classes = n_classes diff --git a/tests/test_saving_cbor.py b/tests/test_saving_cbor.py new file mode 100644 index 00000000..64cd534a --- /dev/null +++ b/tests/test_saving_cbor.py @@ -0,0 +1,1403 @@ +import unittest + +import numpy as np +from numpy.random.mtrand import RandomState +from scipy.sparse import csr_matrix +from sklearn import datasets +from sklearn.cluster import KMeans as SKMeans +from sklearn.metrics import r2_score +from sklearn.datasets import make_classification +from sklearn.datasets import make_blobs, load_iris + +import dislib as ds +from dislib.cluster import KMeans +from dislib.cluster import GaussianMixture +from dislib.classification import CascadeSVM +from dislib.classification import RandomForestClassifier +from dislib.regression import Lasso +from dislib.regression import LinearRegression +from dislib.recommendation import ALS +from dislib.utils import save_model, load_model + +from pycompss.api.api import compss_wait_on + + +class KMeansSavingTestCBOR(unittest.TestCase): + filepath = "tests/files/saving/kmeans.cbor" + + def test_init_params_kmeans(self): + """Tests that KMeans correctly sets the initialization + parameters""" + n_clusters = 2 + max_iter = 1 + tol = 1e-4 + seed = 666 + arity = 2 + init = "random" + + km = KMeans( + n_clusters=n_clusters, + max_iter=max_iter, + tol=tol, + arity=arity, + random_state=seed, + ) + save_model(km, self.filepath, save_format="cbor") + km2 = load_model(self.filepath, load_format="cbor") + + expected = (n_clusters, init, max_iter, tol, arity) + real = (km.n_clusters, km.init, km.max_iter, km.tol, km.arity) + real2 = (km2.n_clusters, km2.init, km2.max_iter, km2.tol, km2.arity) + self.assertEqual(expected, real) + self.assertEqual(expected, real2) + + def test_fit_kmeans(self): + """Tests that the fit method returns the expected centers using toy + data. + """ + arr = np.array([[1, 2], [2, 1], [-1, -2], [-2, -1]]) + x = ds.array(arr, block_size=(2, 2)) + + km = KMeans(n_clusters=2, random_state=666, verbose=False) + km.fit(x) + + expected_centers = np.array([[1.5, 1.5], [-1.5, -1.5]]) + + save_model(km, self.filepath, save_format="cbor") + km2 = load_model(self.filepath, load_format="cbor") + + self.assertTrue((km.centers == expected_centers).all()) + self.assertTrue((km2.centers == expected_centers).all()) + + def test_predict_kmeans(self): + """Tests that labels are correctly predicted using toy data.""" + p1, p2, p3, p4 = [1, 2], [2, 1], [-1, -2], [-2, -1] + + arr1 = np.array([p1, p2, p3, p4]) + x = ds.array(arr1, block_size=(2, 2)) + + km = KMeans(n_clusters=2, random_state=666) + km.fit(x) + + save_model(km, self.filepath, save_format="cbor") + km2 = load_model(self.filepath, load_format="cbor") + + p5, p6 = [10, 10], [-10, -10] + + arr2 = np.array([p1, p2, p3, p4, p5, p6]) + x_test = ds.array(arr2, block_size=(2, 2)) + + labels = km.predict(x_test).collect() + labels2 = km2.predict(x_test).collect() + expected_labels = np.array([0, 0, 1, 1, 0, 1]) + + self.assertTrue(np.array_equal(labels, expected_labels)) + self.assertTrue(np.array_equal(labels2, expected_labels)) + + def test_fit_predict_kmeans(self): + """Tests fit_predict.""" + x, y = make_blobs(n_samples=1500, random_state=170) + x_filtered = np.vstack( + (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10]) + ) + + x_train = ds.array(x_filtered, block_size=(300, 2)) + + kmeans = KMeans(n_clusters=3, random_state=170) + labels = kmeans.fit_predict(x_train).collect() + + save_model(kmeans, self.filepath, save_format="cbor") + kmeans = load_model(self.filepath, load_format="cbor") + + skmeans = SKMeans(n_clusters=3, random_state=170) + sklabels = skmeans.fit_predict(x_filtered) + + centers = np.array( + [ + [-8.941375656533449, -5.481371322614891], + [-4.524023204953875, 0.06235042593214654], + [2.332994701667008, 0.37681003933082696], + ] + ) + + self.assertTrue(np.allclose(centers, kmeans.centers)) + self.assertTrue(np.allclose(labels, sklabels)) + + def test_sparse_kmeans(self): + """Tests K-means produces the same results using dense and sparse + data structures.""" + file_ = "tests/files/libsvm/2" + + x_sp, _ = ds.load_svmlight_file(file_, (10, 300), 780, True) + x_ds, _ = ds.load_svmlight_file(file_, (10, 300), 780, False) + + kmeans = KMeans(random_state=170) + kmeans.fit(x_sp) + + save_model(kmeans, self.filepath, save_format="cbor") + kmeans2 = load_model(self.filepath, load_format="cbor") + + y_sparse = kmeans.predict(x_sp).collect() + y_sparse2 = kmeans2.predict(x_sp).collect() + + sparse_c = kmeans.centers.toarray() + sparse_c2 = kmeans2.centers.toarray() + + kmeans = KMeans(random_state=170) + + y_dense = kmeans.fit_predict(x_ds).collect() + dense_c = kmeans.centers + + self.assertTrue(np.allclose(sparse_c, dense_c)) + self.assertTrue(np.allclose(sparse_c2, dense_c)) + self.assertTrue(np.array_equal(y_sparse, y_dense)) + self.assertTrue(np.array_equal(y_sparse2, y_dense)) + + def test_init_kmeans(self): + # With dense data + x, y = make_blobs(n_samples=1500, random_state=170) + x_filtered = np.vstack( + (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10]) + ) + x_train = ds.array(x_filtered, block_size=(300, 2)) + + init = np.random.random((5, 2)) + km = KMeans(n_clusters=5, init=init) + km.fit(x_train) + + save_model(km, self.filepath, save_format="cbor") + km2 = load_model(self.filepath, load_format="cbor") + + self.assertTrue(np.array_equal(km.init, init)) + self.assertTrue(np.array_equal(km2.init, init)) + self.assertFalse(np.array_equal(km.centers, init)) + self.assertFalse(np.array_equal(km2.centers, init)) + + # With sparse data + x_sp = ds.array(csr_matrix(x_filtered), block_size=(300, 2)) + init = csr_matrix(np.random.random((5, 2))) + + km = KMeans(n_clusters=5, init=init) + km.fit(x_sp) + + save_model(km, self.filepath, save_format="cbor") + km2 = load_model(self.filepath, load_format="cbor") + + self.assertTrue(np.array_equal(km.init.toarray(), init.toarray())) + self.assertTrue(np.array_equal(km2.init.toarray(), init.toarray())) + self.assertFalse(np.array_equal(km.centers.toarray(), init.toarray())) + self.assertFalse(np.array_equal(km2.centers.toarray(), init.toarray())) + + +class GaussianMixtureSavingTestCBOR(unittest.TestCase): + filepath = "tests/files/saving/gm.cbor" + + def test_init_params(self): + """Tests that GaussianMixture params are set""" + n_components = 2 + covariance_type = "diag" + tol = 1e-4 + reg_covar = 1e-5 + max_iter = 3 + init_params = "random" + weights_init = np.array([0.4, 0.6]) + means_init = np.array([[0, 0], [2, 3]]) + precisions_init = "todo" + random_state = RandomState(666) + gm = GaussianMixture( + n_components=n_components, + covariance_type=covariance_type, + tol=tol, + reg_covar=reg_covar, + max_iter=max_iter, + init_params=init_params, + weights_init=weights_init, + means_init=means_init, + precisions_init=precisions_init, + random_state=random_state, + ) + + save_model(gm, self.filepath, save_format="cbor") + gm2 = load_model(self.filepath, load_format="cbor") + + real = ( + gm.n_components, + gm.covariance_type, + gm.tol, + gm.reg_covar, + gm.max_iter, + gm.init_params, + gm.weights_init.tolist(), + gm.means_init.tolist(), + gm.precisions_init, + *[ + list(x) if isinstance(x, np.ndarray) else x + for x in gm.random_state.get_state() + ], + ) + real2 = ( + gm2.n_components, + gm2.covariance_type, + gm2.tol, + gm2.reg_covar, + gm2.max_iter, + gm2.init_params, + gm2.weights_init.tolist(), + gm2.means_init.tolist(), + gm2.precisions_init, + *[ + list(x) if isinstance(x, np.ndarray) else x + for x in gm2.random_state.get_state() + ], + ) + + self.assertEqual(real, real2) + + def test_fit(self): + """Tests GaussianMixture.fit()""" + + x = np.array([[1, 2], [2, 1], [-3, -3], [-1, -2], [-2, -1], [3, 3]]) + ds_x = ds.array(x, block_size=(3, 2)) + + gm = GaussianMixture(n_components=2, random_state=666) + gm.fit(ds_x) + + expected_weights = np.array([0.5, 0.5]) + expected_means = np.array([[-2, -2], [2, 2]]) + expected_cov = np.array( + [ + [[0.66671688, 0.33338255], [0.33338255, 0.66671688]], + [[0.66671688, 0.33338255], [0.33338255, 0.66671688]], + ] + ) + expected_pc = np.array( + [ + [[1.22469875, -0.70714834], [0.0, 1.4141944]], + [[1.22469875, -0.70714834], [0.0, 1.4141944]], + ] + ) + + save_model(gm, self.filepath, save_format="cbor") + gm2 = load_model(self.filepath, load_format="cbor") + + gm.weights_ = compss_wait_on(gm.weights_) + gm.means_ = compss_wait_on(gm.means_) + gm.covariances_ = compss_wait_on(gm.covariances_) + gm.precisions_cholesky_ = compss_wait_on(gm.precisions_cholesky_) + + gm2.weights_ = compss_wait_on(gm2.weights_) + gm2.means_ = compss_wait_on(gm2.means_) + gm2.covariances_ = compss_wait_on(gm2.covariances_) + gm2.precisions_cholesky_ = compss_wait_on(gm2.precisions_cholesky_) + + self.assertTrue((np.allclose(gm.weights_, expected_weights))) + self.assertTrue((np.allclose(gm.means_, expected_means))) + self.assertTrue((np.allclose(gm.covariances_, expected_cov))) + self.assertTrue((np.allclose(gm.precisions_cholesky_, expected_pc))) + + self.assertTrue((np.allclose(gm2.weights_, expected_weights))) + self.assertTrue((np.allclose(gm2.means_, expected_means))) + self.assertTrue((np.allclose(gm2.covariances_, expected_cov))) + self.assertTrue((np.allclose(gm2.precisions_cholesky_, expected_pc))) + + def test_predict(self): + """Tests GaussianMixture.predict()""" + x_train = np.array([[1, 2], [-1, -2], [2, 1], [-2, -1]]) + ds_x_train = ds.array(x_train, block_size=(2, 2)) + + gm = GaussianMixture(n_components=2, random_state=666) + gm.fit(ds_x_train) + + save_model(gm, self.filepath, save_format="cbor") + gm2 = load_model(self.filepath, load_format="cbor") + + x_test = np.concatenate((x_train, [[2, 2], [-1, -3]])) + ds_x_test = ds.array(x_test, block_size=(2, 2)) + pred = gm.predict(ds_x_test).collect() + pred2 = gm2.predict(ds_x_test).collect() + + self.assertTrue(pred[0] != pred[1]) + self.assertTrue(pred[0] == pred[2] == pred[4]) + self.assertTrue(pred[1] == pred[3] == pred[5]) + + self.assertTrue(pred2[0] != pred2[1]) + self.assertTrue(pred2[0] == pred2[2] == pred2[4]) + self.assertTrue(pred2[1] == pred2[3] == pred2[5]) + + def test_fit_predict(self): + """Tests GaussianMixture.fit_predict()""" + x, y = make_blobs(n_samples=1500, random_state=170) + x_filtered = np.vstack( + (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10]) + ) + y_real = np.concatenate((np.zeros(500), np.ones(100), 2 * np.ones(10))) + + ds_x = ds.array(x_filtered, block_size=(300, 2)) + + gm = GaussianMixture(n_components=3, random_state=170) + pred = gm.fit_predict(ds_x).collect() + + save_model(gm, self.filepath, save_format="cbor") + gm2 = load_model(self.filepath, load_format="cbor") + + pred2 = gm2.predict(ds_x).collect() + + self.assertEqual(len(pred), 610) + accuracy = np.count_nonzero(pred == y_real) / len(pred) + self.assertGreater(accuracy, 0.99) + + self.assertEqual(len(pred2), 610) + accuracy2 = np.count_nonzero(pred2 == y_real) / len(pred2) + self.assertGreater(accuracy2, 0.99) + + def test_sparse(self): + """Tests GaussianMixture produces the same results using dense and + sparse data structures""" + file_ = "tests/files/libsvm/2" + + x_sparse, _ = ds.load_svmlight_file(file_, (10, 780), 780, True) + x_dense, _ = ds.load_svmlight_file(file_, (10, 780), 780, False) + + covariance_types = "full", "tied", "diag", "spherical" + + for cov_type in covariance_types: + gm = GaussianMixture( + n_components=4, random_state=0, covariance_type=cov_type + ) + gm.fit(x_sparse) + save_model(gm, self.filepath, save_format="cbor") + gm2 = load_model(self.filepath, load_format="cbor") + labels_sparse = gm.predict(x_sparse).collect() + labels_sparse2 = gm2.predict(x_sparse).collect() + + gm = GaussianMixture( + n_components=4, random_state=0, covariance_type=cov_type + ) + gm.fit(x_dense) + save_model(gm, self.filepath, save_format="cbor") + gm2 = load_model(self.filepath, load_format="cbor") + labels_dense = gm.predict(x_dense).collect() + labels_dense2 = gm2.predict(x_dense).collect() + + self.assertTrue(np.array_equal(labels_sparse, labels_sparse2)) + self.assertTrue(np.array_equal(labels_sparse, labels_dense)) + self.assertTrue(np.array_equal(labels_sparse2, labels_dense2)) + + def test_init_random(self): + """Tests GaussianMixture random initialization""" + x = ds.random_array((50, 3), (10, 3), random_state=0) + gm = GaussianMixture( + init_params="random", n_components=4, arity=2, random_state=170 + ) + gm.fit(x) + save_model(gm, self.filepath, save_format="cbor") + gm2 = load_model(self.filepath, load_format="cbor") + self.assertGreater(gm.n_iter, 5) + self.assertGreater(gm2.n_iter, 5) + + def test_means_init_and_weights_init(self): + """Tests GaussianMixture means_init and weights_init parameters""" + x, _ = load_iris(return_X_y=True) + x_ds = ds.array(x, (75, 4)) + weights_init = [1 / 3, 1 / 3, 1 / 3] + means_init = np.array([[5, 3, 2, 0], [6, 3, 4, 1], [7, 3, 6, 2]]) + gm = GaussianMixture( + random_state=0, + n_components=3, + weights_init=weights_init, + means_init=means_init, + ) + gm.fit(x_ds) + save_model(gm, self.filepath, save_format="cbor") + gm2 = load_model(self.filepath, load_format="cbor") + self.assertTrue(gm.converged_) + self.assertTrue(gm2.converged_) + + +class CSVMSavingTestCBOR(unittest.TestCase): + filepath = "tests/files/saving/csvm.cbor" + + def test_init_params(self): + """Test constructor parameters""" + cascade_arity = 3 + max_iter = 1 + tol = 1e-4 + kernel = "rbf" + c = 2 + gamma = 0.1 + check_convergence = True + seed = 666 + verbose = False + + csvm = CascadeSVM( + cascade_arity=cascade_arity, + max_iter=max_iter, + tol=tol, + kernel=kernel, + c=c, + gamma=gamma, + check_convergence=check_convergence, + random_state=seed, + verbose=verbose, + ) + save_model(csvm, self.filepath, save_format="cbor") + csvm2 = load_model(self.filepath, load_format="cbor") + + self.assertEqual(csvm.cascade_arity, cascade_arity) + self.assertEqual(csvm.max_iter, max_iter) + self.assertEqual(csvm.tol, tol) + self.assertEqual(csvm.kernel, kernel) + self.assertEqual(csvm.c, c) + self.assertEqual(csvm.gamma, gamma) + self.assertEqual(csvm.check_convergence, check_convergence) + self.assertEqual(csvm.random_state, seed) + self.assertEqual(csvm.verbose, verbose) + + self.assertEqual(csvm2.cascade_arity, cascade_arity) + self.assertEqual(csvm2.max_iter, max_iter) + self.assertEqual(csvm2.tol, tol) + self.assertEqual(csvm2.kernel, kernel) + self.assertEqual(csvm2.c, c) + self.assertEqual(csvm2.gamma, gamma) + self.assertEqual(csvm2.check_convergence, check_convergence) + self.assertEqual(csvm2.random_state, seed) + self.assertEqual(csvm2.verbose, verbose) + + def test_fit_private_params(self): + kernel = "rbf" + c = 2 + gamma = 0.1 + seed = 666 + file_ = "tests/files/libsvm/2" + + x, y = ds.load_svmlight_file(file_, (10, 300), 780, False) + csvm = CascadeSVM(kernel=kernel, c=c, gamma=gamma, random_state=seed) + csvm.fit(x, y) + save_model(csvm, self.filepath, save_format="cbor") + csvm2 = load_model(self.filepath, load_format="cbor") + self.assertEqual(csvm._clf_params["kernel"], kernel) + self.assertEqual(csvm._clf_params["C"], c) + self.assertEqual(csvm._clf_params["gamma"], gamma) + self.assertEqual(csvm2._clf_params["kernel"], kernel) + self.assertEqual(csvm2._clf_params["C"], c) + self.assertEqual(csvm2._clf_params["gamma"], gamma) + + kernel, c = "linear", 0.3 + csvm = CascadeSVM(kernel=kernel, c=c, random_state=seed) + csvm.fit(x, y) + save_model(csvm, self.filepath, save_format="cbor") + csvm2 = load_model(self.filepath, load_format="cbor") + self.assertEqual(csvm._clf_params["kernel"], kernel) + self.assertEqual(csvm._clf_params["C"], c) + self.assertEqual(csvm2._clf_params["kernel"], kernel) + self.assertEqual(csvm2._clf_params["C"], c) + + # # check for exception when incorrect kernel is passed + # self.assertRaises(AttributeError, CascadeSVM(kernel='fake_kernel')) + + def test_fit(self): + seed = 666 + file_ = "tests/files/libsvm/2" + + x, y = ds.load_svmlight_file(file_, (10, 300), 780, False) + + csvm = CascadeSVM( + cascade_arity=3, + max_iter=5, + tol=1e-4, + kernel="linear", + c=2, + gamma=0.1, + check_convergence=True, + random_state=seed, + verbose=False, + ) + + csvm.fit(x, y) + save_model(csvm, self.filepath, save_format="cbor") + csvm2 = load_model(self.filepath, load_format="cbor") + + self.assertTrue(csvm.converged) + self.assertTrue(csvm2.converged) + + csvm = CascadeSVM( + cascade_arity=3, + max_iter=1, + tol=1e-4, + kernel="linear", + c=2, + gamma=0.1, + check_convergence=False, + random_state=seed, + verbose=False, + ) + + csvm.fit(x, y) + save_model(csvm, self.filepath, save_format="cbor") + csvm2 = load_model(self.filepath, load_format="cbor") + self.assertFalse(csvm.converged) + self.assertEqual(csvm.iterations, 1) + self.assertFalse(csvm2.converged) + self.assertEqual(csvm2.iterations, 1) + + def test_predict(self): + seed = 666 + + # negative points belong to class 1, positives to 0 + p1, p2, p3, p4 = [1, 2], [2, 1], [-1, -2], [-2, -1] + + x = ds.array(np.array([p1, p4, p3, p2]), (2, 2)) + y = ds.array(np.array([0, 1, 1, 0]).reshape(-1, 1), (2, 1)) + + csvm = CascadeSVM( + cascade_arity=3, + max_iter=10, + tol=1e-4, + kernel="linear", + c=2, + gamma=0.1, + check_convergence=False, + random_state=seed, + verbose=False, + ) + + csvm.fit(x, y) + save_model(csvm, self.filepath, save_format="cbor") + csvm2 = load_model(self.filepath, load_format="cbor") + + # p5 should belong to class 0, p6 to class 1 + p5, p6 = np.array([1, 1]), np.array([-1, -1]) + + x_test = ds.array(np.array([p1, p2, p3, p4, p5, p6]), (2, 2)) + + y_pred = csvm.predict(x_test) + y_pred2 = csvm2.predict(x_test) + + l1, l2, l3, l4, l5, l6 = y_pred.collect() + self.assertTrue(l1 == l2 == l5 == 0) + self.assertTrue(l3 == l4 == l6 == 1) + + l1, l2, l3, l4, l5, l6 = y_pred2.collect() + self.assertTrue(l1 == l2 == l5 == 0) + self.assertTrue(l3 == l4 == l6 == 1) + + def test_score(self): + seed = 666 + + # negative points belong to class 1, positives to 0 + p1, p2, p3, p4 = [1, 2], [2, 1], [-1, -2], [-2, -1] + + x = ds.array(np.array([p1, p4, p3, p2]), (2, 2)) + y = ds.array(np.array([0, 1, 1, 0]).reshape(-1, 1), (2, 1)) + + csvm = CascadeSVM( + cascade_arity=3, + max_iter=10, + tol=1e-4, + kernel="rbf", + c=2, + gamma=0.1, + check_convergence=True, + random_state=seed, + verbose=False, + ) + + csvm.fit(x, y) + save_model(csvm, self.filepath, save_format="cbor") + csvm2 = load_model(self.filepath, load_format="cbor") + + # points are separable, scoring the training dataset should have 100% + # accuracy + x_test = ds.array(np.array([p1, p2, p3, p4]), (2, 2)) + y_test = ds.array(np.array([0, 0, 1, 1]).reshape(-1, 1), (2, 1)) + + accuracy = compss_wait_on(csvm.score(x_test, y_test)) + accuracy2 = compss_wait_on(csvm2.score(x_test, y_test)) + + self.assertEqual(accuracy, 1.0) + self.assertEqual(accuracy2, 1.0) + + def test_decision_func(self): + seed = 666 + + # negative points belong to class 1, positives to 0 + # all points are in the x-axis + p1, p2, p3, p4 = [0, 2], [0, 1], [0, -2], [0, -1] + + x = ds.array(np.array([p1, p4, p3, p2]), (2, 2)) + y = ds.array(np.array([0, 1, 1, 0]).reshape(-1, 1), (2, 1)) + + csvm = CascadeSVM( + cascade_arity=3, + max_iter=10, + tol=1e-4, + kernel="rbf", + c=2, + gamma=0.1, + check_convergence=False, + random_state=seed, + verbose=False, + ) + + csvm.fit(x, y) + save_model(csvm, self.filepath, save_format="cbor") + csvm2 = load_model(self.filepath, load_format="cbor") + + # p1 should be equidistant to p3, and p2 to p4 + x_test = ds.array(np.array([p1, p2, p3, p4]), (2, 2)) + + y_pred = csvm.decision_function(x_test) + y_pred2 = csvm2.decision_function(x_test) + + d1, d2, d3, d4 = y_pred.collect() + self.assertTrue(np.isclose(abs(d1) - abs(d3), 0)) + self.assertTrue(np.isclose(abs(d2) - abs(d4), 0)) + d1, d2, d3, d4 = y_pred2.collect() + self.assertTrue(np.isclose(abs(d1) - abs(d3), 0)) + self.assertTrue(np.isclose(abs(d2) - abs(d4), 0)) + + # p5 and p6 should be in the decision function (distance=0) + p5, p6 = np.array([1, 0]), np.array([-1, 0]) + + x_test = ds.array(np.array([p5, p6]), (1, 2)) + + y_pred = csvm.decision_function(x_test) + y_pred2 = csvm2.decision_function(x_test) + + d5, d6 = y_pred.collect() + self.assertTrue(np.isclose(d5, 0)) + self.assertTrue(np.isclose(d6, 0)) + d5, d6 = y_pred2.collect() + self.assertTrue(np.isclose(d5, 0)) + self.assertTrue(np.isclose(d6, 0)) + + def test_sparse(self): + """Tests that C-SVM produces the same results with sparse and dense + data""" + seed = 666 + train = "tests/files/libsvm/3" + + x_sp, y_sp = ds.load_svmlight_file(train, (10, 300), 780, True) + x_d, y_d = ds.load_svmlight_file(train, (10, 300), 780, False) + + csvm_sp = CascadeSVM(random_state=seed) + csvm_sp.fit(x_sp, y_sp) + save_model(csvm_sp, self.filepath, save_format="cbor") + csvm_sp2 = load_model(self.filepath, load_format="cbor") + + csvm_d = CascadeSVM(random_state=seed) + csvm_d.fit(x_d, y_d) + save_model(csvm_d, self.filepath, save_format="cbor") + csvm_d2 = load_model(self.filepath, load_format="cbor") + + sv_d = csvm_d._clf.support_vectors_ + sv_sp = csvm_sp._clf.support_vectors_.toarray() + sv_d2 = csvm_d2._clf.support_vectors_ + sv_sp2 = csvm_sp2._clf.support_vectors_.toarray() + + self.assertTrue(np.array_equal(sv_d, sv_sp)) + self.assertTrue(np.array_equal(sv_d2, sv_sp2)) + self.assertTrue(np.array_equal(sv_d, sv_d2)) + + coef_d = csvm_d._clf.dual_coef_ + coef_sp = csvm_sp._clf.dual_coef_.toarray() + coef_d2 = csvm_d2._clf.dual_coef_ + coef_sp2 = csvm_sp2._clf.dual_coef_.toarray() + + self.assertTrue(np.array_equal(coef_d, coef_sp)) + self.assertTrue(np.array_equal(coef_d2, coef_sp2)) + self.assertTrue(np.array_equal(coef_d, coef_d2)) + + def test_duplicates(self): + """Tests that C-SVM does not generate duplicate support vectors""" + x = ds.array( + np.array( + [ + [0, 1], + [1, 1], + [0, 1], + [1, 2], + [0, 0], + [2, 2], + [2, 1], + [1, 0], + ] + ), + (2, 2), + ) + + y = ds.array(np.array([1, 0, 1, 0, 1, 0, 0, 1]).reshape(-1, 1), (2, 1)) + + csvm = CascadeSVM(c=1, random_state=1, max_iter=100, tol=0) + csvm.fit(x, y) + save_model(csvm, self.filepath, save_format="cbor") + csvm2 = load_model(self.filepath, load_format="cbor") + + csvm._collect_clf() + csvm2._collect_clf() + self.assertEqual(csvm._clf.support_vectors_.shape[0], 6) + self.assertEqual(csvm2._clf.support_vectors_.shape[0], 6) + + +class RFSavingTestCBOR(unittest.TestCase): + filepath = "tests/files/saving/rf.cbor" + + def test_make_classification_score(self): + """Tests RandomForestClassifier fit and score with default params.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[: len(x) // 2], (300, 10)) + y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) + + rf = RandomForestClassifier(random_state=0) + rf.fit(x_train, y_train) + save_model(rf, self.filepath, save_format="cbor") + rf2 = load_model(self.filepath, load_format="cbor") + + accuracy = compss_wait_on(rf.score(x_test, y_test)) + accuracy2 = compss_wait_on(rf2.score(x_test, y_test)) + self.assertGreater(accuracy, 0.7) + self.assertGreater(accuracy2, 0.7) + + def test_make_classification_predict_and_distr_depth(self): + """Tests RandomForestClassifier fit and predict with a distr_depth.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[: len(x) // 2], (300, 10)) + y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = y[len(y) // 2:] + + rf = RandomForestClassifier(distr_depth=2, random_state=0) + rf.fit(x_train, y_train) + save_model(rf, self.filepath, save_format="cbor") + rf2 = load_model(self.filepath, load_format="cbor") + + y_pred = rf.predict(x_test).collect() + y_pred2 = rf2.predict(x_test).collect() + accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) + accuracy2 = np.count_nonzero(y_pred2 == y_test) / len(y_test) + self.assertGreater(accuracy, 0.7) + self.assertGreater(accuracy2, 0.7) + + def test_make_classification_fit_predict(self): + """Tests RandomForestClassifier fit_predict with default params.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[: len(x) // 2], (300, 10)) + y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) + + rf = RandomForestClassifier(random_state=0) + rf.fit(x_train, y_train) + save_model(rf, self.filepath, save_format="cbor") + rf2 = load_model(self.filepath, load_format="cbor") + + y_pred = rf.predict(x_train).collect() + y_pred2 = rf2.predict(x_train).collect() + y_train = y_train.collect() + accuracy = np.count_nonzero(y_pred == y_train) / len(y_train) + accuracy2 = np.count_nonzero(y_pred2 == y_train) / len(y_train) + self.assertGreater(accuracy, 0.7) + self.assertGreater(accuracy2, 0.7) + + def test_make_classification_sklearn_max_predict(self): + """Tests RandomForestClassifier predict with sklearn_max.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[: len(x) // 2], (300, 10)) + y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = y[len(y) // 2:] + + rf = RandomForestClassifier(random_state=0, sklearn_max=10) + rf.fit(x_train, y_train) + save_model(rf, self.filepath, save_format="cbor") + rf2 = load_model(self.filepath, load_format="cbor") + + y_pred = rf.predict(x_test).collect() + y_pred2 = rf2.predict(x_test).collect() + accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) + accuracy2 = np.count_nonzero(y_pred2 == y_test) / len(y_test) + self.assertGreater(accuracy, 0.7) + self.assertGreater(accuracy2, 0.7) + + def test_make_classification_sklearn_max_predict_proba(self): + """Tests RandomForestClassifier predict_proba with sklearn_max.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[: len(x) // 2], (300, 10)) + y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = y[len(y) // 2:] + + rf = RandomForestClassifier(random_state=0, sklearn_max=10) + rf.fit(x_train, y_train) + save_model(rf, self.filepath, save_format="cbor") + rf2 = load_model(self.filepath, load_format="cbor") + + probabilities = rf.predict_proba(x_test).collect() + probabilities2 = rf2.predict_proba(x_test).collect() + rf.classes = compss_wait_on(rf.classes) + rf2.classes = compss_wait_on(rf2.classes) + y_pred = rf.classes[np.argmax(probabilities, axis=1)] + y_pred2 = rf2.classes[np.argmax(probabilities2, axis=1)] + accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) + accuracy2 = np.count_nonzero(y_pred2 == y_test) / len(y_test) + self.assertGreater(accuracy, 0.7) + self.assertGreater(accuracy2, 0.7) + + def test_make_classification_hard_vote_predict(self): + """Tests RandomForestClassifier predict with hard_vote.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[: len(x) // 2], (300, 10)) + y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = y[len(y) // 2:] + + rf = RandomForestClassifier( + random_state=0, sklearn_max=10, hard_vote=True + ) + rf.fit(x_train, y_train) + save_model(rf, self.filepath, save_format="cbor") + rf2 = load_model(self.filepath, load_format="cbor") + + y_pred = rf.predict(x_test).collect() + y_pred2 = rf2.predict(x_test).collect() + accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) + accuracy2 = np.count_nonzero(y_pred2 == y_test) / len(y_test) + self.assertGreater(accuracy, 0.7) + self.assertGreater(accuracy2, 0.7) + + def test_make_classification_hard_vote_score_mix(self): + """Tests RandomForestClassifier score with hard_vote, sklearn_max, + distr_depth and max_depth.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[: len(x) // 2], (300, 10)) + y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) + + rf = RandomForestClassifier( + random_state=0, + sklearn_max=100, + distr_depth=2, + max_depth=12, + hard_vote=True, + ) + rf.fit(x_train, y_train) + save_model(rf, self.filepath, save_format="cbor") + rf2 = load_model(self.filepath, load_format="cbor") + + accuracy = compss_wait_on(rf.score(x_test, y_test)) + accuracy2 = compss_wait_on(rf2.score(x_test, y_test)) + self.assertGreater(accuracy, 0.7) + self.assertGreater(accuracy2, 0.7) + + def test_iris(self): + """Tests RandomForestClassifier with a minimal example.""" + x, y = datasets.load_iris(return_X_y=True) + ds_fit = ds.array(x[::2], block_size=(30, 2)) + fit_y = ds.array(y[::2].reshape(-1, 1), block_size=(30, 1)) + ds_validate = ds.array(x[1::2], block_size=(30, 2)) + validate_y = ds.array(y[1::2].reshape(-1, 1), block_size=(30, 1)) + + rf = RandomForestClassifier( + n_estimators=1, max_depth=1, random_state=0 + ) + rf.fit(ds_fit, fit_y) + save_model(rf, self.filepath, save_format="cbor") + rf2 = load_model(self.filepath, load_format="cbor") + + accuracy = compss_wait_on(rf.score(ds_validate, validate_y)) + accuracy2 = compss_wait_on(rf2.score(ds_validate, validate_y)) + + # Accuracy should be <= 2/3 for any seed, often exactly equal. + self.assertAlmostEqual(accuracy, 2 / 3) + self.assertAlmostEqual(accuracy2, 2 / 3) + + +class LassoSavingTestCBOR(unittest.TestCase): + filepath = "tests/files/saving/lasso.cbor" + + def test_fit_predict(self): + """Tests fit and predicts methods""" + + np.random.seed(42) + + n_samples, n_features = 50, 100 + X = np.random.randn(n_samples, n_features) + + # Decreasing coef w. alternated signs for visualization + idx = np.arange(n_features) + coef = (-1) ** idx * np.exp(-idx / 10) + coef[10:] = 0 # sparsify coef + y = np.dot(X, coef) + + # Add noise + y += 0.01 * np.random.normal(size=n_samples) + + n_samples = X.shape[0] + X_train, y_train = X[: n_samples // 2], y[: n_samples // 2] + X_test, y_test = X[n_samples // 2:], y[n_samples // 2:] + + lasso = Lasso(lmbd=0.1, max_iter=50) + + lasso.fit(ds.array(X_train, (5, 100)), ds.array(y_train, (5, 1))) + save_model(lasso, self.filepath, save_format="cbor") + lasso2 = load_model(self.filepath, load_format="cbor") + + y_pred_lasso = lasso.predict(ds.array(X_test, (25, 100))) + r2_score_lasso = r2_score(y_test, y_pred_lasso.collect()) + y_pred_lasso2 = lasso2.predict(ds.array(X_test, (25, 100))) + r2_score_lasso2 = r2_score(y_test, y_pred_lasso2.collect()) + + self.assertAlmostEqual(r2_score_lasso, 0.9481746925431124) + self.assertAlmostEqual(r2_score_lasso2, 0.9481746925431124) + + +class LinearRegressionSavingTestCBOR(unittest.TestCase): + filepath = "tests/files/saving/linear_regression.cbor" + + def test_univariate(self): + """Tests fit() and predict(), univariate.""" + x_data = np.array([1, 2, 3, 4, 5]) + y_data = np.array([2, 1, 1, 2, 4.5]) + + bn, bm = 2, 1 + + x = ds.array(x=x_data, block_size=(bn, bm)) + y = ds.array(x=y_data, block_size=(bn, bm)) + + reg = LinearRegression() + reg.fit(x, y) + save_model(reg, self.filepath, save_format="cbor") + reg2 = load_model(self.filepath, load_format="cbor") + + self.assertTrue(np.allclose(reg.coef_.collect(), 0.6)) + self.assertTrue(np.allclose(reg.intercept_.collect(), 0.3)) + self.assertTrue(np.allclose(reg2.coef_.collect(), 0.6)) + self.assertTrue(np.allclose(reg2.intercept_.collect(), 0.3)) + + # Predict one sample + x_test = np.array([3]) + test_data = ds.array(x=x_test, block_size=(1, 1)) + pred = reg.predict(test_data).collect() + pred2 = reg2.predict(test_data).collect() + self.assertTrue(np.allclose(pred, 2.1)) + self.assertTrue(np.allclose(pred2, 2.1)) + + # Predict multiple samples + x_test = np.array([3, 5, 6]) + test_data = ds.array(x=x_test, block_size=(bn, bm)) + pred = reg.predict(test_data).collect() + pred2 = reg2.predict(test_data).collect() + self.assertTrue(np.allclose(pred, [2.1, 3.3, 3.9])) + self.assertTrue(np.allclose(pred2, [2.1, 3.3, 3.9])) + + def test_univariate_no_intercept(self): + """Tests fit() and predict(), univariate, fit_intercept=False.""" + x_data = np.array([1, 2, 3, 4, 5]) + y_data = np.array([2, 1, 1, 2, 4.5]) + + bn, bm = 2, 1 + + x = ds.array(x=x_data, block_size=(bn, bm)) + y = ds.array(x=y_data, block_size=(bn, bm)) + + reg = LinearRegression(fit_intercept=False) + reg.fit(x, y) + save_model(reg, self.filepath, save_format="cbor") + reg2 = load_model(self.filepath, load_format="cbor") + + self.assertTrue(np.allclose(reg.coef_.collect(), 0.68181818)) + self.assertTrue(np.allclose(reg2.coef_.collect(), 0.68181818)) + self.assertTrue(np.allclose(reg.intercept_.collect(), 0)) + self.assertTrue(np.allclose(reg2.intercept_.collect(), 0)) + + # Predict one sample + x_test = np.array([3]) + test_data = ds.array(x=x_test, block_size=(1, 1)) + pred = reg.predict(test_data).collect() + pred2 = reg2.predict(test_data).collect() + self.assertTrue(np.allclose(pred, 2.04545455)) + self.assertTrue(np.allclose(pred2, 2.04545455)) + + # Predict multiple samples + x_test = np.array([3, 5, 6]) + test_data = ds.array(x=x_test, block_size=(bn, bm)) + pred = reg.predict(test_data).collect() + pred2 = reg.predict(test_data).collect() + self.assertTrue(np.allclose(pred, [2.04545455, 3.4090909, 4.0909091])) + self.assertTrue(np.allclose(pred2, [2.04545455, 3.4090909, 4.0909091])) + + def test_multivariate(self): + """Tests fit() and predict(), multivariate.""" + x_data = np.array([[1, 2], [2, 0], [3, 1], [4, 4], [5, 3]]) + y_data = np.array([2, 1, 1, 2, 4.5]) + + bn, bm = 2, 2 + + x = ds.array(x=x_data, block_size=(bn, bm)) + y = ds.array(x=y_data, block_size=(bn, 1)) + + reg = LinearRegression() + reg.fit(x, y) + save_model(reg, self.filepath, save_format="cbor") + reg2 = load_model(self.filepath, load_format="cbor") + + self.assertTrue(np.allclose(reg.coef_.collect(), [0.421875, 0.296875])) + self.assertTrue( + np.allclose(reg2.coef_.collect(), [0.421875, 0.296875]) + ) + self.assertTrue(np.allclose(reg.intercept_.collect(), 0.240625)) + self.assertTrue(np.allclose(reg2.intercept_.collect(), 0.240625)) + + # Predict one sample + x_test = np.array([3, 2]) + test_data = ds.array(x=x_test, block_size=(1, bm)) + pred = reg.predict(test_data).collect() + pred2 = reg2.predict(test_data).collect() + self.assertTrue(np.allclose(pred, 2.1)) + self.assertTrue(np.allclose(pred2, 2.1)) + + # Predict multiple samples + x_test = np.array([[3, 2], [4, 4], [1, 3]]) + test_data = ds.array(x=x_test, block_size=(bn, bm)) + pred = reg.predict(test_data).collect() + self.assertTrue(np.allclose(pred, [2.1, 3.115625, 1.553125])) + + def test_multivariate_no_intercept(self): + """Tests fit() and predict(), multivariate, fit_intercept=False.""" + x_data = np.array([[1, 2], [2, 0], [3, 1], [4, 4], [5, 3]]) + y_data = np.array([2, 1, 1, 2, 4.5]) + + bn, bm = 2, 2 + + x = ds.array(x=x_data, block_size=(bn, bm)) + y = ds.array(x=y_data, block_size=(bn, 1)) + + reg = LinearRegression(fit_intercept=False) + reg.fit(x, y) + save_model(reg, self.filepath, save_format="cbor") + reg2 = load_model(self.filepath, load_format="cbor") + + self.assertTrue( + np.allclose(reg.coef_.collect(), [0.48305085, 0.30367232]) + ) + self.assertTrue( + np.allclose(reg2.coef_.collect(), [0.48305085, 0.30367232]) + ) + self.assertTrue(np.allclose(reg.intercept_.collect(), 0)) + self.assertTrue(np.allclose(reg2.intercept_.collect(), 0)) + + # Predict one sample + x_test = np.array([3, 2]) + test_data = ds.array(x=x_test, block_size=(1, bm)) + pred = reg.predict(test_data).collect() + pred2 = reg2.predict(test_data).collect() + self.assertTrue(np.allclose(pred, [2.05649718])) + self.assertTrue(np.allclose(pred2, [2.05649718])) + + # Predict multiple samples + x_test = np.array([[3, 2], [4, 4], [1, 3]]) + test_data = ds.array(x=x_test, block_size=(bn, bm)) + pred = reg.predict(test_data).collect() + pred2 = reg2.predict(test_data).collect() + self.assertTrue(np.allclose(pred, [2.05649718, 3.14689266, 1.3940678])) + self.assertTrue( + np.allclose(pred2, [2.05649718, 3.14689266, 1.3940678]) + ) + + def test_multivariate_multiobjective(self): + """Tests fit() and predict(), multivariate, multiobjective.""" + x_data = np.array( + [[1, 2, 3], [2, 0, 4], [3, 1, 8], [4, 4, 2], [5, 3, 1], [2, 7, 1]] + ) + y_data = np.array( + [ + [2, 0, 3], + [1, 5, 2], + [1, 3, 4], + [2, 7, 9], + [4.5, -1, 4], + [0, 0, 0], + ] + ) + + bn, bm = 2, 2 + + x = ds.array(x=x_data, block_size=(bn, bm)) + y = ds.array(x=y_data, block_size=(bn, bm)) + + reg = LinearRegression() + reg.fit(x, y) + save_model(reg, self.filepath, save_format="cbor") + reg2 = load_model(self.filepath, load_format="cbor") + + # Predict one sample + x_test = np.array([3, 2, 1]) + test_data = ds.array(x=x_test, block_size=(1, bm)) + pred = reg.predict(test_data).collect() + pred2 = reg2.predict(test_data).collect() + self.assertTrue(np.allclose(pred, [3.0318415, 1.97164872, 3.85410906])) + self.assertTrue( + np.allclose(pred2, [3.0318415, 1.97164872, 3.85410906]) + ) + + # Predict multiple samples + x_test = np.array([[3, 2, 1], [4, 3, 3], [1, 1, 1]]) + test_data = ds.array(x=x_test, block_size=(bn, bm)) + pred = reg.predict(test_data).collect() + pred2 = reg2.predict(test_data).collect() + self.assertTrue( + np.allclose( + pred, + [ + [3.0318415, 1.97164872, 3.85410906], + [2.5033157, 2.65809327, 5.05310495], + [2.145797, 1.4840121, 1.5739791], + ], + ) + ) + self.assertTrue( + np.allclose( + pred2, + [ + [3.0318415, 1.97164872, 3.85410906], + [2.5033157, 2.65809327, 5.05310495], + [2.145797, 1.4840121, 1.5739791], + ], + ) + ) + + # Check attributes values + self.assertTrue( + np.allclose( + reg2.coef_.collect(), + [ + [0.65034768, 0.34673933, 1.22176283], + [-0.41465084, -0.20584208, -0.16339571], + [-0.38211131, 0.27277365, 0.07031439], + ], + ) + ) + self.assertTrue( + np.allclose( + reg2.coef_.collect(), + [ + [0.65034768, 0.34673933, 1.22176283], + [-0.41465084, -0.20584208, -0.16339571], + [-0.38211131, 0.27277365, 0.07031439], + ], + ) + ) + self.assertTrue( + np.allclose( + reg.intercept_.collect(), [2.29221145, 1.07034124, 0.44529761] + ) + ) + self.assertTrue( + np.allclose( + reg2.intercept_.collect(), [2.29221145, 1.07034124, 0.44529761] + ) + ) + + +def load_movielens(train_ratio=0.9): + file = "tests/files/sample_movielens_ratings.csv" + + # 'user_id', 'movie_id', 'rating', 'timestamp' + + data = np.genfromtxt(file, dtype="int", delimiter=",", usecols=range(3)) + + # just in case there are movies/user without rating + # movie_id + n_m = max(len(np.unique(data[:, 1])), max(data[:, 1]) + 1) + # user_id + n_u = max(len(np.unique(data[:, 0])), max(data[:, 0]) + 1) + + idx = int(data.shape[0] * train_ratio) + + train_data = data[:idx] + test_data = data[idx:] + + train = csr_matrix( + (train_data[:, 2], (train_data[:, 0], train_data[:, 1])), + shape=(n_u, n_m), + ) + + test = csr_matrix((test_data[:, 2], (test_data[:, 0], test_data[:, 1]))) + + x_size, y_size = train.shape[0] // 4, train.shape[1] // 4 + train_arr = ds.array(train, block_size=(x_size, y_size)) + + x_size, y_size = test.shape[0] // 4, test.shape[1] // 4 + test_arr = ds.array(test, block_size=(x_size, y_size)) + + return train_arr, test_arr + + +class ALSSavingTestCBOR(unittest.TestCase): + filepath = "tests/files/saving/als.cbor" + + def test_init_params(self): + # Test all parameters + seed = 666 + n_f = 100 + lambda_ = 0.001 + convergence_threshold = 0.1 + max_iter = 10 + verbose = True + arity = 12 + + als = ALS( + random_state=seed, + n_f=n_f, + lambda_=lambda_, + tol=convergence_threshold, + max_iter=max_iter, + verbose=verbose, + arity=arity, + ) + save_model(als, self.filepath, save_format="cbor") + als2 = load_model(self.filepath, load_format="cbor") + + self.assertEqual(als.random_state, seed) + self.assertEqual(als.n_f, n_f) + self.assertEqual(als.lambda_, lambda_) + self.assertEqual(als.tol, convergence_threshold) + self.assertEqual(als.max_iter, max_iter) + self.assertEqual(als.verbose, verbose) + self.assertEqual(als.arity, arity) + self.assertEqual(als2.random_state, seed) + self.assertEqual(als2.n_f, n_f) + self.assertEqual(als2.lambda_, lambda_) + self.assertEqual(als2.tol, convergence_threshold) + self.assertEqual(als2.max_iter, max_iter) + self.assertEqual(als2.verbose, verbose) + self.assertEqual(als2.arity, arity) + + def test_fit(self): + train, test = load_movielens() + + als = ALS( + tol=0.01, + random_state=666, + n_f=100, + verbose=False, + check_convergence=True, + ) + + als.fit(train, test) + self.assertTrue(als.converged) + + als.fit(train) + save_model(als, self.filepath, save_format="cbor") + als2 = load_model(self.filepath, load_format="cbor") + + self.assertTrue(als.converged) + self.assertTrue(als2.converged) + + def test_predict(self): + data = np.array([[0, 0, 5], [3, 0, 5], [3, 1, 2]]) + ratings = csr_matrix(data) + train = ds.array(x=ratings, block_size=(1, 1)) + als = ALS(tol=0.01, random_state=666, n_f=5, verbose=False) + als.fit(train) + save_model(als, self.filepath, save_format="cbor") + als2 = load_model(self.filepath, load_format="cbor") + + predictions = als.predict_user(user_id=0) + predictions2 = als2.predict_user(user_id=0) + + # Check that the ratings for user 0 are similar to user 1 because they + # share preferences (third movie), thus it is expected that user 0 + # will rate movie 1 similarly to user 1. + self.assertTrue( + 2.75 < predictions[0] < 3.25 + and predictions[1] < 1 + and predictions[2] > 4.5 + ) + self.assertTrue( + 2.75 < predictions2[0] < 3.25 + and predictions2[1] < 1 + and predictions2[2] > 4.5 + ) + self.assertTrue( + np.array_equal(predictions, predictions2, equal_nan=True) + ) + + +def main(): + unittest.main() + + +if __name__ == "__main__": + main() diff --git a/tests/test_saving_json.py b/tests/test_saving_json.py new file mode 100644 index 00000000..be18474d --- /dev/null +++ b/tests/test_saving_json.py @@ -0,0 +1,1403 @@ +import unittest + +import numpy as np +from numpy.random.mtrand import RandomState +from scipy.sparse import csr_matrix +from sklearn import datasets +from sklearn.cluster import KMeans as SKMeans +from sklearn.metrics import r2_score +from sklearn.datasets import make_classification +from sklearn.datasets import make_blobs, load_iris + +import dislib as ds +from dislib.cluster import KMeans +from dislib.cluster import GaussianMixture +from dislib.classification import CascadeSVM +from dislib.classification import RandomForestClassifier +from dislib.regression import Lasso +from dislib.regression import LinearRegression +from dislib.recommendation import ALS +from dislib.utils import save_model, load_model + +from pycompss.api.api import compss_wait_on + + +class KMeansSavingTestJSON(unittest.TestCase): + filepath = "tests/files/saving/kmeans.json" + + def test_init_params_kmeans(self): + """Tests that saved and loaded KMeans object correctly sets the initialization + parameters""" + n_clusters = 2 + max_iter = 1 + tol = 1e-4 + seed = 666 + arity = 2 + init = "random" + + km = KMeans( + n_clusters=n_clusters, + max_iter=max_iter, + tol=tol, + arity=arity, + random_state=seed, + ) + save_model(km, self.filepath) + km2 = load_model(self.filepath) + + expected = (n_clusters, init, max_iter, tol, arity) + real = (km.n_clusters, km.init, km.max_iter, km.tol, km.arity) + real2 = (km2.n_clusters, km2.init, km2.max_iter, km2.tol, km2.arity) + self.assertEqual(expected, real) + self.assertEqual(expected, real2) + + def test_fit_kmeans(self): + """Tests that the fit method returns the expected centers using toy + data. + """ + arr = np.array([[1, 2], [2, 1], [-1, -2], [-2, -1]]) + x = ds.array(arr, block_size=(2, 2)) + + km = KMeans(n_clusters=2, random_state=666, verbose=False) + km.fit(x) + + expected_centers = np.array([[1.5, 1.5], [-1.5, -1.5]]) + + save_model(km, self.filepath) + km2 = load_model(self.filepath) + + self.assertTrue((km.centers == expected_centers).all()) + self.assertTrue((km2.centers == expected_centers).all()) + + def test_predict_kmeans(self): + """Tests that labels are correctly predicted using toy data.""" + p1, p2, p3, p4 = [1, 2], [2, 1], [-1, -2], [-2, -1] + + arr1 = np.array([p1, p2, p3, p4]) + x = ds.array(arr1, block_size=(2, 2)) + + km = KMeans(n_clusters=2, random_state=666) + km.fit(x) + + save_model(km, self.filepath) + km2 = load_model(self.filepath) + + p5, p6 = [10, 10], [-10, -10] + + arr2 = np.array([p1, p2, p3, p4, p5, p6]) + x_test = ds.array(arr2, block_size=(2, 2)) + + labels = km.predict(x_test).collect() + labels2 = km2.predict(x_test).collect() + expected_labels = np.array([0, 0, 1, 1, 0, 1]) + + self.assertTrue(np.array_equal(labels, expected_labels)) + self.assertTrue(np.array_equal(labels2, expected_labels)) + + def test_fit_predict_kmeans(self): + """Tests fit_predict.""" + x, y = make_blobs(n_samples=1500, random_state=170) + x_filtered = np.vstack( + (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10]) + ) + + x_train = ds.array(x_filtered, block_size=(300, 2)) + + kmeans = KMeans(n_clusters=3, random_state=170) + labels = kmeans.fit_predict(x_train).collect() + + save_model(kmeans, self.filepath) + kmeans = load_model(self.filepath) + + skmeans = SKMeans(n_clusters=3, random_state=170) + sklabels = skmeans.fit_predict(x_filtered) + + centers = np.array( + [ + [-8.941375656533449, -5.481371322614891], + [-4.524023204953875, 0.06235042593214654], + [2.332994701667008, 0.37681003933082696], + ] + ) + + self.assertTrue(np.allclose(centers, kmeans.centers)) + self.assertTrue(np.allclose(labels, sklabels)) + + def test_sparse_kmeans(self): + """Tests K-means produces the same results using dense and sparse + data structures.""" + file_ = "tests/files/libsvm/2" + + x_sp, _ = ds.load_svmlight_file(file_, (10, 300), 780, True) + x_ds, _ = ds.load_svmlight_file(file_, (10, 300), 780, False) + + kmeans = KMeans(random_state=170) + kmeans.fit(x_sp) + + save_model(kmeans, self.filepath) + kmeans2 = load_model(self.filepath) + + y_sparse = kmeans.predict(x_sp).collect() + y_sparse2 = kmeans2.predict(x_sp).collect() + + sparse_c = kmeans.centers.toarray() + sparse_c2 = kmeans2.centers.toarray() + + kmeans = KMeans(random_state=170) + + y_dense = kmeans.fit_predict(x_ds).collect() + dense_c = kmeans.centers + + self.assertTrue(np.allclose(sparse_c, dense_c)) + self.assertTrue(np.allclose(sparse_c2, dense_c)) + self.assertTrue(np.array_equal(y_sparse, y_dense)) + self.assertTrue(np.array_equal(y_sparse2, y_dense)) + + def test_init_kmeans(self): + # With dense data + x, y = make_blobs(n_samples=1500, random_state=170) + x_filtered = np.vstack( + (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10]) + ) + x_train = ds.array(x_filtered, block_size=(300, 2)) + + init = np.random.random((5, 2)) + km = KMeans(n_clusters=5, init=init) + km.fit(x_train) + + save_model(km, self.filepath) + km2 = load_model(self.filepath) + + self.assertTrue(np.array_equal(km.init, init)) + self.assertTrue(np.array_equal(km2.init, init)) + self.assertFalse(np.array_equal(km.centers, init)) + self.assertFalse(np.array_equal(km2.centers, init)) + + # With sparse data + x_sp = ds.array(csr_matrix(x_filtered), block_size=(300, 2)) + init = csr_matrix(np.random.random((5, 2))) + + km = KMeans(n_clusters=5, init=init) + km.fit(x_sp) + + save_model(km, self.filepath) + km2 = load_model(self.filepath) + + self.assertTrue(np.array_equal(km.init.toarray(), init.toarray())) + self.assertTrue(np.array_equal(km2.init.toarray(), init.toarray())) + self.assertFalse(np.array_equal(km.centers.toarray(), init.toarray())) + self.assertFalse(np.array_equal(km2.centers.toarray(), init.toarray())) + + +class GaussianMixtureSavingTestJSON(unittest.TestCase): + filepath = "tests/files/saving/gm.json" + + def test_init_params(self): + """Tests that GaussianMixture params are set""" + n_components = 2 + covariance_type = "diag" + tol = 1e-4 + reg_covar = 1e-5 + max_iter = 3 + init_params = "random" + weights_init = np.array([0.4, 0.6]) + means_init = np.array([[0, 0], [2, 3]]) + precisions_init = "todo" + random_state = RandomState(666) + gm = GaussianMixture( + n_components=n_components, + covariance_type=covariance_type, + tol=tol, + reg_covar=reg_covar, + max_iter=max_iter, + init_params=init_params, + weights_init=weights_init, + means_init=means_init, + precisions_init=precisions_init, + random_state=random_state, + ) + + save_model(gm, self.filepath) + gm2 = load_model(self.filepath) + + real = ( + gm.n_components, + gm.covariance_type, + gm.tol, + gm.reg_covar, + gm.max_iter, + gm.init_params, + gm.weights_init.tolist(), + gm.means_init.tolist(), + gm.precisions_init, + *[ + list(x) if isinstance(x, np.ndarray) else x + for x in gm.random_state.get_state() + ], + ) + real2 = ( + gm2.n_components, + gm2.covariance_type, + gm2.tol, + gm2.reg_covar, + gm2.max_iter, + gm2.init_params, + gm2.weights_init.tolist(), + gm2.means_init.tolist(), + gm2.precisions_init, + *[ + list(x) if isinstance(x, np.ndarray) else x + for x in gm2.random_state.get_state() + ], + ) + + self.assertEqual(real, real2) + + def test_fit(self): + """Tests GaussianMixture.fit()""" + + x = np.array([[1, 2], [2, 1], [-3, -3], [-1, -2], [-2, -1], [3, 3]]) + ds_x = ds.array(x, block_size=(3, 2)) + + gm = GaussianMixture(n_components=2, random_state=666) + gm.fit(ds_x) + + expected_weights = np.array([0.5, 0.5]) + expected_means = np.array([[-2, -2], [2, 2]]) + expected_cov = np.array( + [ + [[0.66671688, 0.33338255], [0.33338255, 0.66671688]], + [[0.66671688, 0.33338255], [0.33338255, 0.66671688]], + ] + ) + expected_pc = np.array( + [ + [[1.22469875, -0.70714834], [0.0, 1.4141944]], + [[1.22469875, -0.70714834], [0.0, 1.4141944]], + ] + ) + + save_model(gm, self.filepath) + gm2 = load_model(self.filepath) + + gm.weights_ = compss_wait_on(gm.weights_) + gm.means_ = compss_wait_on(gm.means_) + gm.covariances_ = compss_wait_on(gm.covariances_) + gm.precisions_cholesky_ = compss_wait_on(gm.precisions_cholesky_) + + gm2.weights_ = compss_wait_on(gm2.weights_) + gm2.means_ = compss_wait_on(gm2.means_) + gm2.covariances_ = compss_wait_on(gm2.covariances_) + gm2.precisions_cholesky_ = compss_wait_on(gm2.precisions_cholesky_) + + self.assertTrue((np.allclose(gm.weights_, expected_weights))) + self.assertTrue((np.allclose(gm.means_, expected_means))) + self.assertTrue((np.allclose(gm.covariances_, expected_cov))) + self.assertTrue((np.allclose(gm.precisions_cholesky_, expected_pc))) + + self.assertTrue((np.allclose(gm2.weights_, expected_weights))) + self.assertTrue((np.allclose(gm2.means_, expected_means))) + self.assertTrue((np.allclose(gm2.covariances_, expected_cov))) + self.assertTrue((np.allclose(gm2.precisions_cholesky_, expected_pc))) + + def test_predict(self): + """Tests GaussianMixture.predict()""" + x_train = np.array([[1, 2], [-1, -2], [2, 1], [-2, -1]]) + ds_x_train = ds.array(x_train, block_size=(2, 2)) + + gm = GaussianMixture(n_components=2, random_state=666) + gm.fit(ds_x_train) + + save_model(gm, self.filepath) + gm2 = load_model(self.filepath) + + x_test = np.concatenate((x_train, [[2, 2], [-1, -3]])) + ds_x_test = ds.array(x_test, block_size=(2, 2)) + pred = gm.predict(ds_x_test).collect() + pred2 = gm2.predict(ds_x_test).collect() + + self.assertTrue(pred[0] != pred[1]) + self.assertTrue(pred[0] == pred[2] == pred[4]) + self.assertTrue(pred[1] == pred[3] == pred[5]) + + self.assertTrue(pred2[0] != pred2[1]) + self.assertTrue(pred2[0] == pred2[2] == pred2[4]) + self.assertTrue(pred2[1] == pred2[3] == pred2[5]) + + def test_fit_predict(self): + """Tests GaussianMixture.fit_predict()""" + x, y = make_blobs(n_samples=1500, random_state=170) + x_filtered = np.vstack( + (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10]) + ) + y_real = np.concatenate((np.zeros(500), np.ones(100), 2 * np.ones(10))) + + ds_x = ds.array(x_filtered, block_size=(300, 2)) + + gm = GaussianMixture(n_components=3, random_state=170) + pred = gm.fit_predict(ds_x).collect() + + save_model(gm, self.filepath) + gm2 = load_model(self.filepath) + + pred2 = gm2.predict(ds_x).collect() + + self.assertEqual(len(pred), 610) + accuracy = np.count_nonzero(pred == y_real) / len(pred) + self.assertGreater(accuracy, 0.99) + + self.assertEqual(len(pred2), 610) + accuracy2 = np.count_nonzero(pred2 == y_real) / len(pred2) + self.assertGreater(accuracy2, 0.99) + + def test_sparse(self): + """Tests GaussianMixture produces the same results using dense and + sparse data structures""" + file_ = "tests/files/libsvm/2" + + x_sparse, _ = ds.load_svmlight_file(file_, (10, 780), 780, True) + x_dense, _ = ds.load_svmlight_file(file_, (10, 780), 780, False) + + covariance_types = "full", "tied", "diag", "spherical" + + for cov_type in covariance_types: + gm = GaussianMixture( + n_components=4, random_state=0, covariance_type=cov_type + ) + gm.fit(x_sparse) + save_model(gm, self.filepath) + gm2 = load_model(self.filepath) + labels_sparse = gm.predict(x_sparse).collect() + labels_sparse2 = gm2.predict(x_sparse).collect() + + gm = GaussianMixture( + n_components=4, random_state=0, covariance_type=cov_type + ) + gm.fit(x_dense) + save_model(gm, self.filepath) + gm2 = load_model(self.filepath) + labels_dense = gm.predict(x_dense).collect() + labels_dense2 = gm2.predict(x_dense).collect() + + self.assertTrue(np.array_equal(labels_sparse, labels_sparse2)) + self.assertTrue(np.array_equal(labels_sparse, labels_dense)) + self.assertTrue(np.array_equal(labels_sparse2, labels_dense2)) + + def test_init_random(self): + """Tests GaussianMixture random initialization""" + x = ds.random_array((50, 3), (10, 3), random_state=0) + gm = GaussianMixture( + init_params="random", n_components=4, arity=2, random_state=170 + ) + gm.fit(x) + save_model(gm, self.filepath) + gm2 = load_model(self.filepath) + self.assertGreater(gm.n_iter, 5) + self.assertGreater(gm2.n_iter, 5) + + def test_means_init_and_weights_init(self): + """Tests GaussianMixture means_init and weights_init parameters""" + x, _ = load_iris(return_X_y=True) + x_ds = ds.array(x, (75, 4)) + weights_init = [1 / 3, 1 / 3, 1 / 3] + means_init = np.array([[5, 3, 2, 0], [6, 3, 4, 1], [7, 3, 6, 2]]) + gm = GaussianMixture( + random_state=0, + n_components=3, + weights_init=weights_init, + means_init=means_init, + ) + gm.fit(x_ds) + save_model(gm, self.filepath) + gm2 = load_model(self.filepath) + self.assertTrue(gm.converged_) + self.assertTrue(gm2.converged_) + + +class CSVMSavingTestJSON(unittest.TestCase): + filepath = "tests/files/saving/csvm.json" + + def test_init_params(self): + """Test constructor parameters""" + cascade_arity = 3 + max_iter = 1 + tol = 1e-4 + kernel = "rbf" + c = 2 + gamma = 0.1 + check_convergence = True + seed = 666 + verbose = False + + csvm = CascadeSVM( + cascade_arity=cascade_arity, + max_iter=max_iter, + tol=tol, + kernel=kernel, + c=c, + gamma=gamma, + check_convergence=check_convergence, + random_state=seed, + verbose=verbose, + ) + save_model(csvm, self.filepath) + csvm2 = load_model(self.filepath) + + self.assertEqual(csvm.cascade_arity, cascade_arity) + self.assertEqual(csvm.max_iter, max_iter) + self.assertEqual(csvm.tol, tol) + self.assertEqual(csvm.kernel, kernel) + self.assertEqual(csvm.c, c) + self.assertEqual(csvm.gamma, gamma) + self.assertEqual(csvm.check_convergence, check_convergence) + self.assertEqual(csvm.random_state, seed) + self.assertEqual(csvm.verbose, verbose) + + self.assertEqual(csvm2.cascade_arity, cascade_arity) + self.assertEqual(csvm2.max_iter, max_iter) + self.assertEqual(csvm2.tol, tol) + self.assertEqual(csvm2.kernel, kernel) + self.assertEqual(csvm2.c, c) + self.assertEqual(csvm2.gamma, gamma) + self.assertEqual(csvm2.check_convergence, check_convergence) + self.assertEqual(csvm2.random_state, seed) + self.assertEqual(csvm2.verbose, verbose) + + def test_fit_private_params(self): + kernel = "rbf" + c = 2 + gamma = 0.1 + seed = 666 + file_ = "tests/files/libsvm/2" + + x, y = ds.load_svmlight_file(file_, (10, 300), 780, False) + csvm = CascadeSVM(kernel=kernel, c=c, gamma=gamma, random_state=seed) + csvm.fit(x, y) + save_model(csvm, self.filepath) + csvm2 = load_model(self.filepath) + self.assertEqual(csvm._clf_params["kernel"], kernel) + self.assertEqual(csvm._clf_params["C"], c) + self.assertEqual(csvm._clf_params["gamma"], gamma) + self.assertEqual(csvm2._clf_params["kernel"], kernel) + self.assertEqual(csvm2._clf_params["C"], c) + self.assertEqual(csvm2._clf_params["gamma"], gamma) + + kernel, c = "linear", 0.3 + csvm = CascadeSVM(kernel=kernel, c=c, random_state=seed) + csvm.fit(x, y) + save_model(csvm, self.filepath) + csvm2 = load_model(self.filepath) + self.assertEqual(csvm._clf_params["kernel"], kernel) + self.assertEqual(csvm._clf_params["C"], c) + self.assertEqual(csvm2._clf_params["kernel"], kernel) + self.assertEqual(csvm2._clf_params["C"], c) + + # # check for exception when incorrect kernel is passed + # self.assertRaises(AttributeError, CascadeSVM(kernel='fake_kernel')) + + def test_fit(self): + seed = 666 + file_ = "tests/files/libsvm/2" + + x, y = ds.load_svmlight_file(file_, (10, 300), 780, False) + + csvm = CascadeSVM( + cascade_arity=3, + max_iter=5, + tol=1e-4, + kernel="linear", + c=2, + gamma=0.1, + check_convergence=True, + random_state=seed, + verbose=False, + ) + + csvm.fit(x, y) + save_model(csvm, self.filepath) + csvm2 = load_model(self.filepath) + + self.assertTrue(csvm.converged) + self.assertTrue(csvm2.converged) + + csvm = CascadeSVM( + cascade_arity=3, + max_iter=1, + tol=1e-4, + kernel="linear", + c=2, + gamma=0.1, + check_convergence=False, + random_state=seed, + verbose=False, + ) + + csvm.fit(x, y) + save_model(csvm, self.filepath) + csvm2 = load_model(self.filepath) + self.assertFalse(csvm.converged) + self.assertEqual(csvm.iterations, 1) + self.assertFalse(csvm2.converged) + self.assertEqual(csvm2.iterations, 1) + + def test_predict(self): + seed = 666 + + # negative points belong to class 1, positives to 0 + p1, p2, p3, p4 = [1, 2], [2, 1], [-1, -2], [-2, -1] + + x = ds.array(np.array([p1, p4, p3, p2]), (2, 2)) + y = ds.array(np.array([0, 1, 1, 0]).reshape(-1, 1), (2, 1)) + + csvm = CascadeSVM( + cascade_arity=3, + max_iter=10, + tol=1e-4, + kernel="linear", + c=2, + gamma=0.1, + check_convergence=False, + random_state=seed, + verbose=False, + ) + + csvm.fit(x, y) + save_model(csvm, self.filepath) + csvm2 = load_model(self.filepath) + + # p5 should belong to class 0, p6 to class 1 + p5, p6 = np.array([1, 1]), np.array([-1, -1]) + + x_test = ds.array(np.array([p1, p2, p3, p4, p5, p6]), (2, 2)) + + y_pred = csvm.predict(x_test) + y_pred2 = csvm2.predict(x_test) + + l1, l2, l3, l4, l5, l6 = y_pred.collect() + self.assertTrue(l1 == l2 == l5 == 0) + self.assertTrue(l3 == l4 == l6 == 1) + + l1, l2, l3, l4, l5, l6 = y_pred2.collect() + self.assertTrue(l1 == l2 == l5 == 0) + self.assertTrue(l3 == l4 == l6 == 1) + + def test_score(self): + seed = 666 + + # negative points belong to class 1, positives to 0 + p1, p2, p3, p4 = [1, 2], [2, 1], [-1, -2], [-2, -1] + + x = ds.array(np.array([p1, p4, p3, p2]), (2, 2)) + y = ds.array(np.array([0, 1, 1, 0]).reshape(-1, 1), (2, 1)) + + csvm = CascadeSVM( + cascade_arity=3, + max_iter=10, + tol=1e-4, + kernel="rbf", + c=2, + gamma=0.1, + check_convergence=True, + random_state=seed, + verbose=False, + ) + + csvm.fit(x, y) + save_model(csvm, self.filepath) + csvm2 = load_model(self.filepath) + + # points are separable, scoring the training dataset should have 100% + # accuracy + x_test = ds.array(np.array([p1, p2, p3, p4]), (2, 2)) + y_test = ds.array(np.array([0, 0, 1, 1]).reshape(-1, 1), (2, 1)) + + accuracy = compss_wait_on(csvm.score(x_test, y_test)) + accuracy2 = compss_wait_on(csvm2.score(x_test, y_test)) + + self.assertEqual(accuracy, 1.0) + self.assertEqual(accuracy2, 1.0) + + def test_decision_func(self): + seed = 666 + + # negative points belong to class 1, positives to 0 + # all points are in the x-axis + p1, p2, p3, p4 = [0, 2], [0, 1], [0, -2], [0, -1] + + x = ds.array(np.array([p1, p4, p3, p2]), (2, 2)) + y = ds.array(np.array([0, 1, 1, 0]).reshape(-1, 1), (2, 1)) + + csvm = CascadeSVM( + cascade_arity=3, + max_iter=10, + tol=1e-4, + kernel="rbf", + c=2, + gamma=0.1, + check_convergence=False, + random_state=seed, + verbose=False, + ) + + csvm.fit(x, y) + save_model(csvm, self.filepath) + csvm2 = load_model(self.filepath) + + # p1 should be equidistant to p3, and p2 to p4 + x_test = ds.array(np.array([p1, p2, p3, p4]), (2, 2)) + + y_pred = csvm.decision_function(x_test) + y_pred2 = csvm2.decision_function(x_test) + + d1, d2, d3, d4 = y_pred.collect() + self.assertTrue(np.isclose(abs(d1) - abs(d3), 0)) + self.assertTrue(np.isclose(abs(d2) - abs(d4), 0)) + d1, d2, d3, d4 = y_pred2.collect() + self.assertTrue(np.isclose(abs(d1) - abs(d3), 0)) + self.assertTrue(np.isclose(abs(d2) - abs(d4), 0)) + + # p5 and p6 should be in the decision function (distance=0) + p5, p6 = np.array([1, 0]), np.array([-1, 0]) + + x_test = ds.array(np.array([p5, p6]), (1, 2)) + + y_pred = csvm.decision_function(x_test) + y_pred2 = csvm2.decision_function(x_test) + + d5, d6 = y_pred.collect() + self.assertTrue(np.isclose(d5, 0)) + self.assertTrue(np.isclose(d6, 0)) + d5, d6 = y_pred2.collect() + self.assertTrue(np.isclose(d5, 0)) + self.assertTrue(np.isclose(d6, 0)) + + def test_sparse(self): + """Tests that C-SVM produces the same results with sparse and dense + data""" + seed = 666 + train = "tests/files/libsvm/3" + + x_sp, y_sp = ds.load_svmlight_file(train, (10, 300), 780, True) + x_d, y_d = ds.load_svmlight_file(train, (10, 300), 780, False) + + csvm_sp = CascadeSVM(random_state=seed) + csvm_sp.fit(x_sp, y_sp) + save_model(csvm_sp, self.filepath) + csvm_sp2 = load_model(self.filepath) + + csvm_d = CascadeSVM(random_state=seed) + csvm_d.fit(x_d, y_d) + save_model(csvm_d, self.filepath) + csvm_d2 = load_model(self.filepath) + + sv_d = csvm_d._clf.support_vectors_ + sv_sp = csvm_sp._clf.support_vectors_.toarray() + sv_d2 = csvm_d2._clf.support_vectors_ + sv_sp2 = csvm_sp2._clf.support_vectors_.toarray() + + self.assertTrue(np.array_equal(sv_d, sv_sp)) + self.assertTrue(np.array_equal(sv_d2, sv_sp2)) + self.assertTrue(np.array_equal(sv_d, sv_d2)) + + coef_d = csvm_d._clf.dual_coef_ + coef_sp = csvm_sp._clf.dual_coef_.toarray() + coef_d2 = csvm_d2._clf.dual_coef_ + coef_sp2 = csvm_sp2._clf.dual_coef_.toarray() + + self.assertTrue(np.array_equal(coef_d, coef_sp)) + self.assertTrue(np.array_equal(coef_d2, coef_sp2)) + self.assertTrue(np.array_equal(coef_d, coef_d2)) + + def test_duplicates(self): + """Tests that C-SVM does not generate duplicate support vectors""" + x = ds.array( + np.array( + [ + [0, 1], + [1, 1], + [0, 1], + [1, 2], + [0, 0], + [2, 2], + [2, 1], + [1, 0], + ] + ), + (2, 2), + ) + + y = ds.array(np.array([1, 0, 1, 0, 1, 0, 0, 1]).reshape(-1, 1), (2, 1)) + + csvm = CascadeSVM(c=1, random_state=1, max_iter=100, tol=0) + csvm.fit(x, y) + save_model(csvm, self.filepath) + csvm2 = load_model(self.filepath) + + csvm._collect_clf() + csvm2._collect_clf() + self.assertEqual(csvm._clf.support_vectors_.shape[0], 6) + self.assertEqual(csvm2._clf.support_vectors_.shape[0], 6) + + +class RFSavingTestJSON(unittest.TestCase): + filepath = "tests/files/saving/rf.json" + + def test_make_classification_score(self): + """Tests RandomForestClassifier fit and score with default params.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[: len(x) // 2], (300, 10)) + y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) + + rf = RandomForestClassifier(random_state=0) + rf.fit(x_train, y_train) + save_model(rf, self.filepath) + rf2 = load_model(self.filepath) + + accuracy = compss_wait_on(rf.score(x_test, y_test)) + accuracy2 = compss_wait_on(rf2.score(x_test, y_test)) + self.assertGreater(accuracy, 0.7) + self.assertGreater(accuracy2, 0.7) + + def test_make_classification_predict_and_distr_depth(self): + """Tests RandomForestClassifier fit and predict with a distr_depth.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[: len(x) // 2], (300, 10)) + y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = y[len(y) // 2:] + + rf = RandomForestClassifier(distr_depth=2, random_state=0) + rf.fit(x_train, y_train) + save_model(rf, self.filepath) + rf2 = load_model(self.filepath) + + y_pred = rf.predict(x_test).collect() + y_pred2 = rf2.predict(x_test).collect() + accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) + accuracy2 = np.count_nonzero(y_pred2 == y_test) / len(y_test) + self.assertGreater(accuracy, 0.7) + self.assertGreater(accuracy2, 0.7) + + def test_make_classification_fit_predict(self): + """Tests RandomForestClassifier fit_predict with default params.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[: len(x) // 2], (300, 10)) + y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) + + rf = RandomForestClassifier(random_state=0) + rf.fit(x_train, y_train) + save_model(rf, self.filepath) + rf2 = load_model(self.filepath) + + y_pred = rf.predict(x_train).collect() + y_pred2 = rf2.predict(x_train).collect() + y_train = y_train.collect() + accuracy = np.count_nonzero(y_pred == y_train) / len(y_train) + accuracy2 = np.count_nonzero(y_pred2 == y_train) / len(y_train) + self.assertGreater(accuracy, 0.7) + self.assertGreater(accuracy2, 0.7) + + def test_make_classification_sklearn_max_predict(self): + """Tests RandomForestClassifier predict with sklearn_max.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[: len(x) // 2], (300, 10)) + y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = y[len(y) // 2:] + + rf = RandomForestClassifier(random_state=0, sklearn_max=10) + rf.fit(x_train, y_train) + save_model(rf, self.filepath) + rf2 = load_model(self.filepath) + + y_pred = rf.predict(x_test).collect() + y_pred2 = rf2.predict(x_test).collect() + accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) + accuracy2 = np.count_nonzero(y_pred2 == y_test) / len(y_test) + self.assertGreater(accuracy, 0.7) + self.assertGreater(accuracy2, 0.7) + + def test_make_classification_sklearn_max_predict_proba(self): + """Tests RandomForestClassifier predict_proba with sklearn_max.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[: len(x) // 2], (300, 10)) + y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = y[len(y) // 2:] + + rf = RandomForestClassifier(random_state=0, sklearn_max=10) + rf.fit(x_train, y_train) + save_model(rf, self.filepath) + rf2 = load_model(self.filepath) + + probabilities = rf.predict_proba(x_test).collect() + probabilities2 = rf2.predict_proba(x_test).collect() + rf.classes = compss_wait_on(rf.classes) + rf2.classes = compss_wait_on(rf2.classes) + y_pred = rf.classes[np.argmax(probabilities, axis=1)] + y_pred2 = rf2.classes[np.argmax(probabilities2, axis=1)] + accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) + accuracy2 = np.count_nonzero(y_pred2 == y_test) / len(y_test) + self.assertGreater(accuracy, 0.7) + self.assertGreater(accuracy2, 0.7) + + def test_make_classification_hard_vote_predict(self): + """Tests RandomForestClassifier predict with hard_vote.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[: len(x) // 2], (300, 10)) + y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = y[len(y) // 2:] + + rf = RandomForestClassifier( + random_state=0, sklearn_max=10, hard_vote=True + ) + rf.fit(x_train, y_train) + save_model(rf, self.filepath) + rf2 = load_model(self.filepath) + + y_pred = rf.predict(x_test).collect() + y_pred2 = rf2.predict(x_test).collect() + accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) + accuracy2 = np.count_nonzero(y_pred2 == y_test) / len(y_test) + self.assertGreater(accuracy, 0.7) + self.assertGreater(accuracy2, 0.7) + + def test_make_classification_hard_vote_score_mix(self): + """Tests RandomForestClassifier score with hard_vote, sklearn_max, + distr_depth and max_depth.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[: len(x) // 2], (300, 10)) + y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) + + rf = RandomForestClassifier( + random_state=0, + sklearn_max=100, + distr_depth=2, + max_depth=12, + hard_vote=True, + ) + rf.fit(x_train, y_train) + save_model(rf, self.filepath) + rf2 = load_model(self.filepath) + + accuracy = compss_wait_on(rf.score(x_test, y_test)) + accuracy2 = compss_wait_on(rf2.score(x_test, y_test)) + self.assertGreater(accuracy, 0.7) + self.assertGreater(accuracy2, 0.7) + + def test_iris(self): + """Tests RandomForestClassifier with a minimal example.""" + x, y = datasets.load_iris(return_X_y=True) + ds_fit = ds.array(x[::2], block_size=(30, 2)) + fit_y = ds.array(y[::2].reshape(-1, 1), block_size=(30, 1)) + ds_validate = ds.array(x[1::2], block_size=(30, 2)) + validate_y = ds.array(y[1::2].reshape(-1, 1), block_size=(30, 1)) + + rf = RandomForestClassifier( + n_estimators=1, max_depth=1, random_state=0 + ) + rf.fit(ds_fit, fit_y) + save_model(rf, self.filepath) + rf2 = load_model(self.filepath) + + accuracy = compss_wait_on(rf.score(ds_validate, validate_y)) + accuracy2 = compss_wait_on(rf2.score(ds_validate, validate_y)) + + # Accuracy should be <= 2/3 for any seed, often exactly equal. + self.assertAlmostEqual(accuracy, 2 / 3) + self.assertAlmostEqual(accuracy2, 2 / 3) + + +class LassoSavingTestJSON(unittest.TestCase): + filepath = "tests/files/saving/lasso.json" + + def test_fit_predict(self): + """Tests fit and predicts methods""" + + np.random.seed(42) + + n_samples, n_features = 50, 100 + X = np.random.randn(n_samples, n_features) + + # Decreasing coef w. alternated signs for visualization + idx = np.arange(n_features) + coef = (-1) ** idx * np.exp(-idx / 10) + coef[10:] = 0 # sparsify coef + y = np.dot(X, coef) + + # Add noise + y += 0.01 * np.random.normal(size=n_samples) + + n_samples = X.shape[0] + X_train, y_train = X[: n_samples // 2], y[: n_samples // 2] + X_test, y_test = X[n_samples // 2:], y[n_samples // 2:] + + lasso = Lasso(lmbd=0.1, max_iter=50) + + lasso.fit(ds.array(X_train, (5, 100)), ds.array(y_train, (5, 1))) + save_model(lasso, self.filepath) + lasso2 = load_model(self.filepath) + + y_pred_lasso = lasso.predict(ds.array(X_test, (25, 100))) + r2_score_lasso = r2_score(y_test, y_pred_lasso.collect()) + y_pred_lasso2 = lasso2.predict(ds.array(X_test, (25, 100))) + r2_score_lasso2 = r2_score(y_test, y_pred_lasso2.collect()) + + self.assertAlmostEqual(r2_score_lasso, 0.9481746925431124) + self.assertAlmostEqual(r2_score_lasso2, 0.9481746925431124) + + +class LinearRegressionSavingTestJSON(unittest.TestCase): + filepath = "tests/files/saving/linear_regression.json" + + def test_univariate(self): + """Tests fit() and predict(), univariate.""" + x_data = np.array([1, 2, 3, 4, 5]) + y_data = np.array([2, 1, 1, 2, 4.5]) + + bn, bm = 2, 1 + + x = ds.array(x=x_data, block_size=(bn, bm)) + y = ds.array(x=y_data, block_size=(bn, bm)) + + reg = LinearRegression() + reg.fit(x, y) + save_model(reg, self.filepath) + reg2 = load_model(self.filepath) + + self.assertTrue(np.allclose(reg.coef_.collect(), 0.6)) + self.assertTrue(np.allclose(reg.intercept_.collect(), 0.3)) + self.assertTrue(np.allclose(reg2.coef_.collect(), 0.6)) + self.assertTrue(np.allclose(reg2.intercept_.collect(), 0.3)) + + # Predict one sample + x_test = np.array([3]) + test_data = ds.array(x=x_test, block_size=(1, 1)) + pred = reg.predict(test_data).collect() + pred2 = reg2.predict(test_data).collect() + self.assertTrue(np.allclose(pred, 2.1)) + self.assertTrue(np.allclose(pred2, 2.1)) + + # Predict multiple samples + x_test = np.array([3, 5, 6]) + test_data = ds.array(x=x_test, block_size=(bn, bm)) + pred = reg.predict(test_data).collect() + pred2 = reg2.predict(test_data).collect() + self.assertTrue(np.allclose(pred, [2.1, 3.3, 3.9])) + self.assertTrue(np.allclose(pred2, [2.1, 3.3, 3.9])) + + def test_univariate_no_intercept(self): + """Tests fit() and predict(), univariate, fit_intercept=False.""" + x_data = np.array([1, 2, 3, 4, 5]) + y_data = np.array([2, 1, 1, 2, 4.5]) + + bn, bm = 2, 1 + + x = ds.array(x=x_data, block_size=(bn, bm)) + y = ds.array(x=y_data, block_size=(bn, bm)) + + reg = LinearRegression(fit_intercept=False) + reg.fit(x, y) + save_model(reg, self.filepath) + reg2 = load_model(self.filepath) + + self.assertTrue(np.allclose(reg.coef_.collect(), 0.68181818)) + self.assertTrue(np.allclose(reg2.coef_.collect(), 0.68181818)) + self.assertTrue(np.allclose(reg.intercept_.collect(), 0)) + self.assertTrue(np.allclose(reg2.intercept_.collect(), 0)) + + # Predict one sample + x_test = np.array([3]) + test_data = ds.array(x=x_test, block_size=(1, 1)) + pred = reg.predict(test_data).collect() + pred2 = reg2.predict(test_data).collect() + self.assertTrue(np.allclose(pred, 2.04545455)) + self.assertTrue(np.allclose(pred2, 2.04545455)) + + # Predict multiple samples + x_test = np.array([3, 5, 6]) + test_data = ds.array(x=x_test, block_size=(bn, bm)) + pred = reg.predict(test_data).collect() + pred2 = reg.predict(test_data).collect() + self.assertTrue(np.allclose(pred, [2.04545455, 3.4090909, 4.0909091])) + self.assertTrue(np.allclose(pred2, [2.04545455, 3.4090909, 4.0909091])) + + def test_multivariate(self): + """Tests fit() and predict(), multivariate.""" + x_data = np.array([[1, 2], [2, 0], [3, 1], [4, 4], [5, 3]]) + y_data = np.array([2, 1, 1, 2, 4.5]) + + bn, bm = 2, 2 + + x = ds.array(x=x_data, block_size=(bn, bm)) + y = ds.array(x=y_data, block_size=(bn, 1)) + + reg = LinearRegression() + reg.fit(x, y) + save_model(reg, self.filepath) + reg2 = load_model(self.filepath) + + self.assertTrue(np.allclose(reg.coef_.collect(), [0.421875, 0.296875])) + self.assertTrue( + np.allclose(reg2.coef_.collect(), [0.421875, 0.296875]) + ) + self.assertTrue(np.allclose(reg.intercept_.collect(), 0.240625)) + self.assertTrue(np.allclose(reg2.intercept_.collect(), 0.240625)) + + # Predict one sample + x_test = np.array([3, 2]) + test_data = ds.array(x=x_test, block_size=(1, bm)) + pred = reg.predict(test_data).collect() + pred2 = reg2.predict(test_data).collect() + self.assertTrue(np.allclose(pred, 2.1)) + self.assertTrue(np.allclose(pred2, 2.1)) + + # Predict multiple samples + x_test = np.array([[3, 2], [4, 4], [1, 3]]) + test_data = ds.array(x=x_test, block_size=(bn, bm)) + pred = reg.predict(test_data).collect() + self.assertTrue(np.allclose(pred, [2.1, 3.115625, 1.553125])) + + def test_multivariate_no_intercept(self): + """Tests fit() and predict(), multivariate, fit_intercept=False.""" + x_data = np.array([[1, 2], [2, 0], [3, 1], [4, 4], [5, 3]]) + y_data = np.array([2, 1, 1, 2, 4.5]) + + bn, bm = 2, 2 + + x = ds.array(x=x_data, block_size=(bn, bm)) + y = ds.array(x=y_data, block_size=(bn, 1)) + + reg = LinearRegression(fit_intercept=False) + reg.fit(x, y) + save_model(reg, self.filepath) + reg2 = load_model(self.filepath) + + self.assertTrue( + np.allclose(reg.coef_.collect(), [0.48305085, 0.30367232]) + ) + self.assertTrue( + np.allclose(reg2.coef_.collect(), [0.48305085, 0.30367232]) + ) + self.assertTrue(np.allclose(reg.intercept_.collect(), 0)) + self.assertTrue(np.allclose(reg2.intercept_.collect(), 0)) + + # Predict one sample + x_test = np.array([3, 2]) + test_data = ds.array(x=x_test, block_size=(1, bm)) + pred = reg.predict(test_data).collect() + pred2 = reg2.predict(test_data).collect() + self.assertTrue(np.allclose(pred, [2.05649718])) + self.assertTrue(np.allclose(pred2, [2.05649718])) + + # Predict multiple samples + x_test = np.array([[3, 2], [4, 4], [1, 3]]) + test_data = ds.array(x=x_test, block_size=(bn, bm)) + pred = reg.predict(test_data).collect() + pred2 = reg2.predict(test_data).collect() + self.assertTrue(np.allclose(pred, [2.05649718, 3.14689266, 1.3940678])) + self.assertTrue( + np.allclose(pred2, [2.05649718, 3.14689266, 1.3940678]) + ) + + def test_multivariate_multiobjective(self): + """Tests fit() and predict(), multivariate, multiobjective.""" + x_data = np.array( + [[1, 2, 3], [2, 0, 4], [3, 1, 8], [4, 4, 2], [5, 3, 1], [2, 7, 1]] + ) + y_data = np.array( + [ + [2, 0, 3], + [1, 5, 2], + [1, 3, 4], + [2, 7, 9], + [4.5, -1, 4], + [0, 0, 0], + ] + ) + + bn, bm = 2, 2 + + x = ds.array(x=x_data, block_size=(bn, bm)) + y = ds.array(x=y_data, block_size=(bn, bm)) + + reg = LinearRegression() + reg.fit(x, y) + save_model(reg, self.filepath) + reg2 = load_model(self.filepath) + + # Predict one sample + x_test = np.array([3, 2, 1]) + test_data = ds.array(x=x_test, block_size=(1, bm)) + pred = reg.predict(test_data).collect() + pred2 = reg2.predict(test_data).collect() + self.assertTrue(np.allclose(pred, [3.0318415, 1.97164872, 3.85410906])) + self.assertTrue( + np.allclose(pred2, [3.0318415, 1.97164872, 3.85410906]) + ) + + # Predict multiple samples + x_test = np.array([[3, 2, 1], [4, 3, 3], [1, 1, 1]]) + test_data = ds.array(x=x_test, block_size=(bn, bm)) + pred = reg.predict(test_data).collect() + pred2 = reg2.predict(test_data).collect() + self.assertTrue( + np.allclose( + pred, + [ + [3.0318415, 1.97164872, 3.85410906], + [2.5033157, 2.65809327, 5.05310495], + [2.145797, 1.4840121, 1.5739791], + ], + ) + ) + self.assertTrue( + np.allclose( + pred2, + [ + [3.0318415, 1.97164872, 3.85410906], + [2.5033157, 2.65809327, 5.05310495], + [2.145797, 1.4840121, 1.5739791], + ], + ) + ) + + # Check attributes values + self.assertTrue( + np.allclose( + reg2.coef_.collect(), + [ + [0.65034768, 0.34673933, 1.22176283], + [-0.41465084, -0.20584208, -0.16339571], + [-0.38211131, 0.27277365, 0.07031439], + ], + ) + ) + self.assertTrue( + np.allclose( + reg2.coef_.collect(), + [ + [0.65034768, 0.34673933, 1.22176283], + [-0.41465084, -0.20584208, -0.16339571], + [-0.38211131, 0.27277365, 0.07031439], + ], + ) + ) + self.assertTrue( + np.allclose( + reg.intercept_.collect(), [2.29221145, 1.07034124, 0.44529761] + ) + ) + self.assertTrue( + np.allclose( + reg2.intercept_.collect(), [2.29221145, 1.07034124, 0.44529761] + ) + ) + + +def load_movielens(train_ratio=0.9): + file = "tests/files/sample_movielens_ratings.csv" + + # 'user_id', 'movie_id', 'rating', 'timestamp' + + data = np.genfromtxt(file, dtype="int", delimiter=",", usecols=range(3)) + + # just in case there are movies/user without rating + # movie_id + n_m = max(len(np.unique(data[:, 1])), max(data[:, 1]) + 1) + # user_id + n_u = max(len(np.unique(data[:, 0])), max(data[:, 0]) + 1) + + idx = int(data.shape[0] * train_ratio) + + train_data = data[:idx] + test_data = data[idx:] + + train = csr_matrix( + (train_data[:, 2], (train_data[:, 0], train_data[:, 1])), + shape=(n_u, n_m), + ) + + test = csr_matrix((test_data[:, 2], (test_data[:, 0], test_data[:, 1]))) + + x_size, y_size = train.shape[0] // 4, train.shape[1] // 4 + train_arr = ds.array(train, block_size=(x_size, y_size)) + + x_size, y_size = test.shape[0] // 4, test.shape[1] // 4 + test_arr = ds.array(test, block_size=(x_size, y_size)) + + return train_arr, test_arr + + +class ALSSavingTestJSON(unittest.TestCase): + filepath = "tests/files/saving/als.json" + + def test_init_params(self): + # Test all parameters + seed = 666 + n_f = 100 + lambda_ = 0.001 + convergence_threshold = 0.1 + max_iter = 10 + verbose = True + arity = 12 + + als = ALS( + random_state=seed, + n_f=n_f, + lambda_=lambda_, + tol=convergence_threshold, + max_iter=max_iter, + verbose=verbose, + arity=arity, + ) + save_model(als, self.filepath) + als2 = load_model(self.filepath) + + self.assertEqual(als.random_state, seed) + self.assertEqual(als.n_f, n_f) + self.assertEqual(als.lambda_, lambda_) + self.assertEqual(als.tol, convergence_threshold) + self.assertEqual(als.max_iter, max_iter) + self.assertEqual(als.verbose, verbose) + self.assertEqual(als.arity, arity) + self.assertEqual(als2.random_state, seed) + self.assertEqual(als2.n_f, n_f) + self.assertEqual(als2.lambda_, lambda_) + self.assertEqual(als2.tol, convergence_threshold) + self.assertEqual(als2.max_iter, max_iter) + self.assertEqual(als2.verbose, verbose) + self.assertEqual(als2.arity, arity) + + def test_fit(self): + train, test = load_movielens() + + als = ALS( + tol=0.01, + random_state=666, + n_f=100, + verbose=False, + check_convergence=True, + ) + + als.fit(train, test) + self.assertTrue(als.converged) + + als.fit(train) + save_model(als, self.filepath) + als2 = load_model(self.filepath) + + self.assertTrue(als.converged) + self.assertTrue(als2.converged) + + def test_predict(self): + data = np.array([[0, 0, 5], [3, 0, 5], [3, 1, 2]]) + ratings = csr_matrix(data) + train = ds.array(x=ratings, block_size=(1, 1)) + als = ALS(tol=0.01, random_state=666, n_f=5, verbose=False) + als.fit(train) + save_model(als, self.filepath) + als2 = load_model(self.filepath) + + predictions = als.predict_user(user_id=0) + predictions2 = als2.predict_user(user_id=0) + + # Check that the ratings for user 0 are similar to user 1 because they + # share preferences (third movie), thus it is expected that user 0 + # will rate movie 1 similarly to user 1. + self.assertTrue( + 2.75 < predictions[0] < 3.25 + and predictions[1] < 1 + and predictions[2] > 4.5 + ) + self.assertTrue( + 2.75 < predictions2[0] < 3.25 + and predictions2[1] < 1 + and predictions2[2] > 4.5 + ) + self.assertTrue( + np.array_equal(predictions, predictions2, equal_nan=True) + ) + + +def main(): + unittest.main() + + +if __name__ == "__main__": + main() From 2b9f8f370fcdde8992a93dda22e1805617000161 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Mon, 19 Jul 2021 10:38:34 +0200 Subject: [PATCH 23/46] Format and doc changes --- .gitignore | 3 ++ dislib/utils/saving.py | 107 +++++++++++++++++++++-------------------- requirements.txt | 1 + 3 files changed, 59 insertions(+), 52 deletions(-) diff --git a/.gitignore b/.gitignore index 66a5171a..ac75ae24 100644 --- a/.gitignore +++ b/.gitignore @@ -112,6 +112,9 @@ target/ *compss*.out *compss*.err +# Saving +**/saving/* + # ========== C & C++ ignores ================= # Prerequisites *.d diff --git a/dislib/utils/saving.py b/dislib/utils/saving.py index 31159a7f..30b015d8 100644 --- a/dislib/utils/saving.py +++ b/dislib/utils/saving.py @@ -1,6 +1,7 @@ import json import os import numpy as np +import cbor2 from pycompss.runtime.management.classes import Future from pycompss.api.api import compss_wait_on @@ -24,11 +25,6 @@ _SkTreeWrapper, ) -try: - import cbor2 -except ImportError: - cbor2 = None - # Dislib models with saving tested (model: str -> module: str) _implemented_models = { "KMeans": "cluster", @@ -56,9 +52,27 @@ } -def save_model(model, filepath, overwrite=True, save_format=None): - """Saves a model to a file. - Usage: +def save_model(model, filepath, overwrite=True, save_format="json"): + """ Saves a model to a file. + + The model is synchronized before saving and can be reinstantiated in the + exact same state, without any of the code used for model definition or + fitting. + + Parameters + ---------- + model : dislib model. + Dislib model to serialize and save. + filepath : str + Path where to save the model + overwrite : bool, optional (default=True) + Whether any existing model at the target + location should be overwritten. + save_format : str, optional (default='json) + Format used to save the models. + + Examples + -------- >>> from dislib.cluster import KMeans >>> from dislib.utils import save_model, load_model >>> import numpy as np @@ -73,20 +87,8 @@ def save_model(model, filepath, overwrite=True, save_format=None): >>> model_pred = model.predict(x_test) >>> loaded_model_pred = loaded_model.predict(x_test) >>> assert np.allclose(model_pred.collect(), loaded_model_pred.collect()) - - The file contains: - - the model's class - - the model's attributes - The model is synchronized before saving and can be reinstantiated in the - exact same state, without any of the code used for model definition or - fitting. - Args: - model: `dislib` model instance to be saved. - filepath: String path where to save the model - overwrite: Whether we should overwrite any existing model at the target - location, or instead ask the user with a manual prompt. - save_format: Format used to save the model. Defaults to `json`. """ + # Check overwrite if not overwrite and os.path.isfile(filepath): return @@ -108,23 +110,31 @@ def save_model(model, filepath, overwrite=True, save_format=None): model_metadata["model_name"] = model_name # Save model - default_format = "json" - save_format = save_format or default_format if save_format == "json": with open(filepath, "w") as f: json.dump(model_metadata, f, default=_encode_helper) elif save_format == "cbor": - if cbor2 is None: - raise ModuleNotFoundError("No module named 'cbor2'") with open(filepath, "wb") as f: cbor2.dump(model_metadata, f, default=_encode_helper_cbor) else: - raise ValueError("Save format must be either json or h5.") + raise ValueError("Wrong save format.") + +def load_model(filepath, load_format="json"): + """ Loads a model from a file. -def load_model(filepath, load_format=None): - """Loads a model from a file. - Usage: + The model is reinstantiated in the exact same state in which it was saved, + without any of the code used for model definition or fitting. + + Parameters + ---------- + filepath : str + Path of the saved the model + load_format : str, optional (default='json') + Format used to load the model. + + Examples + -------- >>> from dislib.cluster import KMeans >>> from dislib.utils import save_model, load_model >>> import numpy as np @@ -139,36 +149,22 @@ def load_model(filepath, load_format=None): >>> model_pred = model.predict(x_test) >>> loaded_model_pred = loaded_model.predict(x_test) >>> assert np.allclose(model_pred.collect(), loaded_model_pred.collect()) - - The file must contain: - - the model's class - - the model's attributes - The model is reinstantiated in the exact same state in which it was saved, - without any of the code used for model definition or fitting. - Args: - filepath: String path where to save the model - load_format: Format used to load the model. Defaults to 'json'. """ # Load model - default_format = "json" - load_format = load_format or default_format - if load_format == "json": with open(filepath, "r") as f: model_metadata = json.load(f, object_hook=_decode_helper) elif load_format == "cbor": - if cbor2 is None: - raise ModuleNotFoundError("No module named 'cbor2'") with open(filepath, "rb") as f: model_metadata = cbor2.load(f, object_hook=_decode_helper_cbor) else: - raise ValueError("Load format must be either json or h5.") + raise ValueError("Wrong load format.") # Check for dislib model model_name = model_metadata["model_name"] if model_name not in _implemented_models.keys(): raise NotImplementedError( - "Loading has only been implemented for the following models:\n%s" + "Saving has only been implemented for the following models:\n%s" % _implemented_models.keys() ) del model_metadata["model_name"] @@ -179,7 +175,7 @@ def load_model(filepath, load_format=None): model = model_class() model.__dict__.update(model_metadata) - # Set class methodss + # Set class methods if model_name == "CascadeSVM" and "kernel" in model_metadata: try: model._kernel_f = getattr( @@ -192,17 +188,19 @@ def load_model(filepath, load_format=None): def _encode_helper_cbor(encoder, obj): - """Special encoder wrapper for dislib using cbor""" + """ Special encoder wrapper for dislib using cbor2""" encoder.encode(_encode_helper(obj)) def _decode_helper_cbor(decoder, obj): - """Special decoder wrapper for dislib using cbor""" + """ Special decoder wrapper for dislib using cbor2""" return _decode_helper(obj) def _encode_helper(obj): - """Special encoder for dislib""" + """ Special encoder for dislib that serializes the different objectes + and stores their state for future loading. + """ if isinstance(obj, np.generic): return obj.item() elif isinstance(obj, range): @@ -254,7 +252,9 @@ def _encode_helper(obj): def _decode_helper(obj): - """Special decoder for dislib""" + """ Special decoder for dislib that instantiates the different objects + and updates their attributes to recover the saved state. + """ if isinstance(obj, dict) and "class_name" in obj: class_name = obj["class_name"] @@ -331,7 +331,9 @@ def _decode_helper(obj): def _sync_obj(obj): - """Recursively synchronizes the Future objects of a list or dictionary.""" + """ Recursively synchronizes the Future objects of a list or dictionary + by using `compss_wait_on(obj)`. + """ if isinstance(obj, dict): iterator = iter(obj.items()) elif isinstance(obj, list): @@ -353,7 +355,8 @@ def _sync_obj(obj): def _sync_rf(rf): - """Sync the `try_features` and 'n_classes' attribute of the different trees + """ Sync the `try_features` and `n_classes` attribute of the different trees + since they cannot be synced recursively. """ if isinstance(rf.trees[0].try_features, Future): try_features = compss_wait_on(rf.trees[0].try_features) diff --git a/requirements.txt b/requirements.txt index 4100177f..3fc50ee3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ scipy>=1.3.0 numpy>=1.18.1, <=1.19.5 numpydoc>=0.8.0 cvxpy>=1.1.5 +cbor2>=5.4.0 \ No newline at end of file From 6dbd62572cb45f05da4b474d96f37d335dc7516d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Mon, 19 Jul 2021 16:43:00 +0200 Subject: [PATCH 24/46] cbor2 not always required --- dislib/utils/saving.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/dislib/utils/saving.py b/dislib/utils/saving.py index 30b015d8..65065eab 100644 --- a/dislib/utils/saving.py +++ b/dislib/utils/saving.py @@ -1,7 +1,6 @@ import json import os import numpy as np -import cbor2 from pycompss.runtime.management.classes import Future from pycompss.api.api import compss_wait_on @@ -25,6 +24,11 @@ _SkTreeWrapper, ) +try: + import cbor2 +except ImportError: + cbor2 = None + # Dislib models with saving tested (model: str -> module: str) _implemented_models = { "KMeans": "cluster", @@ -114,6 +118,8 @@ def save_model(model, filepath, overwrite=True, save_format="json"): with open(filepath, "w") as f: json.dump(model_metadata, f, default=_encode_helper) elif save_format == "cbor": + if cbor2 is None: + raise ModuleNotFoundError("No module named 'cbor2'") with open(filepath, "wb") as f: cbor2.dump(model_metadata, f, default=_encode_helper_cbor) else: @@ -155,6 +161,8 @@ def load_model(filepath, load_format="json"): with open(filepath, "r") as f: model_metadata = json.load(f, object_hook=_decode_helper) elif load_format == "cbor": + if cbor2 is None: + raise ModuleNotFoundError("No module named 'cbor2'") with open(filepath, "rb") as f: model_metadata = cbor2.load(f, object_hook=_decode_helper_cbor) else: @@ -188,12 +196,12 @@ def load_model(filepath, load_format="json"): def _encode_helper_cbor(encoder, obj): - """ Special encoder wrapper for dislib using cbor2""" + """ Special encoder wrapper for dislib using cbor2.""" encoder.encode(_encode_helper(obj)) def _decode_helper_cbor(decoder, obj): - """ Special decoder wrapper for dislib using cbor2""" + """ Special decoder wrapper for dislib using cbor2.""" return _decode_helper(obj) From 004e713e9f62bc6d8df29cf1a6c2f40721091cf7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Wed, 21 Jul 2021 12:12:32 +0200 Subject: [PATCH 25/46] Install dislib requirements --- Dockerfile | 2 ++ dislib/utils/saving.py | 18 +++++++++--------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/Dockerfile b/Dockerfile index e8a72019..75fabe37 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,6 +5,8 @@ COPY . dislib/ ENV PYTHONPATH=$PYTHONPATH:/dislib +RUN python -m pip install -r /dislib/requirements.txt + # Expose SSH port and run SSHD EXPOSE 22 CMD ["/usr/sbin/sshd","-D"] diff --git a/dislib/utils/saving.py b/dislib/utils/saving.py index 65065eab..a6e4e0fd 100644 --- a/dislib/utils/saving.py +++ b/dislib/utils/saving.py @@ -28,7 +28,7 @@ import cbor2 except ImportError: cbor2 = None - + # Dislib models with saving tested (model: str -> module: str) _implemented_models = { "KMeans": "cluster", @@ -57,7 +57,7 @@ def save_model(model, filepath, overwrite=True, save_format="json"): - """ Saves a model to a file. + """Saves a model to a file. The model is synchronized before saving and can be reinstantiated in the exact same state, without any of the code used for model definition or @@ -127,7 +127,7 @@ def save_model(model, filepath, overwrite=True, save_format="json"): def load_model(filepath, load_format="json"): - """ Loads a model from a file. + """Loads a model from a file. The model is reinstantiated in the exact same state in which it was saved, without any of the code used for model definition or fitting. @@ -196,17 +196,17 @@ def load_model(filepath, load_format="json"): def _encode_helper_cbor(encoder, obj): - """ Special encoder wrapper for dislib using cbor2.""" + """Special encoder wrapper for dislib using cbor2.""" encoder.encode(_encode_helper(obj)) def _decode_helper_cbor(decoder, obj): - """ Special decoder wrapper for dislib using cbor2.""" + """Special decoder wrapper for dislib using cbor2.""" return _decode_helper(obj) def _encode_helper(obj): - """ Special encoder for dislib that serializes the different objectes + """Special encoder for dislib that serializes the different objectes and stores their state for future loading. """ if isinstance(obj, np.generic): @@ -260,7 +260,7 @@ def _encode_helper(obj): def _decode_helper(obj): - """ Special decoder for dislib that instantiates the different objects + """Special decoder for dislib that instantiates the different objects and updates their attributes to recover the saved state. """ if isinstance(obj, dict) and "class_name" in obj: @@ -339,7 +339,7 @@ def _decode_helper(obj): def _sync_obj(obj): - """ Recursively synchronizes the Future objects of a list or dictionary + """Recursively synchronizes the Future objects of a list or dictionary by using `compss_wait_on(obj)`. """ if isinstance(obj, dict): @@ -363,7 +363,7 @@ def _sync_obj(obj): def _sync_rf(rf): - """ Sync the `try_features` and `n_classes` attribute of the different trees + """Sync the `try_features` and `n_classes` attribute of the different trees since they cannot be synced recursively. """ if isinstance(rf.trees[0].try_features, Future): From 456e783c6fee6d9743983f87dd15b4ecb54e9fd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Wed, 21 Jul 2021 12:19:38 +0200 Subject: [PATCH 26/46] Added directory to save models during testing. --- .gitignore | 3 ++- tests/files/saving/saving.txt | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) create mode 100644 tests/files/saving/saving.txt diff --git a/.gitignore b/.gitignore index ac75ae24..ad8ef5a4 100644 --- a/.gitignore +++ b/.gitignore @@ -113,7 +113,8 @@ target/ *compss*.err # Saving -**/saving/* +tests/files/saving/* +!tests/files/saving/*.txt # ========== C & C++ ignores ================= # Prerequisites diff --git a/tests/files/saving/saving.txt b/tests/files/saving/saving.txt new file mode 100644 index 00000000..d7d8541b --- /dev/null +++ b/tests/files/saving/saving.txt @@ -0,0 +1 @@ +Directory where the models generated by the tests regarding saving functionalities should be located. \ No newline at end of file From e67cfa1c505a1226d0095976b3d36e27ece40e2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Wed, 21 Jul 2021 12:21:54 +0200 Subject: [PATCH 27/46] Install requirements using pip3. --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 75fabe37..2fa20a5d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,7 +5,7 @@ COPY . dislib/ ENV PYTHONPATH=$PYTHONPATH:/dislib -RUN python -m pip install -r /dislib/requirements.txt +RUN pip3 install -r /dislib/requirements.txt # Expose SSH port and run SSHD EXPOSE 22 From addf6d54cf95bdad19eaed7bc685d55a03374120 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Wed, 21 Jul 2021 13:05:17 +0200 Subject: [PATCH 28/46] Changed environment language. --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 2fa20a5d..7b1ed215 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,7 +4,7 @@ MAINTAINER COMPSs Support COPY . dislib/ ENV PYTHONPATH=$PYTHONPATH:/dislib - +ENV LC_ALL=C.UTF-8 RUN pip3 install -r /dislib/requirements.txt # Expose SSH port and run SSHD From 8e40e411ec7e05fc692263fae7229f5efac51f95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Wed, 21 Jul 2021 17:02:51 +0200 Subject: [PATCH 29/46] Changed Jenkins timeout from 2h to 3h --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 681857e0..eaf042e9 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -11,7 +11,7 @@ def setGithubCommitStatus(state, description) { pipeline { options { - timeout(time: 4, unit: 'HOURS') + timeout(time: 3, unit: 'HOURS') } agent { node { From eb852e6319f1337c13342d338bcc3ae4500400ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Fri, 23 Jul 2021 12:53:22 +0200 Subject: [PATCH 30/46] Changed names of constant variables --- dislib/utils/saving.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/dislib/utils/saving.py b/dislib/utils/saving.py index a6e4e0fd..f0b8313c 100644 --- a/dislib/utils/saving.py +++ b/dislib/utils/saving.py @@ -30,7 +30,7 @@ cbor2 = None # Dislib models with saving tested (model: str -> module: str) -_implemented_models = { +IMPLEMENTED_MODELS = { "KMeans": "cluster", "GaussianMixture": "cluster", "CascadeSVM": "classification", @@ -41,7 +41,7 @@ } # Classes used by models -_dislib_classes = { +DISLIB_CLASSES = { "KMeans": dislib.cluster.KMeans, "DecisionTreeClassifier": DecisionTreeClassifier, "_Node": _Node, @@ -50,7 +50,7 @@ "_SkTreeWrapper": _SkTreeWrapper, } -_sklearn_classes = { +SKLEARN_CLASSES = { "SVC": SklearnSVC, "DecisionTreeClassifier": SklearnDTClassifier, } @@ -99,10 +99,10 @@ def save_model(model, filepath, overwrite=True, save_format="json"): # Check for dislib model model_name = model.__class__.__name__ - if model_name not in _implemented_models.keys(): + if model_name not in IMPLEMENTED_MODELS.keys(): raise NotImplementedError( "Saving has only been implemented for the following models:\n%s" - % _implemented_models.keys() + % IMPLEMENTED_MODELS.keys() ) # Synchronize model @@ -170,15 +170,15 @@ def load_model(filepath, load_format="json"): # Check for dislib model model_name = model_metadata["model_name"] - if model_name not in _implemented_models.keys(): + if model_name not in IMPLEMENTED_MODELS.keys(): raise NotImplementedError( "Saving has only been implemented for the following models:\n%s" - % _implemented_models.keys() + % IMPLEMENTED_MODELS.keys() ) del model_metadata["model_name"] # Create model - model_module = getattr(ds, _implemented_models[model_name]) + model_module = getattr(ds, IMPLEMENTED_MODELS[model_name]) model_class = getattr(model_module, model_name) model = model_class() model.__dict__.update(model_metadata) @@ -249,7 +249,7 @@ def _encode_helper(obj): "items": obj.__getstate__(), } elif isinstance( - obj, tuple(_dislib_classes.values()) + tuple(_sklearn_classes.values()) + obj, tuple(DISLIB_CLASSES.values()) + tuple(SKLEARN_CLASSES.values()) ): return { "class_name": obj.__class__.__name__, @@ -302,12 +302,12 @@ def _decode_helper(obj): model.__setstate__(dict_) return model elif ( - class_name in _dislib_classes.keys() + class_name in DISLIB_CLASSES.keys() and "dislib" in obj["module_name"] ): dict_ = _decode_helper(obj["items"]) if class_name == "DecisionTreeClassifier": - model = _dislib_classes[obj["class_name"]]( + model = DISLIB_CLASSES[obj["class_name"]]( try_features=dict_.pop("try_features"), max_depth=dict_.pop("max_depth"), distr_depth=dict_.pop("distr_depth"), @@ -317,17 +317,17 @@ def _decode_helper(obj): ) elif class_name == "_SkTreeWrapper": sk_tree = _decode_helper(dict_.pop("sk_tree")) - model = _dislib_classes[obj["class_name"]](sk_tree) + model = DISLIB_CLASSES[obj["class_name"]](sk_tree) else: - model = _dislib_classes[obj["class_name"]]() + model = DISLIB_CLASSES[obj["class_name"]]() model.__dict__.update(dict_) return model elif ( - class_name in _sklearn_classes.keys() + class_name in SKLEARN_CLASSES.keys() and "sklearn" in obj["module_name"] ): dict_ = _decode_helper(obj["items"]) - model = _sklearn_classes[obj["class_name"]]() + model = SKLEARN_CLASSES[obj["class_name"]]() model.__dict__.update(dict_) return model elif class_name == "callable": From db0db92195a7c29a483f7006f0d5d21752bae63a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Wed, 21 Jul 2021 17:02:05 +0200 Subject: [PATCH 31/46] Add RF Classifier and started modifying _data.py --- dislib/regression/rf/__init__.py | 0 dislib/regression/rf/_data.py | 279 ++++++++++++++ dislib/regression/rf/decision_tree.py | 520 ++++++++++++++++++++++++++ dislib/regression/rf/forest.py | 306 +++++++++++++++ dislib/regression/rf/test_split.py | 50 +++ 5 files changed, 1155 insertions(+) create mode 100644 dislib/regression/rf/__init__.py create mode 100644 dislib/regression/rf/_data.py create mode 100644 dislib/regression/rf/decision_tree.py create mode 100644 dislib/regression/rf/forest.py create mode 100644 dislib/regression/rf/test_split.py diff --git a/dislib/regression/rf/__init__.py b/dislib/regression/rf/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/dislib/regression/rf/_data.py b/dislib/regression/rf/_data.py new file mode 100644 index 00000000..360f705d --- /dev/null +++ b/dislib/regression/rf/_data.py @@ -0,0 +1,279 @@ +import tempfile + +import numpy as np +from numpy.lib import format +from pycompss.api.parameter import ( + FILE_IN, + FILE_INOUT, + COLLECTION_IN, + Depth, + Type, +) +from pycompss.api.task import task + +from dislib.data.array import Array + + +class RfDataset(object): + """Dataset format used by the fit() of the RandomForestRegressor. + + The RfDataset contains a file path for the samples and another one for the + targets. Optionally, a path can be provided for a transposed version of the + samples matrix, i.e., the features. + + Note: For a representation of a dataset distributed in multiple files, use + dislib.data.Dataset instead. + + Parameters + ---------- + samples_path : str + Path of the .npy file containing the 2-d array of samples. It can be a + pycompss.runtime.Future object. If so, self.n_samples and + self.n_features must be set manually (they can also be + pycompss.runtime.Future objects). + targets_path : str + Path of the .dat file containing the 1-d array of targets. It can be a + pycompss.runtime.Future object. + features_path : str, optional (default=None) + Path of the .npy file containing the 2-d array of samples transposed. + The array must be C-ordered. Providing this array may improve the + performance as it allows sequential access to the features. + + Attributes + ---------- + n_samples : int + The number of samples of the dataset. It can be a + pycompss.runtime.Future object. + n_features : int + The number of features of the dataset. It can be a + pycompss.runtime.Future object. + y_targets : ndarray + The array of targets for this RfDataset. It can be a + pycompss.runtime.Future object. + + """ + + def __init__(self, samples_path, targets_path, features_path=None): + self.samples_path = samples_path + self.targets_path = targets_path + self.features_path = features_path + self.n_samples = None + self.n_features = None + + self.y_targets = None + + def get_n_samples(self): + """Gets the number of samples obtained from the samples file. + + Returns + ------- + n_samples : int + + Raises + ------ + AssertionError + If self.n_samples is None and self.samples_path is not a string. + ValueError + If invalid content is encountered in the samples file. + + """ + if self.n_samples is None: + assert isinstance(self.samples_path, str), ( + "self.n_samples must be set manually if self.samples_path " + "is a pycompss.runtime.Future object" + ) + shape = _NpyFile(self.samples_path).get_shape() + if len(shape) != 2: + raise ValueError("Cannot read 2D array from the samples file.") + self.n_samples, self.n_features = shape + return self.n_samples + + def get_n_features(self): + """Gets the number of features obtained from the samples file. + + Returns + ------- + n_features : int + + Raises + ------ + AssertionError + If self.n_features is None and self.samples_path is not a string. + ValueError + If invalid content is encountered in the samples file. + + """ + if self.n_features is None: + assert isinstance(self.samples_path, str), ( + "self.n_features must be set manually if self.samples_path " + "is a pycompss.runtime.Future object" + ) + shape = _NpyFile(self.samples_path).get_shape() + if len(shape) != 2: + raise ValueError("Cannot read 2D array from the samples file.") + self.n_samples, self.n_features = shape + return self.n_features + + def get_y_targets(self): + """Obtains the array of targets. + + Returns + ------- + y_targets : ndarray + + """ + if self.y_targets is None: + targets = _get_targets(self.targets_path) + self.y_targets = targets + return self.y_targets + + def validate_features_file(self): + """Validates the features file header information. + + Raises + ------ + ValueError + If the shape of the array in the features_file doesn't match this + class n_samples and n_features or if the array is in fortran order. + + """ + features_npy_file = _NpyFile(self.features_path) + shape = features_npy_file.get_shape() + fortran_order = features_npy_file.get_fortran_order() + if len(shape) != 2: + raise ValueError("Cannot read 2D array from features_file.") + if (self.get_n_features(), self.get_n_samples()) != shape: + raise ValueError("Invalid dimensions for the features_file.") + if fortran_order: + raise ValueError("Fortran order not supported for features array.") + + +def transform_to_rf_dataset(x: Array, y: Array) -> RfDataset: + """Creates a RfDataset object from samples x and targets y. + + This function creates a dislib.regression.rf.data.RfDataset by saving + x and y in files. + + Parameters + ---------- + x : ds-array, shape = (n_samples, n_features) + The training input samples. + y : ds-array, shape = (n_samples,) or (n_samples, n_outputs) + The target values. + + Returns + ------- + rf_dataset : dislib.regression.rf._data.RfDataset + + """ + n_samples = x.shape[0] + n_features = x.shape[1] + + samples_file = tempfile.NamedTemporaryFile( + mode="wb", prefix="tmp_rf_samples_", delete=False + ) + samples_path = samples_file.name + samples_file.close() + _allocate_samples_file(samples_path, n_samples, n_features) + + start_idx = 0 + row_blocks_iterator = x._iterator(axis=0) + top_row = next(row_blocks_iterator) + _fill_samples_file(samples_path, top_row._blocks, start_idx) + start_idx += x._top_left_shape[0] + for x_row in row_blocks_iterator: + _fill_samples_file(samples_path, x_row._blocks, start_idx) + start_idx += x._reg_shape[0] + + targets_file = tempfile.NamedTemporaryFile( + mode="w", prefix="tmp_rf_targets_", delete=False + ) + targets_path = targets_file.name + targets_file.close() + for y_row in y._iterator(axis=0): + _fill_targets_file(targets_path, y_row._blocks) + + rf_dataset = RfDataset(samples_path, targets_path) + rf_dataset.n_samples = n_samples + rf_dataset.n_features = n_features + return rf_dataset + + +class _NpyFile(object): + def __init__(self, path): + self.path = path + + self.shape = None + self.fortran_order = None + self.dtype = None + + def get_shape(self): + if self.shape is None: + self._read_header() + return self.shape + + def get_fortran_order(self): + if self.fortran_order is None: + self._read_header() + return self.fortran_order + + def get_dtype(self): + if self.dtype is None: + self._read_header() + return self.dtype + + def _read_header(self): + with open(self.path, "rb") as fp: + version = format.read_magic(fp) + try: + format._check_version(version) + except ValueError: + raise ValueError("Invalid file format.") + header_data = format._read_array_header(fp, version) + self.shape, self.fortran_order, self.dtype = header_data + + +@task(targets_path=FILE_IN, returns=1) +def _get_targets(targets_path): + y = np.genfromtxt(targets_path, dtype=None, encoding="utf-8") + return y + + +@task(returns=1) +def _get_samples_shape(subset): + return subset.samples.shape + + +@task(returns=3) +def _merge_shapes(*samples_shapes): + n_samples = 0 + n_features = samples_shapes[0][1] + for shape in samples_shapes: + n_samples += shape[0] + assert shape[1] == n_features, "Subsamples with different n_features." + return samples_shapes, n_samples, n_features + + +@task(samples_path=FILE_INOUT) +def _allocate_samples_file(samples_path, n_samples, n_features): + np.lib.format.open_memmap( + samples_path, + mode="w+", + dtype="float32", + shape=(int(n_samples), int(n_features)), + ) + + +@task(samples_path=FILE_INOUT, row_blocks={Type: COLLECTION_IN, Depth: 2}) +def _fill_samples_file(samples_path, row_blocks, start_idx): + rows_samples = Array._merge_blocks(row_blocks) + rows_samples = rows_samples.astype(dtype="float32", casting="same_kind") + samples = np.lib.format.open_memmap(samples_path, mode="r+") + samples[start_idx : start_idx + rows_samples.shape[0]] = rows_samples + + +@task(targets_path=FILE_INOUT, row_blocks={Type: COLLECTION_IN, Depth: 2}) +def _fill_targets_file(targets_path, row_blocks): + rows_targets = Array._merge_blocks(row_blocks) + with open(targets_path, "at") as f: + np.savetxt(f, rows_targets, fmt="%s", encoding="utf-8") diff --git a/dislib/regression/rf/decision_tree.py b/dislib/regression/rf/decision_tree.py new file mode 100644 index 00000000..0725fcfa --- /dev/null +++ b/dislib/regression/rf/decision_tree.py @@ -0,0 +1,520 @@ +from sys import float_info + +import numpy as np +from numpy.random.mtrand import RandomState +from pycompss.api.api import compss_delete_object +from pycompss.api.parameter import FILE_IN, Type, COLLECTION_IN, Depth +from pycompss.api.task import task +from sklearn.tree import DecisionTreeClassifier as SklearnDTClassifier + +from dislib.classification.rf.test_split import test_split +from dislib.data.array import Array + + +class DecisionTreeClassifier: + """A distributed decision tree classifier. + + Parameters + ---------- + try_features : int + The number of features to consider when looking for the best split. + + Note: the search for a split does not stop until at least one + valid partition of the node samples is found, even if it requires + to effectively inspect more than ``try_features`` features. + max_depth : int + The maximum depth of the tree. If np.inf, then nodes are expanded + until all leaves are pure. + distr_depth : int + Number of levels of the tree in which the nodes are split in a + distributed way. + bootstrap : bool + Randomly select n_instances samples with repetition (used in random + forests). + random_state : RandomState instance + The random number generator. + + Attributes + ---------- + n_features : int + The number of features of the dataset. It can be a + pycompss.runtime.Future object. + n_classes : int + The number of classes of this RfDataset. It can be a + pycompss.runtime.Future object. + tree : None or _Node + The root node of the tree after the tree is fitted. + nodes_info : None or list of _InnerNodeInfo and _LeafInfo + List of the node information for the nodes of the tree in the same + order as obtained in the fit() method, up to ``distr_depth`` depth. + After fit(), it is a pycompss.runtime.Future object. + subtrees : None or list of _Node + List of subtrees of the tree at ``distr_depth`` depth obtained in the + fit() method. After fit(), it is a list of pycompss.runtime.Future + objects. + + Methods + ------- + fit(dataset) + Fits the DecisionTreeClassifier. + predict(x_row) + Predicts classes for the given samples using a fitted tree. + predict_proba(x_row) + Predicts class probabilities for the given smaples using a fitted tree. + + """ + + def __init__(self, try_features, max_depth, distr_depth, sklearn_max, + bootstrap, random_state): + self.try_features = try_features + self.max_depth = max_depth + self.distr_depth = distr_depth + self.sklearn_max = sklearn_max + self.bootstrap = bootstrap + self.random_state = random_state + + self.n_features = None + self.n_classes = None + + self.tree = None + self.nodes_info = None + self.subtrees = None + + def fit(self, dataset): + """Fits the DecisionTreeClassifier. + + Parameters + ---------- + dataset : dislib.classification.rf._data.RfDataset + + """ + + self.n_features = dataset.get_n_features() + self.n_classes = dataset.get_n_classes() + samples_path = dataset.samples_path + features_path = dataset.features_path + n_samples = dataset.get_n_samples() + y_codes = dataset.get_y_codes() + + seed = self.random_state.randint(np.iinfo(np.int32).max) + + sample, y_s = _sample_selection(n_samples, y_codes, self.bootstrap, + seed) + + self.tree = _Node() + self.nodes_info = [] + self.subtrees = [] + tree_traversal = [(self.tree, sample, y_s, 0)] + while tree_traversal: + node, sample, y_s, depth = tree_traversal.pop() + if depth < self.distr_depth: + split = _split_node_wrapper(sample, self.n_features, y_s, + self.n_classes, self.try_features, + self.random_state, + samples_file=samples_path, + features_file=features_path) + node_info, left_group, y_l, right_group, y_r = split + compss_delete_object(sample) + compss_delete_object(y_s) + node.content = len(self.nodes_info) + self.nodes_info.append(node_info) + node.left = _Node() + node.right = _Node() + depth = depth + 1 + tree_traversal.append((node.right, right_group, y_r, depth)) + tree_traversal.append((node.left, left_group, y_l, depth)) + else: + subtree = _build_subtree_wrapper(sample, y_s, self.n_features, + self.max_depth - depth, + self.n_classes, + self.try_features, + self.sklearn_max, + self.random_state, + samples_path, features_path) + node.content = len(self.subtrees) + self.subtrees.append(subtree) + compss_delete_object(sample) + compss_delete_object(y_s) + self.nodes_info = _merge(*self.nodes_info) + + def predict(self, x_row): + """Predicts classes for the given samples using a fitted tree. + + Parameters + ---------- + x_row : ds-array + A row block of samples. + + Returns + ------- + predicted : ndarray + An array with the predicted classes for the given samples. The + values are codes of the fitted + dislib.classification.rf.data.RfDataset. The returned object can + be a pycompss.runtime.Future object. + + """ + + assert self.tree is not None, 'The decision tree is not fitted.' + + branch_predictions = [] + for i, subtree in enumerate(self.subtrees): + pred = _predict_branch(x_row._blocks, self.tree, self.nodes_info, + i, subtree, self.distr_depth) + branch_predictions.append(pred) + return _merge_branches(None, *branch_predictions) + + def predict_proba(self, x_row): + """Predicts class probabilities for a row block using a fitted tree. + + Parameters + ---------- + x_row : ds-array + A row block of samples. + + Returns + ------- + predicted_proba : ndarray + An array with the predicted probabilities for the given samples. + The shape is (len(subset.samples), self.n_classes), with the index + of the column being codes of the fitted + dislib.classification.rf.data.RfDataset. The returned object can be + a pycompss.runtime.Future object. + + """ + + assert self.tree is not None, 'The decision tree is not fitted.' + + branch_predictions = [] + for i, subtree in enumerate(self.subtrees): + pred = _predict_branch_proba(x_row._blocks, self.tree, + self.nodes_info, i, subtree, + self.distr_depth, self.n_classes) + branch_predictions.append(pred) + return _merge_branches(self.n_classes, *branch_predictions) + + +class _Node: + + def __init__(self): + self.content = None + self.left = None + self.right = None + + def predict(self, sample): + node_content = self.content + if isinstance(node_content, _LeafInfo): + return np.full((len(sample),), node_content.mode) + if isinstance(node_content, _SkTreeWrapper): + if len(sample) > 0: + return node_content.sk_tree.predict(sample) + if isinstance(node_content, _InnerNodeInfo): + pred = np.empty((len(sample),), dtype=np.int64) + left_mask = sample[:, node_content.index] <= node_content.value + pred[left_mask] = self.left.predict(sample[left_mask]) + pred[~left_mask] = self.right.predict(sample[~left_mask]) + return pred + assert len(sample) == 0, 'Type not supported' + return np.empty((0,), dtype=np.int64) + + def predict_proba(self, sample, n_classes): + node_content = self.content + if isinstance(node_content, _LeafInfo): + single_pred = node_content.frequencies / node_content.size + return np.tile(single_pred, (len(sample), 1)) + if isinstance(node_content, _SkTreeWrapper): + if len(sample) > 0: + sk_tree_pred = node_content.sk_tree.predict_proba(sample) + pred = np.zeros((len(sample), n_classes), dtype=np.float64) + pred[:, node_content.sk_tree.classes_] = sk_tree_pred + return pred + if isinstance(node_content, _InnerNodeInfo): + pred = np.empty((len(sample), n_classes), dtype=np.float64) + l_msk = sample[:, node_content.index] <= node_content.value + pred[l_msk] = self.left.predict_proba(sample[l_msk], n_classes) + pred[~l_msk] = self.right.predict_proba(sample[~l_msk], n_classes) + return pred + assert len(sample) == 0, 'Type not supported' + return np.empty((0, n_classes), dtype=np.float64) + + +class _InnerNodeInfo: + def __init__(self, index=None, value=None): + self.index = index + self.value = value + + +class _LeafInfo: + def __init__(self, size=None, frequencies=None, mode=None): + self.size = size + self.frequencies = frequencies + self.mode = mode + + +class _SkTreeWrapper: + def __init__(self, tree): + self.sk_tree = tree + self.classes = tree.classes_ + + +def _get_sample_attributes(samples_file, indices): + samples_mmap = np.load(samples_file, mmap_mode='r', allow_pickle=False) + x = samples_mmap[indices] + return x + + +def _get_feature_mmap(features_file, i): + return _get_features_mmap(features_file)[i] + + +def _get_features_mmap(features_file): + return np.load(features_file, mmap_mode='r', allow_pickle=False) + + +@task(priority=True, returns=2) +def _sample_selection(n_samples, y_codes, bootstrap, seed): + if bootstrap: + random_state = RandomState(seed) + selection = random_state.choice(n_samples, size=n_samples, + replace=True) + selection.sort() + return selection, y_codes[selection] + else: + return np.arange(n_samples), y_codes + + +def _feature_selection(untried_indices, m_try, random_state): + selection_len = min(m_try, len(untried_indices)) + return random_state.choice(untried_indices, size=selection_len, + replace=False) + + +def _get_groups(sample, y_s, features_mmap, index, value): + if index is None: + empty_sample = np.array([], dtype=np.int64) + empty_labels = np.array([], dtype=np.int8) + return sample, y_s, empty_sample, empty_labels + feature = features_mmap[index][sample] + mask = feature < value + left = sample[mask] + right = sample[~mask] + y_l = y_s[mask] + y_r = y_s[~mask] + return left, y_l, right, y_r + + +def _compute_leaf_info(y_s, n_classes): + frequencies = np.bincount(y_s, minlength=n_classes) + mode = np.argmax(frequencies) + return _LeafInfo(len(y_s), frequencies, mode) + + +def _split_node_wrapper(sample, n_features, y_s, n_classes, m_try, + random_state, samples_file=None, features_file=None): + seed = random_state.randint(np.iinfo(np.int32).max) + + if features_file is not None: + return _split_node_using_features(sample, n_features, y_s, n_classes, + m_try, features_file, seed) + elif samples_file is not None: + return _split_node(sample, n_features, y_s, n_classes, m_try, + samples_file, seed) + else: + raise ValueError('Invalid combination of arguments. samples_file is ' + 'None and features_file is None.') + + +@task(features_file=FILE_IN, returns=(object, list, list, list, list)) +def _split_node_using_features(sample, n_features, y_s, n_classes, m_try, + features_file, seed): + features_mmap = np.load(features_file, mmap_mode='r', allow_pickle=False) + random_state = RandomState(seed) + return _compute_split(sample, n_features, y_s, n_classes, m_try, + features_mmap, random_state) + + +@task(samples_file=FILE_IN, returns=(object, list, list, list, list)) +def _split_node(sample, n_features, y_s, n_classes, m_try, samples_file, seed): + features_mmap = np.load(samples_file, mmap_mode='r', allow_pickle=False).T + random_state = RandomState(seed) + return _compute_split(sample, n_features, y_s, n_classes, m_try, + features_mmap, random_state) + + +def _compute_split(sample, n_features, y_s, n_classes, m_try, features_mmap, + random_state): + node_info = left_group = y_l = right_group = y_r = None + split_ended = False + tried_indices = [] + while not split_ended: + untried_indices = np.setdiff1d(np.arange(n_features), tried_indices) + index_selection = _feature_selection(untried_indices, m_try, + random_state) + b_score = float_info.max + b_index = None + b_value = None + for index in index_selection: + feature = features_mmap[index] + score, value = test_split(sample, y_s, feature, n_classes) + if score < b_score: + b_score, b_value, b_index = score, value, index + groups = _get_groups(sample, y_s, features_mmap, b_index, b_value) + left_group, y_l, right_group, y_r = groups + if left_group.size and right_group.size: + split_ended = True + node_info = _InnerNodeInfo(b_index, b_value) + else: + tried_indices.extend(list(index_selection)) + if len(tried_indices) == n_features: + split_ended = True + node_info = _compute_leaf_info(y_s, n_classes) + left_group = sample + y_l = y_s + right_group = np.array([], dtype=np.int64) + y_r = np.array([], dtype=np.int8) + + return node_info, left_group, y_l, right_group, y_r + + +def _build_subtree_wrapper(sample, y_s, n_features, max_depth, n_classes, + m_try, sklearn_max, random_state, samples_file, + features_file): + seed = random_state.randint(np.iinfo(np.int32).max) + if features_file is not None: + return _build_subtree_using_features(sample, y_s, n_features, + max_depth, n_classes, m_try, + sklearn_max, seed, samples_file, + features_file) + else: + return _build_subtree(sample, y_s, n_features, max_depth, n_classes, + m_try, sklearn_max, seed, samples_file) + + +@task(samples_file=FILE_IN, features_file=FILE_IN, returns=_Node) +def _build_subtree_using_features(sample, y_s, n_features, max_depth, + n_classes, m_try, sklearn_max, seed, + samples_file, features_file): + random_state = RandomState(seed) + return _compute_build_subtree(sample, y_s, n_features, max_depth, + n_classes, m_try, sklearn_max, random_state, + samples_file, features_file=features_file) + + +@task(samples_file=FILE_IN, returns=_Node) +def _build_subtree(sample, y_s, n_features, max_depth, n_classes, m_try, + sklearn_max, seed, samples_file): + random_state = RandomState(seed) + return _compute_build_subtree(sample, y_s, n_features, max_depth, + n_classes, m_try, sklearn_max, random_state, + samples_file) + + +def _compute_build_subtree(sample, y_s, n_features, max_depth, n_classes, + m_try, sklearn_max, random_state, samples_file, + features_file=None, use_sklearn=True): + if not sample.size: + return _Node() + if features_file is not None: + mmap = np.load(features_file, mmap_mode='r', allow_pickle=False) + else: + mmap = np.load(samples_file, mmap_mode='r', allow_pickle=False).T + subtree = _Node() + tree_traversal = [(subtree, sample, y_s, 0)] + while tree_traversal: + node, sample, y_s, depth = tree_traversal.pop() + if depth < max_depth: + if use_sklearn and n_features * len(sample) <= sklearn_max: + if max_depth == np.inf: + sklearn_max_depth = None + else: + sklearn_max_depth = max_depth - depth + dt = SklearnDTClassifier(max_features=m_try, + max_depth=sklearn_max_depth, + random_state=random_state) + unique = np.unique(sample, return_index=True, + return_counts=True) + sample, new_indices, sample_weight = unique + x = _get_sample_attributes(samples_file, sample) + y_s = y_s[new_indices] + dt.fit(x, y_s, sample_weight=sample_weight, check_input=False) + node.content = _SkTreeWrapper(dt) + else: + split = _compute_split(sample, n_features, y_s, n_classes, + m_try, mmap, random_state) + node_info, left_group, y_l, right_group, y_r = split + node.content = node_info + if isinstance(node_info, _InnerNodeInfo): + node.left = _Node() + node.right = _Node() + tree_traversal.append((node.right, right_group, y_r, + depth + 1)) + tree_traversal.append((node.left, left_group, y_l, + depth + 1)) + else: + node.content = _compute_leaf_info(y_s, n_classes) + return subtree + + +@task(returns=list) +def _merge(*object_list): + return object_list + + +def _get_subtree_path(subtree_index, distr_depth): + if distr_depth == 0: + return '' + return bin(subtree_index)[2:].zfill(distr_depth) + + +def _get_predicted_indices(samples, tree, nodes_info, path): + idx_mask = np.full((len(samples),), True) + for direction in path: + node_info = nodes_info[tree.content] + if isinstance(node_info, _LeafInfo): + if direction == '1': + idx_mask[:] = 0 + else: + col = node_info.index + value = node_info.value + if direction == '0': + idx_mask[idx_mask] = samples[idx_mask, col] <= value + tree = tree.left + else: + idx_mask[idx_mask] = samples[idx_mask, col] > value + tree = tree.right + return idx_mask + + +@task(row_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) +def _predict_branch(row_blocks, tree, nodes_info, subtree_index, subtree, + distr_depth): + samples = Array._merge_blocks(row_blocks) + path = _get_subtree_path(subtree_index, distr_depth) + indices_mask = _get_predicted_indices(samples, tree, nodes_info, path) + prediction = subtree.predict(samples[indices_mask]) + return indices_mask, prediction + + +@task(row_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) +def _predict_branch_proba(row_blocks, tree, nodes_info, subtree_index, subtree, + distr_depth, n_classes): + samples = Array._merge_blocks(row_blocks) + path = _get_subtree_path(subtree_index, distr_depth) + indices_mask = _get_predicted_indices(samples, tree, nodes_info, path) + prediction = subtree.predict_proba(samples[indices_mask], n_classes) + return indices_mask, prediction + + +@task(returns=list) +def _merge_branches(n_classes, *predictions): + samples_len = len(predictions[0][0]) + if n_classes is not None: # predict + shape = (samples_len, n_classes) + dtype = np.float64 + else: # predict_proba + shape = (samples_len,) + dtype = np.int64 + merged_prediction = np.empty(shape, dtype=dtype) + for selected, prediction in predictions: + merged_prediction[selected] = prediction + return merged_prediction diff --git a/dislib/regression/rf/forest.py b/dislib/regression/rf/forest.py new file mode 100644 index 00000000..8f6c0f2a --- /dev/null +++ b/dislib/regression/rf/forest.py @@ -0,0 +1,306 @@ +import math +from collections import Counter + +import numpy as np +from pycompss.api.api import compss_wait_on +from pycompss.api.parameter import Type, COLLECTION_IN, Depth +from pycompss.api.task import task +from sklearn.base import BaseEstimator +from sklearn.utils import check_random_state + +from dislib.classification.rf.decision_tree import DecisionTreeClassifier +from dislib.data.array import Array +from dislib.utils.base import _paired_partition +from dislib.classification.rf._data import transform_to_rf_dataset + + +class RandomForestClassifier(BaseEstimator): + """A distributed random forest classifier. + + Parameters + ---------- + n_estimators : int, optional (default=10) + Number of trees to fit. + try_features : int, str or None, optional (default='sqrt') + The number of features to consider when looking for the best split: + + - If "sqrt", then `try_features=sqrt(n_features)`. + - If "third", then `try_features=n_features // 3`. + - If None, then `try_features=n_features`. + + Note: the search for a split does not stop until at least one + valid partition of the node samples is found, even if it requires + to effectively inspect more than ``try_features`` features. + max_depth : int or np.inf, optional (default=np.inf) + The maximum depth of the tree. If np.inf, then nodes are expanded + until all leaves are pure. + distr_depth : int or str, optional (default='auto') + Number of levels of the tree in which the nodes are split in a + distributed way. + sklearn_max: int or float, optional (default=1e8) + Maximum size (len(subsample)*n_features) of the arrays passed to + sklearn's DecisionTreeClassifier.fit(), which is called to fit subtrees + (subsamples) of our DecisionTreeClassifier. sklearn fit() is used + because it's faster, but requires loading the data to memory, which can + cause memory problems for large datasets. This parameter can be + adjusted to fit the hardware capabilities. + hard_vote : bool, optional (default=False) + If True, it uses majority voting over the predict() result of the + decision tree predictions. If False, it takes the class with the higher + probability given by predict_proba(), which is an average of the + probabilities given by the decision trees. + random_state : int, RandomState instance or None, optional (default=None) + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by `np.random`. + + Attributes + ---------- + classes : None or ndarray + Array of distinct classes, set at fit(). + trees : list of DecisionTreeClassifier + List of the tree classifiers of this forest, populated at fit(). + """ + + def __init__(self, + n_estimators=10, + try_features='sqrt', + max_depth=np.inf, + distr_depth='auto', + sklearn_max=1e8, + hard_vote=False, + random_state=None): + self.n_estimators = n_estimators + self.try_features = try_features + self.max_depth = max_depth + self.distr_depth = distr_depth + self.sklearn_max = sklearn_max + self.hard_vote = hard_vote + self.random_state = random_state + + def fit(self, x, y): + """Fits the RandomForestClassifier. + + Parameters + ---------- + x : ds-array, shape=(n_samples, n_features) + The training input samples. Internally, its dtype will be converted + to ``dtype=np.float32``. + y : ds-array, shape=(n_samples, 1) + The target values. + + Returns + ------- + self : RandomForestClassifier + + """ + self.classes = None + self.trees = [] + + dataset = transform_to_rf_dataset(x, y) + + n_features = dataset.get_n_features() + try_features = _resolve_try_features(self.try_features, n_features) + random_state = check_random_state(self.random_state) + + self.classes = dataset.get_classes() + + if self.distr_depth == 'auto': + dataset.n_samples = compss_wait_on(dataset.get_n_samples()) + distr_depth = max(0, int(math.log10(dataset.n_samples)) - 4) + distr_depth = min(distr_depth, self.max_depth) + else: + distr_depth = self.distr_depth + + for i in range(self.n_estimators): + tree = DecisionTreeClassifier(try_features, self.max_depth, + distr_depth, self.sklearn_max, + bootstrap=True, + random_state=random_state) + self.trees.append(tree) + + for tree in self.trees: + tree.fit(dataset) + + return self + + def predict_proba(self, x): + """Predicts class probabilities using a fitted forest. + + The probabilities are obtained as an average of the probabilities of + each decision tree. + + + Parameters + ---------- + x : ds-array, shape=(n_samples, n_features) + The input samples. + + Returns + ------- + probabilities : ds-array, shape=(n_samples, n_classes) + Predicted probabilities for the samples to belong to each class. + The columns of the array correspond to the classes given at + self.classes. + + """ + assert self.trees is not None, 'The random forest is not fitted.' + prob_blocks = [] + for x_row in x._iterator(axis=0): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict_proba(x_row)) + prob_blocks.append([_join_predictions(*tree_predictions)]) + self.classes = compss_wait_on(self.classes) + n_classes = len(self.classes) + + probabilities = Array(blocks=prob_blocks, + top_left_shape=(x._top_left_shape[0], n_classes), + reg_shape=(x._reg_shape[0], n_classes), + shape=(x.shape[0], n_classes), sparse=False) + return probabilities + + def predict(self, x): + """Predicts classes using a fitted forest. + + Parameters + ---------- + x : ds-array, shape=(n_samples, n_features) + The input samples. + + Returns + ------- + y_pred : ds-array, shape=(n_samples, 1) + Predicted class labels for x. + + """ + assert self.trees is not None, 'The random forest is not fitted.' + pred_blocks = [] + if self.hard_vote: + for x_row in x._iterator(axis=0): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict(x_row)) + pred_blocks.append(_hard_vote(self.classes, *tree_predictions)) + else: + for x_row in x._iterator(axis=0): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict_proba(x_row)) + pred_blocks.append(_soft_vote(self.classes, *tree_predictions)) + + y_pred = Array(blocks=[pred_blocks], + top_left_shape=(x._top_left_shape[0], 1), + reg_shape=(x._reg_shape[0], 1), shape=(x.shape[0], 1), + sparse=False) + + return y_pred + + def score(self, x, y): + """Accuracy classification score. + + Returns the mean accuracy on the given test data. + + + Parameters + ---------- + x : ds-array, shape=(n_samples, n_features) + The training input samples. + y : ds-array, shape (n_samples, 1) + The true labels. + + Returns + ------- + score : float (as future object) + Fraction of correctly classified samples. + + """ + assert self.trees is not None, 'The random forest is not fitted.' + partial_scores = [] + if self.hard_vote: + for x_row, y_row in _paired_partition(x, y): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict(x_row)) + subset_score = _hard_vote_score(y_row._blocks, self.classes, + *tree_predictions) + partial_scores.append(subset_score) + else: + for x_row, y_row in _paired_partition(x, y): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict_proba(x_row)) + subset_score = _soft_vote_score(y_row._blocks, self.classes, + *tree_predictions) + partial_scores.append(subset_score) + + return _merge_scores(*partial_scores) + + +@task(returns=1) +def _resolve_try_features(try_features, n_features): + if try_features is None: + return n_features + elif try_features == 'sqrt': + return int(math.sqrt(n_features)) + elif try_features == 'third': + return max(1, n_features // 3) + else: + return int(try_features) + + +@task(returns=1) +def _join_predictions(*predictions): + aggregate = predictions[0] + for p in predictions[1:]: + aggregate += p + labels = aggregate / len(predictions) + return labels + + +@task(returns=1) +def _soft_vote(classes, *predictions): + aggregate = predictions[0] + for p in predictions[1:]: + aggregate += p + labels = classes[np.argmax(aggregate, axis=1)] + return labels + + +@task(returns=1) +def _hard_vote(classes, *predictions): + mode = np.empty((len(predictions[0]),), dtype=int) + for sample_i, votes in enumerate(zip(*predictions)): + mode[sample_i] = Counter(votes).most_common(1)[0][0] + labels = classes[mode] + return labels + + +@task(y_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) +def _soft_vote_score(y_blocks, classes, *predictions): + real_labels = Array._merge_blocks(y_blocks).flatten() + aggregate = predictions[0] + for p in predictions[1:]: + aggregate += p + predicted_labels = classes[np.argmax(aggregate, axis=1)] + correct = np.count_nonzero(predicted_labels == real_labels) + return correct, len(real_labels) + + +@task(y_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) +def _hard_vote_score(y_blocks, classes, *predictions): + real_labels = Array._merge_blocks(y_blocks).flatten() + mode = np.empty((len(predictions[0]),), dtype=int) + for sample_i, votes in enumerate(zip(*predictions)): + mode[sample_i] = Counter(votes).most_common(1)[0][0] + predicted_labels = classes[mode] + correct = np.count_nonzero(predicted_labels == real_labels) + return correct, len(real_labels) + + +@task(returns=1) +def _merge_scores(*partial_scores): + correct = sum(subset_score[0] for subset_score in partial_scores) + total = sum(subset_score[1] for subset_score in partial_scores) + return correct / total diff --git a/dislib/regression/rf/test_split.py b/dislib/regression/rf/test_split.py new file mode 100644 index 00000000..70922783 --- /dev/null +++ b/dislib/regression/rf/test_split.py @@ -0,0 +1,50 @@ +from sys import float_info + +import numpy as np + + +def gini_criteria_proxy(l_weight, l_length, r_weight, r_length, not_repeated): + """ + Maximizing the Gini gain is equivalent to minimizing this proxy function. + + """ + return -(l_weight / l_length + r_weight / r_length) * not_repeated + + +def test_split(sample, y_s, feature, n_classes): + size = y_s.shape[0] + if size == 0: + return float_info.max, np.float64(np.inf) + + f = feature[sample] + sort_indices = np.argsort(f) + y_sorted = y_s[sort_indices] + f_sorted = f[sort_indices] + + not_repeated = np.empty(size, dtype=np.bool_) + not_repeated[0: size - 1] = (f_sorted[1:] != f_sorted[:-1]) + not_repeated[size - 1] = True + + l_freq = np.zeros((n_classes, size), dtype=np.int64) + l_freq[y_sorted, np.arange(size)] = 1 + + r_freq = np.zeros((n_classes, size), dtype=np.int64) + r_freq[:, 1:] = l_freq[:, :0:-1] + + l_weight = np.sum(np.square(np.cumsum(l_freq, axis=-1)), axis=0) + r_weight = np.sum(np.square(np.cumsum(r_freq, axis=-1)), axis=0)[::-1] + + l_length = np.arange(1, size + 1, dtype=np.int32) + r_length = np.arange(size - 1, -1, -1, dtype=np.int32) + r_length[size - 1] = 1 # Avoid div by zero, the right score is 0 anyways + + scores = gini_criteria_proxy(l_weight, l_length, r_weight, r_length, + not_repeated) + + min_index = size - np.argmin(scores[::-1]) - 1 + + if min_index + 1 == size: + b_value = np.float64(np.inf) + else: + b_value = (f_sorted[min_index] + f_sorted[min_index + 1]) / 2 + return scores[min_index], b_value From ff62b9b14dcf783c7725df3d2247e8ae5eeedf25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Thu, 22 Jul 2021 15:09:27 +0200 Subject: [PATCH 32/46] Added DecisionTreeRegressor with MSE criterion --- dislib/regression/rf/decision_tree.py | 397 ++++++++++++++------------ dislib/regression/rf/test_split.py | 28 +- 2 files changed, 234 insertions(+), 191 deletions(-) diff --git a/dislib/regression/rf/decision_tree.py b/dislib/regression/rf/decision_tree.py index 0725fcfa..43ecaf79 100644 --- a/dislib/regression/rf/decision_tree.py +++ b/dislib/regression/rf/decision_tree.py @@ -5,14 +5,14 @@ from pycompss.api.api import compss_delete_object from pycompss.api.parameter import FILE_IN, Type, COLLECTION_IN, Depth from pycompss.api.task import task -from sklearn.tree import DecisionTreeClassifier as SklearnDTClassifier +from sklearn.tree import DecisionTreeRegressor as SklearnDTRegressor -from dislib.classification.rf.test_split import test_split +from dislib.regression.rf.test_split import test_split from dislib.data.array import Array -class DecisionTreeClassifier: - """A distributed decision tree classifier. +class DecisionTreeRegressor: + """A distributed decision tree regressor. Parameters ---------- @@ -39,9 +39,6 @@ class DecisionTreeClassifier: n_features : int The number of features of the dataset. It can be a pycompss.runtime.Future object. - n_classes : int - The number of classes of this RfDataset. It can be a - pycompss.runtime.Future object. tree : None or _Node The root node of the tree after the tree is fitted. nodes_info : None or list of _InnerNodeInfo and _LeafInfo @@ -56,7 +53,7 @@ class DecisionTreeClassifier: Methods ------- fit(dataset) - Fits the DecisionTreeClassifier. + Fits the DecisionTreeRegressor. predict(x_row) Predicts classes for the given samples using a fitted tree. predict_proba(x_row) @@ -64,8 +61,15 @@ class DecisionTreeClassifier: """ - def __init__(self, try_features, max_depth, distr_depth, sklearn_max, - bootstrap, random_state): + def __init__( + self, + try_features, + max_depth, + distr_depth, + sklearn_max, + bootstrap, + random_state, + ): self.try_features = try_features self.max_depth = max_depth self.distr_depth = distr_depth @@ -74,14 +78,13 @@ def __init__(self, try_features, max_depth, distr_depth, sklearn_max, self.random_state = random_state self.n_features = None - self.n_classes = None self.tree = None self.nodes_info = None self.subtrees = None def fit(self, dataset): - """Fits the DecisionTreeClassifier. + """Fits the DecisionTreeRegressor. Parameters ---------- @@ -90,16 +93,16 @@ def fit(self, dataset): """ self.n_features = dataset.get_n_features() - self.n_classes = dataset.get_n_classes() samples_path = dataset.samples_path features_path = dataset.features_path n_samples = dataset.get_n_samples() - y_codes = dataset.get_y_codes() + y_targets = dataset.get_y_targets() seed = self.random_state.randint(np.iinfo(np.int32).max) - sample, y_s = _sample_selection(n_samples, y_codes, self.bootstrap, - seed) + sample, y_s = _sample_selection( + n_samples, y_targets, self.bootstrap, seed + ) self.tree = _Node() self.nodes_info = [] @@ -108,11 +111,15 @@ def fit(self, dataset): while tree_traversal: node, sample, y_s, depth = tree_traversal.pop() if depth < self.distr_depth: - split = _split_node_wrapper(sample, self.n_features, y_s, - self.n_classes, self.try_features, - self.random_state, - samples_file=samples_path, - features_file=features_path) + split = _split_node_wrapper( + sample, + self.n_features, + y_s, + self.try_features, + self.random_state, + samples_file=samples_path, + features_file=features_path, + ) node_info, left_group, y_l, right_group, y_r = split compss_delete_object(sample) compss_delete_object(y_s) @@ -124,13 +131,17 @@ def fit(self, dataset): tree_traversal.append((node.right, right_group, y_r, depth)) tree_traversal.append((node.left, left_group, y_l, depth)) else: - subtree = _build_subtree_wrapper(sample, y_s, self.n_features, - self.max_depth - depth, - self.n_classes, - self.try_features, - self.sklearn_max, - self.random_state, - samples_path, features_path) + subtree = _build_subtree_wrapper( + sample, + y_s, + self.n_features, + self.max_depth - depth, + self.try_features, + self.sklearn_max, + self.random_state, + samples_path, + features_path, + ) node.content = len(self.subtrees) self.subtrees.append(subtree) compss_delete_object(sample) @@ -155,47 +166,23 @@ def predict(self, x_row): """ - assert self.tree is not None, 'The decision tree is not fitted.' + assert self.tree is not None, "The decision tree is not fitted." branch_predictions = [] for i, subtree in enumerate(self.subtrees): - pred = _predict_branch(x_row._blocks, self.tree, self.nodes_info, - i, subtree, self.distr_depth) + pred = _predict_branch( + x_row._blocks, + self.tree, + self.nodes_info, + i, + subtree, + self.distr_depth, + ) branch_predictions.append(pred) return _merge_branches(None, *branch_predictions) - def predict_proba(self, x_row): - """Predicts class probabilities for a row block using a fitted tree. - - Parameters - ---------- - x_row : ds-array - A row block of samples. - - Returns - ------- - predicted_proba : ndarray - An array with the predicted probabilities for the given samples. - The shape is (len(subset.samples), self.n_classes), with the index - of the column being codes of the fitted - dislib.classification.rf.data.RfDataset. The returned object can be - a pycompss.runtime.Future object. - - """ - - assert self.tree is not None, 'The decision tree is not fitted.' - - branch_predictions = [] - for i, subtree in enumerate(self.subtrees): - pred = _predict_branch_proba(x_row._blocks, self.tree, - self.nodes_info, i, subtree, - self.distr_depth, self.n_classes) - branch_predictions.append(pred) - return _merge_branches(self.n_classes, *branch_predictions) - class _Node: - def __init__(self): self.content = None self.left = None @@ -204,7 +191,7 @@ def __init__(self): def predict(self, sample): node_content = self.content if isinstance(node_content, _LeafInfo): - return np.full((len(sample),), node_content.mode) + return np.full((len(sample),), node_content.mean) if isinstance(node_content, _SkTreeWrapper): if len(sample) > 0: return node_content.sk_tree.predict(sample) @@ -214,29 +201,9 @@ def predict(self, sample): pred[left_mask] = self.left.predict(sample[left_mask]) pred[~left_mask] = self.right.predict(sample[~left_mask]) return pred - assert len(sample) == 0, 'Type not supported' + assert len(sample) == 0, "Type not supported" return np.empty((0,), dtype=np.int64) - def predict_proba(self, sample, n_classes): - node_content = self.content - if isinstance(node_content, _LeafInfo): - single_pred = node_content.frequencies / node_content.size - return np.tile(single_pred, (len(sample), 1)) - if isinstance(node_content, _SkTreeWrapper): - if len(sample) > 0: - sk_tree_pred = node_content.sk_tree.predict_proba(sample) - pred = np.zeros((len(sample), n_classes), dtype=np.float64) - pred[:, node_content.sk_tree.classes_] = sk_tree_pred - return pred - if isinstance(node_content, _InnerNodeInfo): - pred = np.empty((len(sample), n_classes), dtype=np.float64) - l_msk = sample[:, node_content.index] <= node_content.value - pred[l_msk] = self.left.predict_proba(sample[l_msk], n_classes) - pred[~l_msk] = self.right.predict_proba(sample[~l_msk], n_classes) - return pred - assert len(sample) == 0, 'Type not supported' - return np.empty((0, n_classes), dtype=np.float64) - class _InnerNodeInfo: def __init__(self, index=None, value=None): @@ -245,10 +212,9 @@ def __init__(self, index=None, value=None): class _LeafInfo: - def __init__(self, size=None, frequencies=None, mode=None): + def __init__(self, size=None, mean=None): self.size = size - self.frequencies = frequencies - self.mode = mode + self.mean = mean class _SkTreeWrapper: @@ -258,7 +224,7 @@ def __init__(self, tree): def _get_sample_attributes(samples_file, indices): - samples_mmap = np.load(samples_file, mmap_mode='r', allow_pickle=False) + samples_mmap = np.load(samples_file, mmap_mode="r", allow_pickle=False) x = samples_mmap[indices] return x @@ -268,25 +234,27 @@ def _get_feature_mmap(features_file, i): def _get_features_mmap(features_file): - return np.load(features_file, mmap_mode='r', allow_pickle=False) + return np.load(features_file, mmap_mode="r", allow_pickle=False) @task(priority=True, returns=2) -def _sample_selection(n_samples, y_codes, bootstrap, seed): +def _sample_selection(n_samples, y_targets, bootstrap, seed): if bootstrap: random_state = RandomState(seed) - selection = random_state.choice(n_samples, size=n_samples, - replace=True) + selection = random_state.choice( + n_samples, size=n_samples, replace=True + ) selection.sort() - return selection, y_codes[selection] + return selection, y_targets[selection] else: - return np.arange(n_samples), y_codes + return np.arange(n_samples), y_targets def _feature_selection(untried_indices, m_try, random_state): selection_len = min(m_try, len(untried_indices)) - return random_state.choice(untried_indices, size=selection_len, - replace=False) + return random_state.choice( + untried_indices, size=selection_len, replace=False + ) def _get_groups(sample, y_s, features_mmap, index, value): @@ -303,59 +271,71 @@ def _get_groups(sample, y_s, features_mmap, index, value): return left, y_l, right, y_r -def _compute_leaf_info(y_s, n_classes): - frequencies = np.bincount(y_s, minlength=n_classes) - mode = np.argmax(frequencies) - return _LeafInfo(len(y_s), frequencies, mode) +def _compute_leaf_info(y_s): + return _LeafInfo(len(y_s), np.mean(y_s)) -def _split_node_wrapper(sample, n_features, y_s, n_classes, m_try, - random_state, samples_file=None, features_file=None): +def _split_node_wrapper( + sample, + n_features, + y_s, + m_try, + random_state, + samples_file=None, + features_file=None, +): seed = random_state.randint(np.iinfo(np.int32).max) if features_file is not None: - return _split_node_using_features(sample, n_features, y_s, n_classes, - m_try, features_file, seed) + return _split_node_using_features( + sample, n_features, y_s, m_try, features_file, seed + ) elif samples_file is not None: - return _split_node(sample, n_features, y_s, n_classes, m_try, - samples_file, seed) + return _split_node(sample, n_features, y_s, m_try, samples_file, seed) else: - raise ValueError('Invalid combination of arguments. samples_file is ' - 'None and features_file is None.') + raise ValueError( + "Invalid combination of arguments. samples_file is " + "None and features_file is None." + ) @task(features_file=FILE_IN, returns=(object, list, list, list, list)) -def _split_node_using_features(sample, n_features, y_s, n_classes, m_try, - features_file, seed): - features_mmap = np.load(features_file, mmap_mode='r', allow_pickle=False) +def _split_node_using_features( + sample, n_features, y_s, m_try, features_file, seed +): + features_mmap = np.load(features_file, mmap_mode="r", allow_pickle=False) random_state = RandomState(seed) - return _compute_split(sample, n_features, y_s, n_classes, m_try, - features_mmap, random_state) + return _compute_split( + sample, n_features, y_s, m_try, features_mmap, random_state + ) @task(samples_file=FILE_IN, returns=(object, list, list, list, list)) -def _split_node(sample, n_features, y_s, n_classes, m_try, samples_file, seed): - features_mmap = np.load(samples_file, mmap_mode='r', allow_pickle=False).T +def _split_node(sample, n_features, y_s, m_try, samples_file, seed): + features_mmap = np.load(samples_file, mmap_mode="r", allow_pickle=False).T random_state = RandomState(seed) - return _compute_split(sample, n_features, y_s, n_classes, m_try, - features_mmap, random_state) + return _compute_split( + sample, n_features, y_s, m_try, features_mmap, random_state + ) -def _compute_split(sample, n_features, y_s, n_classes, m_try, features_mmap, - random_state): +def _compute_split( + sample, n_features, y_s, m_try, features_mmap, random_state +): node_info = left_group = y_l = right_group = y_r = None split_ended = False tried_indices = [] while not split_ended: untried_indices = np.setdiff1d(np.arange(n_features), tried_indices) - index_selection = _feature_selection(untried_indices, m_try, - random_state) + index_selection = _feature_selection( + untried_indices, m_try, random_state + ) b_score = float_info.max b_index = None b_value = None for index in index_selection: feature = features_mmap[index] - score, value = test_split(sample, y_s, feature, n_classes) + score, value = test_split(sample, y_s, feature) if score < b_score: b_score, b_value, b_index = score, value, index groups = _get_groups(sample, y_s, features_mmap, b_index, b_value) @@ -367,7 +347,7 @@ def _compute_split(sample, n_features, y_s, n_classes, m_try, features_mmap, tried_indices.extend(list(index_selection)) if len(tried_indices) == n_features: split_ended = True - node_info = _compute_leaf_info(y_s, n_classes) + node_info = _compute_leaf_info(y_s) left_group = sample y_l = y_s right_group = np.array([], dtype=np.int64) @@ -376,48 +356,111 @@ def _compute_split(sample, n_features, y_s, n_classes, m_try, features_mmap, return node_info, left_group, y_l, right_group, y_r -def _build_subtree_wrapper(sample, y_s, n_features, max_depth, n_classes, - m_try, sklearn_max, random_state, samples_file, - features_file): +def _build_subtree_wrapper( + sample, + y_s, + n_features, + max_depth, + m_try, + sklearn_max, + random_state, + samples_file, + features_file, +): seed = random_state.randint(np.iinfo(np.int32).max) if features_file is not None: - return _build_subtree_using_features(sample, y_s, n_features, - max_depth, n_classes, m_try, - sklearn_max, seed, samples_file, - features_file) + return _build_subtree_using_features( + sample, + y_s, + n_features, + max_depth, + m_try, + sklearn_max, + seed, + samples_file, + features_file, + ) else: - return _build_subtree(sample, y_s, n_features, max_depth, n_classes, - m_try, sklearn_max, seed, samples_file) + return _build_subtree( + sample, + y_s, + n_features, + max_depth, + m_try, + sklearn_max, + seed, + samples_file, + ) @task(samples_file=FILE_IN, features_file=FILE_IN, returns=_Node) -def _build_subtree_using_features(sample, y_s, n_features, max_depth, - n_classes, m_try, sklearn_max, seed, - samples_file, features_file): +def _build_subtree_using_features( + sample, + y_s, + n_features, + max_depth, + m_try, + sklearn_max, + seed, + samples_file, + features_file, +): random_state = RandomState(seed) - return _compute_build_subtree(sample, y_s, n_features, max_depth, - n_classes, m_try, sklearn_max, random_state, - samples_file, features_file=features_file) + return _compute_build_subtree( + sample, + y_s, + n_features, + max_depth, + m_try, + sklearn_max, + random_state, + samples_file, + features_file=features_file, + ) @task(samples_file=FILE_IN, returns=_Node) -def _build_subtree(sample, y_s, n_features, max_depth, n_classes, m_try, - sklearn_max, seed, samples_file): +def _build_subtree( + sample, + y_s, + n_features, + max_depth, + m_try, + sklearn_max, + seed, + samples_file, +): random_state = RandomState(seed) - return _compute_build_subtree(sample, y_s, n_features, max_depth, - n_classes, m_try, sklearn_max, random_state, - samples_file) - - -def _compute_build_subtree(sample, y_s, n_features, max_depth, n_classes, - m_try, sklearn_max, random_state, samples_file, - features_file=None, use_sklearn=True): + return _compute_build_subtree( + sample, + y_s, + n_features, + max_depth, + m_try, + sklearn_max, + random_state, + samples_file, + ) + + +def _compute_build_subtree( + sample, + y_s, + n_features, + max_depth, + m_try, + sklearn_max, + random_state, + samples_file, + features_file=None, + use_sklearn=True, +): if not sample.size: return _Node() if features_file is not None: - mmap = np.load(features_file, mmap_mode='r', allow_pickle=False) + mmap = np.load(features_file, mmap_mode="r", allow_pickle=False) else: - mmap = np.load(samples_file, mmap_mode='r', allow_pickle=False).T + mmap = np.load(samples_file, mmap_mode="r", allow_pickle=False).T subtree = _Node() tree_traversal = [(subtree, sample, y_s, 0)] while tree_traversal: @@ -428,30 +471,41 @@ def _compute_build_subtree(sample, y_s, n_features, max_depth, n_classes, sklearn_max_depth = None else: sklearn_max_depth = max_depth - depth - dt = SklearnDTClassifier(max_features=m_try, - max_depth=sklearn_max_depth, - random_state=random_state) - unique = np.unique(sample, return_index=True, - return_counts=True) + dt = SklearnDTRegressor( + max_features=m_try, + max_depth=sklearn_max_depth, + random_state=random_state, + ) + unique = np.unique( + sample, return_index=True, return_counts=True + ) sample, new_indices, sample_weight = unique x = _get_sample_attributes(samples_file, sample) y_s = y_s[new_indices] dt.fit(x, y_s, sample_weight=sample_weight, check_input=False) node.content = _SkTreeWrapper(dt) else: - split = _compute_split(sample, n_features, y_s, n_classes, - m_try, mmap, random_state) + split = _compute_split( + sample, + n_features, + y_s, + m_try, + mmap, + random_state, + ) node_info, left_group, y_l, right_group, y_r = split node.content = node_info if isinstance(node_info, _InnerNodeInfo): node.left = _Node() node.right = _Node() - tree_traversal.append((node.right, right_group, y_r, - depth + 1)) - tree_traversal.append((node.left, left_group, y_l, - depth + 1)) + tree_traversal.append( + (node.right, right_group, y_r, depth + 1) + ) + tree_traversal.append( + (node.left, left_group, y_l, depth + 1) + ) else: - node.content = _compute_leaf_info(y_s, n_classes) + node.content = _compute_leaf_info(y_s) return subtree @@ -462,7 +516,7 @@ def _merge(*object_list): def _get_subtree_path(subtree_index, distr_depth): if distr_depth == 0: - return '' + return "" return bin(subtree_index)[2:].zfill(distr_depth) @@ -471,12 +525,12 @@ def _get_predicted_indices(samples, tree, nodes_info, path): for direction in path: node_info = nodes_info[tree.content] if isinstance(node_info, _LeafInfo): - if direction == '1': + if direction == "1": idx_mask[:] = 0 else: col = node_info.index value = node_info.value - if direction == '0': + if direction == "0": idx_mask[idx_mask] = samples[idx_mask, col] <= value tree = tree.left else: @@ -486,8 +540,9 @@ def _get_predicted_indices(samples, tree, nodes_info, path): @task(row_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) -def _predict_branch(row_blocks, tree, nodes_info, subtree_index, subtree, - distr_depth): +def _predict_branch( + row_blocks, tree, nodes_info, subtree_index, subtree, distr_depth +): samples = Array._merge_blocks(row_blocks) path = _get_subtree_path(subtree_index, distr_depth) indices_mask = _get_predicted_indices(samples, tree, nodes_info, path) @@ -495,16 +550,6 @@ def _predict_branch(row_blocks, tree, nodes_info, subtree_index, subtree, return indices_mask, prediction -@task(row_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) -def _predict_branch_proba(row_blocks, tree, nodes_info, subtree_index, subtree, - distr_depth, n_classes): - samples = Array._merge_blocks(row_blocks) - path = _get_subtree_path(subtree_index, distr_depth) - indices_mask = _get_predicted_indices(samples, tree, nodes_info, path) - prediction = subtree.predict_proba(samples[indices_mask], n_classes) - return indices_mask, prediction - - @task(returns=list) def _merge_branches(n_classes, *predictions): samples_len = len(predictions[0][0]) diff --git a/dislib/regression/rf/test_split.py b/dislib/regression/rf/test_split.py index 70922783..aa482b3c 100644 --- a/dislib/regression/rf/test_split.py +++ b/dislib/regression/rf/test_split.py @@ -3,15 +3,15 @@ import numpy as np -def gini_criteria_proxy(l_weight, l_length, r_weight, r_length, not_repeated): +def mse_criteria_proxy(l_weight, l_length, r_weight, r_length, not_repeated): """ - Maximizing the Gini gain is equivalent to minimizing this proxy function. + Maximizing the MSE gain is equivalent to minimizing this proxy function. """ return -(l_weight / l_length + r_weight / r_length) * not_repeated -def test_split(sample, y_s, feature, n_classes): +def test_split(sample, y_s, feature): size = y_s.shape[0] if size == 0: return float_info.max, np.float64(np.inf) @@ -21,28 +21,26 @@ def test_split(sample, y_s, feature, n_classes): y_sorted = y_s[sort_indices] f_sorted = f[sort_indices] + # Threshold value must not be that value of a sample not_repeated = np.empty(size, dtype=np.bool_) - not_repeated[0: size - 1] = (f_sorted[1:] != f_sorted[:-1]) + not_repeated[0 : size - 1] = f_sorted[1:] != f_sorted[:-1] not_repeated[size - 1] = True - l_freq = np.zeros((n_classes, size), dtype=np.int64) - l_freq[y_sorted, np.arange(size)] = 1 - - r_freq = np.zeros((n_classes, size), dtype=np.int64) - r_freq[:, 1:] = l_freq[:, :0:-1] - - l_weight = np.sum(np.square(np.cumsum(l_freq, axis=-1)), axis=0) - r_weight = np.sum(np.square(np.cumsum(r_freq, axis=-1)), axis=0)[::-1] + # Square of the sum of the y values of each branch + r_weight = np.zeros(size) + l_weight = np.square(np.cumsum(y_sorted, axis=-1)) + r_weight[:-1] = np.square(np.cumsum(y_sorted[::-1], axis=-1)[-2::-1]) + # Number of samples of each branch l_length = np.arange(1, size + 1, dtype=np.int32) r_length = np.arange(size - 1, -1, -1, dtype=np.int32) r_length[size - 1] = 1 # Avoid div by zero, the right score is 0 anyways - scores = gini_criteria_proxy(l_weight, l_length, r_weight, r_length, - not_repeated) + scores = mse_criteria_proxy( + l_weight, l_length, r_weight, r_length, not_repeated + ) min_index = size - np.argmin(scores[::-1]) - 1 - if min_index + 1 == size: b_value = np.float64(np.inf) else: From 99699647787b9a02d847af1746b39a6e4e2446ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Fri, 23 Jul 2021 12:51:08 +0200 Subject: [PATCH 33/46] Added RandomForestRegressor --- dislib/regression/__init__.py | 3 +- dislib/regression/rf/decision_tree.py | 13 +- dislib/regression/rf/forest.py | 242 +++++++++----------------- tests/test_rf_regressor.py | 105 +++++++++++ 4 files changed, 199 insertions(+), 164 deletions(-) create mode 100644 tests/test_rf_regressor.py diff --git a/dislib/regression/__init__.py b/dislib/regression/__init__.py index e3287a0b..4a222968 100644 --- a/dislib/regression/__init__.py +++ b/dislib/regression/__init__.py @@ -1,4 +1,5 @@ from dislib.regression.linear.base import LinearRegression from dislib.regression.lasso.base import Lasso +from dislib.regression.rf.forest import RandomForestRegressor -__all__ = ['LinearRegression', 'Lasso'] +__all__ = ["LinearRegression", "Lasso", "RandomForestRegressor"] diff --git a/dislib/regression/rf/decision_tree.py b/dislib/regression/rf/decision_tree.py index 43ecaf79..82730a5d 100644 --- a/dislib/regression/rf/decision_tree.py +++ b/dislib/regression/rf/decision_tree.py @@ -196,13 +196,13 @@ def predict(self, sample): if len(sample) > 0: return node_content.sk_tree.predict(sample) if isinstance(node_content, _InnerNodeInfo): - pred = np.empty((len(sample),), dtype=np.int64) + pred = np.empty((len(sample),), dtype=np.float64) left_mask = sample[:, node_content.index] <= node_content.value pred[left_mask] = self.left.predict(sample[left_mask]) pred[~left_mask] = self.right.predict(sample[~left_mask]) return pred assert len(sample) == 0, "Type not supported" - return np.empty((0,), dtype=np.int64) + return np.empty((0,), dtype=np.float64) class _InnerNodeInfo: @@ -220,7 +220,6 @@ def __init__(self, size=None, mean=None): class _SkTreeWrapper: def __init__(self, tree): self.sk_tree = tree - self.classes = tree.classes_ def _get_sample_attributes(samples_file, indices): @@ -260,8 +259,8 @@ def _feature_selection(untried_indices, m_try, random_state): def _get_groups(sample, y_s, features_mmap, index, value): if index is None: empty_sample = np.array([], dtype=np.int64) - empty_labels = np.array([], dtype=np.int8) - return sample, y_s, empty_sample, empty_labels + empty_target = np.array([], dtype=np.float64) + return sample, y_s, empty_sample, empty_target feature = features_mmap[index][sample] mask = feature < value left = sample[mask] @@ -351,7 +350,7 @@ def _compute_split( left_group = sample y_l = y_s right_group = np.array([], dtype=np.int64) - y_r = np.array([], dtype=np.int8) + y_r = np.array([], dtype=np.float64) return node_info, left_group, y_l, right_group, y_r @@ -558,7 +557,7 @@ def _merge_branches(n_classes, *predictions): dtype = np.float64 else: # predict_proba shape = (samples_len,) - dtype = np.int64 + dtype = np.float64 merged_prediction = np.empty(shape, dtype=dtype) for selected, prediction in predictions: merged_prediction[selected] = prediction diff --git a/dislib/regression/rf/forest.py b/dislib/regression/rf/forest.py index 8f6c0f2a..faae07c8 100644 --- a/dislib/regression/rf/forest.py +++ b/dislib/regression/rf/forest.py @@ -8,14 +8,14 @@ from sklearn.base import BaseEstimator from sklearn.utils import check_random_state -from dislib.classification.rf.decision_tree import DecisionTreeClassifier +from dislib.regression.rf.decision_tree import DecisionTreeRegressor from dislib.data.array import Array from dislib.utils.base import _paired_partition -from dislib.classification.rf._data import transform_to_rf_dataset +from dislib.regression.rf._data import transform_to_rf_dataset -class RandomForestClassifier(BaseEstimator): - """A distributed random forest classifier. +class RandomForestRegressor(BaseEstimator): + """A distributed random forest regressor. Parameters ---------- @@ -39,16 +39,11 @@ class RandomForestClassifier(BaseEstimator): distributed way. sklearn_max: int or float, optional (default=1e8) Maximum size (len(subsample)*n_features) of the arrays passed to - sklearn's DecisionTreeClassifier.fit(), which is called to fit subtrees - (subsamples) of our DecisionTreeClassifier. sklearn fit() is used + sklearn's DecisionTreeRegressor.fit(), which is called to fit subtrees + (subsamples) of our DecisionTreeRegressor. sklearn fit() is used because it's faster, but requires loading the data to memory, which can cause memory problems for large datasets. This parameter can be adjusted to fit the hardware capabilities. - hard_vote : bool, optional (default=False) - If True, it uses majority voting over the predict() result of the - decision tree predictions. If False, it takes the class with the higher - probability given by predict_proba(), which is an average of the - probabilities given by the decision trees. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; @@ -57,30 +52,28 @@ class RandomForestClassifier(BaseEstimator): Attributes ---------- - classes : None or ndarray - Array of distinct classes, set at fit(). - trees : list of DecisionTreeClassifier - List of the tree classifiers of this forest, populated at fit(). + trees : list of DecisionTreeRegressor + List of the tree regressors of this forest, populated at fit(). """ - def __init__(self, - n_estimators=10, - try_features='sqrt', - max_depth=np.inf, - distr_depth='auto', - sklearn_max=1e8, - hard_vote=False, - random_state=None): + def __init__( + self, + n_estimators=10, + try_features="sqrt", + max_depth=np.inf, + distr_depth="auto", + sklearn_max=1e8, + random_state=None, + ): self.n_estimators = n_estimators self.try_features = try_features self.max_depth = max_depth self.distr_depth = distr_depth self.sklearn_max = sklearn_max - self.hard_vote = hard_vote self.random_state = random_state def fit(self, x, y): - """Fits the RandomForestClassifier. + """Fits the RandomForestRegressor. Parameters ---------- @@ -92,10 +85,9 @@ def fit(self, x, y): Returns ------- - self : RandomForestClassifier + self : RandomForestRegressor """ - self.classes = None self.trees = [] dataset = transform_to_rf_dataset(x, y) @@ -104,20 +96,22 @@ def fit(self, x, y): try_features = _resolve_try_features(self.try_features, n_features) random_state = check_random_state(self.random_state) - self.classes = dataset.get_classes() - - if self.distr_depth == 'auto': + if self.distr_depth == "auto": dataset.n_samples = compss_wait_on(dataset.get_n_samples()) distr_depth = max(0, int(math.log10(dataset.n_samples)) - 4) distr_depth = min(distr_depth, self.max_depth) else: distr_depth = self.distr_depth - for i in range(self.n_estimators): - tree = DecisionTreeClassifier(try_features, self.max_depth, - distr_depth, self.sklearn_max, - bootstrap=True, - random_state=random_state) + for _ in range(self.n_estimators): + tree = DecisionTreeRegressor( + try_features, + self.max_depth, + distr_depth, + self.sklearn_max, + bootstrap=True, + random_state=random_state, + ) self.trees.append(tree) for tree in self.trees: @@ -125,44 +119,8 @@ def fit(self, x, y): return self - def predict_proba(self, x): - """Predicts class probabilities using a fitted forest. - - The probabilities are obtained as an average of the probabilities of - each decision tree. - - - Parameters - ---------- - x : ds-array, shape=(n_samples, n_features) - The input samples. - - Returns - ------- - probabilities : ds-array, shape=(n_samples, n_classes) - Predicted probabilities for the samples to belong to each class. - The columns of the array correspond to the classes given at - self.classes. - - """ - assert self.trees is not None, 'The random forest is not fitted.' - prob_blocks = [] - for x_row in x._iterator(axis=0): - tree_predictions = [] - for tree in self.trees: - tree_predictions.append(tree.predict_proba(x_row)) - prob_blocks.append([_join_predictions(*tree_predictions)]) - self.classes = compss_wait_on(self.classes) - n_classes = len(self.classes) - - probabilities = Array(blocks=prob_blocks, - top_left_shape=(x._top_left_shape[0], n_classes), - reg_shape=(x._reg_shape[0], n_classes), - shape=(x.shape[0], n_classes), sparse=False) - return probabilities - def predict(self, x): - """Predicts classes using a fitted forest. + """Predicts target values using a fitted forest. Parameters ---------- @@ -172,36 +130,40 @@ def predict(self, x): Returns ------- y_pred : ds-array, shape=(n_samples, 1) - Predicted class labels for x. + Predicted target values for x. """ - assert self.trees is not None, 'The random forest is not fitted.' + assert self.trees is not None, "The random forest is not fitted." pred_blocks = [] - if self.hard_vote: - for x_row in x._iterator(axis=0): - tree_predictions = [] - for tree in self.trees: - tree_predictions.append(tree.predict(x_row)) - pred_blocks.append(_hard_vote(self.classes, *tree_predictions)) - else: - for x_row in x._iterator(axis=0): - tree_predictions = [] - for tree in self.trees: - tree_predictions.append(tree.predict_proba(x_row)) - pred_blocks.append(_soft_vote(self.classes, *tree_predictions)) + for x_row in x._iterator(axis=0): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict(x_row)) + pred_blocks.append(_join_predictions(*tree_predictions)) - y_pred = Array(blocks=[pred_blocks], - top_left_shape=(x._top_left_shape[0], 1), - reg_shape=(x._reg_shape[0], 1), shape=(x.shape[0], 1), - sparse=False) + y_pred = Array( + blocks=[pred_blocks], + top_left_shape=(x._top_left_shape[0], 1), + reg_shape=(x._reg_shape[0], 1), + shape=(x.shape[0], 1), + sparse=False, + ) return y_pred def score(self, x, y): - """Accuracy classification score. - - Returns the mean accuracy on the given test data. - + """Accuracy regression score. + + Return the coefficient of determination $R^2$ of + the prediction. + The coefficient $R^2$ is defined as $(1-u/v)$, where $u$ + is the residual sum of squares `((y_true - y_pred) ** 2).sum()` and + $v$ is the total sum of squares + `((y_true - y_true.mean()) ** 2).sum()`. + The best possible score is 1.0 and it can be negative + (because the model can be arbitrarily worse). + A constant model that always predicts the expected value of y, + disregarding the input features, would get a $R^2$ score of 0.0. Parameters ---------- @@ -213,27 +175,17 @@ def score(self, x, y): Returns ------- score : float (as future object) - Fraction of correctly classified samples. + Coefficient of determination $R^2$. """ - assert self.trees is not None, 'The random forest is not fitted.' + assert self.trees is not None, "The random forest is not fitted." partial_scores = [] - if self.hard_vote: - for x_row, y_row in _paired_partition(x, y): - tree_predictions = [] - for tree in self.trees: - tree_predictions.append(tree.predict(x_row)) - subset_score = _hard_vote_score(y_row._blocks, self.classes, - *tree_predictions) - partial_scores.append(subset_score) - else: - for x_row, y_row in _paired_partition(x, y): - tree_predictions = [] - for tree in self.trees: - tree_predictions.append(tree.predict_proba(x_row)) - subset_score = _soft_vote_score(y_row._blocks, self.classes, - *tree_predictions) - partial_scores.append(subset_score) + for x_row, y_row in _paired_partition(x, y): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict(x_row)) + subset_score = _partial_score(y_row._blocks, *tree_predictions) + partial_scores.append(subset_score) return _merge_scores(*partial_scores) @@ -242,9 +194,9 @@ def score(self, x, y): def _resolve_try_features(try_features, n_features): if try_features is None: return n_features - elif try_features == 'sqrt': + elif try_features == "sqrt": return int(math.sqrt(n_features)) - elif try_features == 'third': + elif try_features == "third": return max(1, n_features // 3) else: return int(try_features) @@ -255,52 +207,30 @@ def _join_predictions(*predictions): aggregate = predictions[0] for p in predictions[1:]: aggregate += p - labels = aggregate / len(predictions) - return labels - - -@task(returns=1) -def _soft_vote(classes, *predictions): - aggregate = predictions[0] - for p in predictions[1:]: - aggregate += p - labels = classes[np.argmax(aggregate, axis=1)] - return labels - - -@task(returns=1) -def _hard_vote(classes, *predictions): - mode = np.empty((len(predictions[0]),), dtype=int) - for sample_i, votes in enumerate(zip(*predictions)): - mode[sample_i] = Counter(votes).most_common(1)[0][0] - labels = classes[mode] - return labels + target = aggregate / len(predictions) + return target @task(y_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) -def _soft_vote_score(y_blocks, classes, *predictions): - real_labels = Array._merge_blocks(y_blocks).flatten() - aggregate = predictions[0] - for p in predictions[1:]: - aggregate += p - predicted_labels = classes[np.argmax(aggregate, axis=1)] - correct = np.count_nonzero(predicted_labels == real_labels) - return correct, len(real_labels) - - -@task(y_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) -def _hard_vote_score(y_blocks, classes, *predictions): - real_labels = Array._merge_blocks(y_blocks).flatten() - mode = np.empty((len(predictions[0]),), dtype=int) - for sample_i, votes in enumerate(zip(*predictions)): - mode[sample_i] = Counter(votes).most_common(1)[0][0] - predicted_labels = classes[mode] - correct = np.count_nonzero(predicted_labels == real_labels) - return correct, len(real_labels) +def _partial_score(y_blocks, *predictions): + y_true = Array._merge_blocks(y_blocks).flatten() + y_pred = np.mean(predictions, axis=0) + n_samples = y_true.shape[0] + y_avg = np.mean(y_true) + u_partial = np.sum(np.square(y_true - y_pred), axis=0) + v_partial = np.sum(np.square(y_true - y_avg), axis=0) + return u_partial, v_partial, y_avg, n_samples @task(returns=1) def _merge_scores(*partial_scores): - correct = sum(subset_score[0] for subset_score in partial_scores) - total = sum(subset_score[1] for subset_score in partial_scores) - return correct / total + u = v = avg = n = 0 + for u_p, v_p, avg_p, n_p in partial_scores: + u += u_p + + delta = avg_p - avg + avg += delta * n_p / (n + n_p) + v += v_p + delta ** 2 * n * n_p / (n + n_p) + n += n_p + + return 1 - u / v diff --git a/tests/test_rf_regressor.py b/tests/test_rf_regressor.py new file mode 100644 index 00000000..2d82dbeb --- /dev/null +++ b/tests/test_rf_regressor.py @@ -0,0 +1,105 @@ +import unittest + +import numpy as np +from pycompss.api.api import compss_wait_on +from sklearn.datasets import make_regression + +import dislib as ds +from dislib.regression import RandomForestRegressor + + +def _determination_coefficient(y_true, y_pred): + u = np.sum(np.square(y_true - y_pred)) + v = np.sum(np.square(y_true - np.mean(y_true))) + return 1 - u / v + + +class RandomForestRegressorTest(unittest.TestCase): + def test_make_regression(self): + """Tests RandomForestRegressor fit and score with default params.""" + x, y = make_regression( + n_samples=3000, + n_features=10, + n_informative=4, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[: len(x) // 2], (300, 10)) + y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2 :], (300, 10)) + y_test = ds.array(y[len(y) // 2 :][:, np.newaxis], (300, 1)) + + rf = RandomForestRegressor(random_state=0) + + rf.fit(x_train, y_train) + accuracy1 = compss_wait_on(rf.score(x_test, y_test)) + + y_pred = rf.predict(x_test).collect() + y_true = y[len(y) // 2 :] + accuracy2 = _determination_coefficient(y_true, y_pred) + + self.assertGreater(accuracy1, 0.85) + self.assertGreater(accuracy2, 0.85) + self.assertAlmostEqual(accuracy1, accuracy2) + + def test_make_regression_predict_and_distr_depth(self): + """Tests RandomForestRegressor fit and predict with a distr_depth.""" + x, y = make_regression( + n_samples=3000, + n_features=10, + n_informative=4, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[: len(x) // 2], (300, 10)) + y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2 :], (300, 10)) + y_test = ds.array(y[len(y) // 2 :][:, np.newaxis], (300, 1)) + + rf = RandomForestRegressor(distr_depth=2, random_state=0) + + rf.fit(x_train, y_train) + accuracy1 = compss_wait_on(rf.score(x_test, y_test)) + + y_pred = rf.predict(x_test).collect() + y_true = y[len(y) // 2 :] + accuracy2 = _determination_coefficient(y_true, y_pred) + + self.assertGreater(accuracy1, 0.85) + self.assertGreater(accuracy2, 0.85) + self.assertAlmostEqual(accuracy1, accuracy2) + + def test_make_regression_sklearn_max_predict(self): + """Tests RandomForestRegressor predict with sklearn_max.""" + x, y = make_regression( + n_samples=3000, + n_features=10, + n_informative=4, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[: len(x) // 2], (300, 10)) + y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2 :], (300, 10)) + y_test = ds.array(y[len(y) // 2 :][:, np.newaxis], (300, 1)) + + rf = RandomForestRegressor(random_state=0, sklearn_max=10) + + rf.fit(x_train, y_train) + accuracy1 = compss_wait_on(rf.score(x_test, y_test)) + + y_pred = rf.predict(x_test).collect() + y_true = y[len(y) // 2 :] + accuracy2 = _determination_coefficient(y_true, y_pred) + + self.assertGreater(accuracy1, 0.85) + self.assertGreater(accuracy2, 0.85) + self.assertAlmostEqual(accuracy1, accuracy2) + + +def main(): + unittest.main() + + +if __name__ == "__main__": + main() From 85d481b8a20a7e1c9680af1f2f53a0ec66792238 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Mon, 26 Jul 2021 13:01:47 +0200 Subject: [PATCH 34/46] Added RF to a new 'commons' module --- dislib/classification/__init__.py | 4 +- dislib/classification/rf/_data.py | 71 +-- dislib/commons/rf/__init__.py | 0 dislib/commons/rf/_data.py | 404 ++++++++++++++ dislib/commons/rf/_decision_tree.py | 784 ++++++++++++++++++++++++++++ dislib/commons/rf/_forest.py | 486 +++++++++++++++++ dislib/commons/rf/_test_split.py | 59 +++ dislib/regression/__init__.py | 2 +- 8 files changed, 1777 insertions(+), 33 deletions(-) create mode 100644 dislib/commons/rf/__init__.py create mode 100644 dislib/commons/rf/_data.py create mode 100644 dislib/commons/rf/_decision_tree.py create mode 100644 dislib/commons/rf/_forest.py create mode 100644 dislib/commons/rf/_test_split.py diff --git a/dislib/classification/__init__.py b/dislib/classification/__init__.py index 55bc2877..f4a90db6 100644 --- a/dislib/classification/__init__.py +++ b/dislib/classification/__init__.py @@ -1,4 +1,4 @@ from dislib.classification.csvm.base import CascadeSVM -from dislib.classification.rf.forest import RandomForestClassifier +from dislib.commons.rf._forest import RandomForestClassifier -__all__ = ['CascadeSVM', 'RandomForestClassifier'] +__all__ = ["CascadeSVM", "RandomForestClassifier"] diff --git a/dislib/classification/rf/_data.py b/dislib/classification/rf/_data.py index 9bd178b5..1a8da41f 100644 --- a/dislib/classification/rf/_data.py +++ b/dislib/classification/rf/_data.py @@ -2,8 +2,13 @@ import numpy as np from numpy.lib import format -from pycompss.api.parameter import FILE_IN, FILE_INOUT, COLLECTION_IN, Depth, \ - Type +from pycompss.api.parameter import ( + FILE_IN, + FILE_INOUT, + COLLECTION_IN, + Depth, + Type, +) from pycompss.api.task import task from dislib.data.array import Array @@ -82,12 +87,13 @@ def get_n_samples(self): """ if self.n_samples is None: - assert isinstance(self.samples_path, str), \ - 'self.n_samples must be set manually if self.samples_path ' \ - 'is a pycompss.runtime.Future object' + assert isinstance(self.samples_path, str), ( + "self.n_samples must be set manually if self.samples_path " + "is a pycompss.runtime.Future object" + ) shape = _NpyFile(self.samples_path).get_shape() if len(shape) != 2: - raise ValueError('Cannot read 2D array from the samples file.') + raise ValueError("Cannot read 2D array from the samples file.") self.n_samples, self.n_features = shape return self.n_samples @@ -107,12 +113,13 @@ def get_n_features(self): """ if self.n_features is None: - assert isinstance(self.samples_path, str), \ - 'self.n_features must be set manually if self.samples_path ' \ - 'is a pycompss.runtime.Future object' + assert isinstance(self.samples_path, str), ( + "self.n_features must be set manually if self.samples_path " + "is a pycompss.runtime.Future object" + ) shape = _NpyFile(self.samples_path).get_shape() if len(shape) != 2: - raise ValueError('Cannot read 2D array from the samples file.') + raise ValueError("Cannot read 2D array from the samples file.") self.n_samples, self.n_features = shape return self.n_features @@ -169,11 +176,11 @@ class n_samples and n_features or if the array is in fortran order. shape = features_npy_file.get_shape() fortran_order = features_npy_file.get_fortran_order() if len(shape) != 2: - raise ValueError('Cannot read 2D array from features_file.') + raise ValueError("Cannot read 2D array from features_file.") if (self.get_n_features(), self.get_n_samples()) != shape: - raise ValueError('Invalid dimensions for the features_file.') + raise ValueError("Invalid dimensions for the features_file.") if fortran_order: - raise ValueError('Fortran order not supported for features array.') + raise ValueError("Fortran order not supported for features array.") def transform_to_rf_dataset(x: Array, y: Array) -> RfDataset: @@ -197,9 +204,9 @@ def transform_to_rf_dataset(x: Array, y: Array) -> RfDataset: n_samples = x.shape[0] n_features = x.shape[1] - samples_file = tempfile.NamedTemporaryFile(mode='wb', - prefix='tmp_rf_samples_', - delete=False) + samples_file = tempfile.NamedTemporaryFile( + mode="wb", prefix="tmp_rf_samples_", delete=False + ) samples_path = samples_file.name samples_file.close() _allocate_samples_file(samples_path, n_samples, n_features) @@ -213,9 +220,9 @@ def transform_to_rf_dataset(x: Array, y: Array) -> RfDataset: _fill_samples_file(samples_path, x_row._blocks, start_idx) start_idx += x._reg_shape[0] - labels_file = tempfile.NamedTemporaryFile(mode='w', - prefix='tmp_rf_labels_', - delete=False) + labels_file = tempfile.NamedTemporaryFile( + mode="w", prefix="tmp_rf_labels_", delete=False + ) labels_path = labels_file.name labels_file.close() for y_row in y._iterator(axis=0): @@ -251,19 +258,19 @@ def get_dtype(self): return self.dtype def _read_header(self): - with open(self.path, 'rb') as fp: + with open(self.path, "rb") as fp: version = format.read_magic(fp) try: format._check_version(version) except ValueError: - raise ValueError('Invalid file format.') + raise ValueError("Invalid file format.") header_data = format._read_array_header(fp, version) self.shape, self.fortran_order, self.dtype = header_data @task(labels_path=FILE_IN, returns=3) def _get_labels(labels_path): - y = np.genfromtxt(labels_path, dtype=None, encoding='utf-8') + y = np.genfromtxt(labels_path, dtype=None, encoding="utf-8") categories, codes = np.unique(y, return_inverse=True) return codes.astype(np.int8), categories, len(categories) @@ -279,26 +286,30 @@ def _merge_shapes(*samples_shapes): n_features = samples_shapes[0][1] for shape in samples_shapes: n_samples += shape[0] - assert shape[1] == n_features, 'Subsamples with different n_features.' + assert shape[1] == n_features, "Subsamples with different n_features." return samples_shapes, n_samples, n_features @task(samples_path=FILE_INOUT) def _allocate_samples_file(samples_path, n_samples, n_features): - np.lib.format.open_memmap(samples_path, mode='w+', dtype='float32', - shape=(int(n_samples), int(n_features))) + np.lib.format.open_memmap( + samples_path, + mode="w+", + dtype="float32", + shape=(int(n_samples), int(n_features)), + ) @task(samples_path=FILE_INOUT, row_blocks={Type: COLLECTION_IN, Depth: 2}) def _fill_samples_file(samples_path, row_blocks, start_idx): rows_samples = Array._merge_blocks(row_blocks) - rows_samples = rows_samples.astype(dtype='float32', casting='same_kind') - samples = np.lib.format.open_memmap(samples_path, mode='r+') - samples[start_idx: start_idx + rows_samples.shape[0]] = rows_samples + rows_samples = rows_samples.astype(dtype="float32", casting="same_kind") + samples = np.lib.format.open_memmap(samples_path, mode="r+") + samples[start_idx : start_idx + rows_samples.shape[0]] = rows_samples @task(labels_path=FILE_INOUT, row_blocks={Type: COLLECTION_IN, Depth: 2}) def _fill_labels_file(labels_path, row_blocks): rows_labels = Array._merge_blocks(row_blocks) - with open(labels_path, 'at') as f: - np.savetxt(f, rows_labels, fmt='%s', encoding='utf-8') + with open(labels_path, "at") as f: + np.savetxt(f, rows_labels, fmt="%s", encoding="utf-8") diff --git a/dislib/commons/rf/__init__.py b/dislib/commons/rf/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/dislib/commons/rf/_data.py b/dislib/commons/rf/_data.py new file mode 100644 index 00000000..de692182 --- /dev/null +++ b/dislib/commons/rf/_data.py @@ -0,0 +1,404 @@ +import tempfile + +import numpy as np +from numpy.lib import format +from pycompss.api.parameter import ( + FILE_IN, + FILE_INOUT, + COLLECTION_IN, + Depth, + Type, +) +from pycompss.api.task import task + +from dislib.data.array import Array + + +class RfBaseDataset: + """Base class for Dataset format used by the fit() of the + RandomForestRegressor and RandomForestClassifier. + + Warning: This class should not be used directly. Use derived classes + instead. + """ + + def __init__(self, samples_path, targets_path, features_path=None): + self.samples_path = samples_path + self.targets_path = targets_path + self.features_path = features_path + self.n_samples = None + self.n_features = None + self.y_targets = None + + def get_n_samples(self): + """Gets the number of samples obtained from the samples file. + + Returns + ------- + n_samples : int + + Raises + ------ + AssertionError + If self.n_samples is None and self.samples_path is not a string. + ValueError + If invalid content is encountered in the samples file. + + """ + if self.n_samples is None: + assert isinstance(self.samples_path, str), ( + "self.n_samples must be set manually if self.samples_path " + "is a pycompss.runtime.Future object" + ) + shape = _NpyFile(self.samples_path).get_shape() + if len(shape) != 2: + raise ValueError("Cannot read 2D array from the samples file.") + self.n_samples, self.n_features = shape + return self.n_samples + + def get_n_features(self): + """Gets the number of features obtained from the samples file. + + Returns + ------- + n_features : int + + Raises + ------ + AssertionError + If self.n_features is None and self.samples_path is not a string. + ValueError + If invalid content is encountered in the samples file. + + """ + if self.n_features is None: + assert isinstance(self.samples_path, str), ( + "self.n_features must be set manually if self.samples_path " + "is a pycompss.runtime.Future object" + ) + shape = _NpyFile(self.samples_path).get_shape() + if len(shape) != 2: + raise ValueError("Cannot read 2D array from the samples file.") + self.n_samples, self.n_features = shape + return self.n_features + + def validate_features_file(self): + """Validates the features file header information. + + Raises + ------ + ValueError + If the shape of the array in the features_file doesn't match this + class n_samples and n_features or if the array is in fortran order. + + """ + features_npy_file = _NpyFile(self.features_path) + shape = features_npy_file.get_shape() + fortran_order = features_npy_file.get_fortran_order() + if len(shape) != 2: + raise ValueError("Cannot read 2D array from features_file.") + if (self.get_n_features(), self.get_n_samples()) != shape: + raise ValueError("Invalid dimensions for the features_file.") + if fortran_order: + raise ValueError("Fortran order not supported for features array.") + + +class RfClassifierDataset(RfBaseDataset): + """Dataset format used by the fit() of the RandomForestClassifier. + + The RfDataset contains a file path for the samples and another one for the + labels. Optionally, a path can be provided for a transposed version of the + samples matrix, i.e., the features. + + Note: For a representation of a dataset distributed in multiple files, use + dislib.data.Dataset instead. + + Parameters + ---------- + samples_path : str + Path of the .npy file containing the 2-d array of samples. It can be a + pycompss.runtime.Future object. If so, self.n_samples and + self.n_features must be set manually (they can also be + pycompss.runtime.Future objects). + targets_path : str + Path of the .dat file containing the 1-d array of target labels. + It can be a pycompss.runtime.Future object. + features_path : str, optional (default=None) + Path of the .npy file containing the 2-d array of samples transposed. + The array must be C-ordered. Providing this array may improve the + performance as it allows sequential access to the features. + + Attributes + ---------- + n_samples : int + The number of samples of the dataset. It can be a + pycompss.runtime.Future object. + n_features : int + The number of features of the dataset. It can be a + pycompss.runtime.Future object. + y_targets : ndarray + The codified array of labels for this RfDataset. The values are indices + of the array of classes, which contains the corresponding labels. The + dtype is np.int8. It can be a pycompss.runtime.Future object. + y_categories : ndarray + The array of classes for this RfDataset. The values are unique. It can + be a pycompss.runtime.Future object. + n_classes : int + The number of classes of this RfDataset. It can be a + pycompss.runtime.Future object. + + """ + + def __init__(self, samples_path, targets_path, features_path=None): + super().__init__(samples_path, targets_path, features_path) + self.y_categories = None + self.n_classes = None + + def get_y_targets(self): + """Obtains the codified array of target labels. + + Returns + ------- + y_targets : ndarray + + """ + if self.y_targets is None: + labels = _get_labels(self.targets_path) + self.y_targets, self.y_categories, self.n_classes = labels + return self.y_targets + + def get_classes(self): + """Obtains the array of label categories. + + Returns + ------- + y_categories : ndarray + + """ + if self.y_categories is None: + labels = _get_labels(self.targets_path) + self.y_targets, self.y_categories, self.n_classes = labels + return self.y_categories + + def get_n_classes(self): + """Obtains the number of classes. + + Returns + ------- + n_classes : int + + """ + if self.n_classes is None: + labels = _get_labels(self.targets_path) + self.y_targets, self.y_categories, self.n_classes = labels + return self.n_classes + + +class RfRegressorDataset(RfBaseDataset): + """Dataset format used by the fit() of the RandomForestRegressor. + + The RfDataset contains a file path for the samples and another one for the + targets. Optionally, a path can be provided for a transposed version of the + samples matrix, i.e., the features. + + Note: For a representation of a dataset distributed in multiple files, use + dislib.data.Dataset instead. + + Parameters + ---------- + samples_path : str + Path of the .npy file containing the 2-d array of samples. It can be a + pycompss.runtime.Future object. If so, self.n_samples and + self.n_features must be set manually (they can also be + pycompss.runtime.Future objects). + targets_path : str + Path of the .dat file containing the 1-d array of target values. + It can be a pycompss.runtime.Future object. + features_path : str, optional (default=None) + Path of the .npy file containing the 2-d array of samples transposed. + The array must be C-ordered. Providing this array may improve the + performance as it allows sequential access to the features. + + Attributes + ---------- + n_samples : int + The number of samples of the dataset. It can be a + pycompss.runtime.Future object. + n_features : int + The number of features of the dataset. It can be a + pycompss.runtime.Future object. + y_targets : ndarray + The array of targets for this RfDataset. It can be a + pycompss.runtime.Future object. + + """ + + def __init__(self, samples_path, targets_path, features_path=None): + super().__init__(samples_path, targets_path, features_path) + + def get_y_targets(self): + """Obtains the array of target values. + + Returns + ------- + y_targets : ndarray + + """ + if self.y_targets is None: + targets = _get_values(self.targets_path) + self.y_targets = targets + return self.y_targets + + def get_n_classes(self): + return None + + def get_classes(self): + return None + +def transform_to_rf_dataset( + x: Array, y: Array, task: str +) -> RfRegressorDataset or RfClassifierDataset: + """Creates a RfDataset object from samples x and targets y. + + This function creates a dislib.commons.rf.data.RfDataset by saving + x and y in files. + + Parameters + ---------- + x : ds-array, shape = (n_samples, n_features) + The training input samples. + y : ds-array, shape = (n_samples,) or (n_samples, n_outputs) + The target values. + task : {"classification", "regression"} + Task of the Random Forest. + + Returns + ------- + rf_dataset : dislib.regression.rf._data.RfDataset + + """ + n_samples = x.shape[0] + n_features = x.shape[1] + + samples_file = tempfile.NamedTemporaryFile( + mode="wb", prefix="tmp_rf_samples_", delete=False + ) + samples_path = samples_file.name + samples_file.close() + _allocate_samples_file(samples_path, n_samples, n_features) + + start_idx = 0 + row_blocks_iterator = x._iterator(axis=0) + top_row = next(row_blocks_iterator) + _fill_samples_file(samples_path, top_row._blocks, start_idx) + start_idx += x._top_left_shape[0] + for x_row in row_blocks_iterator: + _fill_samples_file(samples_path, x_row._blocks, start_idx) + start_idx += x._reg_shape[0] + + targets_file = tempfile.NamedTemporaryFile( + mode="w", prefix="tmp_rf_targets_", delete=False + ) + targets_path = targets_file.name + targets_file.close() + for y_row in y._iterator(axis=0): + _fill_targets_file(targets_path, y_row._blocks) + + if task == "classification": + rf_dataset = RfClassifierDataset(samples_path, targets_path) + elif task == "regression": + rf_dataset = RfRegressorDataset(samples_path, targets_path) + else: + raise ValueError("task must be either classification or regression.") + rf_dataset.n_samples = n_samples + rf_dataset.n_features = n_features + return rf_dataset + + +class _NpyFile(object): + def __init__(self, path): + self.path = path + + self.shape = None + self.fortran_order = None + self.dtype = None + + def get_shape(self): + if self.shape is None: + self._read_header() + return self.shape + + def get_fortran_order(self): + if self.fortran_order is None: + self._read_header() + return self.fortran_order + + def get_dtype(self): + if self.dtype is None: + self._read_header() + return self.dtype + + def _read_header(self): + with open(self.path, "rb") as fp: + version = format.read_magic(fp) + try: + format._check_version(version) + except ValueError: + raise ValueError("Invalid file format.") + header_data = format._read_array_header(fp, version) + self.shape, self.fortran_order, self.dtype = header_data + + +@task(targets_path=FILE_IN, returns=3) +def _get_labels(targets_path): + # Classification + y = np.genfromtxt(targets_path, dtype=None, encoding="utf-8") + categories, codes = np.unique(y, return_inverse=True) + return codes.astype(np.int8), categories, len(categories) + + +@task(targets_path=FILE_IN, returns=1) +def _get_values(targets_path): + # Regression + y = np.genfromtxt(targets_path, dtype=None, encoding="utf-8") + return y.astype(np.float64) + + +@task(returns=1) +def _get_samples_shape(subset): + return subset.samples.shape + + +@task(returns=3) +def _merge_shapes(*samples_shapes): + n_samples = 0 + n_features = samples_shapes[0][1] + for shape in samples_shapes: + n_samples += shape[0] + assert shape[1] == n_features, "Subsamples with different n_features." + return samples_shapes, n_samples, n_features + + +@task(samples_path=FILE_INOUT) +def _allocate_samples_file(samples_path, n_samples, n_features): + np.lib.format.open_memmap( + samples_path, + mode="w+", + dtype="float32", + shape=(int(n_samples), int(n_features)), + ) + + +@task(samples_path=FILE_INOUT, row_blocks={Type: COLLECTION_IN, Depth: 2}) +def _fill_samples_file(samples_path, row_blocks, start_idx): + rows_samples = Array._merge_blocks(row_blocks) + rows_samples = rows_samples.astype(dtype="float32", casting="same_kind") + samples = np.lib.format.open_memmap(samples_path, mode="r+") + samples[start_idx : start_idx + rows_samples.shape[0]] = rows_samples + + +@task(targets_path=FILE_INOUT, row_blocks={Type: COLLECTION_IN, Depth: 2}) +def _fill_targets_file(targets_path, row_blocks): + rows_targets = Array._merge_blocks(row_blocks) + with open(targets_path, "at") as f: + np.savetxt(f, rows_targets, fmt="%s", encoding="utf-8") diff --git a/dislib/commons/rf/_decision_tree.py b/dislib/commons/rf/_decision_tree.py new file mode 100644 index 00000000..07297a8d --- /dev/null +++ b/dislib/commons/rf/_decision_tree.py @@ -0,0 +1,784 @@ +from sys import float_info + +import numpy as np +from numpy.random.mtrand import RandomState +from pycompss.api.api import compss_delete_object +from pycompss.api.parameter import FILE_IN, Type, COLLECTION_IN, Depth +from pycompss.api.task import task +from sklearn.tree import DecisionTreeClassifier as SklearnDTClassifier +from sklearn.tree import DecisionTreeRegressor as SklearnDTRegressor + +from ._test_split import test_split +from dislib.data.array import Array + + +class BaseDecisionTree: + """Base class for distributed decision trees. + + Warning: This class should not be used directly. + Use derived classes instead. + """ + + def __init__( + self, + try_features, + max_depth, + distr_depth, + sklearn_max, + bootstrap, + random_state, + ): + self.try_features = try_features + self.max_depth = max_depth + self.distr_depth = distr_depth + self.sklearn_max = sklearn_max + self.bootstrap = bootstrap + self.random_state = random_state + + self.n_features = None + self.n_classes = None + + self.tree = None + self.nodes_info = None + self.subtrees = None + + def fit(self, dataset): + """Fits the DecisionTree. + + Parameters + ---------- + dataset : dislib.classification.rf._data.RfDataset + + """ + + self.n_features = dataset.get_n_features() + self.n_classes = dataset.get_n_classes() + samples_path = dataset.samples_path + features_path = dataset.features_path + n_samples = dataset.get_n_samples() + y_targets = dataset.get_y_targets() + + seed = self.random_state.randint(np.iinfo(np.int32).max) + + sample, y_s = _sample_selection( + n_samples, y_targets, self.bootstrap, seed + ) + Node = _ClassificationNode if self.n_classes else _RegressionNode + + self.tree = Node() + self.nodes_info = [] + self.subtrees = [] + tree_traversal = [(self.tree, sample, y_s, 0)] + while tree_traversal: + node, sample, y_s, depth = tree_traversal.pop() + if depth < self.distr_depth: + split = _split_node_wrapper( + sample, + self.n_features, + y_s, + self.n_classes, + self.try_features, + self.random_state, + samples_file=samples_path, + features_file=features_path, + ) + node_info, left_group, y_l, right_group, y_r = split + compss_delete_object(sample) + compss_delete_object(y_s) + node.content = len(self.nodes_info) + self.nodes_info.append(node_info) + node.left = Node() + node.right = Node() + depth = depth + 1 + tree_traversal.append((node.right, right_group, y_r, depth)) + tree_traversal.append((node.left, left_group, y_l, depth)) + else: + subtree = _build_subtree_wrapper( + sample, + y_s, + self.n_features, + self.max_depth - depth, + self.n_classes, + self.try_features, + self.sklearn_max, + self.random_state, + samples_path, + features_path, + ) + node.content = len(self.subtrees) + self.subtrees.append(subtree) + compss_delete_object(sample) + compss_delete_object(y_s) + self.nodes_info = _merge(*self.nodes_info) + + def predict(self, x_row): + """Predicts target values or classes for the given samples using + a fitted tree. + + Parameters + ---------- + x_row : ds-array + A row block of samples. + + Returns + ------- + predicted : ndarray + An array with the predicted classes for the given samples. The + values are codes of the fitted + dislib.classification.rf.data.RfDataset. The returned object can + be a pycompss.runtime.Future object. + + """ + + assert self.tree is not None, "The decision tree is not fitted." + + branch_predictions = [] + for i, subtree in enumerate(self.subtrees): + pred = _predict_branch( + x_row._blocks, + self.tree, + self.nodes_info, + i, + subtree, + self.distr_depth, + ) + branch_predictions.append(pred) + return _merge_branches( + None, *branch_predictions, classification=self.n_classes is None + ) + + +class DecisionTreeClassifier(BaseDecisionTree): + """A distributed decision tree classifier. + + Parameters + ---------- + try_features : int + The number of features to consider when looking for the best split. + + Note: the search for a split does not stop until at least one + valid partition of the node samples is found, even if it requires + to effectively inspect more than ``try_features`` features. + max_depth : int + The maximum depth of the tree. If np.inf, then nodes are expanded + until all leaves are pure. + distr_depth : int + Number of levels of the tree in which the nodes are split in a + distributed way. + bootstrap : bool + Randomly select n_instances samples with repetition (used in random + forests). + random_state : RandomState instance + The random number generator. + + Attributes + ---------- + n_features : int + The number of features of the dataset. It can be a + pycompss.runtime.Future object. + n_classes : int + The number of classes of this RfDataset. It can be a + pycompss.runtime.Future object. + tree : None or _Node + The root node of the tree after the tree is fitted. + nodes_info : None or list of _InnerNodeInfo and _LeafInfo + List of the node information for the nodes of the tree in the same + order as obtained in the fit() method, up to ``distr_depth`` depth. + After fit(), it is a pycompss.runtime.Future object. + subtrees : None or list of _Node + List of subtrees of the tree at ``distr_depth`` depth obtained in the + fit() method. After fit(), it is a list of pycompss.runtime.Future + objects. + + Methods + ------- + fit(dataset) + Fits the DecisionTreeClassifier. + predict(x_row) + Predicts classes for the given samples using a fitted tree. + predict_proba(x_row) + Predicts class probabilities for the given smaples using a fitted tree. + + """ + + def __init__( + self, + try_features, + max_depth, + distr_depth, + sklearn_max, + bootstrap, + random_state, + ): + super().__init__( + try_features, + max_depth, + distr_depth, + sklearn_max, + bootstrap, + random_state, + ) + + def predict_proba(self, x_row): + """Predicts class probabilities for a row block using a fitted tree. + + Parameters + ---------- + x_row : ds-array + A row block of samples. + + Returns + ------- + predicted_proba : ndarray + An array with the predicted probabilities for the given samples. + The shape is (len(subset.samples), self.n_classes), with the index + of the column being codes of the fitted + dislib.classification.rf.data.RfDataset. The returned object can be + a pycompss.runtime.Future object. + + """ + + assert self.tree is not None, "The decision tree is not fitted." + + branch_predictions = [] + for i, subtree in enumerate(self.subtrees): + pred = _predict_branch_proba( + x_row._blocks, + self.tree, + self.nodes_info, + i, + subtree, + self.distr_depth, + self.n_classes, + ) + branch_predictions.append(pred) + return _merge_branches( + self.n_classes, *branch_predictions, classification=True + ) + + +class DecisionTreeRegressor(BaseDecisionTree): + """A distributed decision tree regressor. + + Parameters + ---------- + try_features : int + The number of features to consider when looking for the best split. + + Note: the search for a split does not stop until at least one + valid partition of the node samples is found, even if it requires + to effectively inspect more than ``try_features`` features. + max_depth : int + The maximum depth of the tree. If np.inf, then nodes are expanded + until all leaves are pure. + distr_depth : int + Number of levels of the tree in which the nodes are split in a + distributed way. + bootstrap : bool + Randomly select n_instances samples with repetition (used in random + forests). + random_state : RandomState instance + The random number generator. + + Attributes + ---------- + n_features : int + The number of features of the dataset. It can be a + pycompss.runtime.Future object. + tree : None or _Node + The root node of the tree after the tree is fitted. + nodes_info : None or list of _InnerNodeInfo and _LeafInfo + List of the node information for the nodes of the tree in the same + order as obtained in the fit() method, up to ``distr_depth`` depth. + After fit(), it is a pycompss.runtime.Future object. + subtrees : None or list of _Node + List of subtrees of the tree at ``distr_depth`` depth obtained in the + fit() method. After fit(), it is a list of pycompss.runtime.Future + objects. + + Methods + ------- + fit(dataset) + Fits the DecisionTreeRegressor. + predict(x_row) + Predicts classes for the given samples using a fitted tree. + predict_proba(x_row) + Predicts class probabilities for the given smaples using a fitted tree. + + """ + + def __init__( + self, + try_features, + max_depth, + distr_depth, + sklearn_max, + bootstrap, + random_state, + ): + super().__init__( + try_features, + max_depth, + distr_depth, + sklearn_max, + bootstrap, + random_state, + ) + + +class _Node: + """Base class for tree nodes""" + + def __init__(self, is_classifier): + self.content = None + self.left = None + self.right = None + self.is_classifier = is_classifier + self.predict_dtype = np.int64 if is_classifier else np.float64 + + def predict(self, sample): + node_content = self.content + if isinstance(node_content, _LeafInfo): + return np.full((len(sample),), node_content.target) + if isinstance(node_content, _SkTreeWrapper): + if len(sample) > 0: + return node_content.sk_tree.predict(sample) + if isinstance(node_content, _InnerNodeInfo): + pred = np.empty((len(sample),), dtype=self.predict_dtype) + left_mask = sample[:, node_content.index] <= node_content.value + pred[left_mask] = self.left.predict(sample[left_mask]) + pred[~left_mask] = self.right.predict(sample[~left_mask]) + return pred + assert len(sample) == 0, "Type not supported" + return np.empty((0,), dtype=self.predict_dtype) + + +class _ClassificationNode(_Node): + def __init__(self): + super().__init__(is_classifier=True) + + def predict_proba(self, sample, n_classes): + node_content = self.content + if isinstance(node_content, _LeafInfo): + single_pred = node_content.frequencies / node_content.size + return np.tile(single_pred, (len(sample), 1)) + if isinstance(node_content, _SkTreeWrapper): + if len(sample) > 0: + sk_tree_pred = node_content.sk_tree.predict_proba(sample) + pred = np.zeros((len(sample), n_classes), dtype=np.float64) + pred[:, node_content.sk_tree.classes_] = sk_tree_pred + return pred + if isinstance(node_content, _InnerNodeInfo): + pred = np.empty((len(sample), n_classes), dtype=np.float64) + l_msk = sample[:, node_content.index] <= node_content.value + pred[l_msk] = self.left.predict_proba(sample[l_msk], n_classes) + pred[~l_msk] = self.right.predict_proba(sample[~l_msk], n_classes) + return pred + assert len(sample) == 0, "Type not supported" + return np.empty((0, n_classes), dtype=np.float64) + + +class _RegressionNode(_Node): + def __init__(self): + super().__init__(is_classifier=False) + + +class _InnerNodeInfo: + def __init__(self, index=None, value=None): + self.index = index + self.value = value + + +class _LeafInfo: + def __init__(self, size=None, frequencies=None, target=None): + self.size = size + self.frequencies = frequencies + self.target = target + + +class _SkTreeWrapper: + def __init__(self, tree): + self.sk_tree = tree + + +def _get_sample_attributes(samples_file, indices): + samples_mmap = np.load(samples_file, mmap_mode="r", allow_pickle=False) + x = samples_mmap[indices] + return x + + +def _get_feature_mmap(features_file, i): + return _get_features_mmap(features_file)[i] + + +def _get_features_mmap(features_file): + return np.load(features_file, mmap_mode="r", allow_pickle=False) + + +@task(priority=True, returns=2) +def _sample_selection(n_samples, y_targets, bootstrap, seed): + if bootstrap: + random_state = RandomState(seed) + selection = random_state.choice( + n_samples, size=n_samples, replace=True + ) + selection.sort() + return selection, y_targets[selection] + else: + return np.arange(n_samples), y_targets + + +def _feature_selection(untried_indices, m_try, random_state): + selection_len = min(m_try, len(untried_indices)) + return random_state.choice( + untried_indices, size=selection_len, replace=False + ) + + +def _get_groups(sample, y_s, features_mmap, index, value): + if index is None: + empty_sample = np.array([], dtype=np.int64) + empty_target = np.array([], dtype=y_s.dtype) + return sample, y_s, empty_sample, empty_target + feature = features_mmap[index][sample] + mask = feature < value + left = sample[mask] + right = sample[~mask] + y_l = y_s[mask] + y_r = y_s[~mask] + return left, y_l, right, y_r + + +def _compute_leaf_info(y_s, n_classes): + if n_classes is not None: + frequencies = np.bincount(y_s, minlength=n_classes) + mode = np.argmax(frequencies) + return _LeafInfo(len(y_s), frequencies, mode) + else: + return _LeafInfo(len(y_s), None, np.mean(y_s)) + + +def _split_node_wrapper( + sample, + n_features, + y_s, + n_classes, + m_try, + random_state, + samples_file=None, + features_file=None, +): + seed = random_state.randint(np.iinfo(np.int32).max) + + if features_file is not None: + return _split_node_using_features( + sample, n_features, y_s, n_classes, m_try, features_file, seed + ) + elif samples_file is not None: + return _split_node( + sample, n_features, y_s, n_classes, m_try, samples_file, seed + ) + else: + raise ValueError( + "Invalid combination of arguments. samples_file is " + "None and features_file is None." + ) + + +@task(features_file=FILE_IN, returns=(object, list, list, list, list)) +def _split_node_using_features( + sample, n_features, y_s, n_classes, m_try, features_file, seed +): + features_mmap = np.load(features_file, mmap_mode="r", allow_pickle=False) + random_state = RandomState(seed) + return _compute_split( + sample, n_features, y_s, n_classes, m_try, features_mmap, random_state + ) + + +@task(samples_file=FILE_IN, returns=(object, list, list, list, list)) +def _split_node(sample, n_features, y_s, n_classes, m_try, samples_file, seed): + features_mmap = np.load(samples_file, mmap_mode="r", allow_pickle=False).T + random_state = RandomState(seed) + return _compute_split( + sample, n_features, y_s, n_classes, m_try, features_mmap, random_state + ) + + +def _compute_split( + sample, n_features, y_s, n_classes, m_try, features_mmap, random_state +): + node_info = left_group = y_l = right_group = y_r = None + split_ended = False + tried_indices = [] + while not split_ended: + untried_indices = np.setdiff1d(np.arange(n_features), tried_indices) + index_selection = _feature_selection( + untried_indices, m_try, random_state + ) + b_score = float_info.max + b_index = None + b_value = None + for index in index_selection: + feature = features_mmap[index] + score, value = test_split(sample, y_s, feature, n_classes) + if score < b_score: + b_score, b_value, b_index = score, value, index + groups = _get_groups(sample, y_s, features_mmap, b_index, b_value) + left_group, y_l, right_group, y_r = groups + if left_group.size and right_group.size: + split_ended = True + node_info = _InnerNodeInfo(b_index, b_value) + else: + tried_indices.extend(list(index_selection)) + if len(tried_indices) == n_features: + split_ended = True + node_info = _compute_leaf_info(y_s, n_classes) + left_group = sample + y_l = y_s + right_group = np.array([], dtype=np.int64) + y_r = np.array([], dtype=y_s.dtype) + + return node_info, left_group, y_l, right_group, y_r + + +def _build_subtree_wrapper( + sample, + y_s, + n_features, + max_depth, + n_classes, + m_try, + sklearn_max, + random_state, + samples_file, + features_file, +): + seed = random_state.randint(np.iinfo(np.int32).max) + if features_file is not None: + return _build_subtree_using_features( + sample, + y_s, + n_features, + max_depth, + n_classes, + m_try, + sklearn_max, + seed, + samples_file, + features_file, + ) + else: + return _build_subtree( + sample, + y_s, + n_features, + max_depth, + n_classes, + m_try, + sklearn_max, + seed, + samples_file, + ) + + +@task(samples_file=FILE_IN, features_file=FILE_IN, returns=_Node) +def _build_subtree_using_features( + sample, + y_s, + n_features, + max_depth, + n_classes, + m_try, + sklearn_max, + seed, + samples_file, + features_file, +): + random_state = RandomState(seed) + return _compute_build_subtree( + sample, + y_s, + n_features, + max_depth, + n_classes, + m_try, + sklearn_max, + random_state, + samples_file, + features_file=features_file, + ) + + +@task(samples_file=FILE_IN, returns=_Node) +def _build_subtree( + sample, + y_s, + n_features, + max_depth, + n_classes, + m_try, + sklearn_max, + seed, + samples_file, +): + random_state = RandomState(seed) + return _compute_build_subtree( + sample, + y_s, + n_features, + max_depth, + n_classes, + m_try, + sklearn_max, + random_state, + samples_file, + ) + + +def _compute_build_subtree( + sample, + y_s, + n_features, + max_depth, + n_classes, + m_try, + sklearn_max, + random_state, + samples_file, + features_file=None, + use_sklearn=True, +): + Node = _ClassificationNode if n_classes else _RegressionNode + SklearnDT = SklearnDTClassifier if n_classes else SklearnDTRegressor + if not sample.size: + return Node() + if features_file is not None: + mmap = np.load(features_file, mmap_mode="r", allow_pickle=False) + else: + mmap = np.load(samples_file, mmap_mode="r", allow_pickle=False).T + subtree = Node() + tree_traversal = [(subtree, sample, y_s, 0)] + while tree_traversal: + node, sample, y_s, depth = tree_traversal.pop() + if depth < max_depth: + if use_sklearn and n_features * len(sample) <= sklearn_max: + if max_depth == np.inf: + sklearn_max_depth = None + else: + sklearn_max_depth = max_depth - depth + dt = SklearnDT( + max_features=m_try, + max_depth=sklearn_max_depth, + random_state=random_state, + ) + unique = np.unique( + sample, return_index=True, return_counts=True + ) + sample, new_indices, sample_weight = unique + x = _get_sample_attributes(samples_file, sample) + y_s = y_s[new_indices] + dt.fit(x, y_s, sample_weight=sample_weight, check_input=False) + node.content = _SkTreeWrapper(dt) + else: + split = _compute_split( + sample, + n_features, + y_s, + n_classes, + m_try, + mmap, + random_state, + ) + node_info, left_group, y_l, right_group, y_r = split + node.content = node_info + if isinstance(node_info, _InnerNodeInfo): + node.left = Node() + node.right = Node() + tree_traversal.append( + (node.right, right_group, y_r, depth + 1) + ) + tree_traversal.append( + (node.left, left_group, y_l, depth + 1) + ) + else: + node.content = _compute_leaf_info(y_s, n_classes) + return subtree + + +@task(returns=list) +def _merge(*object_list): + return object_list + + +def _get_subtree_path(subtree_index, distr_depth): + if distr_depth == 0: + return "" + return bin(subtree_index)[2:].zfill(distr_depth) + + +def _get_predicted_indices(samples, tree, nodes_info, path): + idx_mask = np.full((len(samples),), True) + for direction in path: + node_info = nodes_info[tree.content] + if isinstance(node_info, _LeafInfo): + if direction == "1": + idx_mask[:] = 0 + else: + col = node_info.index + value = node_info.value + if direction == "0": + idx_mask[idx_mask] = samples[idx_mask, col] <= value + tree = tree.left + else: + idx_mask[idx_mask] = samples[idx_mask, col] > value + tree = tree.right + return idx_mask + + +@task(row_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) +def _predict_branch( + row_blocks, tree, nodes_info, subtree_index, subtree, distr_depth +): + samples = Array._merge_blocks(row_blocks) + path = _get_subtree_path(subtree_index, distr_depth) + indices_mask = _get_predicted_indices(samples, tree, nodes_info, path) + prediction = subtree.predict(samples[indices_mask]) + return indices_mask, prediction + + +@task(row_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) +def _predict_branch_proba( + row_blocks, + tree, + nodes_info, + subtree_index, + subtree, + distr_depth, + n_classes, +): + samples = Array._merge_blocks(row_blocks) + path = _get_subtree_path(subtree_index, distr_depth) + indices_mask = _get_predicted_indices(samples, tree, nodes_info, path) + prediction = subtree.predict_proba(samples[indices_mask], n_classes) + return indices_mask, prediction + + +@task(returns=list) +def _merge_branches(n_classes, *predictions, classification): + samples_len = len(predictions[0][0]) + if classification: + if n_classes is not None: # predict class + shape = (samples_len, n_classes) + dtype = np.float64 + else: # predict_proba + shape = (samples_len,) + dtype = np.int64 + else: # predict value + shape = (samples_len,) + dtype = np.float64 + + merged_prediction = np.empty(shape, dtype=dtype) + for selected, prediction in predictions: + merged_prediction[selected] = prediction + return merged_prediction diff --git a/dislib/commons/rf/_forest.py b/dislib/commons/rf/_forest.py new file mode 100644 index 00000000..e0f4561d --- /dev/null +++ b/dislib/commons/rf/_forest.py @@ -0,0 +1,486 @@ +import math +from collections import Counter + +import numpy as np +from pycompss.api.api import compss_wait_on +from pycompss.api.parameter import Type, COLLECTION_IN, Depth +from pycompss.api.task import task +from sklearn.base import BaseEstimator +from sklearn.utils import check_random_state + +from dislib.commons.rf._decision_tree import ( + DecisionTreeClassifier, + DecisionTreeRegressor, +) +from dislib.data.array import Array +from dislib.utils.base import _paired_partition +from ._data import transform_to_rf_dataset + + +class BaseRandomForest(BaseEstimator): + """Base class for distributed random forests. + + Warning: This class should not be used directly. + Use derived classes instead. + """ + + def __init__( + self, + n_estimators, + try_features, + max_depth, + distr_depth, + sklearn_max, + hard_vote, + random_state, + ): + self.n_estimators = n_estimators + self.try_features = try_features + self.max_depth = max_depth + self.distr_depth = distr_depth + self.sklearn_max = sklearn_max + self.hard_vote = hard_vote + self.random_state = random_state + + def fit(self, x, y): + """Fits the RandomForest. + + Parameters + ---------- + x : ds-array, shape=(n_samples, n_features) + The training input samples. Internally, its dtype will be converted + to ``dtype=np.float32``. + y : ds-array, shape=(n_samples, 1) + The target values. + + Returns + ------- + self : RandomForest + + """ + self.classes = None + self.trees = [] + + if self.hard_vote is not None: + # Classification + task = "classification" + Tree = DecisionTreeClassifier + else: + # Regression + task = "regression" + Tree = DecisionTreeRegressor + + dataset = transform_to_rf_dataset(x, y, task) + + n_features = dataset.get_n_features() + try_features = _resolve_try_features(self.try_features, n_features) + random_state = check_random_state(self.random_state) + + self.classes = dataset.get_classes() + + if self.distr_depth == "auto": + dataset.n_samples = compss_wait_on(dataset.get_n_samples()) + distr_depth = max(0, int(math.log10(dataset.n_samples)) - 4) + distr_depth = min(distr_depth, self.max_depth) + else: + distr_depth = self.distr_depth + + for i in range(self.n_estimators): + tree = Tree( + try_features, + self.max_depth, + distr_depth, + self.sklearn_max, + bootstrap=True, + random_state=random_state, + ) + self.trees.append(tree) + + for tree in self.trees: + tree.fit(dataset) + + return self + + def predict(self, x): + """Predicts target classes or values using a fitted forest. + + Parameters + ---------- + x : ds-array, shape=(n_samples, n_features) + The input samples. + + Returns + ------- + y_pred : ds-array, shape=(n_samples, 1) + Predicted class labels or values for x. + + """ + assert self.trees is not None, "The random forest is not fitted." + pred_blocks = [] + if self.hard_vote is not None: + # Classification + if self.hard_vote: + for x_row in x._iterator(axis=0): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict(x_row)) + pred_blocks.append( + _hard_vote(self.classes, *tree_predictions) + ) + else: + for x_row in x._iterator(axis=0): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict_proba(x_row)) + pred_blocks.append( + _soft_vote(self.classes, *tree_predictions) + ) + else: + # Regression + for x_row in x._iterator(axis=0): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict(x_row)) + pred_blocks.append(_join_predictions(*tree_predictions)) + + y_pred = Array( + blocks=[pred_blocks], + top_left_shape=(x._top_left_shape[0], 1), + reg_shape=(x._reg_shape[0], 1), + shape=(x.shape[0], 1), + sparse=False, + ) + + return y_pred + + def score(self, x, y): + """Accuracy classification score. + + For classification returns the mean accuracy on the given test data. + + For regression returns the coefficient of determination $R^2$ of + the prediction. + The coefficient $R^2$ is defined as $(1-u/v)$, where $u$ + is the residual sum of squares `((y_true - y_pred) ** 2).sum()` and + $v$ is the total sum of squares + `((y_true - y_true.mean()) ** 2).sum()`. + The best possible score is 1.0 and it can be negative + if the model is arbitrarily worse. + A constant model that always predicts the expected value of y, + disregarding the input features, would get a $R^2$ score of 0.0. + + Parameters + ---------- + x : ds-array, shape=(n_samples, n_features) + The training input samples. + y : ds-array, shape (n_samples, 1) + The true labels. + + Returns + ------- + score : float (as future object) + Fraction of correctly classified samples for classification + or coefficient of determination $R^2$ for regression. + + """ + assert self.trees is not None, "The random forest is not fitted." + partial_scores = [] + if self.hard_vote is not None: + # Classification + if self.hard_vote: + for x_row, y_row in _paired_partition(x, y): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict(x_row)) + subset_score = _hard_vote_score( + y_row._blocks, self.classes, *tree_predictions + ) + partial_scores.append(subset_score) + else: + for x_row, y_row in _paired_partition(x, y): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict_proba(x_row)) + subset_score = _soft_vote_score( + y_row._blocks, self.classes, *tree_predictions + ) + partial_scores.append(subset_score) + score = _merge_classification_scores(*partial_scores) + else: + # Regression + for x_row, y_row in _paired_partition(x, y): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict(x_row)) + subset_score = _regression_score( + y_row._blocks, *tree_predictions + ) + partial_scores.append(subset_score) + score = _merge_regression_scores(*partial_scores) + + return score + + +class RandomForestClassifier(BaseRandomForest): + """A distributed random forest classifier. + + Parameters + ---------- + n_estimators : int, optional (default=10) + Number of trees to fit. + try_features : int, str or None, optional (default='sqrt') + The number of features to consider when looking for the best split: + + - If "sqrt", then `try_features=sqrt(n_features)`. + - If "third", then `try_features=n_features // 3`. + - If None, then `try_features=n_features`. + + Note: the search for a split does not stop until at least one + valid partition of the node samples is found, even if it requires + to effectively inspect more than ``try_features`` features. + max_depth : int or np.inf, optional (default=np.inf) + The maximum depth of the tree. If np.inf, then nodes are expanded + until all leaves are pure. + distr_depth : int or str, optional (default='auto') + Number of levels of the tree in which the nodes are split in a + distributed way. + sklearn_max: int or float, optional (default=1e8) + Maximum size (len(subsample)*n_features) of the arrays passed to + sklearn's DecisionTreeClassifier.fit(), which is called to fit subtrees + (subsamples) of our DecisionTreeClassifier. sklearn fit() is used + because it's faster, but requires loading the data to memory, which can + cause memory problems for large datasets. This parameter can be + adjusted to fit the hardware capabilities. + hard_vote : bool, optional (default=False) + If True, it uses majority voting over the predict() result of the + decision tree predictions. If False, it takes the class with the higher + probability given by predict_proba(), which is an average of the + probabilities given by the decision trees. + random_state : int, RandomState instance or None, optional (default=None) + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by `np.random`. + + Attributes + ---------- + classes : None or ndarray + Array of distinct classes, set at fit(). + trees : list of DecisionTreeClassifier + List of the tree classifiers of this forest, populated at fit(). + """ + + def __init__( + self, + n_estimators=10, + try_features="sqrt", + max_depth=np.inf, + distr_depth="auto", + sklearn_max=1e8, + hard_vote=False, + random_state=None, + ): + super().__init__( + n_estimators, + try_features, + max_depth, + distr_depth, + sklearn_max, + hard_vote, + random_state, + ) + + def predict_proba(self, x): + """Predicts class probabilities using a fitted forest. + + The probabilities are obtained as an average of the probabilities of + each decision tree. + + + Parameters + ---------- + x : ds-array, shape=(n_samples, n_features) + The input samples. + + Returns + ------- + probabilities : ds-array, shape=(n_samples, n_classes) + Predicted probabilities for the samples to belong to each class. + The columns of the array correspond to the classes given at + self.classes. + + """ + assert self.trees is not None, "The random forest is not fitted." + prob_blocks = [] + for x_row in x._iterator(axis=0): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict_proba(x_row)) + prob_blocks.append([_join_predictions(*tree_predictions)]) + self.classes = compss_wait_on(self.classes) + n_classes = len(self.classes) + + probabilities = Array( + blocks=prob_blocks, + top_left_shape=(x._top_left_shape[0], n_classes), + reg_shape=(x._reg_shape[0], n_classes), + shape=(x.shape[0], n_classes), + sparse=False, + ) + return probabilities + + +class RandomForestRegressor(BaseRandomForest): + """A distributed random forest regressor. + + Parameters + ---------- + n_estimators : int, optional (default=10) + Number of trees to fit. + try_features : int, str or None, optional (default='sqrt') + The number of features to consider when looking for the best split: + + - If "sqrt", then `try_features=sqrt(n_features)`. + - If "third", then `try_features=n_features // 3`. + - If None, then `try_features=n_features`. + + Note: the search for a split does not stop until at least one + valid partition of the node samples is found, even if it requires + to effectively inspect more than ``try_features`` features. + max_depth : int or np.inf, optional (default=np.inf) + The maximum depth of the tree. If np.inf, then nodes are expanded + until all leaves are pure. + distr_depth : int or str, optional (default='auto') + Number of levels of the tree in which the nodes are split in a + distributed way. + sklearn_max: int or float, optional (default=1e8) + Maximum size (len(subsample)*n_features) of the arrays passed to + sklearn's DecisionTreeRegressor.fit(), which is called to fit subtrees + (subsamples) of our DecisionTreeRegressor. sklearn fit() is used + because it's faster, but requires loading the data to memory, which can + cause memory problems for large datasets. This parameter can be + adjusted to fit the hardware capabilities. + random_state : int, RandomState instance or None, optional (default=None) + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by `np.random`. + + Attributes + ---------- + trees : list of DecisionTreeRegressor + List of the tree regressors of this forest, populated at fit(). + """ + + def __init__( + self, + n_estimators=10, + try_features="sqrt", + max_depth=np.inf, + distr_depth="auto", + sklearn_max=1e8, + random_state=None, + ): + hard_vote = None + super().__init__( + n_estimators, + try_features, + max_depth, + distr_depth, + sklearn_max, + hard_vote, + random_state, + ) + + +@task(returns=1) +def _resolve_try_features(try_features, n_features): + if try_features is None: + return n_features + elif try_features == "sqrt": + return int(math.sqrt(n_features)) + elif try_features == "third": + return max(1, n_features // 3) + else: + return int(try_features) + + +@task(returns=1) +def _join_predictions(*predictions): + aggregate = predictions[0] + for p in predictions[1:]: + aggregate += p + labels = aggregate / len(predictions) + return labels + + +@task(returns=1) +def _soft_vote(classes, *predictions): + aggregate = predictions[0] + for p in predictions[1:]: + aggregate += p + labels = classes[np.argmax(aggregate, axis=1)] + return labels + + +@task(returns=1) +def _hard_vote(classes, *predictions): + mode = np.empty((len(predictions[0]),), dtype=int) + for sample_i, votes in enumerate(zip(*predictions)): + mode[sample_i] = Counter(votes).most_common(1)[0][0] + labels = classes[mode] + return labels + + +@task(y_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) +def _soft_vote_score(y_blocks, classes, *predictions): + real_labels = Array._merge_blocks(y_blocks).flatten() + aggregate = predictions[0] + for p in predictions[1:]: + aggregate += p + predicted_labels = classes[np.argmax(aggregate, axis=1)] + correct = np.count_nonzero(predicted_labels == real_labels) + return correct, len(real_labels) + + +@task(y_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) +def _hard_vote_score(y_blocks, classes, *predictions): + real_labels = Array._merge_blocks(y_blocks).flatten() + mode = np.empty((len(predictions[0]),), dtype=int) + for sample_i, votes in enumerate(zip(*predictions)): + mode[sample_i] = Counter(votes).most_common(1)[0][0] + predicted_labels = classes[mode] + correct = np.count_nonzero(predicted_labels == real_labels) + return correct, len(real_labels) + + +@task(y_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) +def _regression_score(y_blocks, *predictions): + y_true = Array._merge_blocks(y_blocks).flatten() + y_pred = np.mean(predictions, axis=0) + n_samples = y_true.shape[0] + y_avg = np.mean(y_true) + u_partial = np.sum(np.square(y_true - y_pred), axis=0) + v_partial = np.sum(np.square(y_true - y_avg), axis=0) + return u_partial, v_partial, y_avg, n_samples + + +@task(returns=1) +def _merge_classification_scores(*partial_scores): + correct = sum(subset_score[0] for subset_score in partial_scores) + total = sum(subset_score[1] for subset_score in partial_scores) + return correct / total + + +@task(returns=1) +def _merge_regression_scores(*partial_scores): + u = v = avg = n = 0 + for u_p, v_p, avg_p, n_p in partial_scores: + u += u_p + + delta = avg_p - avg + avg += delta * n_p / (n + n_p) + v += v_p + delta ** 2 * n * n_p / (n + n_p) + n += n_p + + return 1 - u / v diff --git a/dislib/commons/rf/_test_split.py b/dislib/commons/rf/_test_split.py new file mode 100644 index 00000000..38b9015f --- /dev/null +++ b/dislib/commons/rf/_test_split.py @@ -0,0 +1,59 @@ +from sys import float_info + +import numpy as np + + +def criteria_proxy(l_weight, l_length, r_weight, r_length, not_repeated): + """ + Maximizing the MSE or Gini gain is equivalent to minimizing + this proxy function. + """ + return -(l_weight / l_length + r_weight / r_length) * not_repeated + + +def test_split(sample, y_s, feature, n_classes): + size = y_s.shape[0] + if size == 0: + return float_info.max, np.float64(np.inf) + + f = feature[sample] + sort_indices = np.argsort(f) + y_sorted = y_s[sort_indices] + f_sorted = f[sort_indices] + + # Threshold value must not be that value of a sample + not_repeated = np.empty(size, dtype=np.bool_) + not_repeated[0 : size - 1] = f_sorted[1:] != f_sorted[:-1] + not_repeated[size - 1] = True + + if n_classes is not None: # Classification + l_freq = np.zeros((n_classes, size), dtype=np.int64) + l_freq[y_sorted, np.arange(size)] = 1 + + r_freq = np.zeros((n_classes, size), dtype=np.int64) + r_freq[:, 1:] = l_freq[:, :0:-1] + + l_weight = np.sum(np.square(np.cumsum(l_freq, axis=-1)), axis=0) + r_weight = np.sum(np.square(np.cumsum(r_freq, axis=-1)), axis=0)[::-1] + + else: # Regression + # Square of the sum of the y values of each branch + r_weight = np.zeros(size) + l_weight = np.square(np.cumsum(y_sorted, axis=-1)) + r_weight[:-1] = np.square(np.cumsum(y_sorted[::-1], axis=-1)[-2::-1]) + + # Number of samples of each branch + l_length = np.arange(1, size + 1, dtype=np.int32) + r_length = np.arange(size - 1, -1, -1, dtype=np.int32) + r_length[size - 1] = 1 # Avoid div by zero, the right score is 0 + + scores = criteria_proxy( + l_weight, l_length, r_weight, r_length, not_repeated + ) + + min_index = size - np.argmin(scores[::-1]) - 1 + if min_index + 1 == size: + b_value = np.float64(np.inf) + else: + b_value = (f_sorted[min_index] + f_sorted[min_index + 1]) / 2 + return scores[min_index], b_value diff --git a/dislib/regression/__init__.py b/dislib/regression/__init__.py index 4a222968..ecde22d8 100644 --- a/dislib/regression/__init__.py +++ b/dislib/regression/__init__.py @@ -1,5 +1,5 @@ from dislib.regression.linear.base import LinearRegression from dislib.regression.lasso.base import Lasso -from dislib.regression.rf.forest import RandomForestRegressor +from dislib.commons.rf._forest import RandomForestRegressor __all__ = ["LinearRegression", "Lasso", "RandomForestRegressor"] From b3c909e05ce301b9418c19f78b49b86ba72e1a2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Mon, 26 Jul 2021 13:02:51 +0200 Subject: [PATCH 35/46] Removed RF from 'classification' and 'regression' --- dislib/classification/rf/__init__.py | 0 dislib/classification/rf/_data.py | 315 ------------ dislib/classification/rf/decision_tree.py | 520 -------------------- dislib/classification/rf/test_split.py | 50 -- dislib/regression/rf/__init__.py | 0 dislib/regression/rf/_data.py | 279 ----------- dislib/regression/rf/decision_tree.py | 564 ---------------------- dislib/regression/rf/forest.py | 236 --------- dislib/regression/rf/test_split.py | 48 -- 9 files changed, 2012 deletions(-) delete mode 100644 dislib/classification/rf/__init__.py delete mode 100644 dislib/classification/rf/_data.py delete mode 100644 dislib/classification/rf/decision_tree.py delete mode 100644 dislib/classification/rf/test_split.py delete mode 100644 dislib/regression/rf/__init__.py delete mode 100644 dislib/regression/rf/_data.py delete mode 100644 dislib/regression/rf/decision_tree.py delete mode 100644 dislib/regression/rf/forest.py delete mode 100644 dislib/regression/rf/test_split.py diff --git a/dislib/classification/rf/__init__.py b/dislib/classification/rf/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/dislib/classification/rf/_data.py b/dislib/classification/rf/_data.py deleted file mode 100644 index 1a8da41f..00000000 --- a/dislib/classification/rf/_data.py +++ /dev/null @@ -1,315 +0,0 @@ -import tempfile - -import numpy as np -from numpy.lib import format -from pycompss.api.parameter import ( - FILE_IN, - FILE_INOUT, - COLLECTION_IN, - Depth, - Type, -) -from pycompss.api.task import task - -from dislib.data.array import Array - - -class RfDataset(object): - """Dataset format used by the fit() of the RandomForestClassifier. - - The RfDataset contains a file path for the samples and another one for the - labels. Optionally, a path can be provided for a transposed version of the - samples matrix, i.e., the features. - - Note: For a representation of a dataset distributed in multiple files, use - dislib.data.Dataset instead. - - Parameters - ---------- - samples_path : str - Path of the .npy file containing the 2-d array of samples. It can be a - pycompss.runtime.Future object. If so, self.n_samples and - self.n_features must be set manually (they can also be - pycompss.runtime.Future objects). - labels_path : str - Path of the .dat file containing the 1-d array of labels. It can be a - pycompss.runtime.Future object. - features_path : str, optional (default=None) - Path of the .npy file containing the 2-d array of samples transposed. - The array must be C-ordered. Providing this array may improve the - performance as it allows sequential access to the features. - - Attributes - ---------- - n_samples : int - The number of samples of the dataset. It can be a - pycompss.runtime.Future object. - n_features : int - The number of features of the dataset. It can be a - pycompss.runtime.Future object. - y_codes : ndarray - The codified array of labels for this RfDataset. The values are indices - of the array of classes, which contains the corresponding labels. The - dtype is np.int8. It can be a pycompss.runtime.Future object. - y_categories : ndarray - The array of classes for this RfDataset. The values are unique. It can - be a pycompss.runtime.Future object. - n_classes : int - The number of classes of this RfDataset. It can be a - pycompss.runtime.Future object. - - """ - - def __init__(self, samples_path, labels_path, features_path=None): - self.samples_path = samples_path - self.labels_path = labels_path - self.features_path = features_path - self.n_samples = None - self.n_features = None - - self.y_codes = None - self.y_categories = None - self.n_classes = None - - def get_n_samples(self): - """Gets the number of samples obtained from the samples file. - - Returns - ------- - n_samples : int - - Raises - ------ - AssertionError - If self.n_samples is None and self.samples_path is not a string. - ValueError - If invalid content is encountered in the samples file. - - """ - if self.n_samples is None: - assert isinstance(self.samples_path, str), ( - "self.n_samples must be set manually if self.samples_path " - "is a pycompss.runtime.Future object" - ) - shape = _NpyFile(self.samples_path).get_shape() - if len(shape) != 2: - raise ValueError("Cannot read 2D array from the samples file.") - self.n_samples, self.n_features = shape - return self.n_samples - - def get_n_features(self): - """Gets the number of features obtained from the samples file. - - Returns - ------- - n_features : int - - Raises - ------ - AssertionError - If self.n_features is None and self.samples_path is not a string. - ValueError - If invalid content is encountered in the samples file. - - """ - if self.n_features is None: - assert isinstance(self.samples_path, str), ( - "self.n_features must be set manually if self.samples_path " - "is a pycompss.runtime.Future object" - ) - shape = _NpyFile(self.samples_path).get_shape() - if len(shape) != 2: - raise ValueError("Cannot read 2D array from the samples file.") - self.n_samples, self.n_features = shape - return self.n_features - - def get_y_codes(self): - """Obtains the codified array of labels. - - Returns - ------- - y_codes : ndarray - - """ - if self.y_codes is None: - labels = _get_labels(self.labels_path) - self.y_codes, self.y_categories, self.n_classes = labels - return self.y_codes - - def get_classes(self): - """Obtains the array of label categories. - - Returns - ------- - y_categories : ndarray - - """ - if self.y_categories is None: - labels = _get_labels(self.labels_path) - self.y_codes, self.y_categories, self.n_classes = labels - return self.y_categories - - def get_n_classes(self): - """Obtains the number of classes. - - Returns - ------- - n_classes : int - - """ - if self.n_classes is None: - labels = _get_labels(self.labels_path) - self.y_codes, self.y_categories, self.n_classes = labels - return self.n_classes - - def validate_features_file(self): - """Validates the features file header information. - - Raises - ------ - ValueError - If the shape of the array in the features_file doesn't match this - class n_samples and n_features or if the array is in fortran order. - - """ - features_npy_file = _NpyFile(self.features_path) - shape = features_npy_file.get_shape() - fortran_order = features_npy_file.get_fortran_order() - if len(shape) != 2: - raise ValueError("Cannot read 2D array from features_file.") - if (self.get_n_features(), self.get_n_samples()) != shape: - raise ValueError("Invalid dimensions for the features_file.") - if fortran_order: - raise ValueError("Fortran order not supported for features array.") - - -def transform_to_rf_dataset(x: Array, y: Array) -> RfDataset: - """Creates a RfDataset object from samples x and labels y. - - This function creates a dislib.classification.rf.data.RfDataset by saving - x and y in files. - - Parameters - ---------- - x : ds-array, shape = (n_samples, n_features) - The training input samples. - y : ds-array, shape = (n_samples,) or (n_samples, n_outputs) - The target values. - - Returns - ------- - rf_dataset : dislib.classification.rf._data.RfDataset - - """ - n_samples = x.shape[0] - n_features = x.shape[1] - - samples_file = tempfile.NamedTemporaryFile( - mode="wb", prefix="tmp_rf_samples_", delete=False - ) - samples_path = samples_file.name - samples_file.close() - _allocate_samples_file(samples_path, n_samples, n_features) - - start_idx = 0 - row_blocks_iterator = x._iterator(axis=0) - top_row = next(row_blocks_iterator) - _fill_samples_file(samples_path, top_row._blocks, start_idx) - start_idx += x._top_left_shape[0] - for x_row in row_blocks_iterator: - _fill_samples_file(samples_path, x_row._blocks, start_idx) - start_idx += x._reg_shape[0] - - labels_file = tempfile.NamedTemporaryFile( - mode="w", prefix="tmp_rf_labels_", delete=False - ) - labels_path = labels_file.name - labels_file.close() - for y_row in y._iterator(axis=0): - _fill_labels_file(labels_path, y_row._blocks) - - rf_dataset = RfDataset(samples_path, labels_path) - rf_dataset.n_samples = n_samples - rf_dataset.n_features = n_features - return rf_dataset - - -class _NpyFile(object): - def __init__(self, path): - self.path = path - - self.shape = None - self.fortran_order = None - self.dtype = None - - def get_shape(self): - if self.shape is None: - self._read_header() - return self.shape - - def get_fortran_order(self): - if self.fortran_order is None: - self._read_header() - return self.fortran_order - - def get_dtype(self): - if self.dtype is None: - self._read_header() - return self.dtype - - def _read_header(self): - with open(self.path, "rb") as fp: - version = format.read_magic(fp) - try: - format._check_version(version) - except ValueError: - raise ValueError("Invalid file format.") - header_data = format._read_array_header(fp, version) - self.shape, self.fortran_order, self.dtype = header_data - - -@task(labels_path=FILE_IN, returns=3) -def _get_labels(labels_path): - y = np.genfromtxt(labels_path, dtype=None, encoding="utf-8") - categories, codes = np.unique(y, return_inverse=True) - return codes.astype(np.int8), categories, len(categories) - - -@task(returns=1) -def _get_samples_shape(subset): - return subset.samples.shape - - -@task(returns=3) -def _merge_shapes(*samples_shapes): - n_samples = 0 - n_features = samples_shapes[0][1] - for shape in samples_shapes: - n_samples += shape[0] - assert shape[1] == n_features, "Subsamples with different n_features." - return samples_shapes, n_samples, n_features - - -@task(samples_path=FILE_INOUT) -def _allocate_samples_file(samples_path, n_samples, n_features): - np.lib.format.open_memmap( - samples_path, - mode="w+", - dtype="float32", - shape=(int(n_samples), int(n_features)), - ) - - -@task(samples_path=FILE_INOUT, row_blocks={Type: COLLECTION_IN, Depth: 2}) -def _fill_samples_file(samples_path, row_blocks, start_idx): - rows_samples = Array._merge_blocks(row_blocks) - rows_samples = rows_samples.astype(dtype="float32", casting="same_kind") - samples = np.lib.format.open_memmap(samples_path, mode="r+") - samples[start_idx : start_idx + rows_samples.shape[0]] = rows_samples - - -@task(labels_path=FILE_INOUT, row_blocks={Type: COLLECTION_IN, Depth: 2}) -def _fill_labels_file(labels_path, row_blocks): - rows_labels = Array._merge_blocks(row_blocks) - with open(labels_path, "at") as f: - np.savetxt(f, rows_labels, fmt="%s", encoding="utf-8") diff --git a/dislib/classification/rf/decision_tree.py b/dislib/classification/rf/decision_tree.py deleted file mode 100644 index 0725fcfa..00000000 --- a/dislib/classification/rf/decision_tree.py +++ /dev/null @@ -1,520 +0,0 @@ -from sys import float_info - -import numpy as np -from numpy.random.mtrand import RandomState -from pycompss.api.api import compss_delete_object -from pycompss.api.parameter import FILE_IN, Type, COLLECTION_IN, Depth -from pycompss.api.task import task -from sklearn.tree import DecisionTreeClassifier as SklearnDTClassifier - -from dislib.classification.rf.test_split import test_split -from dislib.data.array import Array - - -class DecisionTreeClassifier: - """A distributed decision tree classifier. - - Parameters - ---------- - try_features : int - The number of features to consider when looking for the best split. - - Note: the search for a split does not stop until at least one - valid partition of the node samples is found, even if it requires - to effectively inspect more than ``try_features`` features. - max_depth : int - The maximum depth of the tree. If np.inf, then nodes are expanded - until all leaves are pure. - distr_depth : int - Number of levels of the tree in which the nodes are split in a - distributed way. - bootstrap : bool - Randomly select n_instances samples with repetition (used in random - forests). - random_state : RandomState instance - The random number generator. - - Attributes - ---------- - n_features : int - The number of features of the dataset. It can be a - pycompss.runtime.Future object. - n_classes : int - The number of classes of this RfDataset. It can be a - pycompss.runtime.Future object. - tree : None or _Node - The root node of the tree after the tree is fitted. - nodes_info : None or list of _InnerNodeInfo and _LeafInfo - List of the node information for the nodes of the tree in the same - order as obtained in the fit() method, up to ``distr_depth`` depth. - After fit(), it is a pycompss.runtime.Future object. - subtrees : None or list of _Node - List of subtrees of the tree at ``distr_depth`` depth obtained in the - fit() method. After fit(), it is a list of pycompss.runtime.Future - objects. - - Methods - ------- - fit(dataset) - Fits the DecisionTreeClassifier. - predict(x_row) - Predicts classes for the given samples using a fitted tree. - predict_proba(x_row) - Predicts class probabilities for the given smaples using a fitted tree. - - """ - - def __init__(self, try_features, max_depth, distr_depth, sklearn_max, - bootstrap, random_state): - self.try_features = try_features - self.max_depth = max_depth - self.distr_depth = distr_depth - self.sklearn_max = sklearn_max - self.bootstrap = bootstrap - self.random_state = random_state - - self.n_features = None - self.n_classes = None - - self.tree = None - self.nodes_info = None - self.subtrees = None - - def fit(self, dataset): - """Fits the DecisionTreeClassifier. - - Parameters - ---------- - dataset : dislib.classification.rf._data.RfDataset - - """ - - self.n_features = dataset.get_n_features() - self.n_classes = dataset.get_n_classes() - samples_path = dataset.samples_path - features_path = dataset.features_path - n_samples = dataset.get_n_samples() - y_codes = dataset.get_y_codes() - - seed = self.random_state.randint(np.iinfo(np.int32).max) - - sample, y_s = _sample_selection(n_samples, y_codes, self.bootstrap, - seed) - - self.tree = _Node() - self.nodes_info = [] - self.subtrees = [] - tree_traversal = [(self.tree, sample, y_s, 0)] - while tree_traversal: - node, sample, y_s, depth = tree_traversal.pop() - if depth < self.distr_depth: - split = _split_node_wrapper(sample, self.n_features, y_s, - self.n_classes, self.try_features, - self.random_state, - samples_file=samples_path, - features_file=features_path) - node_info, left_group, y_l, right_group, y_r = split - compss_delete_object(sample) - compss_delete_object(y_s) - node.content = len(self.nodes_info) - self.nodes_info.append(node_info) - node.left = _Node() - node.right = _Node() - depth = depth + 1 - tree_traversal.append((node.right, right_group, y_r, depth)) - tree_traversal.append((node.left, left_group, y_l, depth)) - else: - subtree = _build_subtree_wrapper(sample, y_s, self.n_features, - self.max_depth - depth, - self.n_classes, - self.try_features, - self.sklearn_max, - self.random_state, - samples_path, features_path) - node.content = len(self.subtrees) - self.subtrees.append(subtree) - compss_delete_object(sample) - compss_delete_object(y_s) - self.nodes_info = _merge(*self.nodes_info) - - def predict(self, x_row): - """Predicts classes for the given samples using a fitted tree. - - Parameters - ---------- - x_row : ds-array - A row block of samples. - - Returns - ------- - predicted : ndarray - An array with the predicted classes for the given samples. The - values are codes of the fitted - dislib.classification.rf.data.RfDataset. The returned object can - be a pycompss.runtime.Future object. - - """ - - assert self.tree is not None, 'The decision tree is not fitted.' - - branch_predictions = [] - for i, subtree in enumerate(self.subtrees): - pred = _predict_branch(x_row._blocks, self.tree, self.nodes_info, - i, subtree, self.distr_depth) - branch_predictions.append(pred) - return _merge_branches(None, *branch_predictions) - - def predict_proba(self, x_row): - """Predicts class probabilities for a row block using a fitted tree. - - Parameters - ---------- - x_row : ds-array - A row block of samples. - - Returns - ------- - predicted_proba : ndarray - An array with the predicted probabilities for the given samples. - The shape is (len(subset.samples), self.n_classes), with the index - of the column being codes of the fitted - dislib.classification.rf.data.RfDataset. The returned object can be - a pycompss.runtime.Future object. - - """ - - assert self.tree is not None, 'The decision tree is not fitted.' - - branch_predictions = [] - for i, subtree in enumerate(self.subtrees): - pred = _predict_branch_proba(x_row._blocks, self.tree, - self.nodes_info, i, subtree, - self.distr_depth, self.n_classes) - branch_predictions.append(pred) - return _merge_branches(self.n_classes, *branch_predictions) - - -class _Node: - - def __init__(self): - self.content = None - self.left = None - self.right = None - - def predict(self, sample): - node_content = self.content - if isinstance(node_content, _LeafInfo): - return np.full((len(sample),), node_content.mode) - if isinstance(node_content, _SkTreeWrapper): - if len(sample) > 0: - return node_content.sk_tree.predict(sample) - if isinstance(node_content, _InnerNodeInfo): - pred = np.empty((len(sample),), dtype=np.int64) - left_mask = sample[:, node_content.index] <= node_content.value - pred[left_mask] = self.left.predict(sample[left_mask]) - pred[~left_mask] = self.right.predict(sample[~left_mask]) - return pred - assert len(sample) == 0, 'Type not supported' - return np.empty((0,), dtype=np.int64) - - def predict_proba(self, sample, n_classes): - node_content = self.content - if isinstance(node_content, _LeafInfo): - single_pred = node_content.frequencies / node_content.size - return np.tile(single_pred, (len(sample), 1)) - if isinstance(node_content, _SkTreeWrapper): - if len(sample) > 0: - sk_tree_pred = node_content.sk_tree.predict_proba(sample) - pred = np.zeros((len(sample), n_classes), dtype=np.float64) - pred[:, node_content.sk_tree.classes_] = sk_tree_pred - return pred - if isinstance(node_content, _InnerNodeInfo): - pred = np.empty((len(sample), n_classes), dtype=np.float64) - l_msk = sample[:, node_content.index] <= node_content.value - pred[l_msk] = self.left.predict_proba(sample[l_msk], n_classes) - pred[~l_msk] = self.right.predict_proba(sample[~l_msk], n_classes) - return pred - assert len(sample) == 0, 'Type not supported' - return np.empty((0, n_classes), dtype=np.float64) - - -class _InnerNodeInfo: - def __init__(self, index=None, value=None): - self.index = index - self.value = value - - -class _LeafInfo: - def __init__(self, size=None, frequencies=None, mode=None): - self.size = size - self.frequencies = frequencies - self.mode = mode - - -class _SkTreeWrapper: - def __init__(self, tree): - self.sk_tree = tree - self.classes = tree.classes_ - - -def _get_sample_attributes(samples_file, indices): - samples_mmap = np.load(samples_file, mmap_mode='r', allow_pickle=False) - x = samples_mmap[indices] - return x - - -def _get_feature_mmap(features_file, i): - return _get_features_mmap(features_file)[i] - - -def _get_features_mmap(features_file): - return np.load(features_file, mmap_mode='r', allow_pickle=False) - - -@task(priority=True, returns=2) -def _sample_selection(n_samples, y_codes, bootstrap, seed): - if bootstrap: - random_state = RandomState(seed) - selection = random_state.choice(n_samples, size=n_samples, - replace=True) - selection.sort() - return selection, y_codes[selection] - else: - return np.arange(n_samples), y_codes - - -def _feature_selection(untried_indices, m_try, random_state): - selection_len = min(m_try, len(untried_indices)) - return random_state.choice(untried_indices, size=selection_len, - replace=False) - - -def _get_groups(sample, y_s, features_mmap, index, value): - if index is None: - empty_sample = np.array([], dtype=np.int64) - empty_labels = np.array([], dtype=np.int8) - return sample, y_s, empty_sample, empty_labels - feature = features_mmap[index][sample] - mask = feature < value - left = sample[mask] - right = sample[~mask] - y_l = y_s[mask] - y_r = y_s[~mask] - return left, y_l, right, y_r - - -def _compute_leaf_info(y_s, n_classes): - frequencies = np.bincount(y_s, minlength=n_classes) - mode = np.argmax(frequencies) - return _LeafInfo(len(y_s), frequencies, mode) - - -def _split_node_wrapper(sample, n_features, y_s, n_classes, m_try, - random_state, samples_file=None, features_file=None): - seed = random_state.randint(np.iinfo(np.int32).max) - - if features_file is not None: - return _split_node_using_features(sample, n_features, y_s, n_classes, - m_try, features_file, seed) - elif samples_file is not None: - return _split_node(sample, n_features, y_s, n_classes, m_try, - samples_file, seed) - else: - raise ValueError('Invalid combination of arguments. samples_file is ' - 'None and features_file is None.') - - -@task(features_file=FILE_IN, returns=(object, list, list, list, list)) -def _split_node_using_features(sample, n_features, y_s, n_classes, m_try, - features_file, seed): - features_mmap = np.load(features_file, mmap_mode='r', allow_pickle=False) - random_state = RandomState(seed) - return _compute_split(sample, n_features, y_s, n_classes, m_try, - features_mmap, random_state) - - -@task(samples_file=FILE_IN, returns=(object, list, list, list, list)) -def _split_node(sample, n_features, y_s, n_classes, m_try, samples_file, seed): - features_mmap = np.load(samples_file, mmap_mode='r', allow_pickle=False).T - random_state = RandomState(seed) - return _compute_split(sample, n_features, y_s, n_classes, m_try, - features_mmap, random_state) - - -def _compute_split(sample, n_features, y_s, n_classes, m_try, features_mmap, - random_state): - node_info = left_group = y_l = right_group = y_r = None - split_ended = False - tried_indices = [] - while not split_ended: - untried_indices = np.setdiff1d(np.arange(n_features), tried_indices) - index_selection = _feature_selection(untried_indices, m_try, - random_state) - b_score = float_info.max - b_index = None - b_value = None - for index in index_selection: - feature = features_mmap[index] - score, value = test_split(sample, y_s, feature, n_classes) - if score < b_score: - b_score, b_value, b_index = score, value, index - groups = _get_groups(sample, y_s, features_mmap, b_index, b_value) - left_group, y_l, right_group, y_r = groups - if left_group.size and right_group.size: - split_ended = True - node_info = _InnerNodeInfo(b_index, b_value) - else: - tried_indices.extend(list(index_selection)) - if len(tried_indices) == n_features: - split_ended = True - node_info = _compute_leaf_info(y_s, n_classes) - left_group = sample - y_l = y_s - right_group = np.array([], dtype=np.int64) - y_r = np.array([], dtype=np.int8) - - return node_info, left_group, y_l, right_group, y_r - - -def _build_subtree_wrapper(sample, y_s, n_features, max_depth, n_classes, - m_try, sklearn_max, random_state, samples_file, - features_file): - seed = random_state.randint(np.iinfo(np.int32).max) - if features_file is not None: - return _build_subtree_using_features(sample, y_s, n_features, - max_depth, n_classes, m_try, - sklearn_max, seed, samples_file, - features_file) - else: - return _build_subtree(sample, y_s, n_features, max_depth, n_classes, - m_try, sklearn_max, seed, samples_file) - - -@task(samples_file=FILE_IN, features_file=FILE_IN, returns=_Node) -def _build_subtree_using_features(sample, y_s, n_features, max_depth, - n_classes, m_try, sklearn_max, seed, - samples_file, features_file): - random_state = RandomState(seed) - return _compute_build_subtree(sample, y_s, n_features, max_depth, - n_classes, m_try, sklearn_max, random_state, - samples_file, features_file=features_file) - - -@task(samples_file=FILE_IN, returns=_Node) -def _build_subtree(sample, y_s, n_features, max_depth, n_classes, m_try, - sklearn_max, seed, samples_file): - random_state = RandomState(seed) - return _compute_build_subtree(sample, y_s, n_features, max_depth, - n_classes, m_try, sklearn_max, random_state, - samples_file) - - -def _compute_build_subtree(sample, y_s, n_features, max_depth, n_classes, - m_try, sklearn_max, random_state, samples_file, - features_file=None, use_sklearn=True): - if not sample.size: - return _Node() - if features_file is not None: - mmap = np.load(features_file, mmap_mode='r', allow_pickle=False) - else: - mmap = np.load(samples_file, mmap_mode='r', allow_pickle=False).T - subtree = _Node() - tree_traversal = [(subtree, sample, y_s, 0)] - while tree_traversal: - node, sample, y_s, depth = tree_traversal.pop() - if depth < max_depth: - if use_sklearn and n_features * len(sample) <= sklearn_max: - if max_depth == np.inf: - sklearn_max_depth = None - else: - sklearn_max_depth = max_depth - depth - dt = SklearnDTClassifier(max_features=m_try, - max_depth=sklearn_max_depth, - random_state=random_state) - unique = np.unique(sample, return_index=True, - return_counts=True) - sample, new_indices, sample_weight = unique - x = _get_sample_attributes(samples_file, sample) - y_s = y_s[new_indices] - dt.fit(x, y_s, sample_weight=sample_weight, check_input=False) - node.content = _SkTreeWrapper(dt) - else: - split = _compute_split(sample, n_features, y_s, n_classes, - m_try, mmap, random_state) - node_info, left_group, y_l, right_group, y_r = split - node.content = node_info - if isinstance(node_info, _InnerNodeInfo): - node.left = _Node() - node.right = _Node() - tree_traversal.append((node.right, right_group, y_r, - depth + 1)) - tree_traversal.append((node.left, left_group, y_l, - depth + 1)) - else: - node.content = _compute_leaf_info(y_s, n_classes) - return subtree - - -@task(returns=list) -def _merge(*object_list): - return object_list - - -def _get_subtree_path(subtree_index, distr_depth): - if distr_depth == 0: - return '' - return bin(subtree_index)[2:].zfill(distr_depth) - - -def _get_predicted_indices(samples, tree, nodes_info, path): - idx_mask = np.full((len(samples),), True) - for direction in path: - node_info = nodes_info[tree.content] - if isinstance(node_info, _LeafInfo): - if direction == '1': - idx_mask[:] = 0 - else: - col = node_info.index - value = node_info.value - if direction == '0': - idx_mask[idx_mask] = samples[idx_mask, col] <= value - tree = tree.left - else: - idx_mask[idx_mask] = samples[idx_mask, col] > value - tree = tree.right - return idx_mask - - -@task(row_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) -def _predict_branch(row_blocks, tree, nodes_info, subtree_index, subtree, - distr_depth): - samples = Array._merge_blocks(row_blocks) - path = _get_subtree_path(subtree_index, distr_depth) - indices_mask = _get_predicted_indices(samples, tree, nodes_info, path) - prediction = subtree.predict(samples[indices_mask]) - return indices_mask, prediction - - -@task(row_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) -def _predict_branch_proba(row_blocks, tree, nodes_info, subtree_index, subtree, - distr_depth, n_classes): - samples = Array._merge_blocks(row_blocks) - path = _get_subtree_path(subtree_index, distr_depth) - indices_mask = _get_predicted_indices(samples, tree, nodes_info, path) - prediction = subtree.predict_proba(samples[indices_mask], n_classes) - return indices_mask, prediction - - -@task(returns=list) -def _merge_branches(n_classes, *predictions): - samples_len = len(predictions[0][0]) - if n_classes is not None: # predict - shape = (samples_len, n_classes) - dtype = np.float64 - else: # predict_proba - shape = (samples_len,) - dtype = np.int64 - merged_prediction = np.empty(shape, dtype=dtype) - for selected, prediction in predictions: - merged_prediction[selected] = prediction - return merged_prediction diff --git a/dislib/classification/rf/test_split.py b/dislib/classification/rf/test_split.py deleted file mode 100644 index 70922783..00000000 --- a/dislib/classification/rf/test_split.py +++ /dev/null @@ -1,50 +0,0 @@ -from sys import float_info - -import numpy as np - - -def gini_criteria_proxy(l_weight, l_length, r_weight, r_length, not_repeated): - """ - Maximizing the Gini gain is equivalent to minimizing this proxy function. - - """ - return -(l_weight / l_length + r_weight / r_length) * not_repeated - - -def test_split(sample, y_s, feature, n_classes): - size = y_s.shape[0] - if size == 0: - return float_info.max, np.float64(np.inf) - - f = feature[sample] - sort_indices = np.argsort(f) - y_sorted = y_s[sort_indices] - f_sorted = f[sort_indices] - - not_repeated = np.empty(size, dtype=np.bool_) - not_repeated[0: size - 1] = (f_sorted[1:] != f_sorted[:-1]) - not_repeated[size - 1] = True - - l_freq = np.zeros((n_classes, size), dtype=np.int64) - l_freq[y_sorted, np.arange(size)] = 1 - - r_freq = np.zeros((n_classes, size), dtype=np.int64) - r_freq[:, 1:] = l_freq[:, :0:-1] - - l_weight = np.sum(np.square(np.cumsum(l_freq, axis=-1)), axis=0) - r_weight = np.sum(np.square(np.cumsum(r_freq, axis=-1)), axis=0)[::-1] - - l_length = np.arange(1, size + 1, dtype=np.int32) - r_length = np.arange(size - 1, -1, -1, dtype=np.int32) - r_length[size - 1] = 1 # Avoid div by zero, the right score is 0 anyways - - scores = gini_criteria_proxy(l_weight, l_length, r_weight, r_length, - not_repeated) - - min_index = size - np.argmin(scores[::-1]) - 1 - - if min_index + 1 == size: - b_value = np.float64(np.inf) - else: - b_value = (f_sorted[min_index] + f_sorted[min_index + 1]) / 2 - return scores[min_index], b_value diff --git a/dislib/regression/rf/__init__.py b/dislib/regression/rf/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/dislib/regression/rf/_data.py b/dislib/regression/rf/_data.py deleted file mode 100644 index 360f705d..00000000 --- a/dislib/regression/rf/_data.py +++ /dev/null @@ -1,279 +0,0 @@ -import tempfile - -import numpy as np -from numpy.lib import format -from pycompss.api.parameter import ( - FILE_IN, - FILE_INOUT, - COLLECTION_IN, - Depth, - Type, -) -from pycompss.api.task import task - -from dislib.data.array import Array - - -class RfDataset(object): - """Dataset format used by the fit() of the RandomForestRegressor. - - The RfDataset contains a file path for the samples and another one for the - targets. Optionally, a path can be provided for a transposed version of the - samples matrix, i.e., the features. - - Note: For a representation of a dataset distributed in multiple files, use - dislib.data.Dataset instead. - - Parameters - ---------- - samples_path : str - Path of the .npy file containing the 2-d array of samples. It can be a - pycompss.runtime.Future object. If so, self.n_samples and - self.n_features must be set manually (they can also be - pycompss.runtime.Future objects). - targets_path : str - Path of the .dat file containing the 1-d array of targets. It can be a - pycompss.runtime.Future object. - features_path : str, optional (default=None) - Path of the .npy file containing the 2-d array of samples transposed. - The array must be C-ordered. Providing this array may improve the - performance as it allows sequential access to the features. - - Attributes - ---------- - n_samples : int - The number of samples of the dataset. It can be a - pycompss.runtime.Future object. - n_features : int - The number of features of the dataset. It can be a - pycompss.runtime.Future object. - y_targets : ndarray - The array of targets for this RfDataset. It can be a - pycompss.runtime.Future object. - - """ - - def __init__(self, samples_path, targets_path, features_path=None): - self.samples_path = samples_path - self.targets_path = targets_path - self.features_path = features_path - self.n_samples = None - self.n_features = None - - self.y_targets = None - - def get_n_samples(self): - """Gets the number of samples obtained from the samples file. - - Returns - ------- - n_samples : int - - Raises - ------ - AssertionError - If self.n_samples is None and self.samples_path is not a string. - ValueError - If invalid content is encountered in the samples file. - - """ - if self.n_samples is None: - assert isinstance(self.samples_path, str), ( - "self.n_samples must be set manually if self.samples_path " - "is a pycompss.runtime.Future object" - ) - shape = _NpyFile(self.samples_path).get_shape() - if len(shape) != 2: - raise ValueError("Cannot read 2D array from the samples file.") - self.n_samples, self.n_features = shape - return self.n_samples - - def get_n_features(self): - """Gets the number of features obtained from the samples file. - - Returns - ------- - n_features : int - - Raises - ------ - AssertionError - If self.n_features is None and self.samples_path is not a string. - ValueError - If invalid content is encountered in the samples file. - - """ - if self.n_features is None: - assert isinstance(self.samples_path, str), ( - "self.n_features must be set manually if self.samples_path " - "is a pycompss.runtime.Future object" - ) - shape = _NpyFile(self.samples_path).get_shape() - if len(shape) != 2: - raise ValueError("Cannot read 2D array from the samples file.") - self.n_samples, self.n_features = shape - return self.n_features - - def get_y_targets(self): - """Obtains the array of targets. - - Returns - ------- - y_targets : ndarray - - """ - if self.y_targets is None: - targets = _get_targets(self.targets_path) - self.y_targets = targets - return self.y_targets - - def validate_features_file(self): - """Validates the features file header information. - - Raises - ------ - ValueError - If the shape of the array in the features_file doesn't match this - class n_samples and n_features or if the array is in fortran order. - - """ - features_npy_file = _NpyFile(self.features_path) - shape = features_npy_file.get_shape() - fortran_order = features_npy_file.get_fortran_order() - if len(shape) != 2: - raise ValueError("Cannot read 2D array from features_file.") - if (self.get_n_features(), self.get_n_samples()) != shape: - raise ValueError("Invalid dimensions for the features_file.") - if fortran_order: - raise ValueError("Fortran order not supported for features array.") - - -def transform_to_rf_dataset(x: Array, y: Array) -> RfDataset: - """Creates a RfDataset object from samples x and targets y. - - This function creates a dislib.regression.rf.data.RfDataset by saving - x and y in files. - - Parameters - ---------- - x : ds-array, shape = (n_samples, n_features) - The training input samples. - y : ds-array, shape = (n_samples,) or (n_samples, n_outputs) - The target values. - - Returns - ------- - rf_dataset : dislib.regression.rf._data.RfDataset - - """ - n_samples = x.shape[0] - n_features = x.shape[1] - - samples_file = tempfile.NamedTemporaryFile( - mode="wb", prefix="tmp_rf_samples_", delete=False - ) - samples_path = samples_file.name - samples_file.close() - _allocate_samples_file(samples_path, n_samples, n_features) - - start_idx = 0 - row_blocks_iterator = x._iterator(axis=0) - top_row = next(row_blocks_iterator) - _fill_samples_file(samples_path, top_row._blocks, start_idx) - start_idx += x._top_left_shape[0] - for x_row in row_blocks_iterator: - _fill_samples_file(samples_path, x_row._blocks, start_idx) - start_idx += x._reg_shape[0] - - targets_file = tempfile.NamedTemporaryFile( - mode="w", prefix="tmp_rf_targets_", delete=False - ) - targets_path = targets_file.name - targets_file.close() - for y_row in y._iterator(axis=0): - _fill_targets_file(targets_path, y_row._blocks) - - rf_dataset = RfDataset(samples_path, targets_path) - rf_dataset.n_samples = n_samples - rf_dataset.n_features = n_features - return rf_dataset - - -class _NpyFile(object): - def __init__(self, path): - self.path = path - - self.shape = None - self.fortran_order = None - self.dtype = None - - def get_shape(self): - if self.shape is None: - self._read_header() - return self.shape - - def get_fortran_order(self): - if self.fortran_order is None: - self._read_header() - return self.fortran_order - - def get_dtype(self): - if self.dtype is None: - self._read_header() - return self.dtype - - def _read_header(self): - with open(self.path, "rb") as fp: - version = format.read_magic(fp) - try: - format._check_version(version) - except ValueError: - raise ValueError("Invalid file format.") - header_data = format._read_array_header(fp, version) - self.shape, self.fortran_order, self.dtype = header_data - - -@task(targets_path=FILE_IN, returns=1) -def _get_targets(targets_path): - y = np.genfromtxt(targets_path, dtype=None, encoding="utf-8") - return y - - -@task(returns=1) -def _get_samples_shape(subset): - return subset.samples.shape - - -@task(returns=3) -def _merge_shapes(*samples_shapes): - n_samples = 0 - n_features = samples_shapes[0][1] - for shape in samples_shapes: - n_samples += shape[0] - assert shape[1] == n_features, "Subsamples with different n_features." - return samples_shapes, n_samples, n_features - - -@task(samples_path=FILE_INOUT) -def _allocate_samples_file(samples_path, n_samples, n_features): - np.lib.format.open_memmap( - samples_path, - mode="w+", - dtype="float32", - shape=(int(n_samples), int(n_features)), - ) - - -@task(samples_path=FILE_INOUT, row_blocks={Type: COLLECTION_IN, Depth: 2}) -def _fill_samples_file(samples_path, row_blocks, start_idx): - rows_samples = Array._merge_blocks(row_blocks) - rows_samples = rows_samples.astype(dtype="float32", casting="same_kind") - samples = np.lib.format.open_memmap(samples_path, mode="r+") - samples[start_idx : start_idx + rows_samples.shape[0]] = rows_samples - - -@task(targets_path=FILE_INOUT, row_blocks={Type: COLLECTION_IN, Depth: 2}) -def _fill_targets_file(targets_path, row_blocks): - rows_targets = Array._merge_blocks(row_blocks) - with open(targets_path, "at") as f: - np.savetxt(f, rows_targets, fmt="%s", encoding="utf-8") diff --git a/dislib/regression/rf/decision_tree.py b/dislib/regression/rf/decision_tree.py deleted file mode 100644 index 82730a5d..00000000 --- a/dislib/regression/rf/decision_tree.py +++ /dev/null @@ -1,564 +0,0 @@ -from sys import float_info - -import numpy as np -from numpy.random.mtrand import RandomState -from pycompss.api.api import compss_delete_object -from pycompss.api.parameter import FILE_IN, Type, COLLECTION_IN, Depth -from pycompss.api.task import task -from sklearn.tree import DecisionTreeRegressor as SklearnDTRegressor - -from dislib.regression.rf.test_split import test_split -from dislib.data.array import Array - - -class DecisionTreeRegressor: - """A distributed decision tree regressor. - - Parameters - ---------- - try_features : int - The number of features to consider when looking for the best split. - - Note: the search for a split does not stop until at least one - valid partition of the node samples is found, even if it requires - to effectively inspect more than ``try_features`` features. - max_depth : int - The maximum depth of the tree. If np.inf, then nodes are expanded - until all leaves are pure. - distr_depth : int - Number of levels of the tree in which the nodes are split in a - distributed way. - bootstrap : bool - Randomly select n_instances samples with repetition (used in random - forests). - random_state : RandomState instance - The random number generator. - - Attributes - ---------- - n_features : int - The number of features of the dataset. It can be a - pycompss.runtime.Future object. - tree : None or _Node - The root node of the tree after the tree is fitted. - nodes_info : None or list of _InnerNodeInfo and _LeafInfo - List of the node information for the nodes of the tree in the same - order as obtained in the fit() method, up to ``distr_depth`` depth. - After fit(), it is a pycompss.runtime.Future object. - subtrees : None or list of _Node - List of subtrees of the tree at ``distr_depth`` depth obtained in the - fit() method. After fit(), it is a list of pycompss.runtime.Future - objects. - - Methods - ------- - fit(dataset) - Fits the DecisionTreeRegressor. - predict(x_row) - Predicts classes for the given samples using a fitted tree. - predict_proba(x_row) - Predicts class probabilities for the given smaples using a fitted tree. - - """ - - def __init__( - self, - try_features, - max_depth, - distr_depth, - sklearn_max, - bootstrap, - random_state, - ): - self.try_features = try_features - self.max_depth = max_depth - self.distr_depth = distr_depth - self.sklearn_max = sklearn_max - self.bootstrap = bootstrap - self.random_state = random_state - - self.n_features = None - - self.tree = None - self.nodes_info = None - self.subtrees = None - - def fit(self, dataset): - """Fits the DecisionTreeRegressor. - - Parameters - ---------- - dataset : dislib.classification.rf._data.RfDataset - - """ - - self.n_features = dataset.get_n_features() - samples_path = dataset.samples_path - features_path = dataset.features_path - n_samples = dataset.get_n_samples() - y_targets = dataset.get_y_targets() - - seed = self.random_state.randint(np.iinfo(np.int32).max) - - sample, y_s = _sample_selection( - n_samples, y_targets, self.bootstrap, seed - ) - - self.tree = _Node() - self.nodes_info = [] - self.subtrees = [] - tree_traversal = [(self.tree, sample, y_s, 0)] - while tree_traversal: - node, sample, y_s, depth = tree_traversal.pop() - if depth < self.distr_depth: - split = _split_node_wrapper( - sample, - self.n_features, - y_s, - self.try_features, - self.random_state, - samples_file=samples_path, - features_file=features_path, - ) - node_info, left_group, y_l, right_group, y_r = split - compss_delete_object(sample) - compss_delete_object(y_s) - node.content = len(self.nodes_info) - self.nodes_info.append(node_info) - node.left = _Node() - node.right = _Node() - depth = depth + 1 - tree_traversal.append((node.right, right_group, y_r, depth)) - tree_traversal.append((node.left, left_group, y_l, depth)) - else: - subtree = _build_subtree_wrapper( - sample, - y_s, - self.n_features, - self.max_depth - depth, - self.try_features, - self.sklearn_max, - self.random_state, - samples_path, - features_path, - ) - node.content = len(self.subtrees) - self.subtrees.append(subtree) - compss_delete_object(sample) - compss_delete_object(y_s) - self.nodes_info = _merge(*self.nodes_info) - - def predict(self, x_row): - """Predicts classes for the given samples using a fitted tree. - - Parameters - ---------- - x_row : ds-array - A row block of samples. - - Returns - ------- - predicted : ndarray - An array with the predicted classes for the given samples. The - values are codes of the fitted - dislib.classification.rf.data.RfDataset. The returned object can - be a pycompss.runtime.Future object. - - """ - - assert self.tree is not None, "The decision tree is not fitted." - - branch_predictions = [] - for i, subtree in enumerate(self.subtrees): - pred = _predict_branch( - x_row._blocks, - self.tree, - self.nodes_info, - i, - subtree, - self.distr_depth, - ) - branch_predictions.append(pred) - return _merge_branches(None, *branch_predictions) - - -class _Node: - def __init__(self): - self.content = None - self.left = None - self.right = None - - def predict(self, sample): - node_content = self.content - if isinstance(node_content, _LeafInfo): - return np.full((len(sample),), node_content.mean) - if isinstance(node_content, _SkTreeWrapper): - if len(sample) > 0: - return node_content.sk_tree.predict(sample) - if isinstance(node_content, _InnerNodeInfo): - pred = np.empty((len(sample),), dtype=np.float64) - left_mask = sample[:, node_content.index] <= node_content.value - pred[left_mask] = self.left.predict(sample[left_mask]) - pred[~left_mask] = self.right.predict(sample[~left_mask]) - return pred - assert len(sample) == 0, "Type not supported" - return np.empty((0,), dtype=np.float64) - - -class _InnerNodeInfo: - def __init__(self, index=None, value=None): - self.index = index - self.value = value - - -class _LeafInfo: - def __init__(self, size=None, mean=None): - self.size = size - self.mean = mean - - -class _SkTreeWrapper: - def __init__(self, tree): - self.sk_tree = tree - - -def _get_sample_attributes(samples_file, indices): - samples_mmap = np.load(samples_file, mmap_mode="r", allow_pickle=False) - x = samples_mmap[indices] - return x - - -def _get_feature_mmap(features_file, i): - return _get_features_mmap(features_file)[i] - - -def _get_features_mmap(features_file): - return np.load(features_file, mmap_mode="r", allow_pickle=False) - - -@task(priority=True, returns=2) -def _sample_selection(n_samples, y_targets, bootstrap, seed): - if bootstrap: - random_state = RandomState(seed) - selection = random_state.choice( - n_samples, size=n_samples, replace=True - ) - selection.sort() - return selection, y_targets[selection] - else: - return np.arange(n_samples), y_targets - - -def _feature_selection(untried_indices, m_try, random_state): - selection_len = min(m_try, len(untried_indices)) - return random_state.choice( - untried_indices, size=selection_len, replace=False - ) - - -def _get_groups(sample, y_s, features_mmap, index, value): - if index is None: - empty_sample = np.array([], dtype=np.int64) - empty_target = np.array([], dtype=np.float64) - return sample, y_s, empty_sample, empty_target - feature = features_mmap[index][sample] - mask = feature < value - left = sample[mask] - right = sample[~mask] - y_l = y_s[mask] - y_r = y_s[~mask] - return left, y_l, right, y_r - - -def _compute_leaf_info(y_s): - return _LeafInfo(len(y_s), np.mean(y_s)) - - -def _split_node_wrapper( - sample, - n_features, - y_s, - m_try, - random_state, - samples_file=None, - features_file=None, -): - seed = random_state.randint(np.iinfo(np.int32).max) - - if features_file is not None: - return _split_node_using_features( - sample, n_features, y_s, m_try, features_file, seed - ) - elif samples_file is not None: - return _split_node(sample, n_features, y_s, m_try, samples_file, seed) - else: - raise ValueError( - "Invalid combination of arguments. samples_file is " - "None and features_file is None." - ) - - -@task(features_file=FILE_IN, returns=(object, list, list, list, list)) -def _split_node_using_features( - sample, n_features, y_s, m_try, features_file, seed -): - features_mmap = np.load(features_file, mmap_mode="r", allow_pickle=False) - random_state = RandomState(seed) - return _compute_split( - sample, n_features, y_s, m_try, features_mmap, random_state - ) - - -@task(samples_file=FILE_IN, returns=(object, list, list, list, list)) -def _split_node(sample, n_features, y_s, m_try, samples_file, seed): - features_mmap = np.load(samples_file, mmap_mode="r", allow_pickle=False).T - random_state = RandomState(seed) - return _compute_split( - sample, n_features, y_s, m_try, features_mmap, random_state - ) - - -def _compute_split( - sample, n_features, y_s, m_try, features_mmap, random_state -): - node_info = left_group = y_l = right_group = y_r = None - split_ended = False - tried_indices = [] - while not split_ended: - untried_indices = np.setdiff1d(np.arange(n_features), tried_indices) - index_selection = _feature_selection( - untried_indices, m_try, random_state - ) - b_score = float_info.max - b_index = None - b_value = None - for index in index_selection: - feature = features_mmap[index] - score, value = test_split(sample, y_s, feature) - if score < b_score: - b_score, b_value, b_index = score, value, index - groups = _get_groups(sample, y_s, features_mmap, b_index, b_value) - left_group, y_l, right_group, y_r = groups - if left_group.size and right_group.size: - split_ended = True - node_info = _InnerNodeInfo(b_index, b_value) - else: - tried_indices.extend(list(index_selection)) - if len(tried_indices) == n_features: - split_ended = True - node_info = _compute_leaf_info(y_s) - left_group = sample - y_l = y_s - right_group = np.array([], dtype=np.int64) - y_r = np.array([], dtype=np.float64) - - return node_info, left_group, y_l, right_group, y_r - - -def _build_subtree_wrapper( - sample, - y_s, - n_features, - max_depth, - m_try, - sklearn_max, - random_state, - samples_file, - features_file, -): - seed = random_state.randint(np.iinfo(np.int32).max) - if features_file is not None: - return _build_subtree_using_features( - sample, - y_s, - n_features, - max_depth, - m_try, - sklearn_max, - seed, - samples_file, - features_file, - ) - else: - return _build_subtree( - sample, - y_s, - n_features, - max_depth, - m_try, - sklearn_max, - seed, - samples_file, - ) - - -@task(samples_file=FILE_IN, features_file=FILE_IN, returns=_Node) -def _build_subtree_using_features( - sample, - y_s, - n_features, - max_depth, - m_try, - sklearn_max, - seed, - samples_file, - features_file, -): - random_state = RandomState(seed) - return _compute_build_subtree( - sample, - y_s, - n_features, - max_depth, - m_try, - sklearn_max, - random_state, - samples_file, - features_file=features_file, - ) - - -@task(samples_file=FILE_IN, returns=_Node) -def _build_subtree( - sample, - y_s, - n_features, - max_depth, - m_try, - sklearn_max, - seed, - samples_file, -): - random_state = RandomState(seed) - return _compute_build_subtree( - sample, - y_s, - n_features, - max_depth, - m_try, - sklearn_max, - random_state, - samples_file, - ) - - -def _compute_build_subtree( - sample, - y_s, - n_features, - max_depth, - m_try, - sklearn_max, - random_state, - samples_file, - features_file=None, - use_sklearn=True, -): - if not sample.size: - return _Node() - if features_file is not None: - mmap = np.load(features_file, mmap_mode="r", allow_pickle=False) - else: - mmap = np.load(samples_file, mmap_mode="r", allow_pickle=False).T - subtree = _Node() - tree_traversal = [(subtree, sample, y_s, 0)] - while tree_traversal: - node, sample, y_s, depth = tree_traversal.pop() - if depth < max_depth: - if use_sklearn and n_features * len(sample) <= sklearn_max: - if max_depth == np.inf: - sklearn_max_depth = None - else: - sklearn_max_depth = max_depth - depth - dt = SklearnDTRegressor( - max_features=m_try, - max_depth=sklearn_max_depth, - random_state=random_state, - ) - unique = np.unique( - sample, return_index=True, return_counts=True - ) - sample, new_indices, sample_weight = unique - x = _get_sample_attributes(samples_file, sample) - y_s = y_s[new_indices] - dt.fit(x, y_s, sample_weight=sample_weight, check_input=False) - node.content = _SkTreeWrapper(dt) - else: - split = _compute_split( - sample, - n_features, - y_s, - m_try, - mmap, - random_state, - ) - node_info, left_group, y_l, right_group, y_r = split - node.content = node_info - if isinstance(node_info, _InnerNodeInfo): - node.left = _Node() - node.right = _Node() - tree_traversal.append( - (node.right, right_group, y_r, depth + 1) - ) - tree_traversal.append( - (node.left, left_group, y_l, depth + 1) - ) - else: - node.content = _compute_leaf_info(y_s) - return subtree - - -@task(returns=list) -def _merge(*object_list): - return object_list - - -def _get_subtree_path(subtree_index, distr_depth): - if distr_depth == 0: - return "" - return bin(subtree_index)[2:].zfill(distr_depth) - - -def _get_predicted_indices(samples, tree, nodes_info, path): - idx_mask = np.full((len(samples),), True) - for direction in path: - node_info = nodes_info[tree.content] - if isinstance(node_info, _LeafInfo): - if direction == "1": - idx_mask[:] = 0 - else: - col = node_info.index - value = node_info.value - if direction == "0": - idx_mask[idx_mask] = samples[idx_mask, col] <= value - tree = tree.left - else: - idx_mask[idx_mask] = samples[idx_mask, col] > value - tree = tree.right - return idx_mask - - -@task(row_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) -def _predict_branch( - row_blocks, tree, nodes_info, subtree_index, subtree, distr_depth -): - samples = Array._merge_blocks(row_blocks) - path = _get_subtree_path(subtree_index, distr_depth) - indices_mask = _get_predicted_indices(samples, tree, nodes_info, path) - prediction = subtree.predict(samples[indices_mask]) - return indices_mask, prediction - - -@task(returns=list) -def _merge_branches(n_classes, *predictions): - samples_len = len(predictions[0][0]) - if n_classes is not None: # predict - shape = (samples_len, n_classes) - dtype = np.float64 - else: # predict_proba - shape = (samples_len,) - dtype = np.float64 - merged_prediction = np.empty(shape, dtype=dtype) - for selected, prediction in predictions: - merged_prediction[selected] = prediction - return merged_prediction diff --git a/dislib/regression/rf/forest.py b/dislib/regression/rf/forest.py deleted file mode 100644 index faae07c8..00000000 --- a/dislib/regression/rf/forest.py +++ /dev/null @@ -1,236 +0,0 @@ -import math -from collections import Counter - -import numpy as np -from pycompss.api.api import compss_wait_on -from pycompss.api.parameter import Type, COLLECTION_IN, Depth -from pycompss.api.task import task -from sklearn.base import BaseEstimator -from sklearn.utils import check_random_state - -from dislib.regression.rf.decision_tree import DecisionTreeRegressor -from dislib.data.array import Array -from dislib.utils.base import _paired_partition -from dislib.regression.rf._data import transform_to_rf_dataset - - -class RandomForestRegressor(BaseEstimator): - """A distributed random forest regressor. - - Parameters - ---------- - n_estimators : int, optional (default=10) - Number of trees to fit. - try_features : int, str or None, optional (default='sqrt') - The number of features to consider when looking for the best split: - - - If "sqrt", then `try_features=sqrt(n_features)`. - - If "third", then `try_features=n_features // 3`. - - If None, then `try_features=n_features`. - - Note: the search for a split does not stop until at least one - valid partition of the node samples is found, even if it requires - to effectively inspect more than ``try_features`` features. - max_depth : int or np.inf, optional (default=np.inf) - The maximum depth of the tree. If np.inf, then nodes are expanded - until all leaves are pure. - distr_depth : int or str, optional (default='auto') - Number of levels of the tree in which the nodes are split in a - distributed way. - sklearn_max: int or float, optional (default=1e8) - Maximum size (len(subsample)*n_features) of the arrays passed to - sklearn's DecisionTreeRegressor.fit(), which is called to fit subtrees - (subsamples) of our DecisionTreeRegressor. sklearn fit() is used - because it's faster, but requires loading the data to memory, which can - cause memory problems for large datasets. This parameter can be - adjusted to fit the hardware capabilities. - random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. - - Attributes - ---------- - trees : list of DecisionTreeRegressor - List of the tree regressors of this forest, populated at fit(). - """ - - def __init__( - self, - n_estimators=10, - try_features="sqrt", - max_depth=np.inf, - distr_depth="auto", - sklearn_max=1e8, - random_state=None, - ): - self.n_estimators = n_estimators - self.try_features = try_features - self.max_depth = max_depth - self.distr_depth = distr_depth - self.sklearn_max = sklearn_max - self.random_state = random_state - - def fit(self, x, y): - """Fits the RandomForestRegressor. - - Parameters - ---------- - x : ds-array, shape=(n_samples, n_features) - The training input samples. Internally, its dtype will be converted - to ``dtype=np.float32``. - y : ds-array, shape=(n_samples, 1) - The target values. - - Returns - ------- - self : RandomForestRegressor - - """ - self.trees = [] - - dataset = transform_to_rf_dataset(x, y) - - n_features = dataset.get_n_features() - try_features = _resolve_try_features(self.try_features, n_features) - random_state = check_random_state(self.random_state) - - if self.distr_depth == "auto": - dataset.n_samples = compss_wait_on(dataset.get_n_samples()) - distr_depth = max(0, int(math.log10(dataset.n_samples)) - 4) - distr_depth = min(distr_depth, self.max_depth) - else: - distr_depth = self.distr_depth - - for _ in range(self.n_estimators): - tree = DecisionTreeRegressor( - try_features, - self.max_depth, - distr_depth, - self.sklearn_max, - bootstrap=True, - random_state=random_state, - ) - self.trees.append(tree) - - for tree in self.trees: - tree.fit(dataset) - - return self - - def predict(self, x): - """Predicts target values using a fitted forest. - - Parameters - ---------- - x : ds-array, shape=(n_samples, n_features) - The input samples. - - Returns - ------- - y_pred : ds-array, shape=(n_samples, 1) - Predicted target values for x. - - """ - assert self.trees is not None, "The random forest is not fitted." - pred_blocks = [] - for x_row in x._iterator(axis=0): - tree_predictions = [] - for tree in self.trees: - tree_predictions.append(tree.predict(x_row)) - pred_blocks.append(_join_predictions(*tree_predictions)) - - y_pred = Array( - blocks=[pred_blocks], - top_left_shape=(x._top_left_shape[0], 1), - reg_shape=(x._reg_shape[0], 1), - shape=(x.shape[0], 1), - sparse=False, - ) - - return y_pred - - def score(self, x, y): - """Accuracy regression score. - - Return the coefficient of determination $R^2$ of - the prediction. - The coefficient $R^2$ is defined as $(1-u/v)$, where $u$ - is the residual sum of squares `((y_true - y_pred) ** 2).sum()` and - $v$ is the total sum of squares - `((y_true - y_true.mean()) ** 2).sum()`. - The best possible score is 1.0 and it can be negative - (because the model can be arbitrarily worse). - A constant model that always predicts the expected value of y, - disregarding the input features, would get a $R^2$ score of 0.0. - - Parameters - ---------- - x : ds-array, shape=(n_samples, n_features) - The training input samples. - y : ds-array, shape (n_samples, 1) - The true labels. - - Returns - ------- - score : float (as future object) - Coefficient of determination $R^2$. - - """ - assert self.trees is not None, "The random forest is not fitted." - partial_scores = [] - for x_row, y_row in _paired_partition(x, y): - tree_predictions = [] - for tree in self.trees: - tree_predictions.append(tree.predict(x_row)) - subset_score = _partial_score(y_row._blocks, *tree_predictions) - partial_scores.append(subset_score) - - return _merge_scores(*partial_scores) - - -@task(returns=1) -def _resolve_try_features(try_features, n_features): - if try_features is None: - return n_features - elif try_features == "sqrt": - return int(math.sqrt(n_features)) - elif try_features == "third": - return max(1, n_features // 3) - else: - return int(try_features) - - -@task(returns=1) -def _join_predictions(*predictions): - aggregate = predictions[0] - for p in predictions[1:]: - aggregate += p - target = aggregate / len(predictions) - return target - - -@task(y_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) -def _partial_score(y_blocks, *predictions): - y_true = Array._merge_blocks(y_blocks).flatten() - y_pred = np.mean(predictions, axis=0) - n_samples = y_true.shape[0] - y_avg = np.mean(y_true) - u_partial = np.sum(np.square(y_true - y_pred), axis=0) - v_partial = np.sum(np.square(y_true - y_avg), axis=0) - return u_partial, v_partial, y_avg, n_samples - - -@task(returns=1) -def _merge_scores(*partial_scores): - u = v = avg = n = 0 - for u_p, v_p, avg_p, n_p in partial_scores: - u += u_p - - delta = avg_p - avg - avg += delta * n_p / (n + n_p) - v += v_p + delta ** 2 * n * n_p / (n + n_p) - n += n_p - - return 1 - u / v diff --git a/dislib/regression/rf/test_split.py b/dislib/regression/rf/test_split.py deleted file mode 100644 index aa482b3c..00000000 --- a/dislib/regression/rf/test_split.py +++ /dev/null @@ -1,48 +0,0 @@ -from sys import float_info - -import numpy as np - - -def mse_criteria_proxy(l_weight, l_length, r_weight, r_length, not_repeated): - """ - Maximizing the MSE gain is equivalent to minimizing this proxy function. - - """ - return -(l_weight / l_length + r_weight / r_length) * not_repeated - - -def test_split(sample, y_s, feature): - size = y_s.shape[0] - if size == 0: - return float_info.max, np.float64(np.inf) - - f = feature[sample] - sort_indices = np.argsort(f) - y_sorted = y_s[sort_indices] - f_sorted = f[sort_indices] - - # Threshold value must not be that value of a sample - not_repeated = np.empty(size, dtype=np.bool_) - not_repeated[0 : size - 1] = f_sorted[1:] != f_sorted[:-1] - not_repeated[size - 1] = True - - # Square of the sum of the y values of each branch - r_weight = np.zeros(size) - l_weight = np.square(np.cumsum(y_sorted, axis=-1)) - r_weight[:-1] = np.square(np.cumsum(y_sorted[::-1], axis=-1)[-2::-1]) - - # Number of samples of each branch - l_length = np.arange(1, size + 1, dtype=np.int32) - r_length = np.arange(size - 1, -1, -1, dtype=np.int32) - r_length[size - 1] = 1 # Avoid div by zero, the right score is 0 anyways - - scores = mse_criteria_proxy( - l_weight, l_length, r_weight, r_length, not_repeated - ) - - min_index = size - np.argmin(scores[::-1]) - 1 - if min_index + 1 == size: - b_value = np.float64(np.inf) - else: - b_value = (f_sorted[min_index] + f_sorted[min_index + 1]) / 2 - return scores[min_index], b_value From f99f61bbd097abd02dc888e5b0a8b6b0aa452089 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Mon, 26 Jul 2021 18:15:57 +0200 Subject: [PATCH 36/46] Edited tests. --- dislib/utils/saving.py | 11 +- tests/test_saving.py | 31 ++ tests/test_saving_cbor.py | 555 +------------------------------ tests/test_saving_json.py | 667 ++++---------------------------------- 4 files changed, 122 insertions(+), 1142 deletions(-) create mode 100644 tests/test_saving.py diff --git a/dislib/utils/saving.py b/dislib/utils/saving.py index f0b8313c..620cc90a 100644 --- a/dislib/utils/saving.py +++ b/dislib/utils/saving.py @@ -16,9 +16,12 @@ import dislib.recommendation import dislib.regression from dislib.data.array import Array -from dislib.classification.rf.decision_tree import ( +from dislib.commons.rf._decision_tree import ( DecisionTreeClassifier, + DecisionTreeRegressor, _Node, + _ClassificationNode, + _RegressionNode, _InnerNodeInfo, _LeafInfo, _SkTreeWrapper, @@ -44,7 +47,10 @@ DISLIB_CLASSES = { "KMeans": dislib.cluster.KMeans, "DecisionTreeClassifier": DecisionTreeClassifier, + "DecisionTreeRegressor": DecisionTreeRegressor, "_Node": _Node, + "_ClassificationNode": _ClassificationNode, + "_RegressionNode": _RegressionNode, "_InnerNodeInfo": _InnerNodeInfo, "_LeafInfo": _LeafInfo, "_SkTreeWrapper": _SkTreeWrapper, @@ -347,6 +353,7 @@ def _sync_obj(obj): elif isinstance(obj, list): iterator = iter(enumerate(obj)) else: + print(obj) raise ValueError("Expected dict or list and received %s." % type(obj)) for key, val in iterator: @@ -358,7 +365,7 @@ def _sync_obj(obj): raise TypeError( "Could not synchronize Future (%s, %s)." % (key, val) ) - if hasattr(obj[key], "__dict__"): + if isinstance(getattr(obj[key], "__dict__", None), dict): _sync_obj(obj[key].__dict__) diff --git a/tests/test_saving.py b/tests/test_saving.py new file mode 100644 index 00000000..d1a8bb92 --- /dev/null +++ b/tests/test_saving.py @@ -0,0 +1,31 @@ +import unittest +from unittest.mock import patch + +import numpy as np +import sys + +from dislib.cluster import KMeans +from dislib.utils import save_model, load_model + + +class SavingTest(unittest.TestCase): + filepath = "tests/files/saving/kmeans.json" + + def test_errors(self): + """Test that errors are raised""" + km = KMeans(n_clusters=2, verbose=False) + + with patch(sys.modules["cbor"]) as mock_cbor: + mock_cbor.return_value = None + self.assertRaises( + ModuleNotFoundError, + save_model(km, self.filepath, save_format="json"), + ) + + +def main(): + unittest.main() + + +if __name__ == "__main__": + main() diff --git a/tests/test_saving_cbor.py b/tests/test_saving_cbor.py index 64cd534a..c8efd336 100644 --- a/tests/test_saving_cbor.py +++ b/tests/test_saving_cbor.py @@ -25,32 +25,6 @@ class KMeansSavingTestCBOR(unittest.TestCase): filepath = "tests/files/saving/kmeans.cbor" - def test_init_params_kmeans(self): - """Tests that KMeans correctly sets the initialization - parameters""" - n_clusters = 2 - max_iter = 1 - tol = 1e-4 - seed = 666 - arity = 2 - init = "random" - - km = KMeans( - n_clusters=n_clusters, - max_iter=max_iter, - tol=tol, - arity=arity, - random_state=seed, - ) - save_model(km, self.filepath, save_format="cbor") - km2 = load_model(self.filepath, load_format="cbor") - - expected = (n_clusters, init, max_iter, tol, arity) - real = (km.n_clusters, km.init, km.max_iter, km.tol, km.arity) - real2 = (km2.n_clusters, km2.init, km2.max_iter, km2.tol, km2.arity) - self.assertEqual(expected, real) - self.assertEqual(expected, real2) - def test_fit_kmeans(self): """Tests that the fit method returns the expected centers using toy data. @@ -94,35 +68,6 @@ def test_predict_kmeans(self): self.assertTrue(np.array_equal(labels, expected_labels)) self.assertTrue(np.array_equal(labels2, expected_labels)) - def test_fit_predict_kmeans(self): - """Tests fit_predict.""" - x, y = make_blobs(n_samples=1500, random_state=170) - x_filtered = np.vstack( - (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10]) - ) - - x_train = ds.array(x_filtered, block_size=(300, 2)) - - kmeans = KMeans(n_clusters=3, random_state=170) - labels = kmeans.fit_predict(x_train).collect() - - save_model(kmeans, self.filepath, save_format="cbor") - kmeans = load_model(self.filepath, load_format="cbor") - - skmeans = SKMeans(n_clusters=3, random_state=170) - sklabels = skmeans.fit_predict(x_filtered) - - centers = np.array( - [ - [-8.941375656533449, -5.481371322614891], - [-4.524023204953875, 0.06235042593214654], - [2.332994701667008, 0.37681003933082696], - ] - ) - - self.assertTrue(np.allclose(centers, kmeans.centers)) - self.assertTrue(np.allclose(labels, sklabels)) - def test_sparse_kmeans(self): """Tests K-means produces the same results using dense and sparse data structures.""" @@ -192,67 +137,6 @@ def test_init_kmeans(self): class GaussianMixtureSavingTestCBOR(unittest.TestCase): filepath = "tests/files/saving/gm.cbor" - def test_init_params(self): - """Tests that GaussianMixture params are set""" - n_components = 2 - covariance_type = "diag" - tol = 1e-4 - reg_covar = 1e-5 - max_iter = 3 - init_params = "random" - weights_init = np.array([0.4, 0.6]) - means_init = np.array([[0, 0], [2, 3]]) - precisions_init = "todo" - random_state = RandomState(666) - gm = GaussianMixture( - n_components=n_components, - covariance_type=covariance_type, - tol=tol, - reg_covar=reg_covar, - max_iter=max_iter, - init_params=init_params, - weights_init=weights_init, - means_init=means_init, - precisions_init=precisions_init, - random_state=random_state, - ) - - save_model(gm, self.filepath, save_format="cbor") - gm2 = load_model(self.filepath, load_format="cbor") - - real = ( - gm.n_components, - gm.covariance_type, - gm.tol, - gm.reg_covar, - gm.max_iter, - gm.init_params, - gm.weights_init.tolist(), - gm.means_init.tolist(), - gm.precisions_init, - *[ - list(x) if isinstance(x, np.ndarray) else x - for x in gm.random_state.get_state() - ], - ) - real2 = ( - gm2.n_components, - gm2.covariance_type, - gm2.tol, - gm2.reg_covar, - gm2.max_iter, - gm2.init_params, - gm2.weights_init.tolist(), - gm2.means_init.tolist(), - gm2.precisions_init, - *[ - list(x) if isinstance(x, np.ndarray) else x - for x in gm2.random_state.get_state() - ], - ) - - self.assertEqual(real, real2) - def test_fit(self): """Tests GaussianMixture.fit()""" @@ -324,32 +208,6 @@ def test_predict(self): self.assertTrue(pred2[0] == pred2[2] == pred2[4]) self.assertTrue(pred2[1] == pred2[3] == pred2[5]) - def test_fit_predict(self): - """Tests GaussianMixture.fit_predict()""" - x, y = make_blobs(n_samples=1500, random_state=170) - x_filtered = np.vstack( - (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10]) - ) - y_real = np.concatenate((np.zeros(500), np.ones(100), 2 * np.ones(10))) - - ds_x = ds.array(x_filtered, block_size=(300, 2)) - - gm = GaussianMixture(n_components=3, random_state=170) - pred = gm.fit_predict(ds_x).collect() - - save_model(gm, self.filepath, save_format="cbor") - gm2 = load_model(self.filepath, load_format="cbor") - - pred2 = gm2.predict(ds_x).collect() - - self.assertEqual(len(pred), 610) - accuracy = np.count_nonzero(pred == y_real) / len(pred) - self.assertGreater(accuracy, 0.99) - - self.assertEqual(len(pred2), 610) - accuracy2 = np.count_nonzero(pred2 == y_real) / len(pred2) - self.assertGreater(accuracy2, 0.99) - def test_sparse(self): """Tests GaussianMixture produces the same results using dense and sparse data structures""" @@ -383,86 +241,10 @@ def test_sparse(self): self.assertTrue(np.array_equal(labels_sparse, labels_dense)) self.assertTrue(np.array_equal(labels_sparse2, labels_dense2)) - def test_init_random(self): - """Tests GaussianMixture random initialization""" - x = ds.random_array((50, 3), (10, 3), random_state=0) - gm = GaussianMixture( - init_params="random", n_components=4, arity=2, random_state=170 - ) - gm.fit(x) - save_model(gm, self.filepath, save_format="cbor") - gm2 = load_model(self.filepath, load_format="cbor") - self.assertGreater(gm.n_iter, 5) - self.assertGreater(gm2.n_iter, 5) - - def test_means_init_and_weights_init(self): - """Tests GaussianMixture means_init and weights_init parameters""" - x, _ = load_iris(return_X_y=True) - x_ds = ds.array(x, (75, 4)) - weights_init = [1 / 3, 1 / 3, 1 / 3] - means_init = np.array([[5, 3, 2, 0], [6, 3, 4, 1], [7, 3, 6, 2]]) - gm = GaussianMixture( - random_state=0, - n_components=3, - weights_init=weights_init, - means_init=means_init, - ) - gm.fit(x_ds) - save_model(gm, self.filepath, save_format="cbor") - gm2 = load_model(self.filepath, load_format="cbor") - self.assertTrue(gm.converged_) - self.assertTrue(gm2.converged_) - class CSVMSavingTestCBOR(unittest.TestCase): filepath = "tests/files/saving/csvm.cbor" - def test_init_params(self): - """Test constructor parameters""" - cascade_arity = 3 - max_iter = 1 - tol = 1e-4 - kernel = "rbf" - c = 2 - gamma = 0.1 - check_convergence = True - seed = 666 - verbose = False - - csvm = CascadeSVM( - cascade_arity=cascade_arity, - max_iter=max_iter, - tol=tol, - kernel=kernel, - c=c, - gamma=gamma, - check_convergence=check_convergence, - random_state=seed, - verbose=verbose, - ) - save_model(csvm, self.filepath, save_format="cbor") - csvm2 = load_model(self.filepath, load_format="cbor") - - self.assertEqual(csvm.cascade_arity, cascade_arity) - self.assertEqual(csvm.max_iter, max_iter) - self.assertEqual(csvm.tol, tol) - self.assertEqual(csvm.kernel, kernel) - self.assertEqual(csvm.c, c) - self.assertEqual(csvm.gamma, gamma) - self.assertEqual(csvm.check_convergence, check_convergence) - self.assertEqual(csvm.random_state, seed) - self.assertEqual(csvm.verbose, verbose) - - self.assertEqual(csvm2.cascade_arity, cascade_arity) - self.assertEqual(csvm2.max_iter, max_iter) - self.assertEqual(csvm2.tol, tol) - self.assertEqual(csvm2.kernel, kernel) - self.assertEqual(csvm2.c, c) - self.assertEqual(csvm2.gamma, gamma) - self.assertEqual(csvm2.check_convergence, check_convergence) - self.assertEqual(csvm2.random_state, seed) - self.assertEqual(csvm2.verbose, verbose) - def test_fit_private_params(self): kernel = "rbf" c = 2 @@ -495,51 +277,6 @@ def test_fit_private_params(self): # # check for exception when incorrect kernel is passed # self.assertRaises(AttributeError, CascadeSVM(kernel='fake_kernel')) - def test_fit(self): - seed = 666 - file_ = "tests/files/libsvm/2" - - x, y = ds.load_svmlight_file(file_, (10, 300), 780, False) - - csvm = CascadeSVM( - cascade_arity=3, - max_iter=5, - tol=1e-4, - kernel="linear", - c=2, - gamma=0.1, - check_convergence=True, - random_state=seed, - verbose=False, - ) - - csvm.fit(x, y) - save_model(csvm, self.filepath, save_format="cbor") - csvm2 = load_model(self.filepath, load_format="cbor") - - self.assertTrue(csvm.converged) - self.assertTrue(csvm2.converged) - - csvm = CascadeSVM( - cascade_arity=3, - max_iter=1, - tol=1e-4, - kernel="linear", - c=2, - gamma=0.1, - check_convergence=False, - random_state=seed, - verbose=False, - ) - - csvm.fit(x, y) - save_model(csvm, self.filepath, save_format="cbor") - csvm2 = load_model(self.filepath, load_format="cbor") - self.assertFalse(csvm.converged) - self.assertEqual(csvm.iterations, 1) - self.assertFalse(csvm2.converged) - self.assertEqual(csvm2.iterations, 1) - def test_predict(self): seed = 666 @@ -617,60 +354,6 @@ def test_score(self): self.assertEqual(accuracy, 1.0) self.assertEqual(accuracy2, 1.0) - def test_decision_func(self): - seed = 666 - - # negative points belong to class 1, positives to 0 - # all points are in the x-axis - p1, p2, p3, p4 = [0, 2], [0, 1], [0, -2], [0, -1] - - x = ds.array(np.array([p1, p4, p3, p2]), (2, 2)) - y = ds.array(np.array([0, 1, 1, 0]).reshape(-1, 1), (2, 1)) - - csvm = CascadeSVM( - cascade_arity=3, - max_iter=10, - tol=1e-4, - kernel="rbf", - c=2, - gamma=0.1, - check_convergence=False, - random_state=seed, - verbose=False, - ) - - csvm.fit(x, y) - save_model(csvm, self.filepath, save_format="cbor") - csvm2 = load_model(self.filepath, load_format="cbor") - - # p1 should be equidistant to p3, and p2 to p4 - x_test = ds.array(np.array([p1, p2, p3, p4]), (2, 2)) - - y_pred = csvm.decision_function(x_test) - y_pred2 = csvm2.decision_function(x_test) - - d1, d2, d3, d4 = y_pred.collect() - self.assertTrue(np.isclose(abs(d1) - abs(d3), 0)) - self.assertTrue(np.isclose(abs(d2) - abs(d4), 0)) - d1, d2, d3, d4 = y_pred2.collect() - self.assertTrue(np.isclose(abs(d1) - abs(d3), 0)) - self.assertTrue(np.isclose(abs(d2) - abs(d4), 0)) - - # p5 and p6 should be in the decision function (distance=0) - p5, p6 = np.array([1, 0]), np.array([-1, 0]) - - x_test = ds.array(np.array([p5, p6]), (1, 2)) - - y_pred = csvm.decision_function(x_test) - y_pred2 = csvm2.decision_function(x_test) - - d5, d6 = y_pred.collect() - self.assertTrue(np.isclose(d5, 0)) - self.assertTrue(np.isclose(d6, 0)) - d5, d6 = y_pred2.collect() - self.assertTrue(np.isclose(d5, 0)) - self.assertTrue(np.isclose(d6, 0)) - def test_sparse(self): """Tests that C-SVM produces the same results with sparse and dense data""" @@ -708,36 +391,6 @@ def test_sparse(self): self.assertTrue(np.array_equal(coef_d2, coef_sp2)) self.assertTrue(np.array_equal(coef_d, coef_d2)) - def test_duplicates(self): - """Tests that C-SVM does not generate duplicate support vectors""" - x = ds.array( - np.array( - [ - [0, 1], - [1, 1], - [0, 1], - [1, 2], - [0, 0], - [2, 2], - [2, 1], - [1, 0], - ] - ), - (2, 2), - ) - - y = ds.array(np.array([1, 0, 1, 0, 1, 0, 0, 1]).reshape(-1, 1), (2, 1)) - - csvm = CascadeSVM(c=1, random_state=1, max_iter=100, tol=0) - csvm.fit(x, y) - save_model(csvm, self.filepath, save_format="cbor") - csvm2 = load_model(self.filepath, load_format="cbor") - - csvm._collect_clf() - csvm2._collect_clf() - self.assertEqual(csvm._clf.support_vectors_.shape[0], 6) - self.assertEqual(csvm2._clf.support_vectors_.shape[0], 6) - class RFSavingTestCBOR(unittest.TestCase): filepath = "tests/files/saving/rf.cbor" @@ -757,8 +410,8 @@ def test_make_classification_score(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2 :], (300, 10)) + y_test = ds.array(y[len(y) // 2 :][:, np.newaxis], (300, 1)) rf = RandomForestClassifier(random_state=0) rf.fit(x_train, y_train) @@ -785,8 +438,8 @@ def test_make_classification_predict_and_distr_depth(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = y[len(y) // 2:] + x_test = ds.array(x[len(x) // 2 :], (300, 10)) + y_test = y[len(y) // 2 :] rf = RandomForestClassifier(distr_depth=2, random_state=0) rf.fit(x_train, y_train) @@ -800,35 +453,6 @@ def test_make_classification_predict_and_distr_depth(self): self.assertGreater(accuracy, 0.7) self.assertGreater(accuracy2, 0.7) - def test_make_classification_fit_predict(self): - """Tests RandomForestClassifier fit_predict with default params.""" - x, y = make_classification( - n_samples=3000, - n_features=10, - n_classes=3, - n_informative=4, - n_redundant=2, - n_repeated=1, - n_clusters_per_class=2, - shuffle=True, - random_state=0, - ) - x_train = ds.array(x[: len(x) // 2], (300, 10)) - y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - - rf = RandomForestClassifier(random_state=0) - rf.fit(x_train, y_train) - save_model(rf, self.filepath, save_format="cbor") - rf2 = load_model(self.filepath, load_format="cbor") - - y_pred = rf.predict(x_train).collect() - y_pred2 = rf2.predict(x_train).collect() - y_train = y_train.collect() - accuracy = np.count_nonzero(y_pred == y_train) / len(y_train) - accuracy2 = np.count_nonzero(y_pred2 == y_train) / len(y_train) - self.assertGreater(accuracy, 0.7) - self.assertGreater(accuracy2, 0.7) - def test_make_classification_sklearn_max_predict(self): """Tests RandomForestClassifier predict with sklearn_max.""" x, y = make_classification( @@ -844,8 +468,8 @@ def test_make_classification_sklearn_max_predict(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = y[len(y) // 2:] + x_test = ds.array(x[len(x) // 2 :], (300, 10)) + y_test = y[len(y) // 2 :] rf = RandomForestClassifier(random_state=0, sklearn_max=10) rf.fit(x_train, y_train) @@ -874,8 +498,8 @@ def test_make_classification_sklearn_max_predict_proba(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = y[len(y) // 2:] + x_test = ds.array(x[len(x) // 2 :], (300, 10)) + y_test = y[len(y) // 2 :] rf = RandomForestClassifier(random_state=0, sklearn_max=10) rf.fit(x_train, y_train) @@ -908,8 +532,8 @@ def test_make_classification_hard_vote_predict(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = y[len(y) // 2:] + x_test = ds.array(x[len(x) // 2 :], (300, 10)) + y_test = y[len(y) // 2 :] rf = RandomForestClassifier( random_state=0, sklearn_max=10, hard_vote=True @@ -941,8 +565,8 @@ def test_make_classification_hard_vote_score_mix(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2 :], (300, 10)) + y_test = ds.array(y[len(y) // 2 :][:, np.newaxis], (300, 1)) rf = RandomForestClassifier( random_state=0, @@ -960,28 +584,6 @@ def test_make_classification_hard_vote_score_mix(self): self.assertGreater(accuracy, 0.7) self.assertGreater(accuracy2, 0.7) - def test_iris(self): - """Tests RandomForestClassifier with a minimal example.""" - x, y = datasets.load_iris(return_X_y=True) - ds_fit = ds.array(x[::2], block_size=(30, 2)) - fit_y = ds.array(y[::2].reshape(-1, 1), block_size=(30, 1)) - ds_validate = ds.array(x[1::2], block_size=(30, 2)) - validate_y = ds.array(y[1::2].reshape(-1, 1), block_size=(30, 1)) - - rf = RandomForestClassifier( - n_estimators=1, max_depth=1, random_state=0 - ) - rf.fit(ds_fit, fit_y) - save_model(rf, self.filepath, save_format="cbor") - rf2 = load_model(self.filepath, load_format="cbor") - - accuracy = compss_wait_on(rf.score(ds_validate, validate_y)) - accuracy2 = compss_wait_on(rf2.score(ds_validate, validate_y)) - - # Accuracy should be <= 2/3 for any seed, often exactly equal. - self.assertAlmostEqual(accuracy, 2 / 3) - self.assertAlmostEqual(accuracy2, 2 / 3) - class LassoSavingTestCBOR(unittest.TestCase): filepath = "tests/files/saving/lasso.cbor" @@ -1005,7 +607,7 @@ def test_fit_predict(self): n_samples = X.shape[0] X_train, y_train = X[: n_samples // 2], y[: n_samples // 2] - X_test, y_test = X[n_samples // 2:], y[n_samples // 2:] + X_test, y_test = X[n_samples // 2 :], y[n_samples // 2 :] lasso = Lasso(lmbd=0.1, max_iter=50) @@ -1175,100 +777,6 @@ def test_multivariate_no_intercept(self): np.allclose(pred2, [2.05649718, 3.14689266, 1.3940678]) ) - def test_multivariate_multiobjective(self): - """Tests fit() and predict(), multivariate, multiobjective.""" - x_data = np.array( - [[1, 2, 3], [2, 0, 4], [3, 1, 8], [4, 4, 2], [5, 3, 1], [2, 7, 1]] - ) - y_data = np.array( - [ - [2, 0, 3], - [1, 5, 2], - [1, 3, 4], - [2, 7, 9], - [4.5, -1, 4], - [0, 0, 0], - ] - ) - - bn, bm = 2, 2 - - x = ds.array(x=x_data, block_size=(bn, bm)) - y = ds.array(x=y_data, block_size=(bn, bm)) - - reg = LinearRegression() - reg.fit(x, y) - save_model(reg, self.filepath, save_format="cbor") - reg2 = load_model(self.filepath, load_format="cbor") - - # Predict one sample - x_test = np.array([3, 2, 1]) - test_data = ds.array(x=x_test, block_size=(1, bm)) - pred = reg.predict(test_data).collect() - pred2 = reg2.predict(test_data).collect() - self.assertTrue(np.allclose(pred, [3.0318415, 1.97164872, 3.85410906])) - self.assertTrue( - np.allclose(pred2, [3.0318415, 1.97164872, 3.85410906]) - ) - - # Predict multiple samples - x_test = np.array([[3, 2, 1], [4, 3, 3], [1, 1, 1]]) - test_data = ds.array(x=x_test, block_size=(bn, bm)) - pred = reg.predict(test_data).collect() - pred2 = reg2.predict(test_data).collect() - self.assertTrue( - np.allclose( - pred, - [ - [3.0318415, 1.97164872, 3.85410906], - [2.5033157, 2.65809327, 5.05310495], - [2.145797, 1.4840121, 1.5739791], - ], - ) - ) - self.assertTrue( - np.allclose( - pred2, - [ - [3.0318415, 1.97164872, 3.85410906], - [2.5033157, 2.65809327, 5.05310495], - [2.145797, 1.4840121, 1.5739791], - ], - ) - ) - - # Check attributes values - self.assertTrue( - np.allclose( - reg2.coef_.collect(), - [ - [0.65034768, 0.34673933, 1.22176283], - [-0.41465084, -0.20584208, -0.16339571], - [-0.38211131, 0.27277365, 0.07031439], - ], - ) - ) - self.assertTrue( - np.allclose( - reg2.coef_.collect(), - [ - [0.65034768, 0.34673933, 1.22176283], - [-0.41465084, -0.20584208, -0.16339571], - [-0.38211131, 0.27277365, 0.07031439], - ], - ) - ) - self.assertTrue( - np.allclose( - reg.intercept_.collect(), [2.29221145, 1.07034124, 0.44529761] - ) - ) - self.assertTrue( - np.allclose( - reg2.intercept_.collect(), [2.29221145, 1.07034124, 0.44529761] - ) - ) - def load_movielens(train_ratio=0.9): file = "tests/files/sample_movielens_ratings.csv" @@ -1307,43 +815,6 @@ def load_movielens(train_ratio=0.9): class ALSSavingTestCBOR(unittest.TestCase): filepath = "tests/files/saving/als.cbor" - def test_init_params(self): - # Test all parameters - seed = 666 - n_f = 100 - lambda_ = 0.001 - convergence_threshold = 0.1 - max_iter = 10 - verbose = True - arity = 12 - - als = ALS( - random_state=seed, - n_f=n_f, - lambda_=lambda_, - tol=convergence_threshold, - max_iter=max_iter, - verbose=verbose, - arity=arity, - ) - save_model(als, self.filepath, save_format="cbor") - als2 = load_model(self.filepath, load_format="cbor") - - self.assertEqual(als.random_state, seed) - self.assertEqual(als.n_f, n_f) - self.assertEqual(als.lambda_, lambda_) - self.assertEqual(als.tol, convergence_threshold) - self.assertEqual(als.max_iter, max_iter) - self.assertEqual(als.verbose, verbose) - self.assertEqual(als.arity, arity) - self.assertEqual(als2.random_state, seed) - self.assertEqual(als2.n_f, n_f) - self.assertEqual(als2.lambda_, lambda_) - self.assertEqual(als2.tol, convergence_threshold) - self.assertEqual(als2.max_iter, max_iter) - self.assertEqual(als2.verbose, verbose) - self.assertEqual(als2.arity, arity) - def test_fit(self): train, test = load_movielens() diff --git a/tests/test_saving_json.py b/tests/test_saving_json.py index be18474d..1488d83c 100644 --- a/tests/test_saving_json.py +++ b/tests/test_saving_json.py @@ -25,32 +25,6 @@ class KMeansSavingTestJSON(unittest.TestCase): filepath = "tests/files/saving/kmeans.json" - def test_init_params_kmeans(self): - """Tests that saved and loaded KMeans object correctly sets the initialization - parameters""" - n_clusters = 2 - max_iter = 1 - tol = 1e-4 - seed = 666 - arity = 2 - init = "random" - - km = KMeans( - n_clusters=n_clusters, - max_iter=max_iter, - tol=tol, - arity=arity, - random_state=seed, - ) - save_model(km, self.filepath) - km2 = load_model(self.filepath) - - expected = (n_clusters, init, max_iter, tol, arity) - real = (km.n_clusters, km.init, km.max_iter, km.tol, km.arity) - real2 = (km2.n_clusters, km2.init, km2.max_iter, km2.tol, km2.arity) - self.assertEqual(expected, real) - self.assertEqual(expected, real2) - def test_fit_kmeans(self): """Tests that the fit method returns the expected centers using toy data. @@ -63,8 +37,8 @@ def test_fit_kmeans(self): expected_centers = np.array([[1.5, 1.5], [-1.5, -1.5]]) - save_model(km, self.filepath) - km2 = load_model(self.filepath) + save_model(km, self.filepath, save_format="json") + km2 = load_model(self.filepath, load_format="json") self.assertTrue((km.centers == expected_centers).all()) self.assertTrue((km2.centers == expected_centers).all()) @@ -79,8 +53,8 @@ def test_predict_kmeans(self): km = KMeans(n_clusters=2, random_state=666) km.fit(x) - save_model(km, self.filepath) - km2 = load_model(self.filepath) + save_model(km, self.filepath, save_format="json") + km2 = load_model(self.filepath, load_format="json") p5, p6 = [10, 10], [-10, -10] @@ -94,35 +68,6 @@ def test_predict_kmeans(self): self.assertTrue(np.array_equal(labels, expected_labels)) self.assertTrue(np.array_equal(labels2, expected_labels)) - def test_fit_predict_kmeans(self): - """Tests fit_predict.""" - x, y = make_blobs(n_samples=1500, random_state=170) - x_filtered = np.vstack( - (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10]) - ) - - x_train = ds.array(x_filtered, block_size=(300, 2)) - - kmeans = KMeans(n_clusters=3, random_state=170) - labels = kmeans.fit_predict(x_train).collect() - - save_model(kmeans, self.filepath) - kmeans = load_model(self.filepath) - - skmeans = SKMeans(n_clusters=3, random_state=170) - sklabels = skmeans.fit_predict(x_filtered) - - centers = np.array( - [ - [-8.941375656533449, -5.481371322614891], - [-4.524023204953875, 0.06235042593214654], - [2.332994701667008, 0.37681003933082696], - ] - ) - - self.assertTrue(np.allclose(centers, kmeans.centers)) - self.assertTrue(np.allclose(labels, sklabels)) - def test_sparse_kmeans(self): """Tests K-means produces the same results using dense and sparse data structures.""" @@ -134,8 +79,8 @@ def test_sparse_kmeans(self): kmeans = KMeans(random_state=170) kmeans.fit(x_sp) - save_model(kmeans, self.filepath) - kmeans2 = load_model(self.filepath) + save_model(kmeans, self.filepath, save_format="json") + kmeans2 = load_model(self.filepath, load_format="json") y_sparse = kmeans.predict(x_sp).collect() y_sparse2 = kmeans2.predict(x_sp).collect() @@ -165,8 +110,8 @@ def test_init_kmeans(self): km = KMeans(n_clusters=5, init=init) km.fit(x_train) - save_model(km, self.filepath) - km2 = load_model(self.filepath) + save_model(km, self.filepath, save_format="json") + km2 = load_model(self.filepath, load_format="json") self.assertTrue(np.array_equal(km.init, init)) self.assertTrue(np.array_equal(km2.init, init)) @@ -180,8 +125,8 @@ def test_init_kmeans(self): km = KMeans(n_clusters=5, init=init) km.fit(x_sp) - save_model(km, self.filepath) - km2 = load_model(self.filepath) + save_model(km, self.filepath, save_format="json") + km2 = load_model(self.filepath, load_format="json") self.assertTrue(np.array_equal(km.init.toarray(), init.toarray())) self.assertTrue(np.array_equal(km2.init.toarray(), init.toarray())) @@ -192,67 +137,6 @@ def test_init_kmeans(self): class GaussianMixtureSavingTestJSON(unittest.TestCase): filepath = "tests/files/saving/gm.json" - def test_init_params(self): - """Tests that GaussianMixture params are set""" - n_components = 2 - covariance_type = "diag" - tol = 1e-4 - reg_covar = 1e-5 - max_iter = 3 - init_params = "random" - weights_init = np.array([0.4, 0.6]) - means_init = np.array([[0, 0], [2, 3]]) - precisions_init = "todo" - random_state = RandomState(666) - gm = GaussianMixture( - n_components=n_components, - covariance_type=covariance_type, - tol=tol, - reg_covar=reg_covar, - max_iter=max_iter, - init_params=init_params, - weights_init=weights_init, - means_init=means_init, - precisions_init=precisions_init, - random_state=random_state, - ) - - save_model(gm, self.filepath) - gm2 = load_model(self.filepath) - - real = ( - gm.n_components, - gm.covariance_type, - gm.tol, - gm.reg_covar, - gm.max_iter, - gm.init_params, - gm.weights_init.tolist(), - gm.means_init.tolist(), - gm.precisions_init, - *[ - list(x) if isinstance(x, np.ndarray) else x - for x in gm.random_state.get_state() - ], - ) - real2 = ( - gm2.n_components, - gm2.covariance_type, - gm2.tol, - gm2.reg_covar, - gm2.max_iter, - gm2.init_params, - gm2.weights_init.tolist(), - gm2.means_init.tolist(), - gm2.precisions_init, - *[ - list(x) if isinstance(x, np.ndarray) else x - for x in gm2.random_state.get_state() - ], - ) - - self.assertEqual(real, real2) - def test_fit(self): """Tests GaussianMixture.fit()""" @@ -277,8 +161,8 @@ def test_fit(self): ] ) - save_model(gm, self.filepath) - gm2 = load_model(self.filepath) + save_model(gm, self.filepath, save_format="json") + gm2 = load_model(self.filepath, load_format="json") gm.weights_ = compss_wait_on(gm.weights_) gm.means_ = compss_wait_on(gm.means_) @@ -308,8 +192,8 @@ def test_predict(self): gm = GaussianMixture(n_components=2, random_state=666) gm.fit(ds_x_train) - save_model(gm, self.filepath) - gm2 = load_model(self.filepath) + save_model(gm, self.filepath, save_format="json") + gm2 = load_model(self.filepath, load_format="json") x_test = np.concatenate((x_train, [[2, 2], [-1, -3]])) ds_x_test = ds.array(x_test, block_size=(2, 2)) @@ -324,32 +208,6 @@ def test_predict(self): self.assertTrue(pred2[0] == pred2[2] == pred2[4]) self.assertTrue(pred2[1] == pred2[3] == pred2[5]) - def test_fit_predict(self): - """Tests GaussianMixture.fit_predict()""" - x, y = make_blobs(n_samples=1500, random_state=170) - x_filtered = np.vstack( - (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10]) - ) - y_real = np.concatenate((np.zeros(500), np.ones(100), 2 * np.ones(10))) - - ds_x = ds.array(x_filtered, block_size=(300, 2)) - - gm = GaussianMixture(n_components=3, random_state=170) - pred = gm.fit_predict(ds_x).collect() - - save_model(gm, self.filepath) - gm2 = load_model(self.filepath) - - pred2 = gm2.predict(ds_x).collect() - - self.assertEqual(len(pred), 610) - accuracy = np.count_nonzero(pred == y_real) / len(pred) - self.assertGreater(accuracy, 0.99) - - self.assertEqual(len(pred2), 610) - accuracy2 = np.count_nonzero(pred2 == y_real) / len(pred2) - self.assertGreater(accuracy2, 0.99) - def test_sparse(self): """Tests GaussianMixture produces the same results using dense and sparse data structures""" @@ -365,8 +223,8 @@ def test_sparse(self): n_components=4, random_state=0, covariance_type=cov_type ) gm.fit(x_sparse) - save_model(gm, self.filepath) - gm2 = load_model(self.filepath) + save_model(gm, self.filepath, save_format="json") + gm2 = load_model(self.filepath, load_format="json") labels_sparse = gm.predict(x_sparse).collect() labels_sparse2 = gm2.predict(x_sparse).collect() @@ -374,8 +232,8 @@ def test_sparse(self): n_components=4, random_state=0, covariance_type=cov_type ) gm.fit(x_dense) - save_model(gm, self.filepath) - gm2 = load_model(self.filepath) + save_model(gm, self.filepath, save_format="json") + gm2 = load_model(self.filepath, load_format="json") labels_dense = gm.predict(x_dense).collect() labels_dense2 = gm2.predict(x_dense).collect() @@ -383,86 +241,10 @@ def test_sparse(self): self.assertTrue(np.array_equal(labels_sparse, labels_dense)) self.assertTrue(np.array_equal(labels_sparse2, labels_dense2)) - def test_init_random(self): - """Tests GaussianMixture random initialization""" - x = ds.random_array((50, 3), (10, 3), random_state=0) - gm = GaussianMixture( - init_params="random", n_components=4, arity=2, random_state=170 - ) - gm.fit(x) - save_model(gm, self.filepath) - gm2 = load_model(self.filepath) - self.assertGreater(gm.n_iter, 5) - self.assertGreater(gm2.n_iter, 5) - - def test_means_init_and_weights_init(self): - """Tests GaussianMixture means_init and weights_init parameters""" - x, _ = load_iris(return_X_y=True) - x_ds = ds.array(x, (75, 4)) - weights_init = [1 / 3, 1 / 3, 1 / 3] - means_init = np.array([[5, 3, 2, 0], [6, 3, 4, 1], [7, 3, 6, 2]]) - gm = GaussianMixture( - random_state=0, - n_components=3, - weights_init=weights_init, - means_init=means_init, - ) - gm.fit(x_ds) - save_model(gm, self.filepath) - gm2 = load_model(self.filepath) - self.assertTrue(gm.converged_) - self.assertTrue(gm2.converged_) - class CSVMSavingTestJSON(unittest.TestCase): filepath = "tests/files/saving/csvm.json" - def test_init_params(self): - """Test constructor parameters""" - cascade_arity = 3 - max_iter = 1 - tol = 1e-4 - kernel = "rbf" - c = 2 - gamma = 0.1 - check_convergence = True - seed = 666 - verbose = False - - csvm = CascadeSVM( - cascade_arity=cascade_arity, - max_iter=max_iter, - tol=tol, - kernel=kernel, - c=c, - gamma=gamma, - check_convergence=check_convergence, - random_state=seed, - verbose=verbose, - ) - save_model(csvm, self.filepath) - csvm2 = load_model(self.filepath) - - self.assertEqual(csvm.cascade_arity, cascade_arity) - self.assertEqual(csvm.max_iter, max_iter) - self.assertEqual(csvm.tol, tol) - self.assertEqual(csvm.kernel, kernel) - self.assertEqual(csvm.c, c) - self.assertEqual(csvm.gamma, gamma) - self.assertEqual(csvm.check_convergence, check_convergence) - self.assertEqual(csvm.random_state, seed) - self.assertEqual(csvm.verbose, verbose) - - self.assertEqual(csvm2.cascade_arity, cascade_arity) - self.assertEqual(csvm2.max_iter, max_iter) - self.assertEqual(csvm2.tol, tol) - self.assertEqual(csvm2.kernel, kernel) - self.assertEqual(csvm2.c, c) - self.assertEqual(csvm2.gamma, gamma) - self.assertEqual(csvm2.check_convergence, check_convergence) - self.assertEqual(csvm2.random_state, seed) - self.assertEqual(csvm2.verbose, verbose) - def test_fit_private_params(self): kernel = "rbf" c = 2 @@ -473,8 +255,8 @@ def test_fit_private_params(self): x, y = ds.load_svmlight_file(file_, (10, 300), 780, False) csvm = CascadeSVM(kernel=kernel, c=c, gamma=gamma, random_state=seed) csvm.fit(x, y) - save_model(csvm, self.filepath) - csvm2 = load_model(self.filepath) + save_model(csvm, self.filepath, save_format="json") + csvm2 = load_model(self.filepath, load_format="json") self.assertEqual(csvm._clf_params["kernel"], kernel) self.assertEqual(csvm._clf_params["C"], c) self.assertEqual(csvm._clf_params["gamma"], gamma) @@ -485,8 +267,8 @@ def test_fit_private_params(self): kernel, c = "linear", 0.3 csvm = CascadeSVM(kernel=kernel, c=c, random_state=seed) csvm.fit(x, y) - save_model(csvm, self.filepath) - csvm2 = load_model(self.filepath) + save_model(csvm, self.filepath, save_format="json") + csvm2 = load_model(self.filepath, load_format="json") self.assertEqual(csvm._clf_params["kernel"], kernel) self.assertEqual(csvm._clf_params["C"], c) self.assertEqual(csvm2._clf_params["kernel"], kernel) @@ -495,51 +277,6 @@ def test_fit_private_params(self): # # check for exception when incorrect kernel is passed # self.assertRaises(AttributeError, CascadeSVM(kernel='fake_kernel')) - def test_fit(self): - seed = 666 - file_ = "tests/files/libsvm/2" - - x, y = ds.load_svmlight_file(file_, (10, 300), 780, False) - - csvm = CascadeSVM( - cascade_arity=3, - max_iter=5, - tol=1e-4, - kernel="linear", - c=2, - gamma=0.1, - check_convergence=True, - random_state=seed, - verbose=False, - ) - - csvm.fit(x, y) - save_model(csvm, self.filepath) - csvm2 = load_model(self.filepath) - - self.assertTrue(csvm.converged) - self.assertTrue(csvm2.converged) - - csvm = CascadeSVM( - cascade_arity=3, - max_iter=1, - tol=1e-4, - kernel="linear", - c=2, - gamma=0.1, - check_convergence=False, - random_state=seed, - verbose=False, - ) - - csvm.fit(x, y) - save_model(csvm, self.filepath) - csvm2 = load_model(self.filepath) - self.assertFalse(csvm.converged) - self.assertEqual(csvm.iterations, 1) - self.assertFalse(csvm2.converged) - self.assertEqual(csvm2.iterations, 1) - def test_predict(self): seed = 666 @@ -562,8 +299,8 @@ def test_predict(self): ) csvm.fit(x, y) - save_model(csvm, self.filepath) - csvm2 = load_model(self.filepath) + save_model(csvm, self.filepath, save_format="json") + csvm2 = load_model(self.filepath, load_format="json") # p5 should belong to class 0, p6 to class 1 p5, p6 = np.array([1, 1]), np.array([-1, -1]) @@ -603,8 +340,8 @@ def test_score(self): ) csvm.fit(x, y) - save_model(csvm, self.filepath) - csvm2 = load_model(self.filepath) + save_model(csvm, self.filepath, save_format="json") + csvm2 = load_model(self.filepath, load_format="json") # points are separable, scoring the training dataset should have 100% # accuracy @@ -617,60 +354,6 @@ def test_score(self): self.assertEqual(accuracy, 1.0) self.assertEqual(accuracy2, 1.0) - def test_decision_func(self): - seed = 666 - - # negative points belong to class 1, positives to 0 - # all points are in the x-axis - p1, p2, p3, p4 = [0, 2], [0, 1], [0, -2], [0, -1] - - x = ds.array(np.array([p1, p4, p3, p2]), (2, 2)) - y = ds.array(np.array([0, 1, 1, 0]).reshape(-1, 1), (2, 1)) - - csvm = CascadeSVM( - cascade_arity=3, - max_iter=10, - tol=1e-4, - kernel="rbf", - c=2, - gamma=0.1, - check_convergence=False, - random_state=seed, - verbose=False, - ) - - csvm.fit(x, y) - save_model(csvm, self.filepath) - csvm2 = load_model(self.filepath) - - # p1 should be equidistant to p3, and p2 to p4 - x_test = ds.array(np.array([p1, p2, p3, p4]), (2, 2)) - - y_pred = csvm.decision_function(x_test) - y_pred2 = csvm2.decision_function(x_test) - - d1, d2, d3, d4 = y_pred.collect() - self.assertTrue(np.isclose(abs(d1) - abs(d3), 0)) - self.assertTrue(np.isclose(abs(d2) - abs(d4), 0)) - d1, d2, d3, d4 = y_pred2.collect() - self.assertTrue(np.isclose(abs(d1) - abs(d3), 0)) - self.assertTrue(np.isclose(abs(d2) - abs(d4), 0)) - - # p5 and p6 should be in the decision function (distance=0) - p5, p6 = np.array([1, 0]), np.array([-1, 0]) - - x_test = ds.array(np.array([p5, p6]), (1, 2)) - - y_pred = csvm.decision_function(x_test) - y_pred2 = csvm2.decision_function(x_test) - - d5, d6 = y_pred.collect() - self.assertTrue(np.isclose(d5, 0)) - self.assertTrue(np.isclose(d6, 0)) - d5, d6 = y_pred2.collect() - self.assertTrue(np.isclose(d5, 0)) - self.assertTrue(np.isclose(d6, 0)) - def test_sparse(self): """Tests that C-SVM produces the same results with sparse and dense data""" @@ -682,13 +365,13 @@ def test_sparse(self): csvm_sp = CascadeSVM(random_state=seed) csvm_sp.fit(x_sp, y_sp) - save_model(csvm_sp, self.filepath) - csvm_sp2 = load_model(self.filepath) + save_model(csvm_sp, self.filepath, save_format="json") + csvm_sp2 = load_model(self.filepath, load_format="json") csvm_d = CascadeSVM(random_state=seed) csvm_d.fit(x_d, y_d) - save_model(csvm_d, self.filepath) - csvm_d2 = load_model(self.filepath) + save_model(csvm_d, self.filepath, save_format="json") + csvm_d2 = load_model(self.filepath, load_format="json") sv_d = csvm_d._clf.support_vectors_ sv_sp = csvm_sp._clf.support_vectors_.toarray() @@ -708,36 +391,6 @@ def test_sparse(self): self.assertTrue(np.array_equal(coef_d2, coef_sp2)) self.assertTrue(np.array_equal(coef_d, coef_d2)) - def test_duplicates(self): - """Tests that C-SVM does not generate duplicate support vectors""" - x = ds.array( - np.array( - [ - [0, 1], - [1, 1], - [0, 1], - [1, 2], - [0, 0], - [2, 2], - [2, 1], - [1, 0], - ] - ), - (2, 2), - ) - - y = ds.array(np.array([1, 0, 1, 0, 1, 0, 0, 1]).reshape(-1, 1), (2, 1)) - - csvm = CascadeSVM(c=1, random_state=1, max_iter=100, tol=0) - csvm.fit(x, y) - save_model(csvm, self.filepath) - csvm2 = load_model(self.filepath) - - csvm._collect_clf() - csvm2._collect_clf() - self.assertEqual(csvm._clf.support_vectors_.shape[0], 6) - self.assertEqual(csvm2._clf.support_vectors_.shape[0], 6) - class RFSavingTestJSON(unittest.TestCase): filepath = "tests/files/saving/rf.json" @@ -757,13 +410,13 @@ def test_make_classification_score(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2 :], (300, 10)) + y_test = ds.array(y[len(y) // 2 :][:, np.newaxis], (300, 1)) rf = RandomForestClassifier(random_state=0) rf.fit(x_train, y_train) - save_model(rf, self.filepath) - rf2 = load_model(self.filepath) + save_model(rf, self.filepath, save_format="json") + rf2 = load_model(self.filepath, load_format="json") accuracy = compss_wait_on(rf.score(x_test, y_test)) accuracy2 = compss_wait_on(rf2.score(x_test, y_test)) @@ -785,13 +438,13 @@ def test_make_classification_predict_and_distr_depth(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = y[len(y) // 2:] + x_test = ds.array(x[len(x) // 2 :], (300, 10)) + y_test = y[len(y) // 2 :] rf = RandomForestClassifier(distr_depth=2, random_state=0) rf.fit(x_train, y_train) - save_model(rf, self.filepath) - rf2 = load_model(self.filepath) + save_model(rf, self.filepath, save_format="json") + rf2 = load_model(self.filepath, load_format="json") y_pred = rf.predict(x_test).collect() y_pred2 = rf2.predict(x_test).collect() @@ -800,35 +453,6 @@ def test_make_classification_predict_and_distr_depth(self): self.assertGreater(accuracy, 0.7) self.assertGreater(accuracy2, 0.7) - def test_make_classification_fit_predict(self): - """Tests RandomForestClassifier fit_predict with default params.""" - x, y = make_classification( - n_samples=3000, - n_features=10, - n_classes=3, - n_informative=4, - n_redundant=2, - n_repeated=1, - n_clusters_per_class=2, - shuffle=True, - random_state=0, - ) - x_train = ds.array(x[: len(x) // 2], (300, 10)) - y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - - rf = RandomForestClassifier(random_state=0) - rf.fit(x_train, y_train) - save_model(rf, self.filepath) - rf2 = load_model(self.filepath) - - y_pred = rf.predict(x_train).collect() - y_pred2 = rf2.predict(x_train).collect() - y_train = y_train.collect() - accuracy = np.count_nonzero(y_pred == y_train) / len(y_train) - accuracy2 = np.count_nonzero(y_pred2 == y_train) / len(y_train) - self.assertGreater(accuracy, 0.7) - self.assertGreater(accuracy2, 0.7) - def test_make_classification_sklearn_max_predict(self): """Tests RandomForestClassifier predict with sklearn_max.""" x, y = make_classification( @@ -844,13 +468,13 @@ def test_make_classification_sklearn_max_predict(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = y[len(y) // 2:] + x_test = ds.array(x[len(x) // 2 :], (300, 10)) + y_test = y[len(y) // 2 :] rf = RandomForestClassifier(random_state=0, sklearn_max=10) rf.fit(x_train, y_train) - save_model(rf, self.filepath) - rf2 = load_model(self.filepath) + save_model(rf, self.filepath, save_format="json") + rf2 = load_model(self.filepath, load_format="json") y_pred = rf.predict(x_test).collect() y_pred2 = rf2.predict(x_test).collect() @@ -874,13 +498,13 @@ def test_make_classification_sklearn_max_predict_proba(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = y[len(y) // 2:] + x_test = ds.array(x[len(x) // 2 :], (300, 10)) + y_test = y[len(y) // 2 :] rf = RandomForestClassifier(random_state=0, sklearn_max=10) rf.fit(x_train, y_train) - save_model(rf, self.filepath) - rf2 = load_model(self.filepath) + save_model(rf, self.filepath, save_format="json") + rf2 = load_model(self.filepath, load_format="json") probabilities = rf.predict_proba(x_test).collect() probabilities2 = rf2.predict_proba(x_test).collect() @@ -908,15 +532,15 @@ def test_make_classification_hard_vote_predict(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = y[len(y) // 2:] + x_test = ds.array(x[len(x) // 2 :], (300, 10)) + y_test = y[len(y) // 2 :] rf = RandomForestClassifier( random_state=0, sklearn_max=10, hard_vote=True ) rf.fit(x_train, y_train) - save_model(rf, self.filepath) - rf2 = load_model(self.filepath) + save_model(rf, self.filepath, save_format="json") + rf2 = load_model(self.filepath, load_format="json") y_pred = rf.predict(x_test).collect() y_pred2 = rf2.predict(x_test).collect() @@ -941,8 +565,8 @@ def test_make_classification_hard_vote_score_mix(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2 :], (300, 10)) + y_test = ds.array(y[len(y) // 2 :][:, np.newaxis], (300, 1)) rf = RandomForestClassifier( random_state=0, @@ -952,36 +576,14 @@ def test_make_classification_hard_vote_score_mix(self): hard_vote=True, ) rf.fit(x_train, y_train) - save_model(rf, self.filepath) - rf2 = load_model(self.filepath) + save_model(rf, self.filepath, save_format="json") + rf2 = load_model(self.filepath, load_format="json") accuracy = compss_wait_on(rf.score(x_test, y_test)) accuracy2 = compss_wait_on(rf2.score(x_test, y_test)) self.assertGreater(accuracy, 0.7) self.assertGreater(accuracy2, 0.7) - def test_iris(self): - """Tests RandomForestClassifier with a minimal example.""" - x, y = datasets.load_iris(return_X_y=True) - ds_fit = ds.array(x[::2], block_size=(30, 2)) - fit_y = ds.array(y[::2].reshape(-1, 1), block_size=(30, 1)) - ds_validate = ds.array(x[1::2], block_size=(30, 2)) - validate_y = ds.array(y[1::2].reshape(-1, 1), block_size=(30, 1)) - - rf = RandomForestClassifier( - n_estimators=1, max_depth=1, random_state=0 - ) - rf.fit(ds_fit, fit_y) - save_model(rf, self.filepath) - rf2 = load_model(self.filepath) - - accuracy = compss_wait_on(rf.score(ds_validate, validate_y)) - accuracy2 = compss_wait_on(rf2.score(ds_validate, validate_y)) - - # Accuracy should be <= 2/3 for any seed, often exactly equal. - self.assertAlmostEqual(accuracy, 2 / 3) - self.assertAlmostEqual(accuracy2, 2 / 3) - class LassoSavingTestJSON(unittest.TestCase): filepath = "tests/files/saving/lasso.json" @@ -1005,13 +607,13 @@ def test_fit_predict(self): n_samples = X.shape[0] X_train, y_train = X[: n_samples // 2], y[: n_samples // 2] - X_test, y_test = X[n_samples // 2:], y[n_samples // 2:] + X_test, y_test = X[n_samples // 2 :], y[n_samples // 2 :] lasso = Lasso(lmbd=0.1, max_iter=50) lasso.fit(ds.array(X_train, (5, 100)), ds.array(y_train, (5, 1))) - save_model(lasso, self.filepath) - lasso2 = load_model(self.filepath) + save_model(lasso, self.filepath, save_format="json") + lasso2 = load_model(self.filepath, load_format="json") y_pred_lasso = lasso.predict(ds.array(X_test, (25, 100))) r2_score_lasso = r2_score(y_test, y_pred_lasso.collect()) @@ -1037,8 +639,8 @@ def test_univariate(self): reg = LinearRegression() reg.fit(x, y) - save_model(reg, self.filepath) - reg2 = load_model(self.filepath) + save_model(reg, self.filepath, save_format="json") + reg2 = load_model(self.filepath, load_format="json") self.assertTrue(np.allclose(reg.coef_.collect(), 0.6)) self.assertTrue(np.allclose(reg.intercept_.collect(), 0.3)) @@ -1073,8 +675,8 @@ def test_univariate_no_intercept(self): reg = LinearRegression(fit_intercept=False) reg.fit(x, y) - save_model(reg, self.filepath) - reg2 = load_model(self.filepath) + save_model(reg, self.filepath, save_format="json") + reg2 = load_model(self.filepath, load_format="json") self.assertTrue(np.allclose(reg.coef_.collect(), 0.68181818)) self.assertTrue(np.allclose(reg2.coef_.collect(), 0.68181818)) @@ -1109,8 +711,8 @@ def test_multivariate(self): reg = LinearRegression() reg.fit(x, y) - save_model(reg, self.filepath) - reg2 = load_model(self.filepath) + save_model(reg, self.filepath, save_format="json") + reg2 = load_model(self.filepath, load_format="json") self.assertTrue(np.allclose(reg.coef_.collect(), [0.421875, 0.296875])) self.assertTrue( @@ -1145,8 +747,8 @@ def test_multivariate_no_intercept(self): reg = LinearRegression(fit_intercept=False) reg.fit(x, y) - save_model(reg, self.filepath) - reg2 = load_model(self.filepath) + save_model(reg, self.filepath, save_format="json") + reg2 = load_model(self.filepath, load_format="json") self.assertTrue( np.allclose(reg.coef_.collect(), [0.48305085, 0.30367232]) @@ -1175,100 +777,6 @@ def test_multivariate_no_intercept(self): np.allclose(pred2, [2.05649718, 3.14689266, 1.3940678]) ) - def test_multivariate_multiobjective(self): - """Tests fit() and predict(), multivariate, multiobjective.""" - x_data = np.array( - [[1, 2, 3], [2, 0, 4], [3, 1, 8], [4, 4, 2], [5, 3, 1], [2, 7, 1]] - ) - y_data = np.array( - [ - [2, 0, 3], - [1, 5, 2], - [1, 3, 4], - [2, 7, 9], - [4.5, -1, 4], - [0, 0, 0], - ] - ) - - bn, bm = 2, 2 - - x = ds.array(x=x_data, block_size=(bn, bm)) - y = ds.array(x=y_data, block_size=(bn, bm)) - - reg = LinearRegression() - reg.fit(x, y) - save_model(reg, self.filepath) - reg2 = load_model(self.filepath) - - # Predict one sample - x_test = np.array([3, 2, 1]) - test_data = ds.array(x=x_test, block_size=(1, bm)) - pred = reg.predict(test_data).collect() - pred2 = reg2.predict(test_data).collect() - self.assertTrue(np.allclose(pred, [3.0318415, 1.97164872, 3.85410906])) - self.assertTrue( - np.allclose(pred2, [3.0318415, 1.97164872, 3.85410906]) - ) - - # Predict multiple samples - x_test = np.array([[3, 2, 1], [4, 3, 3], [1, 1, 1]]) - test_data = ds.array(x=x_test, block_size=(bn, bm)) - pred = reg.predict(test_data).collect() - pred2 = reg2.predict(test_data).collect() - self.assertTrue( - np.allclose( - pred, - [ - [3.0318415, 1.97164872, 3.85410906], - [2.5033157, 2.65809327, 5.05310495], - [2.145797, 1.4840121, 1.5739791], - ], - ) - ) - self.assertTrue( - np.allclose( - pred2, - [ - [3.0318415, 1.97164872, 3.85410906], - [2.5033157, 2.65809327, 5.05310495], - [2.145797, 1.4840121, 1.5739791], - ], - ) - ) - - # Check attributes values - self.assertTrue( - np.allclose( - reg2.coef_.collect(), - [ - [0.65034768, 0.34673933, 1.22176283], - [-0.41465084, -0.20584208, -0.16339571], - [-0.38211131, 0.27277365, 0.07031439], - ], - ) - ) - self.assertTrue( - np.allclose( - reg2.coef_.collect(), - [ - [0.65034768, 0.34673933, 1.22176283], - [-0.41465084, -0.20584208, -0.16339571], - [-0.38211131, 0.27277365, 0.07031439], - ], - ) - ) - self.assertTrue( - np.allclose( - reg.intercept_.collect(), [2.29221145, 1.07034124, 0.44529761] - ) - ) - self.assertTrue( - np.allclose( - reg2.intercept_.collect(), [2.29221145, 1.07034124, 0.44529761] - ) - ) - def load_movielens(train_ratio=0.9): file = "tests/files/sample_movielens_ratings.csv" @@ -1307,43 +815,6 @@ def load_movielens(train_ratio=0.9): class ALSSavingTestJSON(unittest.TestCase): filepath = "tests/files/saving/als.json" - def test_init_params(self): - # Test all parameters - seed = 666 - n_f = 100 - lambda_ = 0.001 - convergence_threshold = 0.1 - max_iter = 10 - verbose = True - arity = 12 - - als = ALS( - random_state=seed, - n_f=n_f, - lambda_=lambda_, - tol=convergence_threshold, - max_iter=max_iter, - verbose=verbose, - arity=arity, - ) - save_model(als, self.filepath) - als2 = load_model(self.filepath) - - self.assertEqual(als.random_state, seed) - self.assertEqual(als.n_f, n_f) - self.assertEqual(als.lambda_, lambda_) - self.assertEqual(als.tol, convergence_threshold) - self.assertEqual(als.max_iter, max_iter) - self.assertEqual(als.verbose, verbose) - self.assertEqual(als.arity, arity) - self.assertEqual(als2.random_state, seed) - self.assertEqual(als2.n_f, n_f) - self.assertEqual(als2.lambda_, lambda_) - self.assertEqual(als2.tol, convergence_threshold) - self.assertEqual(als2.max_iter, max_iter) - self.assertEqual(als2.verbose, verbose) - self.assertEqual(als2.arity, arity) - def test_fit(self): train, test = load_movielens() @@ -1359,8 +830,8 @@ def test_fit(self): self.assertTrue(als.converged) als.fit(train) - save_model(als, self.filepath) - als2 = load_model(self.filepath) + save_model(als, self.filepath, save_format="json") + als2 = load_model(self.filepath, load_format="json") self.assertTrue(als.converged) self.assertTrue(als2.converged) @@ -1371,8 +842,8 @@ def test_predict(self): train = ds.array(x=ratings, block_size=(1, 1)) als = ALS(tol=0.01, random_state=666, n_f=5, verbose=False) als.fit(train) - save_model(als, self.filepath) - als2 = load_model(self.filepath) + save_model(als, self.filepath, save_format="json") + als2 = load_model(self.filepath, load_format="json") predictions = als.predict_user(user_id=0) predictions2 = als2.predict_user(user_id=0) From d5fae271de9804995f77965070370bcb2750d3b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Tue, 27 Jul 2021 11:32:34 +0200 Subject: [PATCH 37/46] Changed tests and file names in commons/rf --- dislib/classification/__init__.py | 2 +- dislib/commons/rf/{_data.py => data.py} | 51 +++++------ .../{_decision_tree.py => decision_tree.py} | 2 +- dislib/commons/rf/{_forest.py => forest.py} | 4 +- .../rf/{_test_split.py => test_split.py} | 2 +- dislib/regression/__init__.py | 2 +- dislib/utils/saving.py | 30 ++----- tests/{test_rf.py => test_rf_classifier.py} | 0 tests/test_rf_regressor.py | 18 ++-- tests/test_saving.py | 89 ++++++++++++++++--- tests/test_saving_cbor.py | 31 +++---- tests/test_saving_json.py | 31 +++---- 12 files changed, 151 insertions(+), 111 deletions(-) rename dislib/commons/rf/{_data.py => data.py} (93%) rename dislib/commons/rf/{_decision_tree.py => decision_tree.py} (99%) rename dislib/commons/rf/{_forest.py => forest.py} (99%) rename dislib/commons/rf/{_test_split.py => test_split.py} (96%) rename tests/{test_rf.py => test_rf_classifier.py} (100%) diff --git a/dislib/classification/__init__.py b/dislib/classification/__init__.py index f4a90db6..695dd571 100644 --- a/dislib/classification/__init__.py +++ b/dislib/classification/__init__.py @@ -1,4 +1,4 @@ from dislib.classification.csvm.base import CascadeSVM -from dislib.commons.rf._forest import RandomForestClassifier +from dislib.commons.rf.forest import RandomForestClassifier __all__ = ["CascadeSVM", "RandomForestClassifier"] diff --git a/dislib/commons/rf/_data.py b/dislib/commons/rf/data.py similarity index 93% rename from dislib/commons/rf/_data.py rename to dislib/commons/rf/data.py index de692182..a762e5b6 100644 --- a/dislib/commons/rf/_data.py +++ b/dislib/commons/rf/data.py @@ -35,7 +35,7 @@ def get_n_samples(self): Returns ------- - n_samples : int + n_samples: int Raises ------ @@ -61,7 +61,7 @@ def get_n_features(self): Returns ------- - n_features : int + n_features: int Raises ------ @@ -115,35 +115,35 @@ class RfClassifierDataset(RfBaseDataset): Parameters ---------- - samples_path : str + samples_path: str Path of the .npy file containing the 2-d array of samples. It can be a pycompss.runtime.Future object. If so, self.n_samples and self.n_features must be set manually (they can also be pycompss.runtime.Future objects). - targets_path : str + targets_path: str Path of the .dat file containing the 1-d array of target labels. It can be a pycompss.runtime.Future object. - features_path : str, optional (default=None) + features_path: str, optional (default=None) Path of the .npy file containing the 2-d array of samples transposed. The array must be C-ordered. Providing this array may improve the performance as it allows sequential access to the features. Attributes ---------- - n_samples : int + n_samples: int The number of samples of the dataset. It can be a pycompss.runtime.Future object. - n_features : int + n_features: int The number of features of the dataset. It can be a pycompss.runtime.Future object. - y_targets : ndarray + y_targets: ndarray The codified array of labels for this RfDataset. The values are indices of the array of classes, which contains the corresponding labels. The dtype is np.int8. It can be a pycompss.runtime.Future object. - y_categories : ndarray + y_categories: ndarray The array of classes for this RfDataset. The values are unique. It can be a pycompss.runtime.Future object. - n_classes : int + n_classes: int The number of classes of this RfDataset. It can be a pycompss.runtime.Future object. @@ -159,7 +159,7 @@ def get_y_targets(self): Returns ------- - y_targets : ndarray + y_targets: ndarray """ if self.y_targets is None: @@ -172,7 +172,7 @@ def get_classes(self): Returns ------- - y_categories : ndarray + y_categories: ndarray """ if self.y_categories is None: @@ -185,7 +185,7 @@ def get_n_classes(self): Returns ------- - n_classes : int + n_classes: int """ if self.n_classes is None: @@ -206,28 +206,28 @@ class RfRegressorDataset(RfBaseDataset): Parameters ---------- - samples_path : str + samples_path: str Path of the .npy file containing the 2-d array of samples. It can be a pycompss.runtime.Future object. If so, self.n_samples and self.n_features must be set manually (they can also be pycompss.runtime.Future objects). - targets_path : str + targets_path: str Path of the .dat file containing the 1-d array of target values. It can be a pycompss.runtime.Future object. - features_path : str, optional (default=None) + features_path: str, optional (default=None) Path of the .npy file containing the 2-d array of samples transposed. The array must be C-ordered. Providing this array may improve the performance as it allows sequential access to the features. Attributes ---------- - n_samples : int + n_samples: int The number of samples of the dataset. It can be a pycompss.runtime.Future object. - n_features : int + n_features: int The number of features of the dataset. It can be a pycompss.runtime.Future object. - y_targets : ndarray + y_targets: ndarray The array of targets for this RfDataset. It can be a pycompss.runtime.Future object. @@ -241,7 +241,7 @@ def get_y_targets(self): Returns ------- - y_targets : ndarray + y_targets: ndarray """ if self.y_targets is None: @@ -255,6 +255,7 @@ def get_n_classes(self): def get_classes(self): return None + def transform_to_rf_dataset( x: Array, y: Array, task: str ) -> RfRegressorDataset or RfClassifierDataset: @@ -265,16 +266,16 @@ def transform_to_rf_dataset( Parameters ---------- - x : ds-array, shape = (n_samples, n_features) + x: ds-array, shape = (n_samples, n_features) The training input samples. - y : ds-array, shape = (n_samples,) or (n_samples, n_outputs) + y: ds-array, shape = (n_samples,) or (n_samples, n_outputs) The target values. - task : {"classification", "regression"} + task: {"classification", "regression"} Task of the Random Forest. Returns ------- - rf_dataset : dislib.regression.rf._data.RfDataset + rf_dataset: dislib.regression.rf._data.RfDataset """ n_samples = x.shape[0] @@ -394,7 +395,7 @@ def _fill_samples_file(samples_path, row_blocks, start_idx): rows_samples = Array._merge_blocks(row_blocks) rows_samples = rows_samples.astype(dtype="float32", casting="same_kind") samples = np.lib.format.open_memmap(samples_path, mode="r+") - samples[start_idx : start_idx + rows_samples.shape[0]] = rows_samples + samples[start_idx: start_idx + rows_samples.shape[0]] = rows_samples @task(targets_path=FILE_INOUT, row_blocks={Type: COLLECTION_IN, Depth: 2}) diff --git a/dislib/commons/rf/_decision_tree.py b/dislib/commons/rf/decision_tree.py similarity index 99% rename from dislib/commons/rf/_decision_tree.py rename to dislib/commons/rf/decision_tree.py index 07297a8d..5a501240 100644 --- a/dislib/commons/rf/_decision_tree.py +++ b/dislib/commons/rf/decision_tree.py @@ -8,7 +8,7 @@ from sklearn.tree import DecisionTreeClassifier as SklearnDTClassifier from sklearn.tree import DecisionTreeRegressor as SklearnDTRegressor -from ._test_split import test_split +from dislib.commons.rf.test_split import test_split from dislib.data.array import Array diff --git a/dislib/commons/rf/_forest.py b/dislib/commons/rf/forest.py similarity index 99% rename from dislib/commons/rf/_forest.py rename to dislib/commons/rf/forest.py index e0f4561d..bf121124 100644 --- a/dislib/commons/rf/_forest.py +++ b/dislib/commons/rf/forest.py @@ -8,13 +8,13 @@ from sklearn.base import BaseEstimator from sklearn.utils import check_random_state -from dislib.commons.rf._decision_tree import ( +from dislib.commons.rf.decision_tree import ( DecisionTreeClassifier, DecisionTreeRegressor, ) from dislib.data.array import Array from dislib.utils.base import _paired_partition -from ._data import transform_to_rf_dataset +from dislib.commons.rf.data import transform_to_rf_dataset class BaseRandomForest(BaseEstimator): diff --git a/dislib/commons/rf/_test_split.py b/dislib/commons/rf/test_split.py similarity index 96% rename from dislib/commons/rf/_test_split.py rename to dislib/commons/rf/test_split.py index 38b9015f..428fbc88 100644 --- a/dislib/commons/rf/_test_split.py +++ b/dislib/commons/rf/test_split.py @@ -23,7 +23,7 @@ def test_split(sample, y_s, feature, n_classes): # Threshold value must not be that value of a sample not_repeated = np.empty(size, dtype=np.bool_) - not_repeated[0 : size - 1] = f_sorted[1:] != f_sorted[:-1] + not_repeated[0: size - 1] = f_sorted[1:] != f_sorted[:-1] not_repeated[size - 1] = True if n_classes is not None: # Classification diff --git a/dislib/regression/__init__.py b/dislib/regression/__init__.py index ecde22d8..a47cd17d 100644 --- a/dislib/regression/__init__.py +++ b/dislib/regression/__init__.py @@ -1,5 +1,5 @@ from dislib.regression.linear.base import LinearRegression from dislib.regression.lasso.base import Lasso -from dislib.commons.rf._forest import RandomForestRegressor +from dislib.commons.rf.forest import RandomForestRegressor __all__ = ["LinearRegression", "Lasso", "RandomForestRegressor"] diff --git a/dislib/utils/saving.py b/dislib/utils/saving.py index 620cc90a..53de386e 100644 --- a/dislib/utils/saving.py +++ b/dislib/utils/saving.py @@ -16,7 +16,7 @@ import dislib.recommendation import dislib.regression from dislib.data.array import Array -from dislib.commons.rf._decision_tree import ( +from dislib.commons.rf.decision_tree import ( DecisionTreeClassifier, DecisionTreeRegressor, _Node, @@ -187,16 +187,8 @@ def load_model(filepath, load_format="json"): model_module = getattr(ds, IMPLEMENTED_MODELS[model_name]) model_class = getattr(model_module, model_name) model = model_class() - model.__dict__.update(model_metadata) - - # Set class methods - if model_name == "CascadeSVM" and "kernel" in model_metadata: - try: - model._kernel_f = getattr( - model, model._name_to_kernel[model_metadata["kernel"]] - ) - except AttributeError: - model._kernel_f = getattr(model, "_rbf_kernel") + for key, val in model_metadata.items(): + setattr(model, key, val) return model @@ -217,13 +209,6 @@ def _encode_helper(obj): """ if isinstance(obj, np.generic): return obj.item() - elif isinstance(obj, range): - return { - "class_name": "range", - "start": obj.start, - "stop": obj.stop, - "step": obj.step, - } elif isinstance(obj, csr_matrix): return { "class_name": "csr_matrix", @@ -272,11 +257,7 @@ def _decode_helper(obj): if isinstance(obj, dict) and "class_name" in obj: class_name = obj["class_name"] - if class_name == "range": - return range(obj["start"], obj["stop"], obj["step"]) - elif class_name == "tuple": - return tuple(obj["items"]) - elif class_name == "ndarray": + if class_name == "ndarray": if obj["dtype_list"]: items = list(map(tuple, obj["items"])) return np.rec.fromrecords(items, dtype=eval(obj["dtype"])) @@ -353,8 +334,7 @@ def _sync_obj(obj): elif isinstance(obj, list): iterator = iter(enumerate(obj)) else: - print(obj) - raise ValueError("Expected dict or list and received %s." % type(obj)) + raise TypeError("Expected dict or list and received %s." % type(obj)) for key, val in iterator: if isinstance(val, (dict, list)): diff --git a/tests/test_rf.py b/tests/test_rf_classifier.py similarity index 100% rename from tests/test_rf.py rename to tests/test_rf_classifier.py diff --git a/tests/test_rf_regressor.py b/tests/test_rf_regressor.py index 2d82dbeb..36da50f7 100644 --- a/tests/test_rf_regressor.py +++ b/tests/test_rf_regressor.py @@ -26,8 +26,8 @@ def test_make_regression(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2 :], (300, 10)) - y_test = ds.array(y[len(y) // 2 :][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) rf = RandomForestRegressor(random_state=0) @@ -35,7 +35,7 @@ def test_make_regression(self): accuracy1 = compss_wait_on(rf.score(x_test, y_test)) y_pred = rf.predict(x_test).collect() - y_true = y[len(y) // 2 :] + y_true = y[len(y) // 2:] accuracy2 = _determination_coefficient(y_true, y_pred) self.assertGreater(accuracy1, 0.85) @@ -53,8 +53,8 @@ def test_make_regression_predict_and_distr_depth(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2 :], (300, 10)) - y_test = ds.array(y[len(y) // 2 :][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) rf = RandomForestRegressor(distr_depth=2, random_state=0) @@ -62,7 +62,7 @@ def test_make_regression_predict_and_distr_depth(self): accuracy1 = compss_wait_on(rf.score(x_test, y_test)) y_pred = rf.predict(x_test).collect() - y_true = y[len(y) // 2 :] + y_true = y[len(y) // 2:] accuracy2 = _determination_coefficient(y_true, y_pred) self.assertGreater(accuracy1, 0.85) @@ -80,8 +80,8 @@ def test_make_regression_sklearn_max_predict(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2 :], (300, 10)) - y_test = ds.array(y[len(y) // 2 :][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) rf = RandomForestRegressor(random_state=0, sklearn_max=10) @@ -89,7 +89,7 @@ def test_make_regression_sklearn_max_predict(self): accuracy1 = compss_wait_on(rf.score(x_test, y_test)) y_pred = rf.predict(x_test).collect() - y_true = y[len(y) // 2 :] + y_true = y[len(y) // 2:] accuracy2 = _determination_coefficient(y_true, y_pred) self.assertGreater(accuracy1, 0.85) diff --git a/tests/test_saving.py b/tests/test_saving.py index d1a8bb92..13445f1c 100644 --- a/tests/test_saving.py +++ b/tests/test_saving.py @@ -1,26 +1,91 @@ import unittest -from unittest.mock import patch +import sys +import json import numpy as np -import sys +# Workaround to mask cbor2 +if True: + sys.modules["cbor2"] = None from dislib.cluster import KMeans -from dislib.utils import save_model, load_model +from dislib.cluster import DBSCAN +from dislib.classification import RandomForestClassifier +from dislib.data import array +from dislib.utils.saving import save_model, load_model, _sync_obj + +from sklearn.datasets import make_classification +from pycompss.api.api import compss_wait_on class SavingTest(unittest.TestCase): - filepath = "tests/files/saving/kmeans.json" + filepath = "tests/files/saving/model.json" def test_errors(self): """Test that errors are raised""" - km = KMeans(n_clusters=2, verbose=False) - - with patch(sys.modules["cbor"]) as mock_cbor: - mock_cbor.return_value = None - self.assertRaises( - ModuleNotFoundError, - save_model(km, self.filepath, save_format="json"), - ) + + # Models + km = KMeans(n_clusters=2) + km2 = KMeans(n_clusters=10) + dbscan = DBSCAN() + rf = RandomForestClassifier() + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_clusters_per_class=2, + ) + x_train = array(x[: len(x) // 2], (300, 10)) + y_train = array(y[: len(y) // 2][:, np.newaxis], (300, 1)) + rf.fit(x_train, y_train) + + # Import error + with self.assertRaises(ModuleNotFoundError): + save_model(km, self.filepath, save_format="cbor") + with self.assertRaises(ModuleNotFoundError): + load_model(self.filepath, load_format="cbor") + + # Saving model not implemented + with self.assertRaises(NotImplementedError): + save_model(dbscan, self.filepath) + + # Wrong save format + with self.assertRaises(ValueError): + save_model(km, self.filepath, save_format="xxxx") + + # Overwrite + save_model(km, self.filepath, save_format="json") + with open(self.filepath, "r") as f: + json_str = f.read() + save_model(km2, self.filepath, overwrite=False, save_format="json") + with open(self.filepath, "r") as f: + json_str2 = f.read() + self.assertEqual(json_str, json_str2) + + # Wrong load format + with self.assertRaises(ValueError): + load_model(self.filepath, load_format="xxxx") + + # Load model not implemented + model_data = {"model_name": "dbscan"} + with open(self.filepath, "w") as f: + json.dump(model_data, f) + with self.assertRaises(NotImplementedError): + load_model(self.filepath, load_format="json") + + # Not JSON serializable + setattr(km, "n_clusters", dbscan) + with self.assertRaises(TypeError): + save_model(km, self.filepath, save_format="json") + + # Not dict or list + with self.assertRaises(TypeError): + _sync_obj(km) + + # Future not synchronized + compss_wait_on(rf.trees[0].try_features) + with self.assertRaises(TypeError): + save_model(rf, self.filepath, save_format="json") def main(): diff --git a/tests/test_saving_cbor.py b/tests/test_saving_cbor.py index c8efd336..3d10c1c6 100644 --- a/tests/test_saving_cbor.py +++ b/tests/test_saving_cbor.py @@ -1,13 +1,10 @@ import unittest import numpy as np -from numpy.random.mtrand import RandomState from scipy.sparse import csr_matrix -from sklearn import datasets -from sklearn.cluster import KMeans as SKMeans from sklearn.metrics import r2_score from sklearn.datasets import make_classification -from sklearn.datasets import make_blobs, load_iris +from sklearn.datasets import make_blobs import dislib as ds from dislib.cluster import KMeans @@ -410,8 +407,8 @@ def test_make_classification_score(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2 :], (300, 10)) - y_test = ds.array(y[len(y) // 2 :][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) rf = RandomForestClassifier(random_state=0) rf.fit(x_train, y_train) @@ -438,8 +435,8 @@ def test_make_classification_predict_and_distr_depth(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2 :], (300, 10)) - y_test = y[len(y) // 2 :] + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = y[len(y) // 2:] rf = RandomForestClassifier(distr_depth=2, random_state=0) rf.fit(x_train, y_train) @@ -468,8 +465,8 @@ def test_make_classification_sklearn_max_predict(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2 :], (300, 10)) - y_test = y[len(y) // 2 :] + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = y[len(y) // 2:] rf = RandomForestClassifier(random_state=0, sklearn_max=10) rf.fit(x_train, y_train) @@ -498,8 +495,8 @@ def test_make_classification_sklearn_max_predict_proba(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2 :], (300, 10)) - y_test = y[len(y) // 2 :] + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = y[len(y) // 2:] rf = RandomForestClassifier(random_state=0, sklearn_max=10) rf.fit(x_train, y_train) @@ -532,8 +529,8 @@ def test_make_classification_hard_vote_predict(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2 :], (300, 10)) - y_test = y[len(y) // 2 :] + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = y[len(y) // 2:] rf = RandomForestClassifier( random_state=0, sklearn_max=10, hard_vote=True @@ -565,8 +562,8 @@ def test_make_classification_hard_vote_score_mix(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2 :], (300, 10)) - y_test = ds.array(y[len(y) // 2 :][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) rf = RandomForestClassifier( random_state=0, @@ -607,7 +604,7 @@ def test_fit_predict(self): n_samples = X.shape[0] X_train, y_train = X[: n_samples // 2], y[: n_samples // 2] - X_test, y_test = X[n_samples // 2 :], y[n_samples // 2 :] + X_test, y_test = X[n_samples // 2:], y[n_samples // 2:] lasso = Lasso(lmbd=0.1, max_iter=50) diff --git a/tests/test_saving_json.py b/tests/test_saving_json.py index 1488d83c..0c694cf8 100644 --- a/tests/test_saving_json.py +++ b/tests/test_saving_json.py @@ -1,13 +1,10 @@ import unittest import numpy as np -from numpy.random.mtrand import RandomState from scipy.sparse import csr_matrix -from sklearn import datasets -from sklearn.cluster import KMeans as SKMeans from sklearn.metrics import r2_score from sklearn.datasets import make_classification -from sklearn.datasets import make_blobs, load_iris +from sklearn.datasets import make_blobs import dislib as ds from dislib.cluster import KMeans @@ -410,8 +407,8 @@ def test_make_classification_score(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2 :], (300, 10)) - y_test = ds.array(y[len(y) // 2 :][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) rf = RandomForestClassifier(random_state=0) rf.fit(x_train, y_train) @@ -438,8 +435,8 @@ def test_make_classification_predict_and_distr_depth(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2 :], (300, 10)) - y_test = y[len(y) // 2 :] + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = y[len(y) // 2:] rf = RandomForestClassifier(distr_depth=2, random_state=0) rf.fit(x_train, y_train) @@ -468,8 +465,8 @@ def test_make_classification_sklearn_max_predict(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2 :], (300, 10)) - y_test = y[len(y) // 2 :] + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = y[len(y) // 2:] rf = RandomForestClassifier(random_state=0, sklearn_max=10) rf.fit(x_train, y_train) @@ -498,8 +495,8 @@ def test_make_classification_sklearn_max_predict_proba(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2 :], (300, 10)) - y_test = y[len(y) // 2 :] + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = y[len(y) // 2:] rf = RandomForestClassifier(random_state=0, sklearn_max=10) rf.fit(x_train, y_train) @@ -532,8 +529,8 @@ def test_make_classification_hard_vote_predict(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2 :], (300, 10)) - y_test = y[len(y) // 2 :] + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = y[len(y) // 2:] rf = RandomForestClassifier( random_state=0, sklearn_max=10, hard_vote=True @@ -565,8 +562,8 @@ def test_make_classification_hard_vote_score_mix(self): ) x_train = ds.array(x[: len(x) // 2], (300, 10)) y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2 :], (300, 10)) - y_test = ds.array(y[len(y) // 2 :][:, np.newaxis], (300, 1)) + x_test = ds.array(x[len(x) // 2:], (300, 10)) + y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) rf = RandomForestClassifier( random_state=0, @@ -607,7 +604,7 @@ def test_fit_predict(self): n_samples = X.shape[0] X_train, y_train = X[: n_samples // 2], y[: n_samples // 2] - X_test, y_test = X[n_samples // 2 :], y[n_samples // 2 :] + X_test, y_test = X[n_samples // 2:], y[n_samples // 2:] lasso = Lasso(lmbd=0.1, max_iter=50) From 25f68b31a17db659cf899d63a1952e1e0eb9cccd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Tue, 27 Jul 2021 18:10:30 +0200 Subject: [PATCH 38/46] Modified test_saving.py to raise ModuleNotFound --- tests/test_saving.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/tests/test_saving.py b/tests/test_saving.py index 13445f1c..0ca7deda 100644 --- a/tests/test_saving.py +++ b/tests/test_saving.py @@ -1,17 +1,12 @@ import unittest -import sys import json - import numpy as np -# Workaround to mask cbor2 -if True: - sys.modules["cbor2"] = None from dislib.cluster import KMeans from dislib.cluster import DBSCAN from dislib.classification import RandomForestClassifier from dislib.data import array -from dislib.utils.saving import save_model, load_model, _sync_obj +import dislib.utils.saving as saving from sklearn.datasets import make_classification from pycompss.api.api import compss_wait_on @@ -22,7 +17,8 @@ class SavingTest(unittest.TestCase): def test_errors(self): """Test that errors are raised""" - + cbor2_module = saving.cbor2 + saving.cbor2 = None # Models km = KMeans(n_clusters=2) km2 = KMeans(n_clusters=10) @@ -41,51 +37,55 @@ def test_errors(self): # Import error with self.assertRaises(ModuleNotFoundError): - save_model(km, self.filepath, save_format="cbor") + saving.save_model(km, self.filepath, save_format="cbor") with self.assertRaises(ModuleNotFoundError): - load_model(self.filepath, load_format="cbor") + saving.load_model(self.filepath, load_format="cbor") # Saving model not implemented with self.assertRaises(NotImplementedError): - save_model(dbscan, self.filepath) + saving.save_model(dbscan, self.filepath) # Wrong save format with self.assertRaises(ValueError): - save_model(km, self.filepath, save_format="xxxx") + saving.save_model(km, self.filepath, save_format="xxxx") # Overwrite - save_model(km, self.filepath, save_format="json") + saving.save_model(km, self.filepath, save_format="json") with open(self.filepath, "r") as f: json_str = f.read() - save_model(km2, self.filepath, overwrite=False, save_format="json") + saving.save_model( + km2, self.filepath, overwrite=False, save_format="json" + ) with open(self.filepath, "r") as f: json_str2 = f.read() self.assertEqual(json_str, json_str2) # Wrong load format with self.assertRaises(ValueError): - load_model(self.filepath, load_format="xxxx") + saving.load_model(self.filepath, load_format="xxxx") # Load model not implemented model_data = {"model_name": "dbscan"} with open(self.filepath, "w") as f: json.dump(model_data, f) with self.assertRaises(NotImplementedError): - load_model(self.filepath, load_format="json") + saving.load_model(self.filepath, load_format="json") # Not JSON serializable setattr(km, "n_clusters", dbscan) with self.assertRaises(TypeError): - save_model(km, self.filepath, save_format="json") + saving.save_model(km, self.filepath, save_format="json") # Not dict or list with self.assertRaises(TypeError): - _sync_obj(km) + saving._sync_obj(km) # Future not synchronized compss_wait_on(rf.trees[0].try_features) with self.assertRaises(TypeError): - save_model(rf, self.filepath, save_format="json") + saving.save_model(rf, self.filepath, save_format="json") + + saving.cbor2 = cbor2_module def main(): From bab5e7c4838f7453149853568388648e48809368 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Wed, 28 Jul 2021 17:44:19 +0200 Subject: [PATCH 39/46] Reduced saving tests and added tests for RFRegr --- dislib/commons/rf/decision_tree.py | 18 +- dislib/utils/saving.py | 25 +- tests/test_saving.py | 56 +-- tests/test_saving_cbor.py | 688 ++++------------------------- tests/test_saving_json.py | 688 ++++------------------------- 5 files changed, 188 insertions(+), 1287 deletions(-) diff --git a/dislib/commons/rf/decision_tree.py b/dislib/commons/rf/decision_tree.py index 5a501240..751983d4 100644 --- a/dislib/commons/rf/decision_tree.py +++ b/dislib/commons/rf/decision_tree.py @@ -123,11 +123,10 @@ def predict(self, x_row): Returns ------- predicted : ndarray - An array with the predicted classes for the given samples. The - values are codes of the fitted + An array with the predicted classes or values for the given + samples. For classification, the values are codes of the fitted dislib.classification.rf.data.RfDataset. The returned object can be a pycompss.runtime.Future object. - """ assert self.tree is not None, "The decision tree is not fitted." @@ -301,10 +300,7 @@ class DecisionTreeRegressor(BaseDecisionTree): fit(dataset) Fits the DecisionTreeRegressor. predict(x_row) - Predicts classes for the given samples using a fitted tree. - predict_proba(x_row) - Predicts class probabilities for the given smaples using a fitted tree. - + Predicts target values for the given samples using a fitted tree. """ def __init__( @@ -407,14 +403,6 @@ def _get_sample_attributes(samples_file, indices): return x -def _get_feature_mmap(features_file, i): - return _get_features_mmap(features_file)[i] - - -def _get_features_mmap(features_file): - return np.load(features_file, mmap_mode="r", allow_pickle=False) - - @task(priority=True, returns=2) def _sample_selection(n_samples, y_targets, bootstrap, seed): if bootstrap: diff --git a/dislib/utils/saving.py b/dislib/utils/saving.py index 53de386e..02ecfb8a 100644 --- a/dislib/utils/saving.py +++ b/dislib/utils/saving.py @@ -2,11 +2,11 @@ import os import numpy as np -from pycompss.runtime.management.classes import Future from pycompss.api.api import compss_wait_on from sklearn.svm import SVC as SklearnSVC from sklearn.tree import DecisionTreeClassifier as SklearnDTClassifier +from sklearn.tree import DecisionTreeRegressor as SklearnDTRegressor from sklearn.tree._tree import Tree as SklearnTree from scipy.sparse import csr_matrix @@ -38,6 +38,7 @@ "GaussianMixture": "cluster", "CascadeSVM": "classification", "RandomForestClassifier": "classification", + "RandomForestRegressor": "regression", "ALS": "recommendation", "LinearRegression": "regression", "Lasso": "regression", @@ -59,6 +60,7 @@ SKLEARN_CLASSES = { "SVC": SklearnSVC, "DecisionTreeClassifier": SklearnDTClassifier, + "DecisionTreeRegressor": SklearnDTRegressor, } @@ -112,7 +114,7 @@ def save_model(model, filepath, overwrite=True, save_format="json"): ) # Synchronize model - if model_name == "RandomForestClassifier": + if model_name in ("RandomForestClassifier", "RandomForestRegressor"): _sync_rf(model) _sync_obj(model.__dict__) @@ -293,7 +295,9 @@ def _decode_helper(obj): and "dislib" in obj["module_name"] ): dict_ = _decode_helper(obj["items"]) - if class_name == "DecisionTreeClassifier": + if class_name in ( + "DecisionTreeClassifier", "DecisionTreeRegressor" + ): model = DISLIB_CLASSES[obj["class_name"]]( try_features=dict_.pop("try_features"), max_depth=dict_.pop("max_depth"), @@ -341,10 +345,6 @@ def _sync_obj(obj): _sync_obj(obj[key]) else: obj[key] = compss_wait_on(val) - if isinstance(obj[key], Future): - raise TypeError( - "Could not synchronize Future (%s, %s)." % (key, val) - ) if isinstance(getattr(obj[key], "__dict__", None), dict): _sync_obj(obj[key].__dict__) @@ -353,9 +353,8 @@ def _sync_rf(rf): """Sync the `try_features` and `n_classes` attribute of the different trees since they cannot be synced recursively. """ - if isinstance(rf.trees[0].try_features, Future): - try_features = compss_wait_on(rf.trees[0].try_features) - n_classes = compss_wait_on(rf.trees[0].n_classes) - for tree in rf.trees: - tree.try_features = try_features - tree.n_classes = n_classes + try_features = compss_wait_on(rf.trees[0].try_features) + n_classes = compss_wait_on(rf.trees[0].n_classes) + for tree in rf.trees: + tree.try_features = try_features + tree.n_classes = n_classes diff --git a/tests/test_saving.py b/tests/test_saving.py index 0ca7deda..7545e9ad 100644 --- a/tests/test_saving.py +++ b/tests/test_saving.py @@ -1,92 +1,70 @@ import unittest import json -import numpy as np from dislib.cluster import KMeans from dislib.cluster import DBSCAN -from dislib.classification import RandomForestClassifier -from dislib.data import array import dislib.utils.saving as saving -from sklearn.datasets import make_classification -from pycompss.api.api import compss_wait_on - class SavingTest(unittest.TestCase): - filepath = "tests/files/saving/model.json" def test_errors(self): """Test that errors are raised""" - cbor2_module = saving.cbor2 - saving.cbor2 = None + filepath = "tests/files/saving/model.json" + # Models km = KMeans(n_clusters=2) km2 = KMeans(n_clusters=10) dbscan = DBSCAN() - rf = RandomForestClassifier() - x, y = make_classification( - n_samples=3000, - n_features=10, - n_classes=3, - n_informative=4, - n_clusters_per_class=2, - ) - x_train = array(x[: len(x) // 2], (300, 10)) - y_train = array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - rf.fit(x_train, y_train) # Import error + cbor2_module = saving.cbor2 + saving.cbor2 = None with self.assertRaises(ModuleNotFoundError): - saving.save_model(km, self.filepath, save_format="cbor") + saving.save_model(km, filepath, save_format="cbor") with self.assertRaises(ModuleNotFoundError): - saving.load_model(self.filepath, load_format="cbor") + saving.load_model(filepath, load_format="cbor") + saving.cbor2 = cbor2_module # Saving model not implemented with self.assertRaises(NotImplementedError): - saving.save_model(dbscan, self.filepath) + saving.save_model(dbscan, filepath) # Wrong save format with self.assertRaises(ValueError): - saving.save_model(km, self.filepath, save_format="xxxx") + saving.save_model(km, filepath, save_format="xxxx") # Overwrite - saving.save_model(km, self.filepath, save_format="json") - with open(self.filepath, "r") as f: + saving.save_model(km, filepath, save_format="json") + with open(filepath, "r") as f: json_str = f.read() saving.save_model( - km2, self.filepath, overwrite=False, save_format="json" + km2, filepath, overwrite=False, save_format="json" ) - with open(self.filepath, "r") as f: + with open(filepath, "r") as f: json_str2 = f.read() self.assertEqual(json_str, json_str2) # Wrong load format with self.assertRaises(ValueError): - saving.load_model(self.filepath, load_format="xxxx") + saving.load_model(filepath, load_format="xxxx") # Load model not implemented model_data = {"model_name": "dbscan"} - with open(self.filepath, "w") as f: + with open(filepath, "w") as f: json.dump(model_data, f) with self.assertRaises(NotImplementedError): - saving.load_model(self.filepath, load_format="json") + saving.load_model(filepath, load_format="json") # Not JSON serializable setattr(km, "n_clusters", dbscan) with self.assertRaises(TypeError): - saving.save_model(km, self.filepath, save_format="json") + saving.save_model(km, filepath, save_format="json") # Not dict or list with self.assertRaises(TypeError): saving._sync_obj(km) - # Future not synchronized - compss_wait_on(rf.trees[0].try_features) - with self.assertRaises(TypeError): - saving.save_model(rf, self.filepath, save_format="json") - - saving.cbor2 = cbor2_module - def main(): unittest.main() diff --git a/tests/test_saving_cbor.py b/tests/test_saving_cbor.py index 3d10c1c6..a5c2f23c 100644 --- a/tests/test_saving_cbor.py +++ b/tests/test_saving_cbor.py @@ -3,14 +3,14 @@ import numpy as np from scipy.sparse import csr_matrix from sklearn.metrics import r2_score -from sklearn.datasets import make_classification -from sklearn.datasets import make_blobs +from sklearn.datasets import make_classification, make_regression import dislib as ds from dislib.cluster import KMeans from dislib.cluster import GaussianMixture from dislib.classification import CascadeSVM from dislib.classification import RandomForestClassifier +from dislib.regression import RandomForestRegressor from dislib.regression import Lasso from dislib.regression import LinearRegression from dislib.recommendation import ALS @@ -19,56 +19,11 @@ from pycompss.api.api import compss_wait_on -class KMeansSavingTestCBOR(unittest.TestCase): - filepath = "tests/files/saving/kmeans.cbor" +class CBORSavingTest(unittest.TestCase): - def test_fit_kmeans(self): - """Tests that the fit method returns the expected centers using toy - data. - """ - arr = np.array([[1, 2], [2, 1], [-1, -2], [-2, -1]]) - x = ds.array(arr, block_size=(2, 2)) - - km = KMeans(n_clusters=2, random_state=666, verbose=False) - km.fit(x) - - expected_centers = np.array([[1.5, 1.5], [-1.5, -1.5]]) - - save_model(km, self.filepath, save_format="cbor") - km2 = load_model(self.filepath, load_format="cbor") - - self.assertTrue((km.centers == expected_centers).all()) - self.assertTrue((km2.centers == expected_centers).all()) - - def test_predict_kmeans(self): - """Tests that labels are correctly predicted using toy data.""" - p1, p2, p3, p4 = [1, 2], [2, 1], [-1, -2], [-2, -1] - - arr1 = np.array([p1, p2, p3, p4]) - x = ds.array(arr1, block_size=(2, 2)) - - km = KMeans(n_clusters=2, random_state=666) - km.fit(x) - - save_model(km, self.filepath, save_format="cbor") - km2 = load_model(self.filepath, load_format="cbor") - - p5, p6 = [10, 10], [-10, -10] - - arr2 = np.array([p1, p2, p3, p4, p5, p6]) - x_test = ds.array(arr2, block_size=(2, 2)) - - labels = km.predict(x_test).collect() - labels2 = km2.predict(x_test).collect() - expected_labels = np.array([0, 0, 1, 1, 0, 1]) - - self.assertTrue(np.array_equal(labels, expected_labels)) - self.assertTrue(np.array_equal(labels2, expected_labels)) - - def test_sparse_kmeans(self): - """Tests K-means produces the same results using dense and sparse - data structures.""" + def test_saving_kmeans(self): file_ = "tests/files/libsvm/2" + filepath = "tests/files/saving/kmeans.cbor" x_sp, _ = ds.load_svmlight_file(file_, (10, 300), 780, True) x_ds, _ = ds.load_svmlight_file(file_, (10, 300), 780, False) @@ -76,8 +31,8 @@ def test_sparse_kmeans(self): kmeans = KMeans(random_state=170) kmeans.fit(x_sp) - save_model(kmeans, self.filepath, save_format="cbor") - kmeans2 = load_model(self.filepath, load_format="cbor") + save_model(kmeans, filepath, save_format="cbor") + kmeans2 = load_model(filepath, load_format="cbor") y_sparse = kmeans.predict(x_sp).collect() y_sparse2 = kmeans2.predict(x_sp).collect() @@ -95,120 +50,9 @@ def test_sparse_kmeans(self): self.assertTrue(np.array_equal(y_sparse, y_dense)) self.assertTrue(np.array_equal(y_sparse2, y_dense)) - def test_init_kmeans(self): - # With dense data - x, y = make_blobs(n_samples=1500, random_state=170) - x_filtered = np.vstack( - (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10]) - ) - x_train = ds.array(x_filtered, block_size=(300, 2)) - - init = np.random.random((5, 2)) - km = KMeans(n_clusters=5, init=init) - km.fit(x_train) - - save_model(km, self.filepath, save_format="cbor") - km2 = load_model(self.filepath, load_format="cbor") - - self.assertTrue(np.array_equal(km.init, init)) - self.assertTrue(np.array_equal(km2.init, init)) - self.assertFalse(np.array_equal(km.centers, init)) - self.assertFalse(np.array_equal(km2.centers, init)) - - # With sparse data - x_sp = ds.array(csr_matrix(x_filtered), block_size=(300, 2)) - init = csr_matrix(np.random.random((5, 2))) - - km = KMeans(n_clusters=5, init=init) - km.fit(x_sp) - - save_model(km, self.filepath, save_format="cbor") - km2 = load_model(self.filepath, load_format="cbor") - - self.assertTrue(np.array_equal(km.init.toarray(), init.toarray())) - self.assertTrue(np.array_equal(km2.init.toarray(), init.toarray())) - self.assertFalse(np.array_equal(km.centers.toarray(), init.toarray())) - self.assertFalse(np.array_equal(km2.centers.toarray(), init.toarray())) - - -class GaussianMixtureSavingTestCBOR(unittest.TestCase): - filepath = "tests/files/saving/gm.cbor" - - def test_fit(self): - """Tests GaussianMixture.fit()""" - - x = np.array([[1, 2], [2, 1], [-3, -3], [-1, -2], [-2, -1], [3, 3]]) - ds_x = ds.array(x, block_size=(3, 2)) - - gm = GaussianMixture(n_components=2, random_state=666) - gm.fit(ds_x) - - expected_weights = np.array([0.5, 0.5]) - expected_means = np.array([[-2, -2], [2, 2]]) - expected_cov = np.array( - [ - [[0.66671688, 0.33338255], [0.33338255, 0.66671688]], - [[0.66671688, 0.33338255], [0.33338255, 0.66671688]], - ] - ) - expected_pc = np.array( - [ - [[1.22469875, -0.70714834], [0.0, 1.4141944]], - [[1.22469875, -0.70714834], [0.0, 1.4141944]], - ] - ) - - save_model(gm, self.filepath, save_format="cbor") - gm2 = load_model(self.filepath, load_format="cbor") - - gm.weights_ = compss_wait_on(gm.weights_) - gm.means_ = compss_wait_on(gm.means_) - gm.covariances_ = compss_wait_on(gm.covariances_) - gm.precisions_cholesky_ = compss_wait_on(gm.precisions_cholesky_) - - gm2.weights_ = compss_wait_on(gm2.weights_) - gm2.means_ = compss_wait_on(gm2.means_) - gm2.covariances_ = compss_wait_on(gm2.covariances_) - gm2.precisions_cholesky_ = compss_wait_on(gm2.precisions_cholesky_) - - self.assertTrue((np.allclose(gm.weights_, expected_weights))) - self.assertTrue((np.allclose(gm.means_, expected_means))) - self.assertTrue((np.allclose(gm.covariances_, expected_cov))) - self.assertTrue((np.allclose(gm.precisions_cholesky_, expected_pc))) - - self.assertTrue((np.allclose(gm2.weights_, expected_weights))) - self.assertTrue((np.allclose(gm2.means_, expected_means))) - self.assertTrue((np.allclose(gm2.covariances_, expected_cov))) - self.assertTrue((np.allclose(gm2.precisions_cholesky_, expected_pc))) - - def test_predict(self): - """Tests GaussianMixture.predict()""" - x_train = np.array([[1, 2], [-1, -2], [2, 1], [-2, -1]]) - ds_x_train = ds.array(x_train, block_size=(2, 2)) - - gm = GaussianMixture(n_components=2, random_state=666) - gm.fit(ds_x_train) - - save_model(gm, self.filepath, save_format="cbor") - gm2 = load_model(self.filepath, load_format="cbor") - - x_test = np.concatenate((x_train, [[2, 2], [-1, -3]])) - ds_x_test = ds.array(x_test, block_size=(2, 2)) - pred = gm.predict(ds_x_test).collect() - pred2 = gm2.predict(ds_x_test).collect() - - self.assertTrue(pred[0] != pred[1]) - self.assertTrue(pred[0] == pred[2] == pred[4]) - self.assertTrue(pred[1] == pred[3] == pred[5]) - - self.assertTrue(pred2[0] != pred2[1]) - self.assertTrue(pred2[0] == pred2[2] == pred2[4]) - self.assertTrue(pred2[1] == pred2[3] == pred2[5]) - - def test_sparse(self): - """Tests GaussianMixture produces the same results using dense and - sparse data structures""" + def test_saving_gm(self): file_ = "tests/files/libsvm/2" + filepath = "tests/files/saving/gm.cbor" x_sparse, _ = ds.load_svmlight_file(file_, (10, 780), 780, True) x_dense, _ = ds.load_svmlight_file(file_, (10, 780), 780, False) @@ -220,8 +64,8 @@ def test_sparse(self): n_components=4, random_state=0, covariance_type=cov_type ) gm.fit(x_sparse) - save_model(gm, self.filepath, save_format="cbor") - gm2 = load_model(self.filepath, load_format="cbor") + save_model(gm, filepath, save_format="cbor") + gm2 = load_model(filepath, load_format="cbor") labels_sparse = gm.predict(x_sparse).collect() labels_sparse2 = gm2.predict(x_sparse).collect() @@ -229,8 +73,8 @@ def test_sparse(self): n_components=4, random_state=0, covariance_type=cov_type ) gm.fit(x_dense) - save_model(gm, self.filepath, save_format="cbor") - gm2 = load_model(self.filepath, load_format="cbor") + save_model(gm, filepath, save_format="cbor") + gm2 = load_model(filepath, load_format="cbor") labels_dense = gm.predict(x_dense).collect() labels_dense2 = gm2.predict(x_dense).collect() @@ -238,137 +82,23 @@ def test_sparse(self): self.assertTrue(np.array_equal(labels_sparse, labels_dense)) self.assertTrue(np.array_equal(labels_sparse2, labels_dense2)) - -class CSVMSavingTestCBOR(unittest.TestCase): - filepath = "tests/files/saving/csvm.cbor" - - def test_fit_private_params(self): - kernel = "rbf" - c = 2 - gamma = 0.1 - seed = 666 - file_ = "tests/files/libsvm/2" - - x, y = ds.load_svmlight_file(file_, (10, 300), 780, False) - csvm = CascadeSVM(kernel=kernel, c=c, gamma=gamma, random_state=seed) - csvm.fit(x, y) - save_model(csvm, self.filepath, save_format="cbor") - csvm2 = load_model(self.filepath, load_format="cbor") - self.assertEqual(csvm._clf_params["kernel"], kernel) - self.assertEqual(csvm._clf_params["C"], c) - self.assertEqual(csvm._clf_params["gamma"], gamma) - self.assertEqual(csvm2._clf_params["kernel"], kernel) - self.assertEqual(csvm2._clf_params["C"], c) - self.assertEqual(csvm2._clf_params["gamma"], gamma) - - kernel, c = "linear", 0.3 - csvm = CascadeSVM(kernel=kernel, c=c, random_state=seed) - csvm.fit(x, y) - save_model(csvm, self.filepath, save_format="cbor") - csvm2 = load_model(self.filepath, load_format="cbor") - self.assertEqual(csvm._clf_params["kernel"], kernel) - self.assertEqual(csvm._clf_params["C"], c) - self.assertEqual(csvm2._clf_params["kernel"], kernel) - self.assertEqual(csvm2._clf_params["C"], c) - - # # check for exception when incorrect kernel is passed - # self.assertRaises(AttributeError, CascadeSVM(kernel='fake_kernel')) - - def test_predict(self): - seed = 666 - - # negative points belong to class 1, positives to 0 - p1, p2, p3, p4 = [1, 2], [2, 1], [-1, -2], [-2, -1] - - x = ds.array(np.array([p1, p4, p3, p2]), (2, 2)) - y = ds.array(np.array([0, 1, 1, 0]).reshape(-1, 1), (2, 1)) - - csvm = CascadeSVM( - cascade_arity=3, - max_iter=10, - tol=1e-4, - kernel="linear", - c=2, - gamma=0.1, - check_convergence=False, - random_state=seed, - verbose=False, - ) - - csvm.fit(x, y) - save_model(csvm, self.filepath, save_format="cbor") - csvm2 = load_model(self.filepath, load_format="cbor") - - # p5 should belong to class 0, p6 to class 1 - p5, p6 = np.array([1, 1]), np.array([-1, -1]) - - x_test = ds.array(np.array([p1, p2, p3, p4, p5, p6]), (2, 2)) - - y_pred = csvm.predict(x_test) - y_pred2 = csvm2.predict(x_test) - - l1, l2, l3, l4, l5, l6 = y_pred.collect() - self.assertTrue(l1 == l2 == l5 == 0) - self.assertTrue(l3 == l4 == l6 == 1) - - l1, l2, l3, l4, l5, l6 = y_pred2.collect() - self.assertTrue(l1 == l2 == l5 == 0) - self.assertTrue(l3 == l4 == l6 == 1) - - def test_score(self): - seed = 666 - - # negative points belong to class 1, positives to 0 - p1, p2, p3, p4 = [1, 2], [2, 1], [-1, -2], [-2, -1] - - x = ds.array(np.array([p1, p4, p3, p2]), (2, 2)) - y = ds.array(np.array([0, 1, 1, 0]).reshape(-1, 1), (2, 1)) - - csvm = CascadeSVM( - cascade_arity=3, - max_iter=10, - tol=1e-4, - kernel="rbf", - c=2, - gamma=0.1, - check_convergence=True, - random_state=seed, - verbose=False, - ) - - csvm.fit(x, y) - save_model(csvm, self.filepath, save_format="cbor") - csvm2 = load_model(self.filepath, load_format="cbor") - - # points are separable, scoring the training dataset should have 100% - # accuracy - x_test = ds.array(np.array([p1, p2, p3, p4]), (2, 2)) - y_test = ds.array(np.array([0, 0, 1, 1]).reshape(-1, 1), (2, 1)) - - accuracy = compss_wait_on(csvm.score(x_test, y_test)) - accuracy2 = compss_wait_on(csvm2.score(x_test, y_test)) - - self.assertEqual(accuracy, 1.0) - self.assertEqual(accuracy2, 1.0) - - def test_sparse(self): - """Tests that C-SVM produces the same results with sparse and dense - data""" + def test_saving_csvm(self): seed = 666 train = "tests/files/libsvm/3" + filepath = "tests/files/saving/csvm.cbor" x_sp, y_sp = ds.load_svmlight_file(train, (10, 300), 780, True) x_d, y_d = ds.load_svmlight_file(train, (10, 300), 780, False) csvm_sp = CascadeSVM(random_state=seed) csvm_sp.fit(x_sp, y_sp) - save_model(csvm_sp, self.filepath, save_format="cbor") - csvm_sp2 = load_model(self.filepath, load_format="cbor") + save_model(csvm_sp, filepath, save_format="cbor") + csvm_sp2 = load_model(filepath, load_format="cbor") csvm_d = CascadeSVM(random_state=seed) csvm_d.fit(x_d, y_d) - save_model(csvm_d, self.filepath, save_format="cbor") - csvm_d2 = load_model(self.filepath, load_format="cbor") + save_model(csvm_d, filepath, save_format="cbor") + csvm_d2 = load_model(filepath, load_format="cbor") sv_d = csvm_d._clf.support_vectors_ sv_sp = csvm_sp._clf.support_vectors_.toarray() @@ -388,100 +118,8 @@ def test_sparse(self): self.assertTrue(np.array_equal(coef_d2, coef_sp2)) self.assertTrue(np.array_equal(coef_d, coef_d2)) - -class RFSavingTestCBOR(unittest.TestCase): - filepath = "tests/files/saving/rf.cbor" - - def test_make_classification_score(self): - """Tests RandomForestClassifier fit and score with default params.""" - x, y = make_classification( - n_samples=3000, - n_features=10, - n_classes=3, - n_informative=4, - n_redundant=2, - n_repeated=1, - n_clusters_per_class=2, - shuffle=True, - random_state=0, - ) - x_train = ds.array(x[: len(x) // 2], (300, 10)) - y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) - - rf = RandomForestClassifier(random_state=0) - rf.fit(x_train, y_train) - save_model(rf, self.filepath, save_format="cbor") - rf2 = load_model(self.filepath, load_format="cbor") - - accuracy = compss_wait_on(rf.score(x_test, y_test)) - accuracy2 = compss_wait_on(rf2.score(x_test, y_test)) - self.assertGreater(accuracy, 0.7) - self.assertGreater(accuracy2, 0.7) - - def test_make_classification_predict_and_distr_depth(self): - """Tests RandomForestClassifier fit and predict with a distr_depth.""" - x, y = make_classification( - n_samples=3000, - n_features=10, - n_classes=3, - n_informative=4, - n_redundant=2, - n_repeated=1, - n_clusters_per_class=2, - shuffle=True, - random_state=0, - ) - x_train = ds.array(x[: len(x) // 2], (300, 10)) - y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = y[len(y) // 2:] - - rf = RandomForestClassifier(distr_depth=2, random_state=0) - rf.fit(x_train, y_train) - save_model(rf, self.filepath, save_format="cbor") - rf2 = load_model(self.filepath, load_format="cbor") - - y_pred = rf.predict(x_test).collect() - y_pred2 = rf2.predict(x_test).collect() - accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) - accuracy2 = np.count_nonzero(y_pred2 == y_test) / len(y_test) - self.assertGreater(accuracy, 0.7) - self.assertGreater(accuracy2, 0.7) - - def test_make_classification_sklearn_max_predict(self): - """Tests RandomForestClassifier predict with sklearn_max.""" - x, y = make_classification( - n_samples=3000, - n_features=10, - n_classes=3, - n_informative=4, - n_redundant=2, - n_repeated=1, - n_clusters_per_class=2, - shuffle=True, - random_state=0, - ) - x_train = ds.array(x[: len(x) // 2], (300, 10)) - y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = y[len(y) // 2:] - - rf = RandomForestClassifier(random_state=0, sklearn_max=10) - rf.fit(x_train, y_train) - save_model(rf, self.filepath, save_format="cbor") - rf2 = load_model(self.filepath, load_format="cbor") - - y_pred = rf.predict(x_test).collect() - y_pred2 = rf2.predict(x_test).collect() - accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) - accuracy2 = np.count_nonzero(y_pred2 == y_test) / len(y_test) - self.assertGreater(accuracy, 0.7) - self.assertGreater(accuracy2, 0.7) - - def test_make_classification_sklearn_max_predict_proba(self): - """Tests RandomForestClassifier predict_proba with sklearn_max.""" + def test_saving_rf_class(self): + filepath = "tests/files/saving/rf_class.cbor" x, y = make_classification( n_samples=3000, n_features=10, @@ -500,8 +138,8 @@ def test_make_classification_sklearn_max_predict_proba(self): rf = RandomForestClassifier(random_state=0, sklearn_max=10) rf.fit(x_train, y_train) - save_model(rf, self.filepath, save_format="cbor") - rf2 = load_model(self.filepath, load_format="cbor") + save_model(rf, filepath, save_format="cbor") + rf2 = load_model(filepath, load_format="cbor") probabilities = rf.predict_proba(x_test).collect() probabilities2 = rf2.predict_proba(x_test).collect() @@ -514,49 +152,18 @@ def test_make_classification_sklearn_max_predict_proba(self): self.assertGreater(accuracy, 0.7) self.assertGreater(accuracy2, 0.7) - def test_make_classification_hard_vote_predict(self): - """Tests RandomForestClassifier predict with hard_vote.""" - x, y = make_classification( - n_samples=3000, - n_features=10, - n_classes=3, - n_informative=4, - n_redundant=2, - n_repeated=1, - n_clusters_per_class=2, - shuffle=True, - random_state=0, - ) - x_train = ds.array(x[: len(x) // 2], (300, 10)) - y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = y[len(y) // 2:] + def test_saving_rf_regr(self): + filepath = "tests/files/saving/rf_regr.cbor" - rf = RandomForestClassifier( - random_state=0, sklearn_max=10, hard_vote=True - ) - rf.fit(x_train, y_train) - save_model(rf, self.filepath, save_format="cbor") - rf2 = load_model(self.filepath, load_format="cbor") - - y_pred = rf.predict(x_test).collect() - y_pred2 = rf2.predict(x_test).collect() - accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) - accuracy2 = np.count_nonzero(y_pred2 == y_test) / len(y_test) - self.assertGreater(accuracy, 0.7) - self.assertGreater(accuracy2, 0.7) + def determination_coefficient(y_true, y_pred): + u = np.sum(np.square(y_true - y_pred)) + v = np.sum(np.square(y_true - np.mean(y_true))) + return 1 - u / v - def test_make_classification_hard_vote_score_mix(self): - """Tests RandomForestClassifier score with hard_vote, sklearn_max, - distr_depth and max_depth.""" - x, y = make_classification( + x, y = make_regression( n_samples=3000, n_features=10, - n_classes=3, n_informative=4, - n_redundant=2, - n_repeated=1, - n_clusters_per_class=2, shuffle=True, random_state=0, ) @@ -565,29 +172,30 @@ def test_make_classification_hard_vote_score_mix(self): x_test = ds.array(x[len(x) // 2:], (300, 10)) y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) - rf = RandomForestClassifier( - random_state=0, - sklearn_max=100, - distr_depth=2, - max_depth=12, - hard_vote=True, - ) + rf = RandomForestRegressor(random_state=0, sklearn_max=10) + rf.fit(x_train, y_train) - save_model(rf, self.filepath, save_format="cbor") - rf2 = load_model(self.filepath, load_format="cbor") + save_model(rf, filepath, save_format="cbor") + rf2 = load_model(filepath, load_format="cbor") - accuracy = compss_wait_on(rf.score(x_test, y_test)) + accuracy1 = compss_wait_on(rf.score(x_test, y_test)) accuracy2 = compss_wait_on(rf2.score(x_test, y_test)) - self.assertGreater(accuracy, 0.7) - self.assertGreater(accuracy2, 0.7) - - -class LassoSavingTestCBOR(unittest.TestCase): - filepath = "tests/files/saving/lasso.cbor" - - def test_fit_predict(self): - """Tests fit and predicts methods""" - + y_pred = rf.predict(x_test).collect() + y_true = y[len(y) // 2:] + y_pred2 = rf2.predict(x_test).collect() + y_true2 = y[len(y) // 2:] + coef1 = determination_coefficient(y_true, y_pred) + coef2 = determination_coefficient(y_true2, y_pred2) + + self.assertGreater(accuracy1, 0.85) + self.assertGreater(accuracy2, 0.85) + self.assertGreater(coef1, 0.85) + self.assertGreater(coef2, 0.85) + self.assertAlmostEqual(accuracy1, accuracy2) + self.assertAlmostEqual(coef1, coef2) + + def test_saving_lasso(self): + filepath = "tests/files/saving/lasso.cbor" np.random.seed(42) n_samples, n_features = 50, 100 @@ -609,8 +217,8 @@ def test_fit_predict(self): lasso = Lasso(lmbd=0.1, max_iter=50) lasso.fit(ds.array(X_train, (5, 100)), ds.array(y_train, (5, 1))) - save_model(lasso, self.filepath, save_format="cbor") - lasso2 = load_model(self.filepath, load_format="cbor") + save_model(lasso, filepath, save_format="cbor") + lasso2 = load_model(filepath, load_format="cbor") y_pred_lasso = lasso.predict(ds.array(X_test, (25, 100))) r2_score_lasso = r2_score(y_test, y_pred_lasso.collect()) @@ -620,84 +228,9 @@ def test_fit_predict(self): self.assertAlmostEqual(r2_score_lasso, 0.9481746925431124) self.assertAlmostEqual(r2_score_lasso2, 0.9481746925431124) + def test_saving_linear(self): + filepath = "tests/files/saving/linear_regression.cbor" -class LinearRegressionSavingTestCBOR(unittest.TestCase): - filepath = "tests/files/saving/linear_regression.cbor" - - def test_univariate(self): - """Tests fit() and predict(), univariate.""" - x_data = np.array([1, 2, 3, 4, 5]) - y_data = np.array([2, 1, 1, 2, 4.5]) - - bn, bm = 2, 1 - - x = ds.array(x=x_data, block_size=(bn, bm)) - y = ds.array(x=y_data, block_size=(bn, bm)) - - reg = LinearRegression() - reg.fit(x, y) - save_model(reg, self.filepath, save_format="cbor") - reg2 = load_model(self.filepath, load_format="cbor") - - self.assertTrue(np.allclose(reg.coef_.collect(), 0.6)) - self.assertTrue(np.allclose(reg.intercept_.collect(), 0.3)) - self.assertTrue(np.allclose(reg2.coef_.collect(), 0.6)) - self.assertTrue(np.allclose(reg2.intercept_.collect(), 0.3)) - - # Predict one sample - x_test = np.array([3]) - test_data = ds.array(x=x_test, block_size=(1, 1)) - pred = reg.predict(test_data).collect() - pred2 = reg2.predict(test_data).collect() - self.assertTrue(np.allclose(pred, 2.1)) - self.assertTrue(np.allclose(pred2, 2.1)) - - # Predict multiple samples - x_test = np.array([3, 5, 6]) - test_data = ds.array(x=x_test, block_size=(bn, bm)) - pred = reg.predict(test_data).collect() - pred2 = reg2.predict(test_data).collect() - self.assertTrue(np.allclose(pred, [2.1, 3.3, 3.9])) - self.assertTrue(np.allclose(pred2, [2.1, 3.3, 3.9])) - - def test_univariate_no_intercept(self): - """Tests fit() and predict(), univariate, fit_intercept=False.""" - x_data = np.array([1, 2, 3, 4, 5]) - y_data = np.array([2, 1, 1, 2, 4.5]) - - bn, bm = 2, 1 - - x = ds.array(x=x_data, block_size=(bn, bm)) - y = ds.array(x=y_data, block_size=(bn, bm)) - - reg = LinearRegression(fit_intercept=False) - reg.fit(x, y) - save_model(reg, self.filepath, save_format="cbor") - reg2 = load_model(self.filepath, load_format="cbor") - - self.assertTrue(np.allclose(reg.coef_.collect(), 0.68181818)) - self.assertTrue(np.allclose(reg2.coef_.collect(), 0.68181818)) - self.assertTrue(np.allclose(reg.intercept_.collect(), 0)) - self.assertTrue(np.allclose(reg2.intercept_.collect(), 0)) - - # Predict one sample - x_test = np.array([3]) - test_data = ds.array(x=x_test, block_size=(1, 1)) - pred = reg.predict(test_data).collect() - pred2 = reg2.predict(test_data).collect() - self.assertTrue(np.allclose(pred, 2.04545455)) - self.assertTrue(np.allclose(pred2, 2.04545455)) - - # Predict multiple samples - x_test = np.array([3, 5, 6]) - test_data = ds.array(x=x_test, block_size=(bn, bm)) - pred = reg.predict(test_data).collect() - pred2 = reg.predict(test_data).collect() - self.assertTrue(np.allclose(pred, [2.04545455, 3.4090909, 4.0909091])) - self.assertTrue(np.allclose(pred2, [2.04545455, 3.4090909, 4.0909091])) - - def test_multivariate(self): - """Tests fit() and predict(), multivariate.""" x_data = np.array([[1, 2], [2, 0], [3, 1], [4, 4], [5, 3]]) y_data = np.array([2, 1, 1, 2, 4.5]) @@ -708,8 +241,8 @@ def test_multivariate(self): reg = LinearRegression() reg.fit(x, y) - save_model(reg, self.filepath, save_format="cbor") - reg2 = load_model(self.filepath, load_format="cbor") + save_model(reg, filepath, save_format="cbor") + reg2 = load_model(filepath, load_format="cbor") self.assertTrue(np.allclose(reg.coef_.collect(), [0.421875, 0.296875])) self.assertTrue( @@ -732,46 +265,35 @@ def test_multivariate(self): pred = reg.predict(test_data).collect() self.assertTrue(np.allclose(pred, [2.1, 3.115625, 1.553125])) - def test_multivariate_no_intercept(self): - """Tests fit() and predict(), multivariate, fit_intercept=False.""" - x_data = np.array([[1, 2], [2, 0], [3, 1], [4, 4], [5, 3]]) - y_data = np.array([2, 1, 1, 2, 4.5]) + def test_saving_als(self): + filepath = "tests/files/saving/als.cbor" - bn, bm = 2, 2 - - x = ds.array(x=x_data, block_size=(bn, bm)) - y = ds.array(x=y_data, block_size=(bn, 1)) + data = np.array([[0, 0, 5], [3, 0, 5], [3, 1, 2]]) + ratings = csr_matrix(data) + train = ds.array(x=ratings, block_size=(1, 1)) + als = ALS(tol=0.01, random_state=666, n_f=5, verbose=False) + als.fit(train) + save_model(als, filepath, save_format="cbor") + als2 = load_model(filepath, load_format="cbor") - reg = LinearRegression(fit_intercept=False) - reg.fit(x, y) - save_model(reg, self.filepath, save_format="cbor") - reg2 = load_model(self.filepath, load_format="cbor") + predictions = als.predict_user(user_id=0) + predictions2 = als2.predict_user(user_id=0) + # Check that the ratings for user 0 are similar to user 1 because they + # share preferences (third movie), thus it is expected that user 0 + # will rate movie 1 similarly to user 1. self.assertTrue( - np.allclose(reg.coef_.collect(), [0.48305085, 0.30367232]) + 2.75 < predictions[0] < 3.25 + and predictions[1] < 1 + and predictions[2] > 4.5 ) self.assertTrue( - np.allclose(reg2.coef_.collect(), [0.48305085, 0.30367232]) + 2.75 < predictions2[0] < 3.25 + and predictions2[1] < 1 + and predictions2[2] > 4.5 ) - self.assertTrue(np.allclose(reg.intercept_.collect(), 0)) - self.assertTrue(np.allclose(reg2.intercept_.collect(), 0)) - - # Predict one sample - x_test = np.array([3, 2]) - test_data = ds.array(x=x_test, block_size=(1, bm)) - pred = reg.predict(test_data).collect() - pred2 = reg2.predict(test_data).collect() - self.assertTrue(np.allclose(pred, [2.05649718])) - self.assertTrue(np.allclose(pred2, [2.05649718])) - - # Predict multiple samples - x_test = np.array([[3, 2], [4, 4], [1, 3]]) - test_data = ds.array(x=x_test, block_size=(bn, bm)) - pred = reg.predict(test_data).collect() - pred2 = reg2.predict(test_data).collect() - self.assertTrue(np.allclose(pred, [2.05649718, 3.14689266, 1.3940678])) self.assertTrue( - np.allclose(pred2, [2.05649718, 3.14689266, 1.3940678]) + np.array_equal(predictions, predictions2, equal_nan=True) ) @@ -809,60 +331,6 @@ def load_movielens(train_ratio=0.9): return train_arr, test_arr -class ALSSavingTestCBOR(unittest.TestCase): - filepath = "tests/files/saving/als.cbor" - - def test_fit(self): - train, test = load_movielens() - - als = ALS( - tol=0.01, - random_state=666, - n_f=100, - verbose=False, - check_convergence=True, - ) - - als.fit(train, test) - self.assertTrue(als.converged) - - als.fit(train) - save_model(als, self.filepath, save_format="cbor") - als2 = load_model(self.filepath, load_format="cbor") - - self.assertTrue(als.converged) - self.assertTrue(als2.converged) - - def test_predict(self): - data = np.array([[0, 0, 5], [3, 0, 5], [3, 1, 2]]) - ratings = csr_matrix(data) - train = ds.array(x=ratings, block_size=(1, 1)) - als = ALS(tol=0.01, random_state=666, n_f=5, verbose=False) - als.fit(train) - save_model(als, self.filepath, save_format="cbor") - als2 = load_model(self.filepath, load_format="cbor") - - predictions = als.predict_user(user_id=0) - predictions2 = als2.predict_user(user_id=0) - - # Check that the ratings for user 0 are similar to user 1 because they - # share preferences (third movie), thus it is expected that user 0 - # will rate movie 1 similarly to user 1. - self.assertTrue( - 2.75 < predictions[0] < 3.25 - and predictions[1] < 1 - and predictions[2] > 4.5 - ) - self.assertTrue( - 2.75 < predictions2[0] < 3.25 - and predictions2[1] < 1 - and predictions2[2] > 4.5 - ) - self.assertTrue( - np.array_equal(predictions, predictions2, equal_nan=True) - ) - - def main(): unittest.main() diff --git a/tests/test_saving_json.py b/tests/test_saving_json.py index 0c694cf8..0a19429f 100644 --- a/tests/test_saving_json.py +++ b/tests/test_saving_json.py @@ -3,14 +3,14 @@ import numpy as np from scipy.sparse import csr_matrix from sklearn.metrics import r2_score -from sklearn.datasets import make_classification -from sklearn.datasets import make_blobs +from sklearn.datasets import make_classification, make_regression import dislib as ds from dislib.cluster import KMeans from dislib.cluster import GaussianMixture from dislib.classification import CascadeSVM from dislib.classification import RandomForestClassifier +from dislib.regression import RandomForestRegressor from dislib.regression import Lasso from dislib.regression import LinearRegression from dislib.recommendation import ALS @@ -19,56 +19,11 @@ from pycompss.api.api import compss_wait_on -class KMeansSavingTestJSON(unittest.TestCase): - filepath = "tests/files/saving/kmeans.json" +class JSONSavingTest(unittest.TestCase): - def test_fit_kmeans(self): - """Tests that the fit method returns the expected centers using toy - data. - """ - arr = np.array([[1, 2], [2, 1], [-1, -2], [-2, -1]]) - x = ds.array(arr, block_size=(2, 2)) - - km = KMeans(n_clusters=2, random_state=666, verbose=False) - km.fit(x) - - expected_centers = np.array([[1.5, 1.5], [-1.5, -1.5]]) - - save_model(km, self.filepath, save_format="json") - km2 = load_model(self.filepath, load_format="json") - - self.assertTrue((km.centers == expected_centers).all()) - self.assertTrue((km2.centers == expected_centers).all()) - - def test_predict_kmeans(self): - """Tests that labels are correctly predicted using toy data.""" - p1, p2, p3, p4 = [1, 2], [2, 1], [-1, -2], [-2, -1] - - arr1 = np.array([p1, p2, p3, p4]) - x = ds.array(arr1, block_size=(2, 2)) - - km = KMeans(n_clusters=2, random_state=666) - km.fit(x) - - save_model(km, self.filepath, save_format="json") - km2 = load_model(self.filepath, load_format="json") - - p5, p6 = [10, 10], [-10, -10] - - arr2 = np.array([p1, p2, p3, p4, p5, p6]) - x_test = ds.array(arr2, block_size=(2, 2)) - - labels = km.predict(x_test).collect() - labels2 = km2.predict(x_test).collect() - expected_labels = np.array([0, 0, 1, 1, 0, 1]) - - self.assertTrue(np.array_equal(labels, expected_labels)) - self.assertTrue(np.array_equal(labels2, expected_labels)) - - def test_sparse_kmeans(self): - """Tests K-means produces the same results using dense and sparse - data structures.""" + def test_saving_kmeans(self): file_ = "tests/files/libsvm/2" + filepath = "tests/files/saving/kmeans.json" x_sp, _ = ds.load_svmlight_file(file_, (10, 300), 780, True) x_ds, _ = ds.load_svmlight_file(file_, (10, 300), 780, False) @@ -76,8 +31,8 @@ def test_sparse_kmeans(self): kmeans = KMeans(random_state=170) kmeans.fit(x_sp) - save_model(kmeans, self.filepath, save_format="json") - kmeans2 = load_model(self.filepath, load_format="json") + save_model(kmeans, filepath, save_format="json") + kmeans2 = load_model(filepath, load_format="json") y_sparse = kmeans.predict(x_sp).collect() y_sparse2 = kmeans2.predict(x_sp).collect() @@ -95,120 +50,9 @@ def test_sparse_kmeans(self): self.assertTrue(np.array_equal(y_sparse, y_dense)) self.assertTrue(np.array_equal(y_sparse2, y_dense)) - def test_init_kmeans(self): - # With dense data - x, y = make_blobs(n_samples=1500, random_state=170) - x_filtered = np.vstack( - (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10]) - ) - x_train = ds.array(x_filtered, block_size=(300, 2)) - - init = np.random.random((5, 2)) - km = KMeans(n_clusters=5, init=init) - km.fit(x_train) - - save_model(km, self.filepath, save_format="json") - km2 = load_model(self.filepath, load_format="json") - - self.assertTrue(np.array_equal(km.init, init)) - self.assertTrue(np.array_equal(km2.init, init)) - self.assertFalse(np.array_equal(km.centers, init)) - self.assertFalse(np.array_equal(km2.centers, init)) - - # With sparse data - x_sp = ds.array(csr_matrix(x_filtered), block_size=(300, 2)) - init = csr_matrix(np.random.random((5, 2))) - - km = KMeans(n_clusters=5, init=init) - km.fit(x_sp) - - save_model(km, self.filepath, save_format="json") - km2 = load_model(self.filepath, load_format="json") - - self.assertTrue(np.array_equal(km.init.toarray(), init.toarray())) - self.assertTrue(np.array_equal(km2.init.toarray(), init.toarray())) - self.assertFalse(np.array_equal(km.centers.toarray(), init.toarray())) - self.assertFalse(np.array_equal(km2.centers.toarray(), init.toarray())) - - -class GaussianMixtureSavingTestJSON(unittest.TestCase): - filepath = "tests/files/saving/gm.json" - - def test_fit(self): - """Tests GaussianMixture.fit()""" - - x = np.array([[1, 2], [2, 1], [-3, -3], [-1, -2], [-2, -1], [3, 3]]) - ds_x = ds.array(x, block_size=(3, 2)) - - gm = GaussianMixture(n_components=2, random_state=666) - gm.fit(ds_x) - - expected_weights = np.array([0.5, 0.5]) - expected_means = np.array([[-2, -2], [2, 2]]) - expected_cov = np.array( - [ - [[0.66671688, 0.33338255], [0.33338255, 0.66671688]], - [[0.66671688, 0.33338255], [0.33338255, 0.66671688]], - ] - ) - expected_pc = np.array( - [ - [[1.22469875, -0.70714834], [0.0, 1.4141944]], - [[1.22469875, -0.70714834], [0.0, 1.4141944]], - ] - ) - - save_model(gm, self.filepath, save_format="json") - gm2 = load_model(self.filepath, load_format="json") - - gm.weights_ = compss_wait_on(gm.weights_) - gm.means_ = compss_wait_on(gm.means_) - gm.covariances_ = compss_wait_on(gm.covariances_) - gm.precisions_cholesky_ = compss_wait_on(gm.precisions_cholesky_) - - gm2.weights_ = compss_wait_on(gm2.weights_) - gm2.means_ = compss_wait_on(gm2.means_) - gm2.covariances_ = compss_wait_on(gm2.covariances_) - gm2.precisions_cholesky_ = compss_wait_on(gm2.precisions_cholesky_) - - self.assertTrue((np.allclose(gm.weights_, expected_weights))) - self.assertTrue((np.allclose(gm.means_, expected_means))) - self.assertTrue((np.allclose(gm.covariances_, expected_cov))) - self.assertTrue((np.allclose(gm.precisions_cholesky_, expected_pc))) - - self.assertTrue((np.allclose(gm2.weights_, expected_weights))) - self.assertTrue((np.allclose(gm2.means_, expected_means))) - self.assertTrue((np.allclose(gm2.covariances_, expected_cov))) - self.assertTrue((np.allclose(gm2.precisions_cholesky_, expected_pc))) - - def test_predict(self): - """Tests GaussianMixture.predict()""" - x_train = np.array([[1, 2], [-1, -2], [2, 1], [-2, -1]]) - ds_x_train = ds.array(x_train, block_size=(2, 2)) - - gm = GaussianMixture(n_components=2, random_state=666) - gm.fit(ds_x_train) - - save_model(gm, self.filepath, save_format="json") - gm2 = load_model(self.filepath, load_format="json") - - x_test = np.concatenate((x_train, [[2, 2], [-1, -3]])) - ds_x_test = ds.array(x_test, block_size=(2, 2)) - pred = gm.predict(ds_x_test).collect() - pred2 = gm2.predict(ds_x_test).collect() - - self.assertTrue(pred[0] != pred[1]) - self.assertTrue(pred[0] == pred[2] == pred[4]) - self.assertTrue(pred[1] == pred[3] == pred[5]) - - self.assertTrue(pred2[0] != pred2[1]) - self.assertTrue(pred2[0] == pred2[2] == pred2[4]) - self.assertTrue(pred2[1] == pred2[3] == pred2[5]) - - def test_sparse(self): - """Tests GaussianMixture produces the same results using dense and - sparse data structures""" + def test_saving_gm(self): file_ = "tests/files/libsvm/2" + filepath = "tests/files/saving/gm.json" x_sparse, _ = ds.load_svmlight_file(file_, (10, 780), 780, True) x_dense, _ = ds.load_svmlight_file(file_, (10, 780), 780, False) @@ -220,8 +64,8 @@ def test_sparse(self): n_components=4, random_state=0, covariance_type=cov_type ) gm.fit(x_sparse) - save_model(gm, self.filepath, save_format="json") - gm2 = load_model(self.filepath, load_format="json") + save_model(gm, filepath, save_format="json") + gm2 = load_model(filepath, load_format="json") labels_sparse = gm.predict(x_sparse).collect() labels_sparse2 = gm2.predict(x_sparse).collect() @@ -229,8 +73,8 @@ def test_sparse(self): n_components=4, random_state=0, covariance_type=cov_type ) gm.fit(x_dense) - save_model(gm, self.filepath, save_format="json") - gm2 = load_model(self.filepath, load_format="json") + save_model(gm, filepath, save_format="json") + gm2 = load_model(filepath, load_format="json") labels_dense = gm.predict(x_dense).collect() labels_dense2 = gm2.predict(x_dense).collect() @@ -238,137 +82,23 @@ def test_sparse(self): self.assertTrue(np.array_equal(labels_sparse, labels_dense)) self.assertTrue(np.array_equal(labels_sparse2, labels_dense2)) - -class CSVMSavingTestJSON(unittest.TestCase): - filepath = "tests/files/saving/csvm.json" - - def test_fit_private_params(self): - kernel = "rbf" - c = 2 - gamma = 0.1 - seed = 666 - file_ = "tests/files/libsvm/2" - - x, y = ds.load_svmlight_file(file_, (10, 300), 780, False) - csvm = CascadeSVM(kernel=kernel, c=c, gamma=gamma, random_state=seed) - csvm.fit(x, y) - save_model(csvm, self.filepath, save_format="json") - csvm2 = load_model(self.filepath, load_format="json") - self.assertEqual(csvm._clf_params["kernel"], kernel) - self.assertEqual(csvm._clf_params["C"], c) - self.assertEqual(csvm._clf_params["gamma"], gamma) - self.assertEqual(csvm2._clf_params["kernel"], kernel) - self.assertEqual(csvm2._clf_params["C"], c) - self.assertEqual(csvm2._clf_params["gamma"], gamma) - - kernel, c = "linear", 0.3 - csvm = CascadeSVM(kernel=kernel, c=c, random_state=seed) - csvm.fit(x, y) - save_model(csvm, self.filepath, save_format="json") - csvm2 = load_model(self.filepath, load_format="json") - self.assertEqual(csvm._clf_params["kernel"], kernel) - self.assertEqual(csvm._clf_params["C"], c) - self.assertEqual(csvm2._clf_params["kernel"], kernel) - self.assertEqual(csvm2._clf_params["C"], c) - - # # check for exception when incorrect kernel is passed - # self.assertRaises(AttributeError, CascadeSVM(kernel='fake_kernel')) - - def test_predict(self): - seed = 666 - - # negative points belong to class 1, positives to 0 - p1, p2, p3, p4 = [1, 2], [2, 1], [-1, -2], [-2, -1] - - x = ds.array(np.array([p1, p4, p3, p2]), (2, 2)) - y = ds.array(np.array([0, 1, 1, 0]).reshape(-1, 1), (2, 1)) - - csvm = CascadeSVM( - cascade_arity=3, - max_iter=10, - tol=1e-4, - kernel="linear", - c=2, - gamma=0.1, - check_convergence=False, - random_state=seed, - verbose=False, - ) - - csvm.fit(x, y) - save_model(csvm, self.filepath, save_format="json") - csvm2 = load_model(self.filepath, load_format="json") - - # p5 should belong to class 0, p6 to class 1 - p5, p6 = np.array([1, 1]), np.array([-1, -1]) - - x_test = ds.array(np.array([p1, p2, p3, p4, p5, p6]), (2, 2)) - - y_pred = csvm.predict(x_test) - y_pred2 = csvm2.predict(x_test) - - l1, l2, l3, l4, l5, l6 = y_pred.collect() - self.assertTrue(l1 == l2 == l5 == 0) - self.assertTrue(l3 == l4 == l6 == 1) - - l1, l2, l3, l4, l5, l6 = y_pred2.collect() - self.assertTrue(l1 == l2 == l5 == 0) - self.assertTrue(l3 == l4 == l6 == 1) - - def test_score(self): - seed = 666 - - # negative points belong to class 1, positives to 0 - p1, p2, p3, p4 = [1, 2], [2, 1], [-1, -2], [-2, -1] - - x = ds.array(np.array([p1, p4, p3, p2]), (2, 2)) - y = ds.array(np.array([0, 1, 1, 0]).reshape(-1, 1), (2, 1)) - - csvm = CascadeSVM( - cascade_arity=3, - max_iter=10, - tol=1e-4, - kernel="rbf", - c=2, - gamma=0.1, - check_convergence=True, - random_state=seed, - verbose=False, - ) - - csvm.fit(x, y) - save_model(csvm, self.filepath, save_format="json") - csvm2 = load_model(self.filepath, load_format="json") - - # points are separable, scoring the training dataset should have 100% - # accuracy - x_test = ds.array(np.array([p1, p2, p3, p4]), (2, 2)) - y_test = ds.array(np.array([0, 0, 1, 1]).reshape(-1, 1), (2, 1)) - - accuracy = compss_wait_on(csvm.score(x_test, y_test)) - accuracy2 = compss_wait_on(csvm2.score(x_test, y_test)) - - self.assertEqual(accuracy, 1.0) - self.assertEqual(accuracy2, 1.0) - - def test_sparse(self): - """Tests that C-SVM produces the same results with sparse and dense - data""" + def test_saving_csvm(self): seed = 666 train = "tests/files/libsvm/3" + filepath = "tests/files/saving/csvm.json" x_sp, y_sp = ds.load_svmlight_file(train, (10, 300), 780, True) x_d, y_d = ds.load_svmlight_file(train, (10, 300), 780, False) csvm_sp = CascadeSVM(random_state=seed) csvm_sp.fit(x_sp, y_sp) - save_model(csvm_sp, self.filepath, save_format="json") - csvm_sp2 = load_model(self.filepath, load_format="json") + save_model(csvm_sp, filepath, save_format="json") + csvm_sp2 = load_model(filepath, load_format="json") csvm_d = CascadeSVM(random_state=seed) csvm_d.fit(x_d, y_d) - save_model(csvm_d, self.filepath, save_format="json") - csvm_d2 = load_model(self.filepath, load_format="json") + save_model(csvm_d, filepath, save_format="json") + csvm_d2 = load_model(filepath, load_format="json") sv_d = csvm_d._clf.support_vectors_ sv_sp = csvm_sp._clf.support_vectors_.toarray() @@ -388,100 +118,8 @@ def test_sparse(self): self.assertTrue(np.array_equal(coef_d2, coef_sp2)) self.assertTrue(np.array_equal(coef_d, coef_d2)) - -class RFSavingTestJSON(unittest.TestCase): - filepath = "tests/files/saving/rf.json" - - def test_make_classification_score(self): - """Tests RandomForestClassifier fit and score with default params.""" - x, y = make_classification( - n_samples=3000, - n_features=10, - n_classes=3, - n_informative=4, - n_redundant=2, - n_repeated=1, - n_clusters_per_class=2, - shuffle=True, - random_state=0, - ) - x_train = ds.array(x[: len(x) // 2], (300, 10)) - y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) - - rf = RandomForestClassifier(random_state=0) - rf.fit(x_train, y_train) - save_model(rf, self.filepath, save_format="json") - rf2 = load_model(self.filepath, load_format="json") - - accuracy = compss_wait_on(rf.score(x_test, y_test)) - accuracy2 = compss_wait_on(rf2.score(x_test, y_test)) - self.assertGreater(accuracy, 0.7) - self.assertGreater(accuracy2, 0.7) - - def test_make_classification_predict_and_distr_depth(self): - """Tests RandomForestClassifier fit and predict with a distr_depth.""" - x, y = make_classification( - n_samples=3000, - n_features=10, - n_classes=3, - n_informative=4, - n_redundant=2, - n_repeated=1, - n_clusters_per_class=2, - shuffle=True, - random_state=0, - ) - x_train = ds.array(x[: len(x) // 2], (300, 10)) - y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = y[len(y) // 2:] - - rf = RandomForestClassifier(distr_depth=2, random_state=0) - rf.fit(x_train, y_train) - save_model(rf, self.filepath, save_format="json") - rf2 = load_model(self.filepath, load_format="json") - - y_pred = rf.predict(x_test).collect() - y_pred2 = rf2.predict(x_test).collect() - accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) - accuracy2 = np.count_nonzero(y_pred2 == y_test) / len(y_test) - self.assertGreater(accuracy, 0.7) - self.assertGreater(accuracy2, 0.7) - - def test_make_classification_sklearn_max_predict(self): - """Tests RandomForestClassifier predict with sklearn_max.""" - x, y = make_classification( - n_samples=3000, - n_features=10, - n_classes=3, - n_informative=4, - n_redundant=2, - n_repeated=1, - n_clusters_per_class=2, - shuffle=True, - random_state=0, - ) - x_train = ds.array(x[: len(x) // 2], (300, 10)) - y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = y[len(y) // 2:] - - rf = RandomForestClassifier(random_state=0, sklearn_max=10) - rf.fit(x_train, y_train) - save_model(rf, self.filepath, save_format="json") - rf2 = load_model(self.filepath, load_format="json") - - y_pred = rf.predict(x_test).collect() - y_pred2 = rf2.predict(x_test).collect() - accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) - accuracy2 = np.count_nonzero(y_pred2 == y_test) / len(y_test) - self.assertGreater(accuracy, 0.7) - self.assertGreater(accuracy2, 0.7) - - def test_make_classification_sklearn_max_predict_proba(self): - """Tests RandomForestClassifier predict_proba with sklearn_max.""" + def test_saving_rf_class(self): + filepath = "tests/files/saving/rf_class.json" x, y = make_classification( n_samples=3000, n_features=10, @@ -500,8 +138,8 @@ def test_make_classification_sklearn_max_predict_proba(self): rf = RandomForestClassifier(random_state=0, sklearn_max=10) rf.fit(x_train, y_train) - save_model(rf, self.filepath, save_format="json") - rf2 = load_model(self.filepath, load_format="json") + save_model(rf, filepath, save_format="json") + rf2 = load_model(filepath, load_format="json") probabilities = rf.predict_proba(x_test).collect() probabilities2 = rf2.predict_proba(x_test).collect() @@ -514,49 +152,18 @@ def test_make_classification_sklearn_max_predict_proba(self): self.assertGreater(accuracy, 0.7) self.assertGreater(accuracy2, 0.7) - def test_make_classification_hard_vote_predict(self): - """Tests RandomForestClassifier predict with hard_vote.""" - x, y = make_classification( - n_samples=3000, - n_features=10, - n_classes=3, - n_informative=4, - n_redundant=2, - n_repeated=1, - n_clusters_per_class=2, - shuffle=True, - random_state=0, - ) - x_train = ds.array(x[: len(x) // 2], (300, 10)) - y_train = ds.array(y[: len(y) // 2][:, np.newaxis], (300, 1)) - x_test = ds.array(x[len(x) // 2:], (300, 10)) - y_test = y[len(y) // 2:] + def test_saving_rf_regr(self): + filepath = "tests/files/saving/rf_regr.json" - rf = RandomForestClassifier( - random_state=0, sklearn_max=10, hard_vote=True - ) - rf.fit(x_train, y_train) - save_model(rf, self.filepath, save_format="json") - rf2 = load_model(self.filepath, load_format="json") - - y_pred = rf.predict(x_test).collect() - y_pred2 = rf2.predict(x_test).collect() - accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) - accuracy2 = np.count_nonzero(y_pred2 == y_test) / len(y_test) - self.assertGreater(accuracy, 0.7) - self.assertGreater(accuracy2, 0.7) + def determination_coefficient(y_true, y_pred): + u = np.sum(np.square(y_true - y_pred)) + v = np.sum(np.square(y_true - np.mean(y_true))) + return 1 - u / v - def test_make_classification_hard_vote_score_mix(self): - """Tests RandomForestClassifier score with hard_vote, sklearn_max, - distr_depth and max_depth.""" - x, y = make_classification( + x, y = make_regression( n_samples=3000, n_features=10, - n_classes=3, n_informative=4, - n_redundant=2, - n_repeated=1, - n_clusters_per_class=2, shuffle=True, random_state=0, ) @@ -565,29 +172,30 @@ def test_make_classification_hard_vote_score_mix(self): x_test = ds.array(x[len(x) // 2:], (300, 10)) y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) - rf = RandomForestClassifier( - random_state=0, - sklearn_max=100, - distr_depth=2, - max_depth=12, - hard_vote=True, - ) + rf = RandomForestRegressor(random_state=0, sklearn_max=10) + rf.fit(x_train, y_train) - save_model(rf, self.filepath, save_format="json") - rf2 = load_model(self.filepath, load_format="json") + save_model(rf, filepath, save_format="json") + rf2 = load_model(filepath, load_format="json") - accuracy = compss_wait_on(rf.score(x_test, y_test)) + accuracy1 = compss_wait_on(rf.score(x_test, y_test)) accuracy2 = compss_wait_on(rf2.score(x_test, y_test)) - self.assertGreater(accuracy, 0.7) - self.assertGreater(accuracy2, 0.7) - - -class LassoSavingTestJSON(unittest.TestCase): - filepath = "tests/files/saving/lasso.json" - - def test_fit_predict(self): - """Tests fit and predicts methods""" - + y_pred = rf.predict(x_test).collect() + y_true = y[len(y) // 2:] + y_pred2 = rf2.predict(x_test).collect() + y_true2 = y[len(y) // 2:] + coef1 = determination_coefficient(y_true, y_pred) + coef2 = determination_coefficient(y_true2, y_pred2) + + self.assertGreater(accuracy1, 0.85) + self.assertGreater(accuracy2, 0.85) + self.assertGreater(coef1, 0.85) + self.assertGreater(coef2, 0.85) + self.assertAlmostEqual(accuracy1, accuracy2) + self.assertAlmostEqual(coef1, coef2) + + def test_saving_lasso(self): + filepath = "tests/files/saving/lasso.json" np.random.seed(42) n_samples, n_features = 50, 100 @@ -609,8 +217,8 @@ def test_fit_predict(self): lasso = Lasso(lmbd=0.1, max_iter=50) lasso.fit(ds.array(X_train, (5, 100)), ds.array(y_train, (5, 1))) - save_model(lasso, self.filepath, save_format="json") - lasso2 = load_model(self.filepath, load_format="json") + save_model(lasso, filepath, save_format="json") + lasso2 = load_model(filepath, load_format="json") y_pred_lasso = lasso.predict(ds.array(X_test, (25, 100))) r2_score_lasso = r2_score(y_test, y_pred_lasso.collect()) @@ -620,84 +228,9 @@ def test_fit_predict(self): self.assertAlmostEqual(r2_score_lasso, 0.9481746925431124) self.assertAlmostEqual(r2_score_lasso2, 0.9481746925431124) + def test_saving_linear(self): + filepath = "tests/files/saving/linear_regression.json" -class LinearRegressionSavingTestJSON(unittest.TestCase): - filepath = "tests/files/saving/linear_regression.json" - - def test_univariate(self): - """Tests fit() and predict(), univariate.""" - x_data = np.array([1, 2, 3, 4, 5]) - y_data = np.array([2, 1, 1, 2, 4.5]) - - bn, bm = 2, 1 - - x = ds.array(x=x_data, block_size=(bn, bm)) - y = ds.array(x=y_data, block_size=(bn, bm)) - - reg = LinearRegression() - reg.fit(x, y) - save_model(reg, self.filepath, save_format="json") - reg2 = load_model(self.filepath, load_format="json") - - self.assertTrue(np.allclose(reg.coef_.collect(), 0.6)) - self.assertTrue(np.allclose(reg.intercept_.collect(), 0.3)) - self.assertTrue(np.allclose(reg2.coef_.collect(), 0.6)) - self.assertTrue(np.allclose(reg2.intercept_.collect(), 0.3)) - - # Predict one sample - x_test = np.array([3]) - test_data = ds.array(x=x_test, block_size=(1, 1)) - pred = reg.predict(test_data).collect() - pred2 = reg2.predict(test_data).collect() - self.assertTrue(np.allclose(pred, 2.1)) - self.assertTrue(np.allclose(pred2, 2.1)) - - # Predict multiple samples - x_test = np.array([3, 5, 6]) - test_data = ds.array(x=x_test, block_size=(bn, bm)) - pred = reg.predict(test_data).collect() - pred2 = reg2.predict(test_data).collect() - self.assertTrue(np.allclose(pred, [2.1, 3.3, 3.9])) - self.assertTrue(np.allclose(pred2, [2.1, 3.3, 3.9])) - - def test_univariate_no_intercept(self): - """Tests fit() and predict(), univariate, fit_intercept=False.""" - x_data = np.array([1, 2, 3, 4, 5]) - y_data = np.array([2, 1, 1, 2, 4.5]) - - bn, bm = 2, 1 - - x = ds.array(x=x_data, block_size=(bn, bm)) - y = ds.array(x=y_data, block_size=(bn, bm)) - - reg = LinearRegression(fit_intercept=False) - reg.fit(x, y) - save_model(reg, self.filepath, save_format="json") - reg2 = load_model(self.filepath, load_format="json") - - self.assertTrue(np.allclose(reg.coef_.collect(), 0.68181818)) - self.assertTrue(np.allclose(reg2.coef_.collect(), 0.68181818)) - self.assertTrue(np.allclose(reg.intercept_.collect(), 0)) - self.assertTrue(np.allclose(reg2.intercept_.collect(), 0)) - - # Predict one sample - x_test = np.array([3]) - test_data = ds.array(x=x_test, block_size=(1, 1)) - pred = reg.predict(test_data).collect() - pred2 = reg2.predict(test_data).collect() - self.assertTrue(np.allclose(pred, 2.04545455)) - self.assertTrue(np.allclose(pred2, 2.04545455)) - - # Predict multiple samples - x_test = np.array([3, 5, 6]) - test_data = ds.array(x=x_test, block_size=(bn, bm)) - pred = reg.predict(test_data).collect() - pred2 = reg.predict(test_data).collect() - self.assertTrue(np.allclose(pred, [2.04545455, 3.4090909, 4.0909091])) - self.assertTrue(np.allclose(pred2, [2.04545455, 3.4090909, 4.0909091])) - - def test_multivariate(self): - """Tests fit() and predict(), multivariate.""" x_data = np.array([[1, 2], [2, 0], [3, 1], [4, 4], [5, 3]]) y_data = np.array([2, 1, 1, 2, 4.5]) @@ -708,8 +241,8 @@ def test_multivariate(self): reg = LinearRegression() reg.fit(x, y) - save_model(reg, self.filepath, save_format="json") - reg2 = load_model(self.filepath, load_format="json") + save_model(reg, filepath, save_format="json") + reg2 = load_model(filepath, load_format="json") self.assertTrue(np.allclose(reg.coef_.collect(), [0.421875, 0.296875])) self.assertTrue( @@ -732,46 +265,35 @@ def test_multivariate(self): pred = reg.predict(test_data).collect() self.assertTrue(np.allclose(pred, [2.1, 3.115625, 1.553125])) - def test_multivariate_no_intercept(self): - """Tests fit() and predict(), multivariate, fit_intercept=False.""" - x_data = np.array([[1, 2], [2, 0], [3, 1], [4, 4], [5, 3]]) - y_data = np.array([2, 1, 1, 2, 4.5]) + def test_saving_als(self): + filepath = "tests/files/saving/als.json" - bn, bm = 2, 2 - - x = ds.array(x=x_data, block_size=(bn, bm)) - y = ds.array(x=y_data, block_size=(bn, 1)) + data = np.array([[0, 0, 5], [3, 0, 5], [3, 1, 2]]) + ratings = csr_matrix(data) + train = ds.array(x=ratings, block_size=(1, 1)) + als = ALS(tol=0.01, random_state=666, n_f=5, verbose=False) + als.fit(train) + save_model(als, filepath, save_format="json") + als2 = load_model(filepath, load_format="json") - reg = LinearRegression(fit_intercept=False) - reg.fit(x, y) - save_model(reg, self.filepath, save_format="json") - reg2 = load_model(self.filepath, load_format="json") + predictions = als.predict_user(user_id=0) + predictions2 = als2.predict_user(user_id=0) + # Check that the ratings for user 0 are similar to user 1 because they + # share preferences (third movie), thus it is expected that user 0 + # will rate movie 1 similarly to user 1. self.assertTrue( - np.allclose(reg.coef_.collect(), [0.48305085, 0.30367232]) + 2.75 < predictions[0] < 3.25 + and predictions[1] < 1 + and predictions[2] > 4.5 ) self.assertTrue( - np.allclose(reg2.coef_.collect(), [0.48305085, 0.30367232]) + 2.75 < predictions2[0] < 3.25 + and predictions2[1] < 1 + and predictions2[2] > 4.5 ) - self.assertTrue(np.allclose(reg.intercept_.collect(), 0)) - self.assertTrue(np.allclose(reg2.intercept_.collect(), 0)) - - # Predict one sample - x_test = np.array([3, 2]) - test_data = ds.array(x=x_test, block_size=(1, bm)) - pred = reg.predict(test_data).collect() - pred2 = reg2.predict(test_data).collect() - self.assertTrue(np.allclose(pred, [2.05649718])) - self.assertTrue(np.allclose(pred2, [2.05649718])) - - # Predict multiple samples - x_test = np.array([[3, 2], [4, 4], [1, 3]]) - test_data = ds.array(x=x_test, block_size=(bn, bm)) - pred = reg.predict(test_data).collect() - pred2 = reg2.predict(test_data).collect() - self.assertTrue(np.allclose(pred, [2.05649718, 3.14689266, 1.3940678])) self.assertTrue( - np.allclose(pred2, [2.05649718, 3.14689266, 1.3940678]) + np.array_equal(predictions, predictions2, equal_nan=True) ) @@ -809,60 +331,6 @@ def load_movielens(train_ratio=0.9): return train_arr, test_arr -class ALSSavingTestJSON(unittest.TestCase): - filepath = "tests/files/saving/als.json" - - def test_fit(self): - train, test = load_movielens() - - als = ALS( - tol=0.01, - random_state=666, - n_f=100, - verbose=False, - check_convergence=True, - ) - - als.fit(train, test) - self.assertTrue(als.converged) - - als.fit(train) - save_model(als, self.filepath, save_format="json") - als2 = load_model(self.filepath, load_format="json") - - self.assertTrue(als.converged) - self.assertTrue(als2.converged) - - def test_predict(self): - data = np.array([[0, 0, 5], [3, 0, 5], [3, 1, 2]]) - ratings = csr_matrix(data) - train = ds.array(x=ratings, block_size=(1, 1)) - als = ALS(tol=0.01, random_state=666, n_f=5, verbose=False) - als.fit(train) - save_model(als, self.filepath, save_format="json") - als2 = load_model(self.filepath, load_format="json") - - predictions = als.predict_user(user_id=0) - predictions2 = als2.predict_user(user_id=0) - - # Check that the ratings for user 0 are similar to user 1 because they - # share preferences (third movie), thus it is expected that user 0 - # will rate movie 1 similarly to user 1. - self.assertTrue( - 2.75 < predictions[0] < 3.25 - and predictions[1] < 1 - and predictions[2] > 4.5 - ) - self.assertTrue( - 2.75 < predictions2[0] < 3.25 - and predictions2[1] < 1 - and predictions2[2] > 4.5 - ) - self.assertTrue( - np.array_equal(predictions, predictions2, equal_nan=True) - ) - - def main(): unittest.main() From 7ae3b9d982eb5023936ac62f9573be3345d3b7b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Fri, 30 Jul 2021 10:15:16 +0200 Subject: [PATCH 40/46] Added tests for RF dataset --- dislib/commons/rf/data.py | 24 ++-- tests/test_rf_dataset.py | 241 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 251 insertions(+), 14 deletions(-) create mode 100644 tests/test_rf_dataset.py diff --git a/dislib/commons/rf/data.py b/dislib/commons/rf/data.py index a762e5b6..af9fb066 100644 --- a/dislib/commons/rf/data.py +++ b/dislib/commons/rf/data.py @@ -46,13 +46,12 @@ def get_n_samples(self): """ if self.n_samples is None: - assert isinstance(self.samples_path, str), ( - "self.n_samples must be set manually if self.samples_path " - "is a pycompss.runtime.Future object" - ) + if not isinstance(self.samples_path, str): + raise TypeError( + "self.n_samples must be set manually if self.samples_path " + "is a pycompss.runtime.Future object" + ) shape = _NpyFile(self.samples_path).get_shape() - if len(shape) != 2: - raise ValueError("Cannot read 2D array from the samples file.") self.n_samples, self.n_features = shape return self.n_samples @@ -72,13 +71,12 @@ def get_n_features(self): """ if self.n_features is None: - assert isinstance(self.samples_path, str), ( - "self.n_features must be set manually if self.samples_path " - "is a pycompss.runtime.Future object" - ) + if not isinstance(self.samples_path, str): + raise TypeError( + "self.n_features must be set manually if self.samples_path" + " is a pycompss.runtime.Future object" + ) shape = _NpyFile(self.samples_path).get_shape() - if len(shape) != 2: - raise ValueError("Cannot read 2D array from the samples file.") self.n_samples, self.n_features = shape return self.n_features @@ -95,8 +93,6 @@ class n_samples and n_features or if the array is in fortran order. features_npy_file = _NpyFile(self.features_path) shape = features_npy_file.get_shape() fortran_order = features_npy_file.get_fortran_order() - if len(shape) != 2: - raise ValueError("Cannot read 2D array from features_file.") if (self.get_n_features(), self.get_n_samples()) != shape: raise ValueError("Invalid dimensions for the features_file.") if fortran_order: diff --git a/tests/test_rf_dataset.py b/tests/test_rf_dataset.py new file mode 100644 index 00000000..c70664e1 --- /dev/null +++ b/tests/test_rf_dataset.py @@ -0,0 +1,241 @@ +import unittest + +import os +import shutil +from sklearn.datasets import make_classification +import dislib as ds +from dislib.commons.rf import data +from dislib.commons.rf import test_split +from dislib.data.array import Array +import numpy as np +from sys import float_info +from pycompss.api.api import compss_wait_on + +DIRPATH = "tests/files/saving" + + +class RFDatasetTest(unittest.TestCase): + def setUp(self) -> None: + os.makedirs(DIRPATH, exist_ok=True) + return super().setUp() + + def tearDown(self) -> None: + shutil.rmtree(DIRPATH) + return super().tearDown() + + def test_rf_dataset(self): + # Save samples and features + x, y = make_classification( + n_samples=900, + n_features=10, + n_classes=3, + n_informative=4, + random_state=0, + ) + x_ds_1 = ds.array(x, (300, 10)) + x_ds_2 = ds.array(x[:600], (300, 10)) + y_ds_1 = ds.array(y[:, np.newaxis], (300, 1)) + y_ds_2 = ds.array(y[:600][:, np.newaxis], (300, 1)) + samples_path_1 = os.path.join(DIRPATH, "feats_1") + samples_path_2 = os.path.join(DIRPATH, "feats_2") + targets_path_1 = os.path.join(DIRPATH, "targets_1") + targets_path_2 = os.path.join(DIRPATH, "targets_2") + features_path_f = os.path.join(DIRPATH, "targets_f") + save_samples(x_ds_1, samples_path_1, False) + save_samples(x_ds_2, samples_path_2, False) + save_targets(y_ds_1, targets_path_1) + save_targets(y_ds_2, targets_path_2) + save_features(x_ds_2, features_path_f, True) + + # Regression and classification datatser + rf_regr = data.RfRegressorDataset(samples_path_1, targets_path_1) + rf_class = data.RfClassifierDataset(samples_path_1, targets_path_1) + + # Test get number of samples and features + self.assertEqual(rf_regr.get_n_samples(), 900) + self.assertEqual(rf_class.get_n_samples(), 900) + self.assertEqual(rf_regr.get_n_features(), 10) + self.assertEqual(rf_class.get_n_features(), 10) + + # Test get y targets + y_regr = compss_wait_on(rf_regr.get_y_targets()) + y_class = compss_wait_on(rf_class.get_y_targets()) + self.assertTrue(np.all(y_regr == y_ds_1.collect())) + self.assertTrue(np.all(y_class == y_ds_1.collect())) + + # Test get number of classes and classes + n_class = compss_wait_on(rf_regr.get_n_classes()) + classes = compss_wait_on(rf_regr.get_classes()) + self.assertTrue(n_class is None) + self.assertTrue(classes is None) + + rf_class.n_classes = None + n_class = compss_wait_on(rf_class.get_n_classes()) + rf_class.y_categories = None + classes = compss_wait_on(rf_class.get_classes()) + self.assertEqual(n_class, 3) + self.assertTrue(np.all(classes == [0, 1, 2])) + + # Sample and feature paths must be str + rf_dataset = data.RfBaseDataset(None, None) + with self.assertRaises(TypeError): + rf_dataset.get_n_samples() + with self.assertRaises(TypeError): + rf_dataset.get_n_features() + + # Task must be classification or regression + with self.assertRaises(ValueError): + rf_dataset = data.transform_to_rf_dataset(x_ds_1, y_ds_1, "aaa") + + # Validate dimension + rf_dataset = data.RfBaseDataset( + samples_path_1, targets_path_1, features_path_f + ) + rf_dataset.samples_path = samples_path_2 + with self.assertRaises(ValueError): + rf_dataset.validate_features_file() + + # Validate Fortran order + rf_dataset = data.RfBaseDataset( + samples_path_1, targets_path_1, features_path_f + ) + with self.assertRaises(ValueError): + rf_dataset.validate_features_file() + + # Dataset creation + rf_regr = data.transform_to_rf_dataset( + x_ds_1, y_ds_1, "regression" + ) + rf_class = data.transform_to_rf_dataset( + x_ds_1, y_ds_1, "classification" + ) + self.assertEquals(compss_wait_on(rf_regr.get_n_samples()), 900) + self.assertEquals(compss_wait_on(rf_regr.get_n_features()), 10) + self.assertEquals(compss_wait_on(rf_class.get_n_samples()), 900) + self.assertEquals(compss_wait_on(rf_class.get_n_features()), 10) + + # Npy files + file = data._NpyFile(features_path_f) + file.shape = None + self.assertEqual(file.get_shape(), (10, 600)) + file.fortran_order = None + self.assertTrue(file.get_fortran_order()) + file.dtype = None + self.assertEqual(file.get_dtype().name, "float32") + + file = data._NpyFile(samples_path_2) + file.shape = None + self.assertEqual(file.get_shape(), (600, 10)) + file.fortran_order = None + self.assertFalse(file.get_fortran_order()) + file.dtype = None + self.assertEqual(file.get_dtype().name, "float32") + + # Test returns for empty size + score, value = test_split.test_split(None, np.array([]), None, None) + self.assertEqual(score, float_info.max) + self.assertEqual(value, np.float64(np.inf)) + + +def _fill_samples_file( + samples_path, row_blocks, start_idx, fortran_order +): + rows_samples = Array._merge_blocks(row_blocks) + rows_samples = rows_samples.astype( + dtype="float32", casting="same_kind" + ) + samples = np.lib.format.open_memmap( + samples_path, mode="r+", fortran_order=fortran_order + ) + samples[start_idx: start_idx + rows_samples.shape[0]] = ( + rows_samples + ) + + +def _fill_features_file( + samples_path, row_blocks, start_idx, fortran_order +): + rows_samples = Array._merge_blocks(row_blocks).T + rows_samples = rows_samples.astype( + dtype="float32", casting="same_kind" + ) + samples = np.lib.format.open_memmap( + samples_path, mode="r+", fortran_order=fortran_order + ) + samples[start_idx: start_idx + rows_samples.shape[1]] = ( + rows_samples + ) + + +def _fill_targets_file(targets_path, row_blocks): + rows_targets = Array._merge_blocks(row_blocks) + with open(targets_path, "at") as f: + np.savetxt(f, rows_targets, fmt="%s", encoding="utf-8") + + +def save_samples(x, samples_path, fortran_order): + n_samples = x.shape[0] + n_features = x.shape[1] + + open(samples_path, 'w').close() + np.lib.format.open_memmap( + samples_path, + mode="w+", + dtype="float32", + fortran_order=fortran_order, + shape=(int(n_samples), int(n_features)), + ) + start_idx = 0 + row_blocks_iterator = x._iterator(axis=0) + top_row = next(row_blocks_iterator) + _fill_samples_file( + samples_path, top_row._blocks, start_idx, fortran_order + ) + start_idx += x._top_left_shape[0] + for x_row in row_blocks_iterator: + _fill_samples_file( + samples_path, x_row._blocks, start_idx, fortran_order + ) + start_idx += x._reg_shape[0] + + +def save_targets(y, targets_path): + open(targets_path, 'w').close() + for y_row in y._iterator(axis=0): + _fill_targets_file(targets_path, y_row._blocks) + + +def save_features(x, features_path, fortran_order): + n_samples = x.shape[0] + n_features = x.shape[1] + + if features_path is not None: + np.lib.format.open_memmap( + features_path, + mode="w+", + dtype="float32", + fortran_order=fortran_order, + shape=(int(n_features), int(n_samples)), + ) + start_idx = 0 + col_blocks_iterator = x._iterator(axis=1) + left_col = next(col_blocks_iterator) + _fill_features_file( + features_path, left_col._blocks, + start_idx, fortran_order + ) + start_idx += x._top_left_shape[1] + for x_row in col_blocks_iterator: + _fill_features_file( + features_path, x_row._blocks, + start_idx, fortran_order + ) + start_idx += x._reg_shape[1] + + +def main(): + unittest.main() + + +if __name__ == '__main__': + main() From fe86b92eea4340f770baff13debb587d58d6173b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Fri, 30 Jul 2021 10:17:45 +0200 Subject: [PATCH 41/46] Added setup and teardown for saving tests. --- .gitignore | 3 --- tests/files/saving/saving.txt | 1 - tests/test_saving.py | 15 ++++++++++++--- tests/test_saving_cbor.py | 28 +++++++++++++++++++--------- tests/test_saving_json.py | 28 +++++++++++++++++++--------- 5 files changed, 50 insertions(+), 25 deletions(-) delete mode 100644 tests/files/saving/saving.txt diff --git a/.gitignore b/.gitignore index ad8ef5a4..4b75fb4c 100644 --- a/.gitignore +++ b/.gitignore @@ -112,9 +112,6 @@ target/ *compss*.out *compss*.err -# Saving -tests/files/saving/* -!tests/files/saving/*.txt # ========== C & C++ ignores ================= # Prerequisites diff --git a/tests/files/saving/saving.txt b/tests/files/saving/saving.txt deleted file mode 100644 index d7d8541b..00000000 --- a/tests/files/saving/saving.txt +++ /dev/null @@ -1 +0,0 @@ -Directory where the models generated by the tests regarding saving functionalities should be located. \ No newline at end of file diff --git a/tests/test_saving.py b/tests/test_saving.py index 7545e9ad..523ed5cc 100644 --- a/tests/test_saving.py +++ b/tests/test_saving.py @@ -1,16 +1,25 @@ import unittest import json - +import os +import shutil from dislib.cluster import KMeans from dislib.cluster import DBSCAN import dislib.utils.saving as saving +DIRPATH = "tests/files/saving" + class SavingTest(unittest.TestCase): + def setUp(self) -> None: + os.makedirs(DIRPATH, exist_ok=True) + return super().setUp() + + def tearDown(self) -> None: + shutil.rmtree(DIRPATH) + return super().tearDown() def test_errors(self): - """Test that errors are raised""" - filepath = "tests/files/saving/model.json" + filepath = os.path.join(DIRPATH, "model.json") # Models km = KMeans(n_clusters=2) diff --git a/tests/test_saving_cbor.py b/tests/test_saving_cbor.py index a5c2f23c..5a0ef438 100644 --- a/tests/test_saving_cbor.py +++ b/tests/test_saving_cbor.py @@ -1,5 +1,6 @@ import unittest - +import os +import shutil import numpy as np from scipy.sparse import csr_matrix from sklearn.metrics import r2_score @@ -18,12 +19,21 @@ from pycompss.api.api import compss_wait_on +DIRPATH = "tests/files/saving" + class CBORSavingTest(unittest.TestCase): + def setUp(self) -> None: + os.makedirs(DIRPATH, exist_ok=True) + return super().setUp() + + def tearDown(self) -> None: + shutil.rmtree(DIRPATH) + return super().tearDown() def test_saving_kmeans(self): file_ = "tests/files/libsvm/2" - filepath = "tests/files/saving/kmeans.cbor" + filepath = os.path.join(DIRPATH, "kmeans.cbor") x_sp, _ = ds.load_svmlight_file(file_, (10, 300), 780, True) x_ds, _ = ds.load_svmlight_file(file_, (10, 300), 780, False) @@ -52,7 +62,7 @@ def test_saving_kmeans(self): def test_saving_gm(self): file_ = "tests/files/libsvm/2" - filepath = "tests/files/saving/gm.cbor" + filepath = os.path.join(DIRPATH, "gm.cbor") x_sparse, _ = ds.load_svmlight_file(file_, (10, 780), 780, True) x_dense, _ = ds.load_svmlight_file(file_, (10, 780), 780, False) @@ -85,7 +95,7 @@ def test_saving_gm(self): def test_saving_csvm(self): seed = 666 train = "tests/files/libsvm/3" - filepath = "tests/files/saving/csvm.cbor" + filepath = os.path.join(DIRPATH, "csvm.cbor") x_sp, y_sp = ds.load_svmlight_file(train, (10, 300), 780, True) x_d, y_d = ds.load_svmlight_file(train, (10, 300), 780, False) @@ -119,7 +129,7 @@ def test_saving_csvm(self): self.assertTrue(np.array_equal(coef_d, coef_d2)) def test_saving_rf_class(self): - filepath = "tests/files/saving/rf_class.cbor" + filepath = os.path.join(DIRPATH, "rf_class.cbor") x, y = make_classification( n_samples=3000, n_features=10, @@ -153,7 +163,7 @@ def test_saving_rf_class(self): self.assertGreater(accuracy2, 0.7) def test_saving_rf_regr(self): - filepath = "tests/files/saving/rf_regr.cbor" + filepath = os.path.join(DIRPATH, "rf_regr.cbor") def determination_coefficient(y_true, y_pred): u = np.sum(np.square(y_true - y_pred)) @@ -195,7 +205,7 @@ def determination_coefficient(y_true, y_pred): self.assertAlmostEqual(coef1, coef2) def test_saving_lasso(self): - filepath = "tests/files/saving/lasso.cbor" + filepath = os.path.join(DIRPATH, "lasso.cbor") np.random.seed(42) n_samples, n_features = 50, 100 @@ -229,7 +239,7 @@ def test_saving_lasso(self): self.assertAlmostEqual(r2_score_lasso2, 0.9481746925431124) def test_saving_linear(self): - filepath = "tests/files/saving/linear_regression.cbor" + filepath = os.path.join(DIRPATH, "linear_regression.cbor") x_data = np.array([[1, 2], [2, 0], [3, 1], [4, 4], [5, 3]]) y_data = np.array([2, 1, 1, 2, 4.5]) @@ -266,7 +276,7 @@ def test_saving_linear(self): self.assertTrue(np.allclose(pred, [2.1, 3.115625, 1.553125])) def test_saving_als(self): - filepath = "tests/files/saving/als.cbor" + filepath = os.path.join(DIRPATH, "als.cbor") data = np.array([[0, 0, 5], [3, 0, 5], [3, 1, 2]]) ratings = csr_matrix(data) diff --git a/tests/test_saving_json.py b/tests/test_saving_json.py index 0a19429f..783f9f31 100644 --- a/tests/test_saving_json.py +++ b/tests/test_saving_json.py @@ -1,5 +1,6 @@ import unittest - +import os +import shutil import numpy as np from scipy.sparse import csr_matrix from sklearn.metrics import r2_score @@ -18,12 +19,21 @@ from pycompss.api.api import compss_wait_on +DIRPATH = "tests/files/saving" + class JSONSavingTest(unittest.TestCase): + def setUp(self) -> None: + os.makedirs(DIRPATH, exist_ok=True) + return super().setUp() + + def tearDown(self) -> None: + shutil.rmtree(DIRPATH) + return super().tearDown() def test_saving_kmeans(self): file_ = "tests/files/libsvm/2" - filepath = "tests/files/saving/kmeans.json" + filepath = os.path.join(DIRPATH, "kmeans.json") x_sp, _ = ds.load_svmlight_file(file_, (10, 300), 780, True) x_ds, _ = ds.load_svmlight_file(file_, (10, 300), 780, False) @@ -52,7 +62,7 @@ def test_saving_kmeans(self): def test_saving_gm(self): file_ = "tests/files/libsvm/2" - filepath = "tests/files/saving/gm.json" + filepath = os.path.join(DIRPATH, "gm.json") x_sparse, _ = ds.load_svmlight_file(file_, (10, 780), 780, True) x_dense, _ = ds.load_svmlight_file(file_, (10, 780), 780, False) @@ -85,7 +95,7 @@ def test_saving_gm(self): def test_saving_csvm(self): seed = 666 train = "tests/files/libsvm/3" - filepath = "tests/files/saving/csvm.json" + filepath = os.path.join(DIRPATH, "csvm.json") x_sp, y_sp = ds.load_svmlight_file(train, (10, 300), 780, True) x_d, y_d = ds.load_svmlight_file(train, (10, 300), 780, False) @@ -119,7 +129,7 @@ def test_saving_csvm(self): self.assertTrue(np.array_equal(coef_d, coef_d2)) def test_saving_rf_class(self): - filepath = "tests/files/saving/rf_class.json" + filepath = os.path.join(DIRPATH, "rf_class.json") x, y = make_classification( n_samples=3000, n_features=10, @@ -153,7 +163,7 @@ def test_saving_rf_class(self): self.assertGreater(accuracy2, 0.7) def test_saving_rf_regr(self): - filepath = "tests/files/saving/rf_regr.json" + filepath = os.path.join(DIRPATH, "rf_regr.json") def determination_coefficient(y_true, y_pred): u = np.sum(np.square(y_true - y_pred)) @@ -195,7 +205,7 @@ def determination_coefficient(y_true, y_pred): self.assertAlmostEqual(coef1, coef2) def test_saving_lasso(self): - filepath = "tests/files/saving/lasso.json" + filepath = os.path.join(DIRPATH, "lasso.json") np.random.seed(42) n_samples, n_features = 50, 100 @@ -229,7 +239,7 @@ def test_saving_lasso(self): self.assertAlmostEqual(r2_score_lasso2, 0.9481746925431124) def test_saving_linear(self): - filepath = "tests/files/saving/linear_regression.json" + filepath = os.path.join(DIRPATH, "linear_regression.json") x_data = np.array([[1, 2], [2, 0], [3, 1], [4, 4], [5, 3]]) y_data = np.array([2, 1, 1, 2, 4.5]) @@ -266,7 +276,7 @@ def test_saving_linear(self): self.assertTrue(np.allclose(pred, [2.1, 3.115625, 1.553125])) def test_saving_als(self): - filepath = "tests/files/saving/als.json" + filepath = os.path.join(DIRPATH, "als.json") data = np.array([[0, 0, 5], [3, 0, 5], [3, 1, 2]]) ratings = csr_matrix(data) From da904273b08a2f222c9cf5e4a672516ce5263d7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Mon, 2 Aug 2021 11:17:25 +0200 Subject: [PATCH 42/46] Updated user guide with RF Regressor --- docs/source/user-guide.rst | 72 +++++++++++++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) diff --git a/docs/source/user-guide.rst b/docs/source/user-guide.rst index 8e91e7e2..3fb02dc1 100644 --- a/docs/source/user-guide.rst +++ b/docs/source/user-guide.rst @@ -294,7 +294,7 @@ scalability of the estimator is limited by the reduction phase of the cascade. Random forest classifier ........................ -:class:`RandomForestClassifier ` +:class:`RandomForestClassifier ` is a classifier that uses an ensemble of decision trees and aggregates their predictions. The process of building each decision tree includes some randomization in order to make them different. The accuracy of the joint @@ -565,6 +565,76 @@ shape ``(n_features, n_features)`` and process it as a single block. this with a distributed implementation of a method for solving a system of linear equations.) + +Random forest regressor +........................ + +:class:`RandomForestRegressor ` +is a regressor that uses an ensemble of decision trees and aggregates their +predictions. The process of building each decision tree includes some +randomization in order to make them different. The accuracy of the joint +prediction can be greater than that of individual decision trees. One advantage +of Random Forests is that you cannot overfit by increasing the number of +trees. Several variations of random forests have been proposed and implemented. +A fundamental paper that has been cited extensively is [Bre01]_, which +describes a method for classification problems that can be adapted to regression +problems: + + For building each tree, the original sample set is replaced by a set of the + same size, obtained by drawing with replacement (this method is called + bootstrap aggregating or bagging). At each tree node, a certain number of + random features is selected (random feature selection). The sample set + is splitted in two according to the values of these features, and a + metric called 'Mean Squared Error' is computed for every split. The MSE + measures the squared residuals with respect to the average value of the + target variables, which could be interpreted as a measure of the sample + variance. The split with the lowest MSE value is selected, and + the subsamples are propagated to the children nodes. The trees grown are + not pruned. + +Ensemble estimators can be implemented in an embarrassingly parallel pattern. +You can do this with scikit-learn's RandomForestClassifier using a +``joblib.parallel_backend`` and setting the ``n_jobs`` parameter. However, you +need to be able to load your data into memory for each processor or to use +memory mapped arrays, which can be tricky specially with a distributed backend. + +In our implementation, the samples as a whole are written into a binary file +and accessed using memory maps (the COMPSs runtime manages the transfers to +other nodes when needed). We used this approach because the performance penalty +of using distributed data was too large. Storing the samples file and saving +the decision trees introduces a big load to the disk storage of all nodes. If +your execution fails because you reach your disk storage limits, you can try +reducing the number of trees or reducing their size by setting the +``max_depth`` parameter. If this is not enough, you may consider reducing +the samples. + +In order to get further parallelism, each decision tree is not necessarily +built in a single task: there are tasks for building just a subtree, just a +node or even just part of a node. You can use the ``distr_depth`` parameter to +control the number of tasks used for each tree. However, be aware that the +number of tasks grows exponentially when you increase ``distr_depth``, and that +the task loads become very unbalanced. The fitted decision trees are not +synchronized, so the prediction is equally distributed. + +The results of the RandomForestRegressor can vary in every execution, due to +its random nature. To get reproducible results, a RandomState (pseudorandom +number generator) or an int can be provided to the ``random_state`` +parameter of the constructor. This works by passing a seed (generated by the +master's RandomState) to each task that uses randomness, and creating a new +RandomState inside the task. + +.. topic:: References: + + .. [Chan79] `Updating Formulae and a Pairwise Algorithm for Computing Sample Variances. + `_ + T. F. Chan, G. H. Golub, R. J. LeVeque, 1979 + Technical Report STAN-CS-79-773, Department of Computer Science, Stanford University. + .. [Tor99] `Inductive Learning of Tree-based Regression Models + `_ + L. Torgo, 1999 + Chapter 3, PhD Thesis, Faculdade de Ciè‚ncias da Universidade do Porto + + Decomposition ------------- From b79756f58d7595b104007815392f039723b67418 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Mon, 2 Aug 2021 11:55:09 +0200 Subject: [PATCH 43/46] Resolved conflicts with RF score --- dislib/classification/csvm/base.py | 2 +- dislib/classification/rf/forest.py | 310 ----------------------------- dislib/commons/rf/forest.py | 7 +- 3 files changed, 6 insertions(+), 313 deletions(-) delete mode 100644 dislib/classification/rf/forest.py diff --git a/dislib/classification/csvm/base.py b/dislib/classification/csvm/base.py index 8a052e0d..16e30741 100644 --- a/dislib/classification/csvm/base.py +++ b/dislib/classification/csvm/base.py @@ -192,7 +192,7 @@ def score(self, x, y, collect=False): Test samples. y : ds-array, shape=(n_samples, 1) True labels for x. - collect : bool + collect : bool, optional (default=False) When True, a synchronized result is returned. Returns diff --git a/dislib/classification/rf/forest.py b/dislib/classification/rf/forest.py deleted file mode 100644 index dd78b9e0..00000000 --- a/dislib/classification/rf/forest.py +++ /dev/null @@ -1,310 +0,0 @@ -import math -from collections import Counter - -import numpy as np -from pycompss.api.api import compss_wait_on -from pycompss.api.parameter import Type, COLLECTION_IN, Depth -from pycompss.api.task import task -from sklearn.base import BaseEstimator -from sklearn.utils import check_random_state - -from dislib.classification.rf.decision_tree import DecisionTreeClassifier -from dislib.data.array import Array -from dislib.utils.base import _paired_partition -from dislib.classification.rf._data import transform_to_rf_dataset - - -class RandomForestClassifier(BaseEstimator): - """A distributed random forest classifier. - - Parameters - ---------- - n_estimators : int, optional (default=10) - Number of trees to fit. - try_features : int, str or None, optional (default='sqrt') - The number of features to consider when looking for the best split: - - - If "sqrt", then `try_features=sqrt(n_features)`. - - If "third", then `try_features=n_features // 3`. - - If None, then `try_features=n_features`. - - Note: the search for a split does not stop until at least one - valid partition of the node samples is found, even if it requires - to effectively inspect more than ``try_features`` features. - max_depth : int or np.inf, optional (default=np.inf) - The maximum depth of the tree. If np.inf, then nodes are expanded - until all leaves are pure. - distr_depth : int or str, optional (default='auto') - Number of levels of the tree in which the nodes are split in a - distributed way. - sklearn_max: int or float, optional (default=1e8) - Maximum size (len(subsample)*n_features) of the arrays passed to - sklearn's DecisionTreeClassifier.fit(), which is called to fit subtrees - (subsamples) of our DecisionTreeClassifier. sklearn fit() is used - because it's faster, but requires loading the data to memory, which can - cause memory problems for large datasets. This parameter can be - adjusted to fit the hardware capabilities. - hard_vote : bool, optional (default=False) - If True, it uses majority voting over the predict() result of the - decision tree predictions. If False, it takes the class with the higher - probability given by predict_proba(), which is an average of the - probabilities given by the decision trees. - random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. - - Attributes - ---------- - classes : None or ndarray - Array of distinct classes, set at fit(). - trees : list of DecisionTreeClassifier - List of the tree classifiers of this forest, populated at fit(). - """ - - def __init__(self, - n_estimators=10, - try_features='sqrt', - max_depth=np.inf, - distr_depth='auto', - sklearn_max=1e8, - hard_vote=False, - random_state=None): - self.n_estimators = n_estimators - self.try_features = try_features - self.max_depth = max_depth - self.distr_depth = distr_depth - self.sklearn_max = sklearn_max - self.hard_vote = hard_vote - self.random_state = random_state - - def fit(self, x, y): - """Fits the RandomForestClassifier. - - Parameters - ---------- - x : ds-array, shape=(n_samples, n_features) - The training input samples. Internally, its dtype will be converted - to ``dtype=np.float32``. - y : ds-array, shape=(n_samples, 1) - The target values. - - Returns - ------- - self : RandomForestClassifier - - """ - self.classes = None - self.trees = [] - - dataset = transform_to_rf_dataset(x, y) - - n_features = dataset.get_n_features() - try_features = _resolve_try_features(self.try_features, n_features) - random_state = check_random_state(self.random_state) - - self.classes = dataset.get_classes() - - if self.distr_depth == 'auto': - dataset.n_samples = compss_wait_on(dataset.get_n_samples()) - distr_depth = max(0, int(math.log10(dataset.n_samples)) - 4) - distr_depth = min(distr_depth, self.max_depth) - else: - distr_depth = self.distr_depth - - for i in range(self.n_estimators): - tree = DecisionTreeClassifier(try_features, self.max_depth, - distr_depth, self.sklearn_max, - bootstrap=True, - random_state=random_state) - self.trees.append(tree) - - for tree in self.trees: - tree.fit(dataset) - - return self - - def predict_proba(self, x): - """Predicts class probabilities using a fitted forest. - - The probabilities are obtained as an average of the probabilities of - each decision tree. - - - Parameters - ---------- - x : ds-array, shape=(n_samples, n_features) - The input samples. - - Returns - ------- - probabilities : ds-array, shape=(n_samples, n_classes) - Predicted probabilities for the samples to belong to each class. - The columns of the array correspond to the classes given at - self.classes. - - """ - assert self.trees is not None, 'The random forest is not fitted.' - prob_blocks = [] - for x_row in x._iterator(axis=0): - tree_predictions = [] - for tree in self.trees: - tree_predictions.append(tree.predict_proba(x_row)) - prob_blocks.append([_join_predictions(*tree_predictions)]) - self.classes = compss_wait_on(self.classes) - n_classes = len(self.classes) - - probabilities = Array(blocks=prob_blocks, - top_left_shape=(x._top_left_shape[0], n_classes), - reg_shape=(x._reg_shape[0], n_classes), - shape=(x.shape[0], n_classes), sparse=False) - return probabilities - - def predict(self, x): - """Predicts classes using a fitted forest. - - Parameters - ---------- - x : ds-array, shape=(n_samples, n_features) - The input samples. - - Returns - ------- - y_pred : ds-array, shape=(n_samples, 1) - Predicted class labels for x. - - """ - assert self.trees is not None, 'The random forest is not fitted.' - pred_blocks = [] - if self.hard_vote: - for x_row in x._iterator(axis=0): - tree_predictions = [] - for tree in self.trees: - tree_predictions.append(tree.predict(x_row)) - pred_blocks.append(_hard_vote(self.classes, *tree_predictions)) - else: - for x_row in x._iterator(axis=0): - tree_predictions = [] - for tree in self.trees: - tree_predictions.append(tree.predict_proba(x_row)) - pred_blocks.append(_soft_vote(self.classes, *tree_predictions)) - - y_pred = Array(blocks=[pred_blocks], - top_left_shape=(x._top_left_shape[0], 1), - reg_shape=(x._reg_shape[0], 1), shape=(x.shape[0], 1), - sparse=False) - - return y_pred - - def score(self, x, y, collect=False): - """Accuracy classification score. - - Returns the mean accuracy on the given test data. - - - Parameters - ---------- - x : ds-array, shape=(n_samples, n_features) - The training input samples. - y : ds-array, shape (n_samples, 1) - The true labels. - collect : bool - When True, a synchronized result is returned. - - Returns - ------- - score : float (as future object) - Fraction of correctly classified samples. - - """ - assert self.trees is not None, 'The random forest is not fitted.' - partial_scores = [] - if self.hard_vote: - for x_row, y_row in _paired_partition(x, y): - tree_predictions = [] - for tree in self.trees: - tree_predictions.append(tree.predict(x_row)) - subset_score = _hard_vote_score(y_row._blocks, self.classes, - *tree_predictions) - partial_scores.append(subset_score) - else: - for x_row, y_row in _paired_partition(x, y): - tree_predictions = [] - for tree in self.trees: - tree_predictions.append(tree.predict_proba(x_row)) - subset_score = _soft_vote_score(y_row._blocks, self.classes, - *tree_predictions) - partial_scores.append(subset_score) - - score = _merge_scores(*partial_scores) - - return compss_wait_on(score) if collect else score - - -@task(returns=1) -def _resolve_try_features(try_features, n_features): - if try_features is None: - return n_features - elif try_features == 'sqrt': - return int(math.sqrt(n_features)) - elif try_features == 'third': - return max(1, n_features // 3) - else: - return int(try_features) - - -@task(returns=1) -def _join_predictions(*predictions): - aggregate = predictions[0] - for p in predictions[1:]: - aggregate += p - labels = aggregate / len(predictions) - return labels - - -@task(returns=1) -def _soft_vote(classes, *predictions): - aggregate = predictions[0] - for p in predictions[1:]: - aggregate += p - labels = classes[np.argmax(aggregate, axis=1)] - return labels - - -@task(returns=1) -def _hard_vote(classes, *predictions): - mode = np.empty((len(predictions[0]),), dtype=int) - for sample_i, votes in enumerate(zip(*predictions)): - mode[sample_i] = Counter(votes).most_common(1)[0][0] - labels = classes[mode] - return labels - - -@task(y_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) -def _soft_vote_score(y_blocks, classes, *predictions): - real_labels = Array._merge_blocks(y_blocks).flatten() - aggregate = predictions[0] - for p in predictions[1:]: - aggregate += p - predicted_labels = classes[np.argmax(aggregate, axis=1)] - correct = np.count_nonzero(predicted_labels == real_labels) - return correct, len(real_labels) - - -@task(y_blocks={Type: COLLECTION_IN, Depth: 2}, returns=1) -def _hard_vote_score(y_blocks, classes, *predictions): - real_labels = Array._merge_blocks(y_blocks).flatten() - mode = np.empty((len(predictions[0]),), dtype=int) - for sample_i, votes in enumerate(zip(*predictions)): - mode[sample_i] = Counter(votes).most_common(1)[0][0] - predicted_labels = classes[mode] - correct = np.count_nonzero(predicted_labels == real_labels) - return correct, len(real_labels) - - -@task(returns=1) -def _merge_scores(*partial_scores): - correct = sum(subset_score[0] for subset_score in partial_scores) - total = sum(subset_score[1] for subset_score in partial_scores) - return correct / total diff --git a/dislib/commons/rf/forest.py b/dislib/commons/rf/forest.py index bf121124..be2e668c 100644 --- a/dislib/commons/rf/forest.py +++ b/dislib/commons/rf/forest.py @@ -153,7 +153,7 @@ def predict(self, x): return y_pred - def score(self, x, y): + def score(self, x, y, collect=False): """Accuracy classification score. For classification returns the mean accuracy on the given test data. @@ -175,6 +175,9 @@ def score(self, x, y): The training input samples. y : ds-array, shape (n_samples, 1) The true labels. + collect : bool, optional (default=False) + When True, a synchronized result is returned. + Returns ------- @@ -218,7 +221,7 @@ def score(self, x, y): partial_scores.append(subset_score) score = _merge_regression_scores(*partial_scores) - return score + return compss_wait_on(score) if collect else score class RandomForestClassifier(BaseRandomForest): From 0445030815de04dfc086ed6227bdbc962acd21b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Fri, 6 Aug 2021 19:12:15 +0200 Subject: [PATCH 44/46] Added creation of features file --- dislib/commons/rf/data.py | 67 ++++++++++++++++++++++++++++----------- 1 file changed, 48 insertions(+), 19 deletions(-) diff --git a/dislib/commons/rf/data.py b/dislib/commons/rf/data.py index af9fb066..8e4bf546 100644 --- a/dislib/commons/rf/data.py +++ b/dislib/commons/rf/data.py @@ -253,7 +253,7 @@ def get_classes(self): def transform_to_rf_dataset( - x: Array, y: Array, task: str + x: Array, y: Array, task: str, features_file=False ) -> RfRegressorDataset or RfClassifierDataset: """Creates a RfDataset object from samples x and targets y. @@ -277,6 +277,7 @@ def transform_to_rf_dataset( n_samples = x.shape[0] n_features = x.shape[1] + # Samples samples_file = tempfile.NamedTemporaryFile( mode="wb", prefix="tmp_rf_samples_", delete=False ) @@ -293,6 +294,7 @@ def transform_to_rf_dataset( _fill_samples_file(samples_path, x_row._blocks, start_idx) start_idx += x._reg_shape[0] + # Targets targets_file = tempfile.NamedTemporaryFile( mode="w", prefix="tmp_rf_targets_", delete=False ) @@ -301,10 +303,34 @@ def transform_to_rf_dataset( for y_row in y._iterator(axis=0): _fill_targets_file(targets_path, y_row._blocks) + # Features + if features_file: + features_file = tempfile.NamedTemporaryFile( + mode="wb", prefix="tmp_rf_features_", delete=False + ) + features_path = features_file.name + features_file.close() + _allocate_features_file(features_path, n_samples, n_features) + + start_idx = 0 + row_blocks_iterator = x._iterator(axis=0) + top_row = next(row_blocks_iterator) + _fill_features_file(features_path, top_row._blocks, start_idx) + start_idx += x._top_left_shape[0] + for x_row in row_blocks_iterator: + _fill_features_file(features_path, x_row._blocks, start_idx) + start_idx += x._reg_shape[0] + else: + features_path = None + if task == "classification": - rf_dataset = RfClassifierDataset(samples_path, targets_path) + rf_dataset = RfClassifierDataset( + samples_path, targets_path, features_path + ) elif task == "regression": - rf_dataset = RfRegressorDataset(samples_path, targets_path) + rf_dataset = RfRegressorDataset( + samples_path, targets_path, features_path + ) else: raise ValueError("task must be either classification or regression.") rf_dataset.n_samples = n_samples @@ -361,21 +387,6 @@ def _get_values(targets_path): return y.astype(np.float64) -@task(returns=1) -def _get_samples_shape(subset): - return subset.samples.shape - - -@task(returns=3) -def _merge_shapes(*samples_shapes): - n_samples = 0 - n_features = samples_shapes[0][1] - for shape in samples_shapes: - n_samples += shape[0] - assert shape[1] == n_features, "Subsamples with different n_features." - return samples_shapes, n_samples, n_features - - @task(samples_path=FILE_INOUT) def _allocate_samples_file(samples_path, n_samples, n_features): np.lib.format.open_memmap( @@ -386,12 +397,30 @@ def _allocate_samples_file(samples_path, n_samples, n_features): ) +@task(samples_path=FILE_INOUT) +def _allocate_features_file(samples_path, n_samples, n_features): + np.lib.format.open_memmap( + samples_path, + mode="w+", + dtype="float32", + shape=(int(n_features), int(n_samples)), + ) + + @task(samples_path=FILE_INOUT, row_blocks={Type: COLLECTION_IN, Depth: 2}) def _fill_samples_file(samples_path, row_blocks, start_idx): rows_samples = Array._merge_blocks(row_blocks) rows_samples = rows_samples.astype(dtype="float32", casting="same_kind") samples = np.lib.format.open_memmap(samples_path, mode="r+") - samples[start_idx: start_idx + rows_samples.shape[0]] = rows_samples + samples[start_idx : start_idx + rows_samples.shape[0]] = rows_samples + + +@task(samples_path=FILE_INOUT, row_blocks={Type: COLLECTION_IN, Depth: 2}) +def _fill_features_file(samples_path, row_blocks, start_idx): + rows_samples = Array._merge_blocks(row_blocks) + rows_samples = rows_samples.astype(dtype="float32", casting="same_kind") + samples = np.lib.format.open_memmap(samples_path, mode="r+") + samples[:, start_idx : start_idx + rows_samples.shape[0]] = rows_samples.T @task(targets_path=FILE_INOUT, row_blocks={Type: COLLECTION_IN, Depth: 2}) From 579f9935ff5b6ae85a8ee7329e9f2620995e2d79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Fri, 6 Aug 2021 19:37:23 +0200 Subject: [PATCH 45/46] Added tests for decision tree --- tests/test_decision_tree.py | 171 ++++++++++++++++++++++++++++++++++++ tests/test_rf_dataset.py | 77 +++++++--------- 2 files changed, 201 insertions(+), 47 deletions(-) create mode 100644 tests/test_decision_tree.py diff --git a/tests/test_decision_tree.py b/tests/test_decision_tree.py new file mode 100644 index 00000000..e935dc56 --- /dev/null +++ b/tests/test_decision_tree.py @@ -0,0 +1,171 @@ +import unittest + +import numpy as np +from pycompss.api.api import compss_wait_on + +import dislib as ds +import dislib.commons.rf.decision_tree as dt +import dislib.commons.rf.data as data + + +class DecisionTreeTest(unittest.TestCase): + def test_decision_tree(self): + x1 = np.array( + [ + [0.3, -0.3], + [0.4, -0.5], + [0.5, -0.4], + [0.3, 0.3], + [0.4, 0.5], + [0.5, 0.4], + [-0.3, -0.3], + [-0.4, -0.5], + [-0.5, -0.4], + ] + ) + x2 = np.array([[0.4, -0.3], [0.4, 0.3], [-0.4, -0.3]]) + y1 = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) + y2 = np.array([0, 1, 2]) + + x1_ds = ds.array(x1, (3, 2)) + x2_ds = ds.array(x2, (3, 2)) + y1_ds = ds.array(y1[:, np.newaxis], (3, 1)) + + data1 = data.transform_to_rf_dataset( + x1_ds, y1_ds, "classification", features_file=True + ) + + # Model + try_features = 2 + max_depth = np.inf + distr_depth = 2 + sklearn_max = 1e8 + bootstrap = True + seed = 0 + random_state = np.random.RandomState(seed) + n_samples, n_features = x1.shape + n_classes = np.bincount(y1).shape[0] + features_mmap = x1.T + + # Test bootstrap + sample1, y_s1 = compss_wait_on( + dt._sample_selection(n_samples, y1, True, seed) + ) + sample2, y_s2 = compss_wait_on( + dt._sample_selection(n_samples, y1, False, seed) + ) + self.assertTrue( + np.array_equal(sample1, np.array([0, 2, 3, 3, 3, 4, 5, 5, 7])) + ) + self.assertTrue( + np.array_equal(sample2, np.array([0, 1, 2, 3, 4, 5, 6, 7, 8])) + ) + self.assertTrue( + np.array_equal(y_s1, np.array([0, 0, 1, 1, 1, 1, 1, 1, 2])) + ) + self.assertTrue( + np.array_equal(y_s2, np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])) + ) + + # Assert split wrapper + sample, y_s = sample2, y_s2 + with self.assertRaises(ValueError): + dt._split_node_wrapper( + sample, + n_features, + y_s, + n_classes, + try_features, + random_state, + samples_file=None, + features_file=None, + ) + + split = dt._split_node_wrapper( + sample, + n_features, + y_s, + n_classes, + try_features, + random_state, + samples_file=data1.samples_path, + features_file=data1.features_path, + ) + split = compss_wait_on(split) + node_info, left_group, y_l, right_group, y_r = split + self.assertTrue(node_info.index in (0, 1)) + if node_info.index == 0: + self.assertTrue(np.array_equal(left_group, np.array([6, 7, 8]))) + self.assertTrue(np.array_equal(y_l, np.array([2, 2, 2]))) + self.assertTrue( + np.array_equal(right_group, np.array([0, 1, 2, 3, 4, 5])) + ) + self.assertTrue(np.array_equal(y_r, np.array([0, 0, 0, 1, 1, 1]))) + self.assertAlmostEqual(node_info.value, 0.0) + split_l = dt._compute_split( + left_group, + n_features, + y_l, + n_classes, + try_features, + features_mmap, + random_state, + ) + node_info, left_group, y_l, right_group, y_r = split_l + self.assertTrue(np.array_equal(left_group, np.array([6, 7, 8]))) + self.assertTrue(np.array_equal(y_l, np.array([2, 2, 2]))) + self.assertTrue(np.array_equal(right_group, np.array([]))) + self.assertTrue(np.array_equal(y_r, np.array([]))) + self.assertTrue( + np.array_equal(node_info.frequencies, np.array([0, 0, 3])) + ) + self.assertEqual(node_info.size, 3) + self.assertEqual(node_info.target, 2) + elif node_info.index == 1: + self.assertTrue( + np.array_equal(left_group, np.array([0, 1, 2, 6, 7, 8])) + ) + self.assertTrue(np.array_equal(y_l, np.array([0, 0, 0, 2, 2, 2]))) + self.assertTrue(np.array_equal(right_group, np.array([3, 4, 5]))) + self.assertTrue(np.array_equal(y_r, np.array([1, 1, 1]))) + self.assertAlmostEqual(node_info.value, 0.0) + split_r = dt._compute_split( + right_group, + n_features, + y_r, + n_classes, + try_features, + features_mmap, + random_state, + ) + node_info, left_group, y_l, right_group, y_r = split_r + self.assertTrue(np.array_equal(left_group, np.array([3, 4, 5]))) + self.assertTrue(np.array_equal(y_l, np.array([1, 1, 1]))) + self.assertTrue(np.array_equal(right_group, np.array([]))) + self.assertTrue(np.array_equal(y_r, np.array([]))) + self.assertTrue( + np.array_equal(node_info.frequencies, np.array([0, 3, 0])) + ) + self.assertEqual(node_info.size, 3) + self.assertEqual(node_info.target, 1) + + # Test tree + tree = dt.DecisionTreeClassifier( + try_features, + max_depth, + distr_depth, + sklearn_max, + bootstrap, + random_state, + ) + tree.fit(data1) + y_pred = compss_wait_on(tree.predict(x2_ds)) + self.assertTrue(np.array_equal(y_pred, y2)) + + +def main(): + unittest.main() + + +if __name__ == "__main__": + main() diff --git a/tests/test_rf_dataset.py b/tests/test_rf_dataset.py index c70664e1..86eceaf8 100644 --- a/tests/test_rf_dataset.py +++ b/tests/test_rf_dataset.py @@ -104,10 +104,10 @@ def test_rf_dataset(self): # Dataset creation rf_regr = data.transform_to_rf_dataset( - x_ds_1, y_ds_1, "regression" + x_ds_1, y_ds_1, "regression", features_file=True ) rf_class = data.transform_to_rf_dataset( - x_ds_1, y_ds_1, "classification" + x_ds_1, y_ds_1, "classification", features_file=True ) self.assertEquals(compss_wait_on(rf_regr.get_n_samples()), 900) self.assertEquals(compss_wait_on(rf_regr.get_n_features()), 10) @@ -137,34 +137,22 @@ def test_rf_dataset(self): self.assertEqual(value, np.float64(np.inf)) -def _fill_samples_file( - samples_path, row_blocks, start_idx, fortran_order -): +def _fill_samples_file(samples_path, row_blocks, start_idx, fortran_order): rows_samples = Array._merge_blocks(row_blocks) - rows_samples = rows_samples.astype( - dtype="float32", casting="same_kind" - ) + rows_samples = rows_samples.astype(dtype="float32", casting="same_kind") samples = np.lib.format.open_memmap( samples_path, mode="r+", fortran_order=fortran_order ) - samples[start_idx: start_idx + rows_samples.shape[0]] = ( - rows_samples - ) + samples[start_idx : start_idx + rows_samples.shape[0]] = rows_samples -def _fill_features_file( - samples_path, row_blocks, start_idx, fortran_order -): - rows_samples = Array._merge_blocks(row_blocks).T - rows_samples = rows_samples.astype( - dtype="float32", casting="same_kind" - ) +def _fill_features_file(samples_path, row_blocks, start_idx, fortran_order): + rows_samples = Array._merge_blocks(row_blocks) + rows_samples = rows_samples.astype(dtype="float32", casting="same_kind") samples = np.lib.format.open_memmap( samples_path, mode="r+", fortran_order=fortran_order ) - samples[start_idx: start_idx + rows_samples.shape[1]] = ( - rows_samples - ) + samples[:, start_idx : start_idx + rows_samples.shape[0]] = rows_samples.T def _fill_targets_file(targets_path, row_blocks): @@ -177,7 +165,7 @@ def save_samples(x, samples_path, fortran_order): n_samples = x.shape[0] n_features = x.shape[1] - open(samples_path, 'w').close() + open(samples_path, "w").close() np.lib.format.open_memmap( samples_path, mode="w+", @@ -188,9 +176,7 @@ def save_samples(x, samples_path, fortran_order): start_idx = 0 row_blocks_iterator = x._iterator(axis=0) top_row = next(row_blocks_iterator) - _fill_samples_file( - samples_path, top_row._blocks, start_idx, fortran_order - ) + _fill_samples_file(samples_path, top_row._blocks, start_idx, fortran_order) start_idx += x._top_left_shape[0] for x_row in row_blocks_iterator: _fill_samples_file( @@ -200,7 +186,7 @@ def save_samples(x, samples_path, fortran_order): def save_targets(y, targets_path): - open(targets_path, 'w').close() + open(targets_path, "w").close() for y_row in y._iterator(axis=0): _fill_targets_file(targets_path, y_row._blocks) @@ -209,33 +195,30 @@ def save_features(x, features_path, fortran_order): n_samples = x.shape[0] n_features = x.shape[1] - if features_path is not None: - np.lib.format.open_memmap( - features_path, - mode="w+", - dtype="float32", - fortran_order=fortran_order, - shape=(int(n_features), int(n_samples)), - ) - start_idx = 0 - col_blocks_iterator = x._iterator(axis=1) - left_col = next(col_blocks_iterator) + np.lib.format.open_memmap( + features_path, + mode="w+", + dtype="float32", + fortran_order=fortran_order, + shape=(int(n_features), int(n_samples)), + ) + start_idx = 0 + row_blocks_iterator = x._iterator(axis=0) + top_row = next(row_blocks_iterator) + _fill_features_file( + features_path, top_row._blocks, start_idx, fortran_order + ) + start_idx += x._top_left_shape[0] + for x_row in row_blocks_iterator: _fill_features_file( - features_path, left_col._blocks, - start_idx, fortran_order + features_path, x_row._blocks, start_idx, fortran_order ) - start_idx += x._top_left_shape[1] - for x_row in col_blocks_iterator: - _fill_features_file( - features_path, x_row._blocks, - start_idx, fortran_order - ) - start_idx += x._reg_shape[1] + start_idx += x._reg_shape[0] def main(): unittest.main() -if __name__ == '__main__': +if __name__ == "__main__": main() From 4a44cd3ee7728880e19bb99a96912c5cdc411ed3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Casades=C3=BAs=20Vila?= Date: Mon, 9 Aug 2021 09:38:31 +0200 Subject: [PATCH 46/46] Style changes --- dislib/commons/rf/data.py | 4 ++-- tests/test_rf_dataset.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dislib/commons/rf/data.py b/dislib/commons/rf/data.py index 8e4bf546..e5155bdc 100644 --- a/dislib/commons/rf/data.py +++ b/dislib/commons/rf/data.py @@ -412,7 +412,7 @@ def _fill_samples_file(samples_path, row_blocks, start_idx): rows_samples = Array._merge_blocks(row_blocks) rows_samples = rows_samples.astype(dtype="float32", casting="same_kind") samples = np.lib.format.open_memmap(samples_path, mode="r+") - samples[start_idx : start_idx + rows_samples.shape[0]] = rows_samples + samples[start_idx: start_idx + rows_samples.shape[0]] = rows_samples @task(samples_path=FILE_INOUT, row_blocks={Type: COLLECTION_IN, Depth: 2}) @@ -420,7 +420,7 @@ def _fill_features_file(samples_path, row_blocks, start_idx): rows_samples = Array._merge_blocks(row_blocks) rows_samples = rows_samples.astype(dtype="float32", casting="same_kind") samples = np.lib.format.open_memmap(samples_path, mode="r+") - samples[:, start_idx : start_idx + rows_samples.shape[0]] = rows_samples.T + samples[:, start_idx: start_idx + rows_samples.shape[0]] = rows_samples.T @task(targets_path=FILE_INOUT, row_blocks={Type: COLLECTION_IN, Depth: 2}) diff --git a/tests/test_rf_dataset.py b/tests/test_rf_dataset.py index 86eceaf8..de55fc76 100644 --- a/tests/test_rf_dataset.py +++ b/tests/test_rf_dataset.py @@ -143,7 +143,7 @@ def _fill_samples_file(samples_path, row_blocks, start_idx, fortran_order): samples = np.lib.format.open_memmap( samples_path, mode="r+", fortran_order=fortran_order ) - samples[start_idx : start_idx + rows_samples.shape[0]] = rows_samples + samples[start_idx: start_idx + rows_samples.shape[0]] = rows_samples def _fill_features_file(samples_path, row_blocks, start_idx, fortran_order): @@ -152,7 +152,7 @@ def _fill_features_file(samples_path, row_blocks, start_idx, fortran_order): samples = np.lib.format.open_memmap( samples_path, mode="r+", fortran_order=fortran_order ) - samples[:, start_idx : start_idx + rows_samples.shape[0]] = rows_samples.T + samples[:, start_idx: start_idx + rows_samples.shape[0]] = rows_samples.T def _fill_targets_file(targets_path, row_blocks):