diff --git a/dislib/data/array.py b/dislib/data/array.py index 096f9aa6..f6c36ecb 100644 --- a/dislib/data/array.py +++ b/dislib/data/array.py @@ -1484,7 +1484,7 @@ def _block_apply(func, block, *args, **kwargs): @task(block=INOUT) def _set_value(block, i, j, value): - block[i][j] = value + block[i, j] = value @task(blocks={Type: COLLECTION_IN, Depth: 1}, returns=1) diff --git a/dislib/preprocessing/__init__.py b/dislib/preprocessing/__init__.py index 5e1d35cc..36bd67fb 100644 --- a/dislib/preprocessing/__init__.py +++ b/dislib/preprocessing/__init__.py @@ -1,3 +1,4 @@ -from dislib.preprocessing.classes import StandardScaler +from dislib.preprocessing.minmax_scaler import MinMaxScaler +from dislib.preprocessing.standard_scaler import StandardScaler -__all__ = ['StandardScaler'] +__all__ = ['MinMaxScaler', 'StandardScaler'] diff --git a/dislib/preprocessing/minmax_scaler.py b/dislib/preprocessing/minmax_scaler.py new file mode 100644 index 00000000..2e7a5ce4 --- /dev/null +++ b/dislib/preprocessing/minmax_scaler.py @@ -0,0 +1,113 @@ +import numpy as np +from pycompss.api.parameter import Depth, Type, COLLECTION_IN, COLLECTION_OUT +from pycompss.api.task import task +from scipy.sparse import csr_matrix, issparse + +from dislib.data.array import Array +import dislib as ds + + +class MinMaxScaler(object): + """ Standardize features by rescaling them to the provided range + + Scaling happen independently on each feature by computing the relevant + statistics on the samples in the training set. Minimum and Maximum + values are then stored to be used on later data using the transform method. + + Attributes + ---------- + """ + + def __init__(self, feature_range=(0, 1)): + self._feature_range = feature_range + self.data_min_ = None + self.data_max_ = None + + def fit(self, x): + """ Compute the min and max values for later scaling. + + Parameters + ---------- + x : ds-array, shape=(n_samples, n_features) + + Returns + ------- + self : MinMaxScaler + """ + + self.data_min_ = ds.apply_along_axis(np.min, 0, x) + self.data_max_ = ds.apply_along_axis(np.max, 0, x) + + return self + + def fit_transform(self, x): + """ Fit to data, then transform it. + + Parameters + ---------- + x : ds-array, shape=(n_samples, n_features) + + Returns + ------- + x_new : ds-array, shape=(n_samples, n_features) + Scaled data. + """ + return self.fit(x).transform(x) + + def transform(self, x): + """ + Standarize data. + + Parameters + ---------- + x : ds-array, shape=(n_samples, n_features) + + Returns + ------- + x_new : ds-array, shape=(n_samples, n_features) + Scaled data. + """ + if self.data_min_ is None or self.data_max_ is None: + raise Exception("Model has not been initialized.") + + n_blocks = x._n_blocks[1] + blocks = [] + min_blocks = self.data_min_._blocks + max_blocks = self.data_max_._blocks + + for row in x._iterator(axis=0): + out_blocks = [object() for _ in range(n_blocks)] + _transform(row._blocks, min_blocks, max_blocks, out_blocks, + self._feature_range[0], self._feature_range[1]) + blocks.append(out_blocks) + + return Array(blocks, top_left_shape=x._top_left_shape, + reg_shape=x._reg_shape, shape=x.shape, + sparse=x._sparse) + + +@task(blocks={Type: COLLECTION_IN, Depth: 2}, + min_blocks={Type: COLLECTION_IN, Depth: 2}, + max_blocks={Type: COLLECTION_IN, Depth: 2}, + out_blocks=COLLECTION_OUT) +def _transform(blocks, min_blocks, max_blocks, out_blocks, + range_min, range_max): + x = Array._merge_blocks(blocks) + min_val = Array._merge_blocks(min_blocks) + max_val = Array._merge_blocks(max_blocks) + sparse = issparse(x) + + if sparse: + x = x.toarray() + min_val = min_val.toarray() + max_val = max_val.toarray() + + std_x = (x - min_val) / (max_val - min_val) + scaled_x = std_x * (range_max - range_min) + range_min + + constructor_func = np.array if not sparse else csr_matrix + start, end = 0, 0 + + for i, block in enumerate(blocks[0]): + end += block.shape[1] + out_blocks[i] = constructor_func(scaled_x[:, start:end]) diff --git a/dislib/preprocessing/classes.py b/dislib/preprocessing/standard_scaler.py similarity index 100% rename from dislib/preprocessing/classes.py rename to dislib/preprocessing/standard_scaler.py diff --git a/tests/test_preproc.py b/tests/test_preproc.py index 06edc90e..090b9e86 100644 --- a/tests/test_preproc.py +++ b/tests/test_preproc.py @@ -1,13 +1,105 @@ import unittest import numpy as np +from numpy.testing._private.parameterized import parameterized from pycompss.api.api import compss_wait_on from scipy.sparse import csr_matrix, issparse from sklearn.datasets import make_blobs -from sklearn.preprocessing import StandardScaler as SKScaler +from sklearn.preprocessing import StandardScaler as SkStandardScaler +from sklearn.preprocessing import MinMaxScaler as SkMinMaxScaler import dislib as ds -from dislib.preprocessing import StandardScaler +from dislib.preprocessing import StandardScaler, MinMaxScaler + + +class MinMaxScalerTest(unittest.TestCase): + @parameterized.expand([((0, 1),), + ((-1, 1),)]) + def test_fit_transform(self, feature_range): + """ Tests fit_transform against scikit-learn. + """ + n_samples = 1500 + x, y = make_blobs(n_samples=n_samples, random_state=170) + transformation = [[0.6, -0.6], [-0.4, 0.8]] + x = np.dot(x, transformation) + ds_arr = ds.array(x, block_size=(300, 2)) + + sc1 = SkMinMaxScaler(feature_range=feature_range) + scaled_x = sc1.fit_transform(x) + sc2 = MinMaxScaler(feature_range=feature_range) + ds_scaled = sc2.fit_transform(ds_arr) + + self.assertTrue(np.allclose(scaled_x, ds_scaled.collect())) + self.assertTrue(np.allclose(sc1.data_min_, sc2.data_min_.collect())) + self.assertTrue(np.allclose(sc1.data_max_, sc2.data_max_.collect())) + self.assertEqual(ds_scaled._top_left_shape, + ds_scaled._blocks[0][0].shape) + self.assertEqual(ds_arr._reg_shape, ds_scaled._reg_shape) + self.assertEqual(ds_arr._top_left_shape, ds_scaled._top_left_shape) + self.assertEqual(ds_arr.shape, ds_scaled.shape) + self.assertEqual(ds_arr._n_blocks, ds_scaled._n_blocks) + + @parameterized.expand([((0, 1),), + ((-1, 1),)]) + def test_sparse(self, feature_range): + """ Tests fit_transforms with sparse data""" + n_samples = 1500 + x, y = make_blobs(n_samples=n_samples, random_state=170) + transformation = [[0.6, -0.6], [-0.4, 0.8]] + x = np.dot(x, transformation) + + dense_arr = ds.array(x, block_size=(300, 2)) + sparse_arr = ds.array(csr_matrix(x), block_size=(300, 2)) + + sc = MinMaxScaler(feature_range=feature_range) + dense_scaled = sc.fit_transform(dense_arr) + dense_min = sc.data_min_.collect() + dense_max = sc.data_max_.collect() + + sparse_scaled = sc.fit_transform(sparse_arr) + sparse_min = sc.data_min_.collect() + sparse_max = sc.data_max_.collect() + + csr_scaled = sparse_scaled.collect() + arr_scaled = dense_scaled.collect() + + self.assertTrue(issparse(csr_scaled)) + self.assertTrue(sparse_scaled._sparse) + self.assertTrue(sc.data_min_._sparse) + self.assertTrue(sc.data_max_._sparse) + self.assertTrue(issparse(sparse_min)) + self.assertTrue(issparse(sparse_max)) + + self.assertTrue(np.allclose(csr_scaled.toarray(), arr_scaled)) + self.assertTrue(np.allclose(sparse_min.toarray(), dense_min)) + self.assertTrue(np.allclose(sparse_max.toarray(), dense_max)) + + @parameterized.expand([((0, 1),), + ((-1, 1),)]) + def test_irregular(self, feature_range): + """ Test with an irregular array """ + n_samples = 1500 + x, y = make_blobs(n_samples=n_samples, random_state=170) + transformation = [[0.6, -0.6], [-0.4, 0.8]] + x = np.dot(x, transformation) + ds_arr = ds.array(x, block_size=(300, 2)) + ds_arr = ds_arr[297:602] + x = x[297:602] + + sc1 = SkMinMaxScaler(feature_range=feature_range) + scaled_x = sc1.fit_transform(x) + sc2 = MinMaxScaler(feature_range=feature_range) + ds_scaled = sc2.fit_transform(ds_arr) + + self.assertTrue(np.allclose(scaled_x, ds_scaled.collect())) + self.assertTrue(np.allclose(sc1.data_min_, sc2.data_min_.collect())) + self.assertTrue(np.allclose(sc1.data_max_, sc2.data_max_.collect())) + self.assertEqual(ds_scaled._top_left_shape, + compss_wait_on(ds_scaled._blocks[0][0]).shape) + self.assertEqual(ds_arr._reg_shape, ds_scaled._reg_shape) + self.assertEqual(ds_arr._top_left_shape, ds_scaled._top_left_shape) + self.assertEqual(ds_arr.shape, ds_scaled.shape) + self.assertEqual(ds_arr._n_blocks, ds_scaled._n_blocks) class StandardScalerTest(unittest.TestCase): @@ -20,7 +112,7 @@ def test_fit_transform(self): x = np.dot(x, transformation) ds_arr = ds.array(x, block_size=(300, 2)) - sc1 = SKScaler() + sc1 = SkStandardScaler() scaled_x = sc1.fit_transform(x) sc2 = StandardScaler() ds_scaled = sc2.fit_transform(ds_arr) @@ -78,7 +170,7 @@ def test_irregular(self): ds_arr = ds_arr[297:602] x = x[297:602] - sc1 = SKScaler() + sc1 = SkStandardScaler() scaled_x = sc1.fit_transform(x) sc2 = StandardScaler() ds_scaled = sc2.fit_transform(ds_arr)