Skip to content

Commit

Permalink
Merge branch 'master' into trees-fix
Browse files Browse the repository at this point in the history
  • Loading branch information
michal-choinski committed Sep 30, 2021
2 parents 228faf8 + 8107c2e commit 6aab32b
Show file tree
Hide file tree
Showing 5 changed files with 213 additions and 7 deletions.
2 changes: 1 addition & 1 deletion dislib/data/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -1484,7 +1484,7 @@ def _block_apply(func, block, *args, **kwargs):

@task(block=INOUT)
def _set_value(block, i, j, value):
block[i][j] = value
block[i, j] = value


@task(blocks={Type: COLLECTION_IN, Depth: 1}, returns=1)
Expand Down
5 changes: 3 additions & 2 deletions dislib/preprocessing/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from dislib.preprocessing.classes import StandardScaler
from dislib.preprocessing.minmax_scaler import MinMaxScaler
from dislib.preprocessing.standard_scaler import StandardScaler

__all__ = ['StandardScaler']
__all__ = ['MinMaxScaler', 'StandardScaler']
113 changes: 113 additions & 0 deletions dislib/preprocessing/minmax_scaler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import numpy as np
from pycompss.api.parameter import Depth, Type, COLLECTION_IN, COLLECTION_OUT
from pycompss.api.task import task
from scipy.sparse import csr_matrix, issparse

from dislib.data.array import Array
import dislib as ds


class MinMaxScaler(object):
""" Standardize features by rescaling them to the provided range
Scaling happen independently on each feature by computing the relevant
statistics on the samples in the training set. Minimum and Maximum
values are then stored to be used on later data using the transform method.
Attributes
----------
"""

def __init__(self, feature_range=(0, 1)):
self._feature_range = feature_range
self.data_min_ = None
self.data_max_ = None

def fit(self, x):
""" Compute the min and max values for later scaling.
Parameters
----------
x : ds-array, shape=(n_samples, n_features)
Returns
-------
self : MinMaxScaler
"""

self.data_min_ = ds.apply_along_axis(np.min, 0, x)
self.data_max_ = ds.apply_along_axis(np.max, 0, x)

return self

def fit_transform(self, x):
""" Fit to data, then transform it.
Parameters
----------
x : ds-array, shape=(n_samples, n_features)
Returns
-------
x_new : ds-array, shape=(n_samples, n_features)
Scaled data.
"""
return self.fit(x).transform(x)

def transform(self, x):
"""
Standarize data.
Parameters
----------
x : ds-array, shape=(n_samples, n_features)
Returns
-------
x_new : ds-array, shape=(n_samples, n_features)
Scaled data.
"""
if self.data_min_ is None or self.data_max_ is None:
raise Exception("Model has not been initialized.")

n_blocks = x._n_blocks[1]
blocks = []
min_blocks = self.data_min_._blocks
max_blocks = self.data_max_._blocks

for row in x._iterator(axis=0):
out_blocks = [object() for _ in range(n_blocks)]
_transform(row._blocks, min_blocks, max_blocks, out_blocks,
self._feature_range[0], self._feature_range[1])
blocks.append(out_blocks)

return Array(blocks, top_left_shape=x._top_left_shape,
reg_shape=x._reg_shape, shape=x.shape,
sparse=x._sparse)


@task(blocks={Type: COLLECTION_IN, Depth: 2},
min_blocks={Type: COLLECTION_IN, Depth: 2},
max_blocks={Type: COLLECTION_IN, Depth: 2},
out_blocks=COLLECTION_OUT)
def _transform(blocks, min_blocks, max_blocks, out_blocks,
range_min, range_max):
x = Array._merge_blocks(blocks)
min_val = Array._merge_blocks(min_blocks)
max_val = Array._merge_blocks(max_blocks)
sparse = issparse(x)

if sparse:
x = x.toarray()
min_val = min_val.toarray()
max_val = max_val.toarray()

std_x = (x - min_val) / (max_val - min_val)
scaled_x = std_x * (range_max - range_min) + range_min

constructor_func = np.array if not sparse else csr_matrix
start, end = 0, 0

for i, block in enumerate(blocks[0]):
end += block.shape[1]
out_blocks[i] = constructor_func(scaled_x[:, start:end])
File renamed without changes.
100 changes: 96 additions & 4 deletions tests/test_preproc.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,105 @@
import unittest

import numpy as np
from numpy.testing._private.parameterized import parameterized
from pycompss.api.api import compss_wait_on
from scipy.sparse import csr_matrix, issparse
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler as SKScaler
from sklearn.preprocessing import StandardScaler as SkStandardScaler
from sklearn.preprocessing import MinMaxScaler as SkMinMaxScaler

import dislib as ds
from dislib.preprocessing import StandardScaler
from dislib.preprocessing import StandardScaler, MinMaxScaler


class MinMaxScalerTest(unittest.TestCase):
@parameterized.expand([((0, 1),),
((-1, 1),)])
def test_fit_transform(self, feature_range):
""" Tests fit_transform against scikit-learn.
"""
n_samples = 1500
x, y = make_blobs(n_samples=n_samples, random_state=170)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
x = np.dot(x, transformation)
ds_arr = ds.array(x, block_size=(300, 2))

sc1 = SkMinMaxScaler(feature_range=feature_range)
scaled_x = sc1.fit_transform(x)
sc2 = MinMaxScaler(feature_range=feature_range)
ds_scaled = sc2.fit_transform(ds_arr)

self.assertTrue(np.allclose(scaled_x, ds_scaled.collect()))
self.assertTrue(np.allclose(sc1.data_min_, sc2.data_min_.collect()))
self.assertTrue(np.allclose(sc1.data_max_, sc2.data_max_.collect()))
self.assertEqual(ds_scaled._top_left_shape,
ds_scaled._blocks[0][0].shape)
self.assertEqual(ds_arr._reg_shape, ds_scaled._reg_shape)
self.assertEqual(ds_arr._top_left_shape, ds_scaled._top_left_shape)
self.assertEqual(ds_arr.shape, ds_scaled.shape)
self.assertEqual(ds_arr._n_blocks, ds_scaled._n_blocks)

@parameterized.expand([((0, 1),),
((-1, 1),)])
def test_sparse(self, feature_range):
""" Tests fit_transforms with sparse data"""
n_samples = 1500
x, y = make_blobs(n_samples=n_samples, random_state=170)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
x = np.dot(x, transformation)

dense_arr = ds.array(x, block_size=(300, 2))
sparse_arr = ds.array(csr_matrix(x), block_size=(300, 2))

sc = MinMaxScaler(feature_range=feature_range)
dense_scaled = sc.fit_transform(dense_arr)
dense_min = sc.data_min_.collect()
dense_max = sc.data_max_.collect()

sparse_scaled = sc.fit_transform(sparse_arr)
sparse_min = sc.data_min_.collect()
sparse_max = sc.data_max_.collect()

csr_scaled = sparse_scaled.collect()
arr_scaled = dense_scaled.collect()

self.assertTrue(issparse(csr_scaled))
self.assertTrue(sparse_scaled._sparse)
self.assertTrue(sc.data_min_._sparse)
self.assertTrue(sc.data_max_._sparse)
self.assertTrue(issparse(sparse_min))
self.assertTrue(issparse(sparse_max))

self.assertTrue(np.allclose(csr_scaled.toarray(), arr_scaled))
self.assertTrue(np.allclose(sparse_min.toarray(), dense_min))
self.assertTrue(np.allclose(sparse_max.toarray(), dense_max))

@parameterized.expand([((0, 1),),
((-1, 1),)])
def test_irregular(self, feature_range):
""" Test with an irregular array """
n_samples = 1500
x, y = make_blobs(n_samples=n_samples, random_state=170)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
x = np.dot(x, transformation)
ds_arr = ds.array(x, block_size=(300, 2))
ds_arr = ds_arr[297:602]
x = x[297:602]

sc1 = SkMinMaxScaler(feature_range=feature_range)
scaled_x = sc1.fit_transform(x)
sc2 = MinMaxScaler(feature_range=feature_range)
ds_scaled = sc2.fit_transform(ds_arr)

self.assertTrue(np.allclose(scaled_x, ds_scaled.collect()))
self.assertTrue(np.allclose(sc1.data_min_, sc2.data_min_.collect()))
self.assertTrue(np.allclose(sc1.data_max_, sc2.data_max_.collect()))
self.assertEqual(ds_scaled._top_left_shape,
compss_wait_on(ds_scaled._blocks[0][0]).shape)
self.assertEqual(ds_arr._reg_shape, ds_scaled._reg_shape)
self.assertEqual(ds_arr._top_left_shape, ds_scaled._top_left_shape)
self.assertEqual(ds_arr.shape, ds_scaled.shape)
self.assertEqual(ds_arr._n_blocks, ds_scaled._n_blocks)


class StandardScalerTest(unittest.TestCase):
Expand All @@ -20,7 +112,7 @@ def test_fit_transform(self):
x = np.dot(x, transformation)
ds_arr = ds.array(x, block_size=(300, 2))

sc1 = SKScaler()
sc1 = SkStandardScaler()
scaled_x = sc1.fit_transform(x)
sc2 = StandardScaler()
ds_scaled = sc2.fit_transform(ds_arr)
Expand Down Expand Up @@ -78,7 +170,7 @@ def test_irregular(self):
ds_arr = ds_arr[297:602]
x = x[297:602]

sc1 = SKScaler()
sc1 = SkStandardScaler()
scaled_x = sc1.fit_transform(x)
sc2 = StandardScaler()
ds_scaled = sc2.fit_transform(ds_arr)
Expand Down

0 comments on commit 6aab32b

Please sign in to comment.