Skip to content

Commit

Permalink
Update numerical transformer (#227)
Browse files Browse the repository at this point in the history
* Updates the baseclass

* Update numerical transformer

* Addresses feedback

* Makes get_input_type a class method

* Fix documentation

* Fix one line bug

* Various improvements + some test cases

* Create _add_prefix

* Return data if columns not in data

* Multiple improvements + test cases

* Added test cases

* Fix lint

* Remove test cases.

* Fix a bunch of bugs

* Addresses feedback

* Update baseclass fixes (#242)

* Implement basic integration tests for BaseTransformer

* Fix failing tests

* Fix lint and disable some PyLint warnings

* Fix lint + general improvements

* Fix merge conflicts

* Fix lint

* Update datatypes.

* Updates numerical transformer to match new base class transformer

* Fixes numerical transformer and unit tests

* Add is_composition_identity

* Fix setup.cfg pyling disable syntax

* Fix performance bug

Co-authored-by: Carles Sala <carles@pythiac.com>
Co-authored-by: Carles Sala <carles@sdv.dev>
  • Loading branch information
3 people authored Sep 27, 2021
1 parent 3a83496 commit 28a6679
Show file tree
Hide file tree
Showing 3 changed files with 195 additions and 154 deletions.
63 changes: 51 additions & 12 deletions rdt/transformers/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,11 @@ class NumericalTransformer(BaseTransformer):
is given, there won't be a maximum.
"""

INPUT_TYPE = 'numerical'
DETERMINISTIC_TRANSFORM = True
DETERMINISTIC_REVERSE = True
COMPOSITION_IS_IDENTITY = True

null_transformer = None
nan = None
_dtype = None
Expand All @@ -70,11 +75,39 @@ def __init__(self, dtype=None, nan='mean', null_column=None, rounding=None,
self.min_value = min_value
self.max_value = max_value

def get_output_types(self):
"""Return the output types supported by the transformer.
Returns:
dict:
Mapping from the transformed column names to supported data types.
"""
output_types = {
'value': 'float',
}
if self.null_transformer and self.null_transformer.creates_null_column():
output_types['is_null'] = 'float'

return self._add_prefix(output_types)

def is_composition_identity(self):
"""Return whether composition of transform and reverse transform produces the input data.
Returns:
bool:
Whether or not transforming and then reverse transforming returns the input data.
"""
if self.null_transformer and not self.null_transformer.creates_null_column():
return False

return self.COMPOSITION_IS_IDENTITY

@staticmethod
def _learn_rounding_digits(data):
# check if data has any decimals
data = np.array(data)
roundable_data = data[~(np.isinf(data) | pd.isnull(data))]
if (roundable_data % 1 != 0).any():
if ((roundable_data % 1) != 0).any():
if not (roundable_data == roundable_data.round(MAX_DECIMALS)).all():
return None

Expand All @@ -91,11 +124,11 @@ def _learn_rounding_digits(data):

return None

def fit(self, data):
def _fit(self, data):
"""Fit the transformer to the data.
Args:
data (pandas.Series or numpy.ndarray):
data (pandas.DataFrame or pandas.Series):
Data to fit.
"""
if isinstance(data, np.ndarray):
Expand All @@ -113,7 +146,7 @@ def fit(self, data):
self.null_transformer = NullTransformer(self.nan, self.null_column, copy=True)
self.null_transformer.fit(data)

def transform(self, data):
def _transform(self, data):
"""Transform numerical data.
Integer values are replaced by their float equivalent. Non null float values
Expand All @@ -131,7 +164,7 @@ def transform(self, data):

return self.null_transformer.transform(data)

def reverse_transform(self, data):
def _reverse_transform(self, data):
"""Convert data back into the original format.
Args:
Expand All @@ -141,6 +174,9 @@ def reverse_transform(self, data):
Returns:
numpy.ndarray
"""
if not isinstance(data, np.ndarray):
data = data.to_numpy()

if self._min_value is not None or self._max_value is not None:
if len(data.shape) > 1:
data[:, 0] = data[:, 0].clip(self._min_value, self._max_value)
Expand Down Expand Up @@ -296,7 +332,7 @@ def _get_univariate(self):

raise TypeError('Invalid distribution: {}'.format(distribution))

def fit(self, data):
def _fit(self, data):
"""Fit the transformer to the data.
Args:
Expand All @@ -305,8 +341,8 @@ def fit(self, data):
"""
self._univariate = self._get_univariate()

super().fit(data)
data = super().transform(data)
super()._fit(data)
data = super()._transform(data)
if data.ndim > 1:
data = data[:, 0]

Expand All @@ -316,7 +352,7 @@ def _copula_transform(self, data):
cdf = self._univariate.cdf(data)
return scipy.stats.norm.ppf(cdf.clip(0 + EPSILON, 1 - EPSILON))

def transform(self, data):
def _transform(self, data):
"""Transform numerical data.
Args:
Expand All @@ -326,15 +362,15 @@ def transform(self, data):
Returns:
numpy.ndarray
"""
transformed = super().transform(data)
transformed = super()._transform(data)
if transformed.ndim > 1:
transformed[:, 0] = self._copula_transform(transformed[:, 0])
else:
transformed = self._copula_transform(transformed)

return transformed

def reverse_transform(self, data):
def _reverse_transform(self, data):
"""Convert data back into the original format.
Args:
Expand All @@ -344,9 +380,12 @@ def reverse_transform(self, data):
Returns:
pandas.Series
"""
if not isinstance(data, np.ndarray):
data = data.to_numpy()

if data.ndim > 1:
data[:, 0] = self._univariate.ppf(scipy.stats.norm.cdf(data[:, 0]))
else:
data = self._univariate.ppf(scipy.stats.norm.cdf(data))

return super().reverse_transform(data)
return super()._reverse_transform(data)
77 changes: 39 additions & 38 deletions tests/integration/transformers/test_numerical.py
Original file line number Diff line number Diff line change
@@ -1,60 +1,61 @@
import numpy as np
import pandas as pd

from rdt.transformers.numerical import GaussianCopulaTransformer, NumericalTransformer


class TestNumericalTransformer:

def test_null_column(self):
data = np.array([1, 2, 1, 2, np.nan, 1])
data = pd.DataFrame([1, 2, 1, 2, np.nan, 1], columns=['a'])

nt = NumericalTransformer()
nt.fit(data)
nt.fit(data, list(data.columns))
transformed = nt.transform(data)

assert isinstance(transformed, np.ndarray)
assert isinstance(transformed, pd.DataFrame)
assert transformed.shape == (6, 2)
assert list(transformed[:, 1]) == [0, 0, 0, 0, 1, 0]
assert list(transformed.iloc[:, 1]) == [0, 0, 0, 0, 1, 0]

reverse = nt.reverse_transform(transformed)

np.testing.assert_array_almost_equal(reverse, data, decimal=2)

def test_not_null_column(self):
data = np.array([1, 2, 1, 2, np.nan, 1])
data = pd.DataFrame([1, 2, 1, 2, np.nan, 1], columns=['a'])

nt = NumericalTransformer(null_column=False)
nt.fit(data)
nt.fit(data, list(data.columns))
transformed = nt.transform(data)

assert isinstance(transformed, np.ndarray)
assert transformed.shape == (6, )
assert isinstance(transformed, pd.DataFrame)
assert transformed.shape == (6, 1)

reverse = nt.reverse_transform(transformed)

np.testing.assert_array_almost_equal(reverse, data, decimal=2)

def test_int(self):
data = np.array([1, 2, 1, 2, 1])
data = pd.DataFrame([1, 2, 1, 2, 1], columns=['a'])

nt = NumericalTransformer(dtype=int)
nt.fit(data)
nt.fit(data, list(data.columns))
transformed = nt.transform(data)

assert isinstance(transformed, np.ndarray)
assert transformed.shape == (5, )
assert isinstance(transformed, pd.DataFrame)
assert transformed.shape == (5, 1)

reverse = nt.reverse_transform(transformed)
assert list(reverse) == [1, 2, 1, 2, 1]
assert list(reverse['a']) == [1, 2, 1, 2, 1]

def test_int_nan(self):
data = np.array([1, 2, 1, 2, 1, np.nan])
data = pd.DataFrame([1, 2, 1, 2, 1, np.nan], columns=['a'])

nt = NumericalTransformer(dtype=int)
nt.fit(data)
nt.fit(data, list(data.columns))
transformed = nt.transform(data)

assert isinstance(transformed, np.ndarray)
assert isinstance(transformed, pd.DataFrame)
assert transformed.shape == (6, 2)

reverse = nt.reverse_transform(transformed)
Expand All @@ -64,72 +65,72 @@ def test_int_nan(self):
class TestGaussianCopulaTransformer:

def test_stats(self):
data = np.random.normal(loc=4, scale=4, size=1000)
data = pd.DataFrame(np.random.normal(loc=4, scale=4, size=1000), columns=['a'])

ct = GaussianCopulaTransformer()
ct.fit(data)
ct.fit(data, list(data.columns))
transformed = ct.transform(data)

assert isinstance(transformed, np.ndarray)
assert transformed.shape == (1000, )
assert isinstance(transformed, pd.DataFrame)
assert transformed.shape == (1000, 1)

np.testing.assert_almost_equal(transformed.mean(), 0, decimal=1)
np.testing.assert_almost_equal(transformed.std(), 1, decimal=1)
np.testing.assert_almost_equal(transformed['a.value'].mean(), 0, decimal=1)
np.testing.assert_almost_equal(transformed['a.value'].std(), 1, decimal=1)

reverse = ct.reverse_transform(transformed)

np.testing.assert_array_almost_equal(reverse, data, decimal=1)

def test_null_column(self):
data = np.array([1, 2, 1, 2, np.nan, 1])
data = pd.DataFrame([1, 2, 1, 2, np.nan, 1], columns=['a'])

ct = GaussianCopulaTransformer()
ct.fit(data)
ct.fit(data, list(data.columns))
transformed = ct.transform(data)

assert isinstance(transformed, np.ndarray)
assert isinstance(transformed, pd.DataFrame)
assert transformed.shape == (6, 2)
assert list(transformed[:, 1]) == [0, 0, 0, 0, 1, 0]
assert list(transformed.iloc[:, 1]) == [0, 0, 0, 0, 1, 0]

reverse = ct.reverse_transform(transformed)

np.testing.assert_array_almost_equal(reverse, data, decimal=2)

def test_not_null_column(self):
data = np.array([1, 2, 1, 2, np.nan, 1])
data = pd.DataFrame([1, 2, 1, 2, np.nan, 1], columns=['a'])

ct = GaussianCopulaTransformer(null_column=False)
ct.fit(data)
ct.fit(data, list(data.columns))
transformed = ct.transform(data)

assert isinstance(transformed, np.ndarray)
assert transformed.shape == (6, )
assert isinstance(transformed, pd.DataFrame)
assert transformed.shape == (6, 1)

reverse = ct.reverse_transform(transformed)

np.testing.assert_array_almost_equal(reverse, data, decimal=2)

def test_int(self):
data = np.array([1, 2, 1, 2, 1])
data = pd.DataFrame([1, 2, 1, 2, 1], columns=['a'])

ct = GaussianCopulaTransformer(dtype=int)
ct.fit(data)
ct.fit(data, list(data.columns))
transformed = ct.transform(data)

assert isinstance(transformed, np.ndarray)
assert transformed.shape == (5, )
assert isinstance(transformed, pd.DataFrame)
assert transformed.shape == (5, 1)

reverse = ct.reverse_transform(transformed)
assert list(reverse) == [1, 2, 1, 2, 1]
assert list(reverse['a']) == [1, 2, 1, 2, 1]

def test_int_nan(self):
data = np.array([1, 2, 1, 2, 1, np.nan])
data = pd.DataFrame([1, 2, 1, 2, 1, np.nan], columns=['a'])

ct = GaussianCopulaTransformer(dtype=int)
ct.fit(data)
ct.fit(data, list(data.columns))
transformed = ct.transform(data)

assert isinstance(transformed, np.ndarray)
assert isinstance(transformed, pd.DataFrame)
assert transformed.shape == (6, 2)

reverse = ct.reverse_transform(transformed)
Expand Down
Loading

0 comments on commit 28a6679

Please sign in to comment.