Skip to content

Commit

Permalink
Update boolean transformer (#228)
Browse files Browse the repository at this point in the history
* Update boolean transformer

* Adapt the BooleanTransformer to the new BaseTransformer

* Temporarily support both new and old style transformers

* Make columns attributes public

* Temporarily support old and new style datasets

* Test new-style transformers properly

* Temporarily disable W0212 PyLint warning

Co-authored-by: Carles Sala <carles@pythiac.com>
  • Loading branch information
fealho and csala authored Sep 24, 2021
1 parent 03ecfec commit ef2bb78
Show file tree
Hide file tree
Showing 7 changed files with 146 additions and 88 deletions.
28 changes: 22 additions & 6 deletions rdt/hyper_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,8 +154,12 @@ def fit(self, data):
self._transformers = self._analyze(data)

for column_name, transformer in self._transformers.items():
column = data[column_name]
transformer.fit(column)
try:
column = data[column_name]
transformer.fit(column)
except TypeError:
# temporarily support both old and new style transformers
transformer.fit(data, column_name)

def transform(self, data):
"""Transform the data.
Expand All @@ -176,8 +180,13 @@ def transform(self, data):
drop_columns = []
for column_name, transformer in self._transformers.items():
if column_name in data:
column = data[column_name]
transformed = transformer.transform(column)
try:
column = data[column_name]
transformed = transformer.transform(column)
except AttributeError:
# temporarily support both old and new style transformers
transformed = transformer.transform(data)
transformed = transformed[transformer.output_columns].to_numpy()

shape = transformed.shape

Expand Down Expand Up @@ -244,8 +253,15 @@ def reverse_transform(self, data):
for column_name, transformer in self._transformers.items():
columns = self._get_columns(data, column_name)
if not columns.empty:
columns_data = data[columns].values
reversed_data = transformer.reverse_transform(columns_data)
try:
columns_data = data[columns].values
reversed_data = transformer.reverse_transform(columns_data)
except AttributeError:
# temporarily support both old and new style transformers
rename = dict(zip(columns, transformer.output_columns))
reversed_data = transformer.reverse_transform(data.rename(columns=rename))
reversed_data = reversed_data[transformer.columns[0]].to_numpy()

data[column_name] = reversed_data
drop_columns.extend(set(columns) - {column_name})

Expand Down
40 changes: 20 additions & 20 deletions rdt/transformers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ class BaseTransformer:
COMPOSITION_IS_IDENTITY = None
NEXT_TRANSFORMERS = None

_columns = None
_column_prefix = None
_output_columns = None
columns = None
column_prefix = None
output_columns = None

@classmethod
def get_input_type(cls):
Expand All @@ -36,7 +36,7 @@ def _add_prefix(self, dictionary):

output = {}
for output_columns, output_type in dictionary.items():
output[f'{self._column_prefix}.{output_columns}'] = output_type
output[f'{self.column_prefix}.{output_columns}'] = output_type

return output

Expand Down Expand Up @@ -95,7 +95,7 @@ def _store_columns(self, columns, data):
if missing:
raise KeyError(f'Columns {missing} were not present in the data.')

self._columns = columns
self.columns = columns

@staticmethod
def _get_columns_data(data, columns):
Expand All @@ -112,14 +112,14 @@ def _set_columns_data(data, columns_data, columns):
data[columns] = columns_data

def _build_output_columns(self, data):
self._column_prefix = '#'.join(self._columns)
self._output_columns = list(self.get_output_types().keys())
self.column_prefix = '#'.join(self.columns)
self.output_columns = list(self.get_output_types().keys())

# make sure none of the generated `output_columns` exists in the data
data_columns = set(data.columns)
while data_columns & set(self._output_columns):
self._column_prefix += '#'
self._output_columns = list(self.get_output_types().keys())
while data_columns & set(self.output_columns):
self.column_prefix += '#'
self.output_columns = list(self.get_output_types().keys())

def _fit(self, columns_data):
"""Fit the transformer to the data.
Expand All @@ -141,7 +141,7 @@ def fit(self, data, columns):
"""
self._store_columns(columns, data)

columns_data = self._get_columns_data(data, self._columns)
columns_data = self._get_columns_data(data, self.columns)
self._fit(columns_data)

self._build_output_columns(data)
Expand All @@ -160,7 +160,7 @@ def _transform(self, columns_data):
raise NotImplementedError()

def transform(self, data):
"""Transform the `self._columns` of the `data`.
"""Transform the `self.columns` of the `data`.
Args:
data (pandas.DataFrame):
Expand All @@ -171,16 +171,16 @@ def transform(self, data):
The entire table, containing the transformed data.
"""
# if `data` doesn't have the columns that were fitted on, don't transform
if any(column not in data.columns for column in self._columns):
if any(column not in data.columns for column in self.columns):
return data

data = data.copy()

columns_data = self._get_columns_data(data, self._columns)
columns_data = self._get_columns_data(data, self.columns)
transformed_data = self._transform(columns_data)

self._set_columns_data(data, transformed_data, self._output_columns)
data.drop(self._columns, axis=1, inplace=True)
self._set_columns_data(data, transformed_data, self.output_columns)
data.drop(self.columns, axis=1, inplace=True)

return data

Expand Down Expand Up @@ -227,15 +227,15 @@ def reverse_transform(self, data):
The entire table, containing the reverted data.
"""
# if `data` doesn't have the columns that were transformed, don't reverse_transform
if any(column not in data.columns for column in self._output_columns):
if any(column not in data.columns for column in self.output_columns):
return data

data = data.copy()

columns_data = self._get_columns_data(data, self._output_columns)
columns_data = self._get_columns_data(data, self.output_columns)
reversed_data = self._reverse_transform(columns_data)

self._set_columns_data(data, reversed_data, self._columns)
data.drop(self._output_columns, axis=1, inplace=True)
self._set_columns_data(data, reversed_data, self.columns)
data.drop(self.output_columns, axis=1, inplace=True)

return data
35 changes: 29 additions & 6 deletions rdt/transformers/boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,32 @@ class BooleanTransformer(BaseTransformer):
Defaults to ``None``.
"""

INPUT_TYPE = 'boolean'
DETERMINISTIC_TRANSFORM = True
DETERMINISTIC_REVERSE = True

null_transformer = None

def __init__(self, nan=-1, null_column=None):
self.nan = nan
self.null_column = null_column

def fit(self, data):
def get_output_types(self):
"""Return the output types returned by this transformer.
Returns:
dict:
Mapping from the transformed column names to the produced data types.
"""
output_types = {
'value': 'float',
}
if self.null_transformer and self.null_transformer.creates_null_column():
output_types['is_null'] = 'float'

return self._add_prefix(output_types)

def _fit(self, data):
"""Fit the transformer to the data.
Args:
Expand All @@ -46,7 +65,7 @@ def fit(self, data):
self.null_transformer = NullTransformer(self.nan, self.null_column, copy=True)
self.null_transformer.fit(data)

def transform(self, data):
def _transform(self, data):
"""Transform boolean to float.
The boolean values will be replaced by the corresponding integer
Expand All @@ -57,7 +76,7 @@ def transform(self, data):
Data to transform.
Returns
numpy.ndarray
pandas.DataFrame or pandas.Series
"""
if isinstance(data, np.ndarray):
data = pd.Series(data)
Expand All @@ -66,16 +85,20 @@ def transform(self, data):

return self.null_transformer.transform(data).astype(float)

def reverse_transform(self, data):
def _reverse_transform(self, data):
"""Transform float values back to the original boolean values.
Args:
data (numpy.ndarray):
data (pandas.DataFrame or pandas.Series):
Data to revert.
Returns:
pandas.Series
pandas.Series:
Reverted data.
"""
if not isinstance(data, np.ndarray):
data = data.values

if self.nan is not None:
data = self.null_transformer.reverse_transform(data)

Expand Down
8 changes: 4 additions & 4 deletions tests/integration/transformers/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def _fit(self, data):

def _transform(self, data):
out = pd.DataFrame(dict(zip(
self._output_columns,
self.output_columns,
[
data.astype(np.float).fillna(-1),
data.isnull().astype(np.float)
Expand All @@ -111,8 +111,8 @@ def _transform(self, data):
return out

def _reverse_transform(self, data):
out = data[self._output_columns[0]].round().astype(bool).astype(object)
out.iloc[data[self._output_columns[1]] == 1] = np.nan
out = data[self.output_columns[0]].round().astype(bool).astype(object)
out.iloc[data[self.output_columns[1]] == 1] = np.nan

return out

Expand Down Expand Up @@ -176,7 +176,7 @@ def _transform(self, data):
data = pd.to_datetime(data)

out = pd.DataFrame(dict(zip(
self._output_columns,
self.output_columns,
[
data.values.astype(np.float64),
data.isnull().astype(np.float64)
Expand Down
18 changes: 9 additions & 9 deletions tests/integration/transformers/test_boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,16 @@ def test_boolean_some_nans(self):
- The reversed transformed data
"""
# Setup
data = pd.Series([True, False, None, False])
data = pd.DataFrame([True, False, None, False], columns=['bool'])
transformer = BooleanTransformer()

# Run
transformer.fit(data)
transformer.fit(data, data.columns.to_list())
transformed = transformer.transform(data)
reverse = transformer.reverse_transform(transformed)

# Assert
pd.testing.assert_series_equal(reverse, data)
pd.testing.assert_frame_equal(reverse, data)

def test_boolean_all_nans(self):
"""Test BooleanTransformer on input with all nan values.
Expand All @@ -43,16 +43,16 @@ def test_boolean_all_nans(self):
- The reversed transformed data
"""
# Setup
data = pd.Series([None, None, None, None])
data = pd.DataFrame([None, None, None, None], columns=['bool'])
transformer = BooleanTransformer()

# Run
transformer.fit(data)
transformer.fit(data, data.columns.to_list())
transformed = transformer.transform(data)
reverse = transformer.reverse_transform(transformed)

# Assert
pd.testing.assert_series_equal(reverse, data)
pd.testing.assert_frame_equal(reverse, data)

def test_boolean_input_unchanged(self):
"""Test BooleanTransformer on input with some nan values.
Expand All @@ -69,15 +69,15 @@ def test_boolean_input_unchanged(self):
- The intermediate transformed data is unchanged.
"""
# Setup
data = pd.Series([True, False, None, False])
data = pd.DataFrame([True, False, None, False], columns=['bool'])
transformer = BooleanTransformer()

# Run
transformer.fit(data)
transformer.fit(data, data.columns.to_list())
transformed = transformer.transform(data)
unchanged_transformed = transformed.copy()
reverse = transformer.reverse_transform(transformed)

# Assert
pd.testing.assert_series_equal(reverse, data)
pd.testing.assert_frame_equal(reverse, data)
np.testing.assert_array_equal(unchanged_transformed, transformed)
39 changes: 29 additions & 10 deletions tests/performance/profiling.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Functions to profile performance of RDT Transformers."""

# pylint: disable=W0212

import timeit
import tracemalloc
from copy import deepcopy
Expand Down Expand Up @@ -35,7 +37,10 @@ def _set_memory_for_method(method, dataset, peak_memory):

def _profile_memory(method, dataset):
peak_memory = Value('i', 0)
profiling_process = Process(target=_set_memory_for_method, args=(method, dataset, peak_memory))
profiling_process = Process(
target=_set_memory_for_method,
args=(method, dataset, peak_memory)
)
profiling_process.start()
profiling_process.join()
return peak_memory.value
Expand Down Expand Up @@ -66,18 +71,32 @@ def profile_transformer(transformer, dataset_generator, transform_size, fit_size
"""
fit_size = fit_size or transform_size
fit_dataset = dataset_generator.generate(fit_size)
fit_time = _profile_time(transformer, 'fit', fit_dataset, copy=True)
fit_memory = _profile_memory(transformer.fit, fit_dataset)
transformer.fit(fit_dataset)

replace = transform_size > fit_size
transform_dataset = np.random.choice(fit_dataset, transform_size, replace=replace)
transform_time = _profile_time(transformer, 'transform', transform_dataset)
transform_memory = _profile_memory(transformer.transform, transform_dataset)

reverse_dataset = transformer.transform(transform_dataset)
reverse_time = _profile_time(transformer, 'reverse_transform', reverse_dataset)
reverse_memory = _profile_memory(transformer.reverse_transform, reverse_dataset)
try:
fit_time = _profile_time(transformer, 'fit', fit_dataset, copy=True)
fit_memory = _profile_memory(transformer.fit, fit_dataset)
transformer.fit(fit_dataset)

transform_time = _profile_time(transformer, 'transform', transform_dataset)
transform_memory = _profile_memory(transformer.transform, transform_dataset)

reverse_dataset = transformer.transform(transform_dataset)
reverse_time = _profile_time(transformer, 'reverse_transform', reverse_dataset)
reverse_memory = _profile_memory(transformer.reverse_transform, reverse_dataset)
except TypeError:
# temporarily support both old and new style transformers
fit_time = _profile_time(transformer, '_fit', fit_dataset, copy=True)
fit_memory = _profile_memory(transformer._fit, fit_dataset)
transformer._fit(fit_dataset)

transform_time = _profile_time(transformer, '_transform', transform_dataset)
transform_memory = _profile_memory(transformer._transform, transform_dataset)

reverse_dataset = transformer._transform(transform_dataset)
reverse_time = _profile_time(transformer, '_reverse_transform', reverse_dataset)
reverse_memory = _profile_memory(transformer._reverse_transform, reverse_dataset)

print('Fit Time', fit_time)
print('Fit Memory', fit_memory)
Expand Down
Loading

0 comments on commit ef2bb78

Please sign in to comment.