Skip to content

Commit

Permalink
Correctly sort the columns after transforming/reverse_transforming in…
Browse files Browse the repository at this point in the history
… the `HyperTransformer` (#410)

* Correct order

* Update logic

* Revert changes

* Sort output columns

* Fix lint

* Add docstrings

* Add test case

* Drop support for multiple column transformer

* Remove support for multi-column transformers

* Add add_field_to_set tests

* Fix lint
  • Loading branch information
fealho authored Mar 1, 2022
1 parent 1af42fb commit 6c4d9d6
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 186 deletions.
13 changes: 8 additions & 5 deletions rdt/hyper_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ def get_final_output_columns(self, field):
else:
final_outputs.append(output)

return final_outputs
return sorted(final_outputs, reverse=True)

def get_transformer_tree_yaml(self):
"""Return yaml representation of transformers tree.
Expand Down Expand Up @@ -371,10 +371,6 @@ def _fit_field_transformer(self, data, field, transformer):
if self._field_in_data(output_field, data):
self._fit_field_transformer(data, output_field, next_transformer)

else:
if output_name not in self._output_columns:
self._output_columns.append(output_name)

return data

def _validate_all_fields_fitted(self):
Expand All @@ -383,6 +379,12 @@ def _validate_all_fields_fitted(self):
warnings.warn('The following fields were specified in the input arguments but not'
+ f'found in the data: {non_fitted_fields}')

def _sort_output_columns(self):
"""Sort ``_output_columns`` to follow the same order as the ``_input_columns``."""
for input_column in self._input_columns:
output_columns = self.get_final_output_columns(input_column)
self._output_columns.extend(output_columns)

def fit(self, data):
"""Fit the transformers to the data.
Expand All @@ -409,6 +411,7 @@ def fit(self, data):

self._validate_all_fields_fitted()
self._fitted = True
self._sort_output_columns()

def transform(self, data):
"""Transform the data.
Expand Down
104 changes: 0 additions & 104 deletions tests/integration/test_hyper_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,44 +47,6 @@ def _reverse_transform(self, data):
return data.astype('datetime64')


class DummyTransformerMultiColumn(BaseTransformer):

INPUT_TYPE = 'datetime'
OUTPUT_TYPES = {
'value': 'float',
}

def _fit(self, data):
pass

def _transform(self, data):
# Convert multiple columns into a single datetime
data.columns = [c.replace('_str.value', '') for c in data.columns]
data = pd.to_datetime(data)

float_data = data.to_numpy().astype(np.float64)
data_is_nan = data.isna().to_numpy().astype(np.float64)

output = dict(zip(
self.output_columns,
[float_data, data_is_nan]
))

output = pd.DataFrame(output).fillna(-1)

return output

def _reverse_transform(self, data):
datetimes = data.round().astype('datetime64[ns]')
out = pd.DataFrame({
'year': datetimes.dt.year,
'month': datetimes.dt.month,
'day': datetimes.dt.day,
})

return out


TEST_DATA_INDEX = [4, 6, 3, 8, 'a', 1.0, 2.0, 3.0]


Expand Down Expand Up @@ -239,72 +201,6 @@ def test_hypertransformer_field_transformers():
pd.testing.assert_frame_equal(expected_reversed, reverse_transformed)


def test_hypertransformer_field_transformers_multi_column_fields():
"""Test the HyperTransformer with ``field_transformers`` provided.
This test will make sure that fields made up of multiple columns are
properly handled if they are specified in the ``field_transformers``
argument. If the field has one column that is derived from another
transformer, then the other transformer should be transformed and
the multi-column field should be handled when all its columns are
present.
Setup:
- A dummy transformer that takes in a column for day, year and
month and creates one numeric value from it.
- A dummy transformer that takes a string value and parses the
float from it.
Input:
- A dict mapping each field to a dummy transformer.
- A dataframe with a numerical year, month and day column as well
as a string year, month and day column.
Expected behavior:
- The transformed data should contain all the ML ready data.
- The reverse transformed data should be the same as the input.
"""
# Setup
data = pd.DataFrame({
'year': [2001, 2002, 2003],
'month': [1, 2, 3],
'day': [1, 2, 3],
'year_str': ['2001', '2002', '2003'],
'month_str': ['1', '2', '3'],
'day_str': ['1', '2', '3'],
})
field_transformers = {
('year', 'month', 'day'): DummyTransformerMultiColumn,
'year_str': DummyTransformerNumerical,
'day_str': DummyTransformerNumerical,
'month_str': DummyTransformerNumerical,
('year_str.value', 'month_str.value', 'day_str.value'): DummyTransformerMultiColumn
}
expected_transformed = pd.DataFrame({
'year#month#day.value': [
9.783072e+17,
1.012608e+18,
1.046650e+18
],
'year_str.value#month_str.value#day_str.value.value': [
9.783072e+17,
1.012608e+18,
1.046650e+18
]
})
expected_reversed = data.copy()

# Run
ht = HyperTransformer(field_transformers=field_transformers)
ht.fit(data)
transformed = ht.transform(data)
reverse_transformed = ht.reverse_transform(transformed)

# Assert
pd.testing.assert_frame_equal(transformed, expected_transformed)
pd.testing.assert_frame_equal(expected_reversed, reverse_transformed)


def test_single_category():
"""Test that categorical variables with a single value are supported."""
# Setup
Expand Down
154 changes: 77 additions & 77 deletions tests/unit/test_hyper_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,52 @@

class TestHyperTransformer(TestCase):

def test__add_field_to_set_string(self):
"""Test the ``_add_field_to_set`` method.
Test that ``field`` is added to the ``field_set``.
Input:
- a field name.
- a set of field names.
Expected behavior:
- the passed field name should be added to the set of field names.
"""
# Setup
ht = HyperTransformer()
field = 'abc'
field_set = {'def', 'ghi'}

# Run
ht._add_field_to_set(field, field_set)

# Assert
assert field_set == {'abc', 'def', 'ghi'}

def test__add_field_to_set_tuple(self):
"""Test the ``_add_field_to_set`` method when given a tuple.
Test that each ``field`` name is added to the ``field_set``.
Input:
- a tuple of field names.
- a set of field names.
Expected behavior:
- the passed field names should be added to the set of field names.
"""
# Setup
ht = HyperTransformer()
field = ('abc', 'jkl')
field_set = {'def', 'ghi'}

# Run
ht._add_field_to_set(field, field_set)

# Assert
assert field_set == {'abc', 'def', 'ghi', 'jkl'}

def test__validate_field_transformers(self):
"""Test the ``_validate_field_transformers`` method.
Expand Down Expand Up @@ -322,7 +368,6 @@ def test__fit_field_transformer(self, get_transformer_instance_mock):
Output:
- A DataFrame with columns that result from transforming the
outputs of the original transformer.
- ``_output_columns`` should add the appropriate column names.
"""
# Setup
data = pd.DataFrame({'a': [1, 2, 3]})
Expand Down Expand Up @@ -366,7 +411,6 @@ def test__fit_field_transformer(self, get_transformer_instance_mock):
'a.out1': ['2', '4', '6'],
'a.out2': [1, 2, 3]
})
assert ht._output_columns == ['a.out1.value', 'a.out2']
pd.testing.assert_frame_equal(out, expected)
transformer1.fit.assert_called_once()
transformer1.transform.assert_called_once_with(data)
Expand Down Expand Up @@ -443,81 +487,6 @@ def test__fit_field_transformer_multi_column_field_not_ready(
transformer2.fit.assert_not_called()
assert ht._transformers_sequence == [transformer1]

@patch('rdt.hyper_transformer.get_transformer_instance')
def test__fit_field_transformer_multi_column_field_ready(self, get_transformer_instance_mock):
"""Test the ``_fit_field_transformer`` method.
This tests that the ``_fit_field_transformer`` behaves as expected.
If the column is part of a multi-column field, and the other columns
are present in the data, then it should fit the next transformer.
It should also transform the data.
Setup:
- A mock for ``get_transformer_instance``.
- A mock for the transformer returned by ``get_transformer_instance``.
The ``get_output_types`` method will return one output that is part of
a multi-column field.
- A mock for ``_multi_column_fields`` to return the multi-column field.
Input:
- A DataFrame with the other columns in the multi-column field.
- A column name to fit the transformer to.
- A transformer.
Output:
- A DataFrame with columns that result from transforming the
outputs of the original transformer.
- ``_output_columns`` should add the column name of the output of
the transformer used on the multi-column field.
"""
# Setup
data = pd.DataFrame({
'a': [1, 2, 3],
'b.out1': ['4', '5', '6']
})
transformed_data1 = pd.DataFrame({
'a.out1': ['1', '2', '3'],
'b.out1': ['4', '5', '6']
})
transformer1 = Mock()
transformer2 = Mock()
transformer1.get_output_types.return_value = {
'a.out1': 'categorical'
}
transformer1.get_next_transformers.return_value = None
transformer1.transform.return_value = transformed_data1
transformer2.get_output_types.return_value = {
'a.out1#b.out1': 'numerical'
}
get_transformer_instance_mock.side_effect = [
transformer1,
transformer2
]
ht = HyperTransformer()
ht._get_next_transformer = Mock()
ht._get_next_transformer.side_effect = [
transformer2,
None
]
ht._multi_column_fields = Mock()
ht._multi_column_fields.get.return_value = ('a.out1', 'b.out1')

# Run
out = ht._fit_field_transformer(data, 'a', transformer1)

# Assert
expected = pd.DataFrame({
'a.out1': ['1', '2', '3'],
'b.out1': ['4', '5', '6']
})
assert ht._output_columns == ['a.out1#b.out1']
pd.testing.assert_frame_equal(out, expected)
transformer1.fit.assert_called_once()
transformer1.transform.assert_called_once_with(data)
transformer2.fit.assert_called_once()
transformer2.transform.assert_called_once()
assert ht._transformers_sequence == [transformer1, transformer2]

@patch('rdt.hyper_transformer.warnings')
def test__validate_all_fields_fitted(self, warnings_mock):
"""Test the ``_validate_all_fields_fitted`` method.
Expand Down Expand Up @@ -550,6 +519,35 @@ def test__validate_all_fields_fitted(self, warnings_mock):
# Assert
warnings_mock.warn.assert_called_once()

def test__sort_output_columns(self):
"""Test the ``_sort_output_columns`` method.
Assert the method correctly sorts the ``_output_columns`` attribute
according to ``_input_columns``.
Setup:
- Initialize the ``HyperTransformer`` with:
- A mock for the ``get_final_output_columns`` method, with ``side_effect``
set to the lists of generated output columns for each input column.
- A list of columns names for ``_input_columns``.
Expected behavior:
- ``_output_columns`` should be sorted according to the ``_input_columns``.
"""
# Setup
ht = HyperTransformer()
ht.get_final_output_columns = Mock()
ht.get_final_output_columns.side_effect = [
['a.is_null'], ['b.value', 'b.is_null'], ['c.value']
]
ht._input_columns = ['a', 'b', 'c']

# Run
ht._sort_output_columns()

# Assert
assert ht._output_columns == ['a.is_null', 'b.value', 'b.is_null', 'c.value']

def get_data(self):
return pd.DataFrame({
'integer': [1, 2, 1, 3],
Expand Down Expand Up @@ -641,6 +639,7 @@ def test_fit(self, get_default_transformer_mock):
ht._field_in_set = Mock()
ht._field_in_set.side_effect = [True, True, False, False, False]
ht._validate_all_fields_fitted = Mock()
ht._sort_output_columns = Mock()

# Run
ht.fit(data)
Expand All @@ -654,6 +653,7 @@ def test_fit(self, get_default_transformer_mock):
call(data, 'datetime', datetime_transformer)
])
ht._validate_all_fields_fitted.assert_called_once()
ht._sort_output_columns.assert_called_once()

def test_transform(self):
"""Test the ``transform`` method.
Expand Down

0 comments on commit 6c4d9d6

Please sign in to comment.