From 6c4d9d6177d0a9d4302f6073a62fa419e0f92bb1 Mon Sep 17 00:00:00 2001 From: Felipe Alex Hofmann Date: Tue, 1 Mar 2022 09:27:03 -0800 Subject: [PATCH] Correctly sort the columns after transforming/reverse_transforming in the `HyperTransformer` (#410) * Correct order * Update logic * Revert changes * Sort output columns * Fix lint * Add docstrings * Add test case * Drop support for multiple column transformer * Remove support for multi-column transformers * Add add_field_to_set tests * Fix lint --- rdt/hyper_transformer.py | 13 +- tests/integration/test_hyper_transformer.py | 104 ------------- tests/unit/test_hyper_transformer.py | 154 ++++++++++---------- 3 files changed, 85 insertions(+), 186 deletions(-) diff --git a/rdt/hyper_transformer.py b/rdt/hyper_transformer.py index cf24207ff..ac012e5e3 100644 --- a/rdt/hyper_transformer.py +++ b/rdt/hyper_transformer.py @@ -289,7 +289,7 @@ def get_final_output_columns(self, field): else: final_outputs.append(output) - return final_outputs + return sorted(final_outputs, reverse=True) def get_transformer_tree_yaml(self): """Return yaml representation of transformers tree. @@ -371,10 +371,6 @@ def _fit_field_transformer(self, data, field, transformer): if self._field_in_data(output_field, data): self._fit_field_transformer(data, output_field, next_transformer) - else: - if output_name not in self._output_columns: - self._output_columns.append(output_name) - return data def _validate_all_fields_fitted(self): @@ -383,6 +379,12 @@ def _validate_all_fields_fitted(self): warnings.warn('The following fields were specified in the input arguments but not' + f'found in the data: {non_fitted_fields}') + def _sort_output_columns(self): + """Sort ``_output_columns`` to follow the same order as the ``_input_columns``.""" + for input_column in self._input_columns: + output_columns = self.get_final_output_columns(input_column) + self._output_columns.extend(output_columns) + def fit(self, data): """Fit the transformers to the data. @@ -409,6 +411,7 @@ def fit(self, data): self._validate_all_fields_fitted() self._fitted = True + self._sort_output_columns() def transform(self, data): """Transform the data. diff --git a/tests/integration/test_hyper_transformer.py b/tests/integration/test_hyper_transformer.py index 0c07d60b4..4dafe3a73 100644 --- a/tests/integration/test_hyper_transformer.py +++ b/tests/integration/test_hyper_transformer.py @@ -47,44 +47,6 @@ def _reverse_transform(self, data): return data.astype('datetime64') -class DummyTransformerMultiColumn(BaseTransformer): - - INPUT_TYPE = 'datetime' - OUTPUT_TYPES = { - 'value': 'float', - } - - def _fit(self, data): - pass - - def _transform(self, data): - # Convert multiple columns into a single datetime - data.columns = [c.replace('_str.value', '') for c in data.columns] - data = pd.to_datetime(data) - - float_data = data.to_numpy().astype(np.float64) - data_is_nan = data.isna().to_numpy().astype(np.float64) - - output = dict(zip( - self.output_columns, - [float_data, data_is_nan] - )) - - output = pd.DataFrame(output).fillna(-1) - - return output - - def _reverse_transform(self, data): - datetimes = data.round().astype('datetime64[ns]') - out = pd.DataFrame({ - 'year': datetimes.dt.year, - 'month': datetimes.dt.month, - 'day': datetimes.dt.day, - }) - - return out - - TEST_DATA_INDEX = [4, 6, 3, 8, 'a', 1.0, 2.0, 3.0] @@ -239,72 +201,6 @@ def test_hypertransformer_field_transformers(): pd.testing.assert_frame_equal(expected_reversed, reverse_transformed) -def test_hypertransformer_field_transformers_multi_column_fields(): - """Test the HyperTransformer with ``field_transformers`` provided. - - This test will make sure that fields made up of multiple columns are - properly handled if they are specified in the ``field_transformers`` - argument. If the field has one column that is derived from another - transformer, then the other transformer should be transformed and - the multi-column field should be handled when all its columns are - present. - - Setup: - - A dummy transformer that takes in a column for day, year and - month and creates one numeric value from it. - - A dummy transformer that takes a string value and parses the - float from it. - - Input: - - A dict mapping each field to a dummy transformer. - - A dataframe with a numerical year, month and day column as well - as a string year, month and day column. - - Expected behavior: - - The transformed data should contain all the ML ready data. - - The reverse transformed data should be the same as the input. - """ - # Setup - data = pd.DataFrame({ - 'year': [2001, 2002, 2003], - 'month': [1, 2, 3], - 'day': [1, 2, 3], - 'year_str': ['2001', '2002', '2003'], - 'month_str': ['1', '2', '3'], - 'day_str': ['1', '2', '3'], - }) - field_transformers = { - ('year', 'month', 'day'): DummyTransformerMultiColumn, - 'year_str': DummyTransformerNumerical, - 'day_str': DummyTransformerNumerical, - 'month_str': DummyTransformerNumerical, - ('year_str.value', 'month_str.value', 'day_str.value'): DummyTransformerMultiColumn - } - expected_transformed = pd.DataFrame({ - 'year#month#day.value': [ - 9.783072e+17, - 1.012608e+18, - 1.046650e+18 - ], - 'year_str.value#month_str.value#day_str.value.value': [ - 9.783072e+17, - 1.012608e+18, - 1.046650e+18 - ] - }) - expected_reversed = data.copy() - - # Run - ht = HyperTransformer(field_transformers=field_transformers) - ht.fit(data) - transformed = ht.transform(data) - reverse_transformed = ht.reverse_transform(transformed) - - # Assert - pd.testing.assert_frame_equal(transformed, expected_transformed) - pd.testing.assert_frame_equal(expected_reversed, reverse_transformed) - - def test_single_category(): """Test that categorical variables with a single value are supported.""" # Setup diff --git a/tests/unit/test_hyper_transformer.py b/tests/unit/test_hyper_transformer.py index f5ad28187..90c136009 100644 --- a/tests/unit/test_hyper_transformer.py +++ b/tests/unit/test_hyper_transformer.py @@ -15,6 +15,52 @@ class TestHyperTransformer(TestCase): + def test__add_field_to_set_string(self): + """Test the ``_add_field_to_set`` method. + + Test that ``field`` is added to the ``field_set``. + + Input: + - a field name. + - a set of field names. + + Expected behavior: + - the passed field name should be added to the set of field names. + """ + # Setup + ht = HyperTransformer() + field = 'abc' + field_set = {'def', 'ghi'} + + # Run + ht._add_field_to_set(field, field_set) + + # Assert + assert field_set == {'abc', 'def', 'ghi'} + + def test__add_field_to_set_tuple(self): + """Test the ``_add_field_to_set`` method when given a tuple. + + Test that each ``field`` name is added to the ``field_set``. + + Input: + - a tuple of field names. + - a set of field names. + + Expected behavior: + - the passed field names should be added to the set of field names. + """ + # Setup + ht = HyperTransformer() + field = ('abc', 'jkl') + field_set = {'def', 'ghi'} + + # Run + ht._add_field_to_set(field, field_set) + + # Assert + assert field_set == {'abc', 'def', 'ghi', 'jkl'} + def test__validate_field_transformers(self): """Test the ``_validate_field_transformers`` method. @@ -322,7 +368,6 @@ def test__fit_field_transformer(self, get_transformer_instance_mock): Output: - A DataFrame with columns that result from transforming the outputs of the original transformer. - - ``_output_columns`` should add the appropriate column names. """ # Setup data = pd.DataFrame({'a': [1, 2, 3]}) @@ -366,7 +411,6 @@ def test__fit_field_transformer(self, get_transformer_instance_mock): 'a.out1': ['2', '4', '6'], 'a.out2': [1, 2, 3] }) - assert ht._output_columns == ['a.out1.value', 'a.out2'] pd.testing.assert_frame_equal(out, expected) transformer1.fit.assert_called_once() transformer1.transform.assert_called_once_with(data) @@ -443,81 +487,6 @@ def test__fit_field_transformer_multi_column_field_not_ready( transformer2.fit.assert_not_called() assert ht._transformers_sequence == [transformer1] - @patch('rdt.hyper_transformer.get_transformer_instance') - def test__fit_field_transformer_multi_column_field_ready(self, get_transformer_instance_mock): - """Test the ``_fit_field_transformer`` method. - - This tests that the ``_fit_field_transformer`` behaves as expected. - If the column is part of a multi-column field, and the other columns - are present in the data, then it should fit the next transformer. - It should also transform the data. - - Setup: - - A mock for ``get_transformer_instance``. - - A mock for the transformer returned by ``get_transformer_instance``. - The ``get_output_types`` method will return one output that is part of - a multi-column field. - - A mock for ``_multi_column_fields`` to return the multi-column field. - - Input: - - A DataFrame with the other columns in the multi-column field. - - A column name to fit the transformer to. - - A transformer. - - Output: - - A DataFrame with columns that result from transforming the - outputs of the original transformer. - - ``_output_columns`` should add the column name of the output of - the transformer used on the multi-column field. - """ - # Setup - data = pd.DataFrame({ - 'a': [1, 2, 3], - 'b.out1': ['4', '5', '6'] - }) - transformed_data1 = pd.DataFrame({ - 'a.out1': ['1', '2', '3'], - 'b.out1': ['4', '5', '6'] - }) - transformer1 = Mock() - transformer2 = Mock() - transformer1.get_output_types.return_value = { - 'a.out1': 'categorical' - } - transformer1.get_next_transformers.return_value = None - transformer1.transform.return_value = transformed_data1 - transformer2.get_output_types.return_value = { - 'a.out1#b.out1': 'numerical' - } - get_transformer_instance_mock.side_effect = [ - transformer1, - transformer2 - ] - ht = HyperTransformer() - ht._get_next_transformer = Mock() - ht._get_next_transformer.side_effect = [ - transformer2, - None - ] - ht._multi_column_fields = Mock() - ht._multi_column_fields.get.return_value = ('a.out1', 'b.out1') - - # Run - out = ht._fit_field_transformer(data, 'a', transformer1) - - # Assert - expected = pd.DataFrame({ - 'a.out1': ['1', '2', '3'], - 'b.out1': ['4', '5', '6'] - }) - assert ht._output_columns == ['a.out1#b.out1'] - pd.testing.assert_frame_equal(out, expected) - transformer1.fit.assert_called_once() - transformer1.transform.assert_called_once_with(data) - transformer2.fit.assert_called_once() - transformer2.transform.assert_called_once() - assert ht._transformers_sequence == [transformer1, transformer2] - @patch('rdt.hyper_transformer.warnings') def test__validate_all_fields_fitted(self, warnings_mock): """Test the ``_validate_all_fields_fitted`` method. @@ -550,6 +519,35 @@ def test__validate_all_fields_fitted(self, warnings_mock): # Assert warnings_mock.warn.assert_called_once() + def test__sort_output_columns(self): + """Test the ``_sort_output_columns`` method. + + Assert the method correctly sorts the ``_output_columns`` attribute + according to ``_input_columns``. + + Setup: + - Initialize the ``HyperTransformer`` with: + - A mock for the ``get_final_output_columns`` method, with ``side_effect`` + set to the lists of generated output columns for each input column. + - A list of columns names for ``_input_columns``. + + Expected behavior: + - ``_output_columns`` should be sorted according to the ``_input_columns``. + """ + # Setup + ht = HyperTransformer() + ht.get_final_output_columns = Mock() + ht.get_final_output_columns.side_effect = [ + ['a.is_null'], ['b.value', 'b.is_null'], ['c.value'] + ] + ht._input_columns = ['a', 'b', 'c'] + + # Run + ht._sort_output_columns() + + # Assert + assert ht._output_columns == ['a.is_null', 'b.value', 'b.is_null', 'c.value'] + def get_data(self): return pd.DataFrame({ 'integer': [1, 2, 1, 3], @@ -641,6 +639,7 @@ def test_fit(self, get_default_transformer_mock): ht._field_in_set = Mock() ht._field_in_set.side_effect = [True, True, False, False, False] ht._validate_all_fields_fitted = Mock() + ht._sort_output_columns = Mock() # Run ht.fit(data) @@ -654,6 +653,7 @@ def test_fit(self, get_default_transformer_mock): call(data, 'datetime', datetime_transformer) ]) ht._validate_all_fields_fitted.assert_called_once() + ht._sort_output_columns.assert_called_once() def test_transform(self): """Test the ``transform`` method.