Correctly sort the columns after transforming/reverse_transforming in…

… the `HyperTransformer` (#410) * Correct order * Update logic * Revert changes * Sort output columns * Fix lint * Add docstrings * Add test case * Drop support for multiple column transformer * Remove support for multi-column transformers * Add add_field_to_set tests * Fix lint
sdv-dev · Mar 1, 2022 · 6c4d9d6 · 6c4d9d6
1 parent 1af42fb
commit 6c4d9d6
Show file tree

Hide file tree

Showing 3 changed files with 85 additions and 186 deletions.
diff --git a/rdt/hyper_transformer.py b/rdt/hyper_transformer.py
@@ -289,7 +289,7 @@ def get_final_output_columns(self, field):
             else:
                 final_outputs.append(output)
 
-        return final_outputs
+        return sorted(final_outputs, reverse=True)
 
     def get_transformer_tree_yaml(self):
         """Return yaml representation of transformers tree.
@@ -371,10 +371,6 @@ def _fit_field_transformer(self, data, field, transformer):
                 if self._field_in_data(output_field, data):
                     self._fit_field_transformer(data, output_field, next_transformer)
 
-            else:
-                if output_name not in self._output_columns:
-                    self._output_columns.append(output_name)
-
         return data
 
     def _validate_all_fields_fitted(self):
@@ -383,6 +379,12 @@ def _validate_all_fields_fitted(self):
             warnings.warn('The following fields were specified in the input arguments but not'
                           + f'found in the data: {non_fitted_fields}')
 
+    def _sort_output_columns(self):
+        """Sort ``_output_columns`` to follow the same order as the ``_input_columns``."""
+        for input_column in self._input_columns:
+            output_columns = self.get_final_output_columns(input_column)
+            self._output_columns.extend(output_columns)
+
     def fit(self, data):
         """Fit the transformers to the data.
 
@@ -409,6 +411,7 @@ def fit(self, data):
 
         self._validate_all_fields_fitted()
         self._fitted = True
+        self._sort_output_columns()
 
     def transform(self, data):
         """Transform the data.

diff --git a/tests/integration/test_hyper_transformer.py b/tests/integration/test_hyper_transformer.py
@@ -47,44 +47,6 @@ def _reverse_transform(self, data):
         return data.astype('datetime64')
 
 
-class DummyTransformerMultiColumn(BaseTransformer):
-
-    INPUT_TYPE = 'datetime'
-    OUTPUT_TYPES = {
-        'value': 'float',
-    }
-
-    def _fit(self, data):
-        pass
-
-    def _transform(self, data):
-        # Convert multiple columns into a single datetime
-        data.columns = [c.replace('_str.value', '') for c in data.columns]
-        data = pd.to_datetime(data)
-
-        float_data = data.to_numpy().astype(np.float64)
-        data_is_nan = data.isna().to_numpy().astype(np.float64)
-
-        output = dict(zip(
-            self.output_columns,
-            [float_data, data_is_nan]
-        ))
-
-        output = pd.DataFrame(output).fillna(-1)
-
-        return output
-
-    def _reverse_transform(self, data):
-        datetimes = data.round().astype('datetime64[ns]')
-        out = pd.DataFrame({
-            'year': datetimes.dt.year,
-            'month': datetimes.dt.month,
-            'day': datetimes.dt.day,
-        })
-
-        return out
-
-
 TEST_DATA_INDEX = [4, 6, 3, 8, 'a', 1.0, 2.0, 3.0]
 
 
@@ -239,72 +201,6 @@ def test_hypertransformer_field_transformers():
     pd.testing.assert_frame_equal(expected_reversed, reverse_transformed)
 
 
-def test_hypertransformer_field_transformers_multi_column_fields():
-    """Test the HyperTransformer with ``field_transformers`` provided.
-
-    This test will make sure that fields made up of multiple columns are
-    properly handled if they are specified in the ``field_transformers``
-    argument. If the field has one column that is derived from another
-    transformer, then the other transformer should be transformed and
-    the multi-column field should be handled when all its columns are
-    present.
-
-    Setup:
-        - A dummy transformer that takes in a column for day, year and
-        month and creates one numeric value from it.
-        - A dummy transformer that takes a string value and parses the
-        float from it.
-
-    Input:
-        - A dict mapping each field to a dummy transformer.
-        - A dataframe with a numerical year, month and day column as well
-        as a string year, month and day column.
-
-    Expected behavior:
-        - The transformed data should contain all the ML ready data.
-        - The reverse transformed data should be the same as the input.
-    """
-    # Setup
-    data = pd.DataFrame({
-        'year': [2001, 2002, 2003],
-        'month': [1, 2, 3],
-        'day': [1, 2, 3],
-        'year_str': ['2001', '2002', '2003'],
-        'month_str': ['1', '2', '3'],
-        'day_str': ['1', '2', '3'],
-    })
-    field_transformers = {
-        ('year', 'month', 'day'): DummyTransformerMultiColumn,
-        'year_str': DummyTransformerNumerical,
-        'day_str': DummyTransformerNumerical,
-        'month_str': DummyTransformerNumerical,
-        ('year_str.value', 'month_str.value', 'day_str.value'): DummyTransformerMultiColumn
-    }
-    expected_transformed = pd.DataFrame({
-        'year#month#day.value': [
-            9.783072e+17,
-            1.012608e+18,
-            1.046650e+18
-        ],
-        'year_str.value#month_str.value#day_str.value.value': [
-            9.783072e+17,
-            1.012608e+18,
-            1.046650e+18
-        ]
-    })
-    expected_reversed = data.copy()
-
-    # Run
-    ht = HyperTransformer(field_transformers=field_transformers)
-    ht.fit(data)
-    transformed = ht.transform(data)
-    reverse_transformed = ht.reverse_transform(transformed)
-
-    # Assert
-    pd.testing.assert_frame_equal(transformed, expected_transformed)
-    pd.testing.assert_frame_equal(expected_reversed, reverse_transformed)
-
-
 def test_single_category():
     """Test that categorical variables with a single value are supported."""
     # Setup

diff --git a/tests/unit/test_hyper_transformer.py b/tests/unit/test_hyper_transformer.py
@@ -15,6 +15,52 @@
 
 class TestHyperTransformer(TestCase):
 
+    def test__add_field_to_set_string(self):
+        """Test the ``_add_field_to_set`` method.
+
+        Test that ``field`` is added to the ``field_set``.
+
+        Input:
+            - a field name.
+            - a set of field names.
+
+        Expected behavior:
+            - the passed field name should be added to the set of field names.
+        """
+        # Setup
+        ht = HyperTransformer()
+        field = 'abc'
+        field_set = {'def', 'ghi'}
+
+        # Run
+        ht._add_field_to_set(field, field_set)
+
+        # Assert
+        assert field_set == {'abc', 'def', 'ghi'}
+
+    def test__add_field_to_set_tuple(self):
+        """Test the ``_add_field_to_set`` method when given a tuple.
+
+        Test that each ``field`` name is added to the ``field_set``.
+
+        Input:
+            - a tuple of field names.
+            - a set of field names.
+
+        Expected behavior:
+            - the passed field names should be added to the set of field names.
+        """
+        # Setup
+        ht = HyperTransformer()
+        field = ('abc', 'jkl')
+        field_set = {'def', 'ghi'}
+
+        # Run
+        ht._add_field_to_set(field, field_set)
+
+        # Assert
+        assert field_set == {'abc', 'def', 'ghi', 'jkl'}
+
     def test__validate_field_transformers(self):
         """Test the ``_validate_field_transformers`` method.
 
@@ -322,7 +368,6 @@ def test__fit_field_transformer(self, get_transformer_instance_mock):
         Output:
             - A DataFrame with columns that result from transforming the
             outputs of the original transformer.
-            - ``_output_columns`` should add the appropriate column names.
         """
         # Setup
         data = pd.DataFrame({'a': [1, 2, 3]})
@@ -366,7 +411,6 @@ def test__fit_field_transformer(self, get_transformer_instance_mock):
             'a.out1': ['2', '4', '6'],
             'a.out2': [1, 2, 3]
         })
-        assert ht._output_columns == ['a.out1.value', 'a.out2']
         pd.testing.assert_frame_equal(out, expected)
         transformer1.fit.assert_called_once()
         transformer1.transform.assert_called_once_with(data)
@@ -443,81 +487,6 @@ def test__fit_field_transformer_multi_column_field_not_ready(
         transformer2.fit.assert_not_called()
         assert ht._transformers_sequence == [transformer1]
 
-    @patch('rdt.hyper_transformer.get_transformer_instance')
-    def test__fit_field_transformer_multi_column_field_ready(self, get_transformer_instance_mock):
-        """Test the ``_fit_field_transformer`` method.
-
-        This tests that the ``_fit_field_transformer`` behaves as expected.
-        If the column is part of a multi-column field, and the other columns
-        are present in the data, then it should fit the next transformer.
-        It should also transform the data.
-
-        Setup:
-            - A mock for ``get_transformer_instance``.
-            - A mock for the transformer returned by ``get_transformer_instance``.
-            The ``get_output_types`` method will return one output that is part of
-            a multi-column field.
-            - A mock for ``_multi_column_fields`` to return the multi-column field.
-
-        Input:
-            - A DataFrame with the other columns in the multi-column field.
-            - A column name to fit the transformer to.
-            - A transformer.
-
-        Output:
-            - A DataFrame with columns that result from transforming the
-            outputs of the original transformer.
-            - ``_output_columns`` should add the column name of the output of
-            the transformer used on the multi-column field.
-        """
-        # Setup
-        data = pd.DataFrame({
-            'a': [1, 2, 3],
-            'b.out1': ['4', '5', '6']
-        })
-        transformed_data1 = pd.DataFrame({
-            'a.out1': ['1', '2', '3'],
-            'b.out1': ['4', '5', '6']
-        })
-        transformer1 = Mock()
-        transformer2 = Mock()
-        transformer1.get_output_types.return_value = {
-            'a.out1': 'categorical'
-        }
-        transformer1.get_next_transformers.return_value = None
-        transformer1.transform.return_value = transformed_data1
-        transformer2.get_output_types.return_value = {
-            'a.out1#b.out1': 'numerical'
-        }
-        get_transformer_instance_mock.side_effect = [
-            transformer1,
-            transformer2
-        ]
-        ht = HyperTransformer()
-        ht._get_next_transformer = Mock()
-        ht._get_next_transformer.side_effect = [
-            transformer2,
-            None
-        ]
-        ht._multi_column_fields = Mock()
-        ht._multi_column_fields.get.return_value = ('a.out1', 'b.out1')
-
-        # Run
-        out = ht._fit_field_transformer(data, 'a', transformer1)
-
-        # Assert
-        expected = pd.DataFrame({
-            'a.out1': ['1', '2', '3'],
-            'b.out1': ['4', '5', '6']
-        })
-        assert ht._output_columns == ['a.out1#b.out1']
-        pd.testing.assert_frame_equal(out, expected)
-        transformer1.fit.assert_called_once()
-        transformer1.transform.assert_called_once_with(data)
-        transformer2.fit.assert_called_once()
-        transformer2.transform.assert_called_once()
-        assert ht._transformers_sequence == [transformer1, transformer2]
-
     @patch('rdt.hyper_transformer.warnings')
     def test__validate_all_fields_fitted(self, warnings_mock):
         """Test the ``_validate_all_fields_fitted`` method.
@@ -550,6 +519,35 @@ def test__validate_all_fields_fitted(self, warnings_mock):
         # Assert
         warnings_mock.warn.assert_called_once()
 
+    def test__sort_output_columns(self):
+        """Test the ``_sort_output_columns`` method.
+
+        Assert the method correctly sorts the ``_output_columns`` attribute
+        according to ``_input_columns``.
+
+        Setup:
+            - Initialize the ``HyperTransformer`` with:
+                - A mock for the ``get_final_output_columns`` method, with ``side_effect``
+                set to the lists of generated output columns for each input column.
+                - A list of columns names for ``_input_columns``.
+
+        Expected behavior:
+            - ``_output_columns`` should be sorted according to the ``_input_columns``.
+        """
+        # Setup
+        ht = HyperTransformer()
+        ht.get_final_output_columns = Mock()
+        ht.get_final_output_columns.side_effect = [
+            ['a.is_null'], ['b.value', 'b.is_null'], ['c.value']
+        ]
+        ht._input_columns = ['a', 'b', 'c']
+
+        # Run
+        ht._sort_output_columns()
+
+        # Assert
+        assert ht._output_columns == ['a.is_null', 'b.value', 'b.is_null', 'c.value']
+
     def get_data(self):
         return pd.DataFrame({
             'integer': [1, 2, 1, 3],
@@ -641,6 +639,7 @@ def test_fit(self, get_default_transformer_mock):
         ht._field_in_set = Mock()
         ht._field_in_set.side_effect = [True, True, False, False, False]
         ht._validate_all_fields_fitted = Mock()
+        ht._sort_output_columns = Mock()
 
         # Run
         ht.fit(data)
@@ -654,6 +653,7 @@ def test_fit(self, get_default_transformer_mock):
             call(data, 'datetime', datetime_transformer)
         ])
         ht._validate_all_fields_fitted.assert_called_once()
+        ht._sort_output_columns.assert_called_once()
 
     def test_transform(self):
         """Test the ``transform`` method.