diff --git a/rdt/transformers/base.py b/rdt/transformers/base.py index c9128d340..51f7fa7fe 100644 --- a/rdt/transformers/base.py +++ b/rdt/transformers/base.py @@ -362,7 +362,7 @@ def _fit(self, columns_data): raise NotImplementedError() def _set_seed(self, data): - hash_value = self.get_input_column() + hash_value = self.columns[0] for value in data.head(5): hash_value += str(value) @@ -508,6 +508,14 @@ def get_input_columns(self): """ return self.columns + def _get_output_to_property(self, property_): + result = { + output_column: properties[property_] + for output_column, properties in self.output_properties.items() + } + + return result + def _fit(self, columns_data, columns_to_sdtypes): """Fit the transformer to the data. @@ -519,6 +527,7 @@ def _fit(self, columns_data, columns_to_sdtypes): """ raise NotImplementedError() + @random_state def fit(self, data, columns_to_sdtypes): """Fit the transformer to a ``column`` of the ``data``. @@ -528,9 +537,25 @@ def fit(self, data, columns_to_sdtypes): columns_to_sdtypes (dict): Dictionary mapping column names to their sdtypes. """ - column_names = list(columns_to_sdtypes.keys()) + column_names = tuple(columns_to_sdtypes.keys()) self._store_columns(column_names, data) self._set_seed(data) columns_data = self._get_columns_data(data, self.columns) self._fit(columns_data, columns_to_sdtypes) self._build_output_columns(data) + + def fit_transform(self, data, columns_to_sdtypes): + """Fit the transformer to a `column` of the `data` and then transform it. + + Args: + data (pandas.DataFrame): + The entire table. + columns_to_sdtypes (dict): + Dictionary mapping column names to their sdtypes. + + Returns: + pd.DataFrame: + The entire table, containing the transformed data. + """ + self.fit(data, columns_to_sdtypes) + return self.transform(data) diff --git a/tests/integration/transformers/test_base.py b/tests/integration/transformers/test_base.py index de96dfb99..defa5d901 100644 --- a/tests/integration/transformers/test_base.py +++ b/tests/integration/transformers/test_base.py @@ -3,7 +3,7 @@ import numpy as np import pandas as pd -from rdt.transformers.base import BaseTransformer +from rdt.transformers.base import BaseMultiColumnTransformer, BaseTransformer def test_dummy_transformer_series_output(): @@ -129,3 +129,172 @@ def _reverse_transform(self, data): }) pd.testing.assert_frame_equal(expected_transform, transformed) pd.testing.assert_frame_equal(reverse, data) + + +def test_multi_column_transformer_same_number_of_columns_input_output(): + """Test a multi-column transformer when the same of input and output columns.""" + # Setup + class AdditionTransformer(BaseMultiColumnTransformer): + """This transformer takes 3 columns and return the cumulative sum of each row.""" + def _fit(self, columns_data, columns_to_sdtypes): + self.output_properties = { + column: {'sdtype': 'numerical'} for column in self.columns + } + self.dtypes = columns_data.dtypes + + def _transform(self, data): + return data.cumsum(axis=1) + + def _reverse_transform(self, data): + result = data.diff(axis=1) + result.iloc[:, 0] = data.iloc[:, 0] + + return result.astype(self.dtypes) + + data_test = pd.DataFrame({ + 'col_1': [1, 2, 3], + 'col_2': [10, 20, 30], + 'col_3': [100, 200, 300] + }) + + column_to_sdtype = { + 'col_1': 'numerical', + 'col_2': 'numerical', + 'col_3': 'numerical' + } + transformer = AdditionTransformer() + + # Run + transformed = transformer.fit_transform(data_test, column_to_sdtype) + reverse = transformer.reverse_transform(transformed) + + # Assert + expected_transform = pd.DataFrame({ + 'col_1': [1, 2, 3], + 'col_2': [11, 22, 33], + 'col_3': [111, 222, 333] + }) + pd.testing.assert_frame_equal(expected_transform, transformed) + pd.testing.assert_frame_equal(reverse, data_test) + + +def test_multi_column_transformer_less_output_than_input_columns(): + """Test a multi-column transformer when the output has less columns than the input.""" + class ConcatenateTransformer(BaseMultiColumnTransformer): + """This transformer takes 4 columns and concatenate them into 2 columns. + The two first and last columns are concatenated together. + """ + def _fit(self, columns_data, columns_to_sdtypes): + self.name_1 = self.columns[0] + '#' + self.columns[1] + self.name_2 = self.columns[2] + '#' + self.columns[3] + self.output_properties = { + self.name_1: {'sdtype': 'categorical'}, + self.name_2: {'sdtype': 'categorical'} + } + self.dtypes = columns_data.dtypes + + def _transform(self, data): + data[self.name_1] = data.iloc[:, 0] + '#' + data.iloc[:, 1] + data[self.name_2] = data.iloc[:, 2] + '#' + data.iloc[:, 3] + + return data.drop(columns=self.columns) + + def _reverse_transform(self, data): + result = data.copy() + column_names = list(data.columns) + + col1, col2 = column_names[0].split('#') + result[[col1, col2]] = result[column_names[0]].str.split('#', expand=True) + + col3, col4 = column_names[1].split('#') + result[[col3, col4]] = result[column_names[1]].str.split('#', expand=True) + + return result.astype(self.dtypes).drop(columns=column_names) + + data_test = pd.DataFrame({ + 'col_1': ['A', 'B', 'C'], + 'col_2': ['D', 'E', 'F'], + 'col_3': ['G', 'H', 'I'], + 'col_4': ['J', 'K', 'L'] + }) + + column_to_sdtype = { + 'col_1': 'categorical', + 'col_2': 'categorical', + 'col_3': 'categorical', + 'col_4': 'categorical' + } + transformer = ConcatenateTransformer() + + # Run + transformer.fit(data_test, column_to_sdtype) + transformed = transformer.transform(data_test) + reverse = transformer.reverse_transform(transformed) + + # Assert + expected_transform = pd.DataFrame({ + 'col_1#col_2': ['A#D', 'B#E', 'C#F'], + 'col_3#col_4': ['G#J', 'H#K', 'I#L'] + }) + pd.testing.assert_frame_equal(expected_transform, transformed) + pd.testing.assert_frame_equal(reverse, data_test) + + +def test_multi_column_transformer_more_output_than_input_columns(): + """Test a multi-column transformer when the output has more columns than the input.""" + class ExpandTransformer(BaseMultiColumnTransformer): + + def _fit(self, columns_data, columns_to_sdtypes): + name_1 = self.columns[0] + '.first_part' + name_2 = self.columns[0] + '.second_part' + name_3 = self.columns[1] + '.first_part' + name_4 = self.columns[1] + '.second_part' + self.output_properties = { + name_1: {'sdtype': 'categorical'}, + name_2: {'sdtype': 'categorical'}, + name_3: {'sdtype': 'categorical'}, + name_4: {'sdtype': 'categorical'} + } + self.names = [name_1, name_2, name_3, name_4] + self.dtypes = columns_data.dtypes + + def _transform(self, data): + data[self.names[0]] = data[self.columns[0]].str[0] + data[self.names[1]] = data[self.columns[0]].str[1] + data[self.names[2]] = data[self.columns[1]].str[0] + data[self.names[3]] = data[self.columns[1]].str[1] + + return data.drop(columns=self.columns) + + def _reverse_transform(self, data): + result = data.copy() + result[self.columns[0]] = result[self.names[0]] + result[self.names[1]] + result[self.columns[1]] = result[self.names[2]] + result[self.names[3]] + + return result.astype(self.dtypes).drop(columns=self.names) + + data_test = pd.DataFrame({ + 'col_1': ['AB', 'CD', 'EF'], + 'col_2': ['GH', 'IJ', 'KL'], + }) + + column_to_sdtype = { + 'col_1': 'categorical', + 'col_2': 'categorical', + } + transformer = ExpandTransformer() + + # Run + transformer.fit(data_test, column_to_sdtype) + transformed = transformer.transform(data_test) + reverse = transformer.reverse_transform(transformed) + + # Assert + expected_transform = pd.DataFrame({ + 'col_1.first_part': ['A', 'C', 'E'], + 'col_1.second_part': ['B', 'D', 'F'], + 'col_2.first_part': ['G', 'I', 'K'], + 'col_2.second_part': ['H', 'J', 'L'] + }) + pd.testing.assert_frame_equal(expected_transform, transformed) + pd.testing.assert_frame_equal(reverse, data_test) diff --git a/tests/unit/transformers/test_base.py b/tests/unit/transformers/test_base.py index 5255406a1..d7430eb21 100644 --- a/tests/unit/transformers/test_base.py +++ b/tests/unit/transformers/test_base.py @@ -1303,6 +1303,27 @@ def test_get_input_columns(self): # Assert assert output == ['a', 'b', 'c'] + def test__get_output_to_property(self): + """Test the ``_get_output_to_property`` method.""" + # Setup + transformer = BaseMultiColumnTransformer() + transformer.output_properties = { + 'col_1': {'sdtype': 'numerical'}, + 'col_2': {'sdtype': 'categorical'}, + 'col_3': {'sdtype': 'boolean'}, + } + + # Run + output = transformer._get_output_to_property('sdtype') + + # Assert + expected_output = { + 'col_1': 'numerical', + 'col_2': 'categorical', + 'col_3': 'boolean', + } + assert output == expected_output + def test__fit(self): """Test the ``_fit`` method. @@ -1341,9 +1362,35 @@ def test_fit(self): # Assert transformer._store_columns.assert_called_once_with( - list(columns_to_sdtypes.keys()), data + tuple(columns_to_sdtypes.keys()), data ) transformer._set_seed.assert_called_once_with(data) transformer._get_columns_data.assert_called_once_with(data, ['a', 'b']) transformer._fit.assert_called_once_with(data_transformer, columns_to_sdtypes) transformer._build_output_columns.assert_called_once_with(data) + + def test_fit_transform(self): + """Test the ``fit_transform`` method.""" + # Setup + transformer = BaseMultiColumnTransformer() + columns_to_sdtypes = { + 'a': 'numerical', + 'b': 'categorical', + 'c': 'boolean' + } + data = pd.DataFrame({ + 'a': [1, 2, 3], + 'b': ['a', 'b', 'c'], + }) + transformer.columns = ['a', 'b'] + mock_fit = Mock() + mock_transform = Mock(return_value=data) + transformer.fit = mock_fit + transformer.transform = mock_transform + + # Run + transformer.fit_transform(data, columns_to_sdtypes) + + # Assert + mock_fit.assert_called_once_with(data, columns_to_sdtypes) + mock_transform.assert_called_once_with(data)