From 054d9eb667cd46235b4f195ab703c204ca480d7f Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Thu, 7 Sep 2023 09:39:59 +0200 Subject: [PATCH] ordered_columns + prefix --- rdt/transformers/base.py | 74 ++++++++++--- tests/integration/transformers/test_base.py | 113 +++++++++++--------- tests/unit/transformers/test_base.py | 99 +++++++++++++---- 3 files changed, 200 insertions(+), 86 deletions(-) diff --git a/rdt/transformers/base.py b/rdt/transformers/base.py index 51f7fa7fe..1dadbcaa2 100644 --- a/rdt/transformers/base.py +++ b/rdt/transformers/base.py @@ -486,8 +486,19 @@ class BaseMultiColumnTransformer(BaseTransformer): The ``BaseMultiColumnTransformer`` class contains methods that must be implemented in order to create a new multi column transformer. + + Attributes: + ordered_columns (tuple): + Order of the columns to be used for the transformer. + prefixes (dict): + Dictionary mapping each output column to its prefix. """ + def __init__(self): + super().__init__() + self.ordered_columns = None + self.prefixes = {} + def get_input_column(self): """Override ``get_input_column`` method from ``BaseTransformer``. @@ -510,52 +521,85 @@ def get_input_columns(self): def _get_output_to_property(self, property_): result = { - output_column: properties[property_] + f'{self.prefixes[output_column]}.{output_column}': properties[property_] for output_column, properties in self.output_properties.items() } return result - def _fit(self, columns_data, columns_to_sdtypes): + def _validate_ordered_columns(self, data, ordered_columns): + """Check that all the columns in ``ordered_columns`` are present in the data.""" + missing = set(ordered_columns) - set(data.columns) + if missing: + missing_to_print = ', '.join(missing) + raise KeyError(f'Columns ({missing_to_print}) are not present in the data.') + + def _generate_prefixes(self, ordered_columns): + """Generate prefixes for the output columns to precised which column they come from. + + Returns: + dict: + Dictionary mapping each output column to its prefix. + The key is the output column name and the value is the prefix. + """ + raise NotImplementedError() + + def _validate_prefixes(self, ordered_columns): + """Check that the prefixes are valid. + + Every prefix must include the name of at least one column in the data. + """ + for prefix in self.prefixes.values(): + if not any(column in prefix for column in ordered_columns): + raise ValueError( + f"The prefix '{prefix}' does not include the name of any column in the data." + ) + + def _fit(self, data, ordered_columns): """Fit the transformer to the data. Args: - columns_data (pandas.DataFrame): + data (pandas.DataFrame): Data to transform. - columns_to_sdtypes (dict): - Dictionary mapping column names to their sdtypes. + ordered_columns (tuple): + Order of the columns to be used for the transformer. """ raise NotImplementedError() @random_state - def fit(self, data, columns_to_sdtypes): + def fit(self, data, ordered_columns): """Fit the transformer to a ``column`` of the ``data``. Args: data (pandas.DataFrame): The entire table. - columns_to_sdtypes (dict): - Dictionary mapping column names to their sdtypes. + ordered_columns (tuple): + Order of the columns to be used for the transformer. """ - column_names = tuple(columns_to_sdtypes.keys()) - self._store_columns(column_names, data) + self._validate_ordered_columns(data, ordered_columns) + self.ordered_columns = ordered_columns + self._store_columns(ordered_columns, data) self._set_seed(data) + columns_data = self._get_columns_data(data, self.columns) - self._fit(columns_data, columns_to_sdtypes) + self._fit(columns_data, ordered_columns) + + self.prefixes = self._generate_prefixes(ordered_columns) + self._validate_prefixes(ordered_columns) self._build_output_columns(data) - def fit_transform(self, data, columns_to_sdtypes): + def fit_transform(self, data, ordered_columns): """Fit the transformer to a `column` of the `data` and then transform it. Args: data (pandas.DataFrame): The entire table. - columns_to_sdtypes (dict): - Dictionary mapping column names to their sdtypes. + ordered_columns (tuple): + Order of the columns to be used for the transformer. Returns: pd.DataFrame: The entire table, containing the transformed data. """ - self.fit(data, columns_to_sdtypes) + self.fit(data, ordered_columns) return self.transform(data) diff --git a/tests/integration/transformers/test_base.py b/tests/integration/transformers/test_base.py index defa5d901..25f70cd3a 100644 --- a/tests/integration/transformers/test_base.py +++ b/tests/integration/transformers/test_base.py @@ -136,11 +136,17 @@ def test_multi_column_transformer_same_number_of_columns_input_output(): # Setup class AdditionTransformer(BaseMultiColumnTransformer): """This transformer takes 3 columns and return the cumulative sum of each row.""" - def _fit(self, columns_data, columns_to_sdtypes): + def _fit(self, columns_data, ordered_columns): self.output_properties = { column: {'sdtype': 'numerical'} for column in self.columns } - self.dtypes = columns_data.dtypes + + def _generate_prefixes(self, ordered_columns): + prefixes = {} + for idx, column in enumerate(self.output_properties): + prefixes[column] = '#'.join(ordered_columns[:idx + 1]) + + return prefixes def _transform(self, data): return data.cumsum(axis=1) @@ -149,7 +155,7 @@ def _reverse_transform(self, data): result = data.diff(axis=1) result.iloc[:, 0] = data.iloc[:, 0] - return result.astype(self.dtypes) + return result.astype(int) data_test = pd.DataFrame({ 'col_1': [1, 2, 3], @@ -157,22 +163,18 @@ def _reverse_transform(self, data): 'col_3': [100, 200, 300] }) - column_to_sdtype = { - 'col_1': 'numerical', - 'col_2': 'numerical', - 'col_3': 'numerical' - } + order_columns = ('col_1', 'col_2', 'col_3') transformer = AdditionTransformer() # Run - transformed = transformer.fit_transform(data_test, column_to_sdtype) + transformed = transformer.fit_transform(data_test, order_columns) reverse = transformer.reverse_transform(transformed) # Assert expected_transform = pd.DataFrame({ - 'col_1': [1, 2, 3], - 'col_2': [11, 22, 33], - 'col_3': [111, 222, 333] + 'col_1.col_1': [1, 2, 3], + 'col_1#col_2.col_2': [11, 22, 33], + 'col_1#col_2#col_3.col_3': [111, 222, 333] }) pd.testing.assert_frame_equal(expected_transform, transformed) pd.testing.assert_frame_equal(reverse, data_test) @@ -184,14 +186,20 @@ class ConcatenateTransformer(BaseMultiColumnTransformer): """This transformer takes 4 columns and concatenate them into 2 columns. The two first and last columns are concatenated together. """ - def _fit(self, columns_data, columns_to_sdtypes): + def _fit(self, columns_data, ordered_columns): self.name_1 = self.columns[0] + '#' + self.columns[1] self.name_2 = self.columns[2] + '#' + self.columns[3] self.output_properties = { - self.name_1: {'sdtype': 'categorical'}, - self.name_2: {'sdtype': 'categorical'} + 'concatenate_1': {'sdtype': 'categorical'}, + 'concatenate_2': {'sdtype': 'categorical'} } - self.dtypes = columns_data.dtypes + + def _generate_prefixes(self, ordered_columns): + prefixes = {} + for idx, column in enumerate(self.output_properties): + prefixes[column] = self.name_1 if idx == 0 else self.name_2 + + return prefixes def _transform(self, data): data[self.name_1] = data.iloc[:, 0] + '#' + data.iloc[:, 1] @@ -209,7 +217,7 @@ def _reverse_transform(self, data): col3, col4 = column_names[1].split('#') result[[col3, col4]] = result[column_names[1]].str.split('#', expand=True) - return result.astype(self.dtypes).drop(columns=column_names) + return result.drop(columns=column_names) data_test = pd.DataFrame({ 'col_1': ['A', 'B', 'C'], @@ -218,23 +226,18 @@ def _reverse_transform(self, data): 'col_4': ['J', 'K', 'L'] }) - column_to_sdtype = { - 'col_1': 'categorical', - 'col_2': 'categorical', - 'col_3': 'categorical', - 'col_4': 'categorical' - } + ordered_columns = ('col_1', 'col_2', 'col_3', 'col_4') transformer = ConcatenateTransformer() # Run - transformer.fit(data_test, column_to_sdtype) + transformer.fit(data_test, ordered_columns) transformed = transformer.transform(data_test) reverse = transformer.reverse_transform(transformed) # Assert expected_transform = pd.DataFrame({ - 'col_1#col_2': ['A#D', 'B#E', 'C#F'], - 'col_3#col_4': ['G#J', 'H#K', 'I#L'] + 'col_1#col_2.concatenate_1': ['A#D', 'B#E', 'C#F'], + 'col_3#col_4.concatenate_2': ['G#J', 'H#K', 'I#L'] }) pd.testing.assert_frame_equal(expected_transform, transformed) pd.testing.assert_frame_equal(reverse, data_test) @@ -244,57 +247,61 @@ def test_multi_column_transformer_more_output_than_input_columns(): """Test a multi-column transformer when the output has more columns than the input.""" class ExpandTransformer(BaseMultiColumnTransformer): - def _fit(self, columns_data, columns_to_sdtypes): - name_1 = self.columns[0] + '.first_part' - name_2 = self.columns[0] + '.second_part' - name_3 = self.columns[1] + '.first_part' - name_4 = self.columns[1] + '.second_part' + def _fit(self, columns_data, ordered_columns): self.output_properties = { - name_1: {'sdtype': 'categorical'}, - name_2: {'sdtype': 'categorical'}, - name_3: {'sdtype': 'categorical'}, - name_4: {'sdtype': 'categorical'} + 'first_part_1': {'sdtype': 'categorical'}, + 'second_part_1': {'sdtype': 'categorical'}, + 'first_part_2': {'sdtype': 'categorical'}, + 'second_part_2': {'sdtype': 'categorical'} } - self.names = [name_1, name_2, name_3, name_4] - self.dtypes = columns_data.dtypes + + def _generate_prefixes(self, ordered_columns): + list_prefixes = [ + self.columns[0], self.columns[0], + self.columns[1], self.columns[1] + ] + prefixes = {} + for idx, column in enumerate(self.output_properties): + prefixes[column] = list_prefixes[idx] + + return prefixes def _transform(self, data): - data[self.names[0]] = data[self.columns[0]].str[0] - data[self.names[1]] = data[self.columns[0]].str[1] - data[self.names[2]] = data[self.columns[1]].str[0] - data[self.names[3]] = data[self.columns[1]].str[1] + data[self.output_columns[0]] = data[self.columns[0]].str[0] + data[self.output_columns[1]] = data[self.columns[0]].str[1] + data[self.output_columns[2]] = data[self.columns[1]].str[0] + data[self.output_columns[3]] = data[self.columns[1]].str[1] return data.drop(columns=self.columns) def _reverse_transform(self, data): result = data.copy() - result[self.columns[0]] = result[self.names[0]] + result[self.names[1]] - result[self.columns[1]] = result[self.names[2]] + result[self.names[3]] + reverse_1 = result[self.output_columns[0]] + result[self.output_columns[1]] + reverse_2 = result[self.output_columns[2]] + result[self.output_columns[3]] + result[self.columns[0]] = reverse_1 + result[self.columns[1]] = reverse_2 - return result.astype(self.dtypes).drop(columns=self.names) + return result.drop(columns=self.output_columns) data_test = pd.DataFrame({ 'col_1': ['AB', 'CD', 'EF'], 'col_2': ['GH', 'IJ', 'KL'], }) - column_to_sdtype = { - 'col_1': 'categorical', - 'col_2': 'categorical', - } + ordered_columns = ('col_1', 'col_2') transformer = ExpandTransformer() # Run - transformer.fit(data_test, column_to_sdtype) + transformer.fit(data_test, ordered_columns) transformed = transformer.transform(data_test) reverse = transformer.reverse_transform(transformed) # Assert expected_transform = pd.DataFrame({ - 'col_1.first_part': ['A', 'C', 'E'], - 'col_1.second_part': ['B', 'D', 'F'], - 'col_2.first_part': ['G', 'I', 'K'], - 'col_2.second_part': ['H', 'J', 'L'] + 'col_1.first_part_1': ['A', 'C', 'E'], + 'col_1.second_part_1': ['B', 'D', 'F'], + 'col_2.first_part_2': ['G', 'I', 'K'], + 'col_2.second_part_2': ['H', 'J', 'L'] }) pd.testing.assert_frame_equal(expected_transform, transformed) pd.testing.assert_frame_equal(reverse, data_test) diff --git a/tests/unit/transformers/test_base.py b/tests/unit/transformers/test_base.py index d7430eb21..90e624a86 100644 --- a/tests/unit/transformers/test_base.py +++ b/tests/unit/transformers/test_base.py @@ -1276,6 +1276,15 @@ def _reverse_transform(self, data): class TestBaseMultiColumnTransformer: + def test___init__(self): + """Test the ``__init__`` method.""" + # Setup + transformer = BaseMultiColumnTransformer() + + # Assert + assert transformer.ordered_columns is None + assert transformer.prefixes == {} + def test_get_input_column(self): """Test the ``get_input_column`` method. @@ -1303,6 +1312,15 @@ def test_get_input_columns(self): # Assert assert output == ['a', 'b', 'c'] + def test__generate_prefixes(self): + """Test the ``_generate_prefixes`` method.""" + # Setup + transformer = BaseMultiColumnTransformer() + + # Run and Assert + with pytest.raises(NotImplementedError): + transformer._generate_prefixes(None) + def test__get_output_to_property(self): """Test the ``_get_output_to_property`` method.""" # Setup @@ -1312,18 +1330,64 @@ def test__get_output_to_property(self): 'col_2': {'sdtype': 'categorical'}, 'col_3': {'sdtype': 'boolean'}, } + transformer.prefixes = { + 'col_1': 'prefix_1', + 'col_2': 'prefix_2', + 'col_3': 'prefix_3', + } # Run output = transformer._get_output_to_property('sdtype') # Assert expected_output = { - 'col_1': 'numerical', - 'col_2': 'categorical', - 'col_3': 'boolean', + 'prefix_1.col_1': 'numerical', + 'prefix_2.col_2': 'categorical', + 'prefix_3.col_3': 'boolean', } assert output == expected_output + def test__validate_ordered_columns(self): + """Test the ``_validate_ordered_columns`` method.""" + # Setup + transformer = BaseMultiColumnTransformer() + data = pd.DataFrame({ + 'a': [1, 2, 3], + 'b': ['a', 'b', 'c'], + 'c': [True, False, True], + }) + ordered_columns = ('a', 'b', 'c') + + # Run and Assert + transformer._validate_ordered_columns(data, ordered_columns) + + wrong_ordered_columns = ('a', 'b', 'c', 'd') + expected_error_msg = re.escape( + 'Columns (d) are not present in the data.' + ) + with pytest.raises(KeyError, match=expected_error_msg): + transformer._validate_ordered_columns(data, wrong_ordered_columns) + + def test__validate_prefixes(self): + """Test the ``_validate_prefixes`` method.""" + # Setup + transformer = BaseMultiColumnTransformer() + transformer.prefixes = { + 'col_1': 'col_1', + 'col_2': 'col_2', + 'col_3': 'col_3', + } + + # Run and Assert + transformer._validate_prefixes(('col_1', 'col_2', 'col_3')) + + transformer.prefixes['col_1'] = 'error' + expected_error_msg = ( + "The prefix 'error' does not include the name of any column in the data." + ) + with pytest.raises(ValueError, match=expected_error_msg): + transformer._validate_prefixes(('col_1', 'col_2', 'col_3')) + def test__fit(self): """Test the ``_fit`` method. @@ -1341,43 +1405,42 @@ def test_fit(self): # Setup transformer = BaseMultiColumnTransformer() data = Mock() - columns_to_sdtypes = { - 'a': 'numerical', - 'b': 'categorical', - 'c': 'boolean' - } + ordered_columns = ('a', 'b', 'c') data_transformer = pd.DataFrame({ 'a': [1, 2, 3], 'b': ['a', 'b', 'c'], }) transformer.columns = ['a', 'b'] + + transformer._validate_ordered_columns = Mock() transformer._store_columns = Mock() transformer._get_columns_data = Mock(return_value=data_transformer) transformer._set_seed = Mock() transformer._fit = Mock() + transformer._generate_prefixes = Mock() + transformer._validate_prefixes = Mock() transformer._build_output_columns = Mock() # Run - transformer.fit(data, columns_to_sdtypes) + transformer.fit(data, ordered_columns) # Assert + transformer._validate_ordered_columns.assert_called_once_with(data, ordered_columns) transformer._store_columns.assert_called_once_with( - tuple(columns_to_sdtypes.keys()), data + ordered_columns, data ) transformer._set_seed.assert_called_once_with(data) transformer._get_columns_data.assert_called_once_with(data, ['a', 'b']) - transformer._fit.assert_called_once_with(data_transformer, columns_to_sdtypes) + transformer._fit.assert_called_once_with(data_transformer, ordered_columns) + transformer._generate_prefixes.assert_called_once_with(ordered_columns) + transformer._validate_prefixes.assert_called_once_with(ordered_columns) transformer._build_output_columns.assert_called_once_with(data) def test_fit_transform(self): """Test the ``fit_transform`` method.""" # Setup transformer = BaseMultiColumnTransformer() - columns_to_sdtypes = { - 'a': 'numerical', - 'b': 'categorical', - 'c': 'boolean' - } + ordered_columns = ('a', 'b', 'c') data = pd.DataFrame({ 'a': [1, 2, 3], 'b': ['a', 'b', 'c'], @@ -1389,8 +1452,8 @@ def test_fit_transform(self): transformer.transform = mock_transform # Run - transformer.fit_transform(data, columns_to_sdtypes) + transformer.fit_transform(data, ordered_columns) # Assert - mock_fit.assert_called_once_with(data, columns_to_sdtypes) + mock_fit.assert_called_once_with(data, ordered_columns) mock_transform.assert_called_once_with(data)