From ab0a5f670b835596196ce0618dbc178adc3977e7 Mon Sep 17 00:00:00 2001 From: Felipe Alex Hofmann Date: Thu, 3 Feb 2022 10:10:58 -0800 Subject: [PATCH] Delete unnecessary transformers (#379) * Remove transformers * Remove prints * Lower TEST_THRESHOLD --- rdt/transformers/categorical.py | 28 ------ rdt/transformers/numerical.py | 103 -------------------- tests/quality/test_quality.py | 2 +- tests/unit/transformers/test_categorical.py | 14 +-- tests/unit/transformers/test_numerical.py | 51 +--------- 5 files changed, 3 insertions(+), 195 deletions(-) diff --git a/rdt/transformers/categorical.py b/rdt/transformers/categorical.py index b0b0b1636..58cba9bfd 100644 --- a/rdt/transformers/categorical.py +++ b/rdt/transformers/categorical.py @@ -256,34 +256,6 @@ def _reverse_transform(self, data): return self._reverse_transform_by_row(data) -class CategoricalFuzzyTransformer(CategoricalTransformer): - """Transformer for categorical data. - - This transformer computes a float representative for each one of the categories - found in the fit data. Then, when transforming, it replaces the instances of these - categories with the corresponding representatives plus some added gaussian noise. - - The representatives are decided by sorting the categorical values by their relative - frequency, then dividing the ``[0, 1]`` interval by these relative frequencies, and - finally assigning the middle point of each interval to the corresponding category. - - When the transformation is reverted, each value is assigned the category that - corresponds to the interval it falls in. - - Null values are considered just another category. - - This class behaves exactly as the ``CategoricalTransformer`` with ``fuzzy=True``. - - Args: - clip (bool): - If ``True``, clip the values to [0, 1]. Otherwise normalize them using modulo 1. - Defaults to ``False``. - """ - - def __init__(self, clip=False): - super().__init__(fuzzy=True, clip=clip) - - class OneHotEncodingTransformer(BaseTransformer): """OneHotEncoding for categorical data. diff --git a/rdt/transformers/numerical.py b/rdt/transformers/numerical.py index 3725c6a5a..970f26305 100644 --- a/rdt/transformers/numerical.py +++ b/rdt/transformers/numerical.py @@ -191,109 +191,6 @@ def _reverse_transform(self, data): return data.astype(self._dtype) -class NumericalRoundedBoundedTransformer(NumericalTransformer): - """Transformer for numerical data. - - This transformer replaces integer values with their float equivalent, bounded by the fitted - data (the minimum and maximum values seen while fitting). It will also round all values to - the maximum number of decimal places detected in the fitted data. - - Non null float values are not modified. - - This class behaves exactly as the ``NumericalTransformer`` with ``min_value='auto'``, - ``max_value='auto'`` and ``rounding='auto'``. - - Args: - dtype (data type): - Data type of the data to transform. It will be used when reversing the - transformation. If not provided, the dtype of the fit data will be used. - Defaults to ``None``. - nan (int, str or None): - Indicate what to do with the null values. If an integer is given, replace them - with the given value. If the strings ``'mean'`` or ``'mode'`` are given, replace - them with the corresponding aggregation. If ``None`` is given, do not replace them. - Defaults to ``'mean'``. - null_column (bool): - Whether to create a new column to indicate which values were null or not. - If ``None``, only create a new column when the data contains null values. - If ``True``, always create the new column whether there are null values or not. - If ``False``, do not create the new column. - Defaults to ``None``. - """ - - def __init__(self, dtype=None, nan='mean', null_column=None): - super().__init__(dtype=dtype, nan=nan, null_column=null_column, min_value='auto', - max_value='auto', rounding='auto') - - -class NumericalBoundedTransformer(NumericalTransformer): - """Transformer for numerical data. - - This transformer replaces integer values with their float equivalent, bounded by the fitted - data (the minimum and maximum values seen while fitting). - - Non null float values are not modified. - - This class behaves exactly as the ``NumericalTransformer`` with ``min_value='auto'``, - ``max_value='auto'`` and ``rounding=None``. - - Args: - dtype (data type): - Data type of the data to transform. It will be used when reversing the - transformation. If not provided, the dtype of the fit data will be used. - Defaults to ``None``. - nan (int, str or None): - Indicate what to do with the null values. If an integer is given, replace them - with the given value. If the strings ``'mean'`` or ``'mode'`` are given, replace - them with the corresponding aggregation. If ``None`` is given, do not replace them. - Defaults to ``'mean'``. - null_column (bool): - Whether to create a new column to indicate which values were null or not. - If ``None``, only create a new column when the data contains null values. - If ``True``, always create the new column whether there are null values or not. - If ``False``, do not create the new column. - Defaults to ``None``. - """ - - def __init__(self, dtype=None, nan='mean', null_column=None): - super().__init__(dtype=dtype, nan=nan, null_column=null_column, min_value='auto', - max_value='auto', rounding=None) - - -class NumericalRoundedTransformer(NumericalTransformer): - """Transformer for numerical data. - - This transformer replaces integer values with their float equivalent, rounding all values to - the maximum number of decimal places detected in the fitted data. - - Non null float values are not modified. - - This class behaves exactly as the ``NumericalTransformer`` with ``min_value=None``, - ``max_value=None`` and ``rounding='auto'``. - - Args: - dtype (data type): - Data type of the data to transform. It will be used when reversing the - transformation. If not provided, the dtype of the fit data will be used. - Defaults to ``None``. - nan (int, str or None): - Indicate what to do with the null values. If an integer is given, replace them - with the given value. If the strings ``'mean'`` or ``'mode'`` are given, replace - them with the corresponding aggregation. If ``None`` is given, do not replace them. - Defaults to ``'mean'``. - null_column (bool): - Whether to create a new column to indicate which values were null or not. - If ``None``, only create a new column when the data contains null values. - If ``True``, always create the new column whether there are null values or not. - If ``False``, do not create the new column. - Defaults to ``None``. - """ - - def __init__(self, dtype=None, nan='mean', null_column=None): - super().__init__(dtype=dtype, nan=nan, null_column=null_column, min_value=None, - max_value=None, rounding='auto') - - class GaussianCopulaTransformer(NumericalTransformer): r"""Transformer for numerical data based on copulas transformation. diff --git a/tests/quality/test_quality.py b/tests/quality/test_quality.py index 0edfdc504..982b8a5db 100644 --- a/tests/quality/test_quality.py +++ b/tests/quality/test_quality.py @@ -11,7 +11,7 @@ from tests.quality.utils import download_single_table R2_THRESHOLD = 0.2 -TEST_THRESHOLD = 0.35 +TEST_THRESHOLD = 0.3 MAX_SIZE = 5000000 TYPES_TO_SKIP = {'numerical', 'float', 'integer', 'id', None} diff --git a/tests/unit/transformers/test_categorical.py b/tests/unit/transformers/test_categorical.py index 9fdd1cd63..c08e9ba22 100644 --- a/tests/unit/transformers/test_categorical.py +++ b/tests/unit/transformers/test_categorical.py @@ -6,8 +6,7 @@ import pytest from rdt.transformers.categorical import ( - CategoricalFuzzyTransformer, CategoricalTransformer, LabelEncodingTransformer, - OneHotEncodingTransformer) + CategoricalTransformer, LabelEncodingTransformer, OneHotEncodingTransformer) RE_SSN = re.compile(r'\d\d\d-\d\d-\d\d\d\d') @@ -1582,14 +1581,3 @@ def test__reverse_transform_clips_values(self): # Assert pd.testing.assert_series_equal(out, pd.Series(['a', 'b', 'c'])) - - -class TestCategoricalFuzzyTransformer: - - def test___init__(self): - """Test that the ``__init__`` method uses ``fuzzy==True`` by default.""" - # Setup - transformer = CategoricalFuzzyTransformer() - - # Assert - assert transformer.fuzzy diff --git a/tests/unit/transformers/test_numerical.py b/tests/unit/transformers/test_numerical.py index 7d80fb3bb..4b22159c2 100644 --- a/tests/unit/transformers/test_numerical.py +++ b/tests/unit/transformers/test_numerical.py @@ -9,8 +9,7 @@ from rdt.transformers.null import NullTransformer from rdt.transformers.numerical import ( - BayesGMMTransformer, GaussianCopulaTransformer, NumericalBoundedTransformer, - NumericalRoundedBoundedTransformer, NumericalRoundedTransformer, NumericalTransformer) + BayesGMMTransformer, GaussianCopulaTransformer, NumericalTransformer) class TestNumericalTransformer(TestCase): @@ -838,54 +837,6 @@ def test__reverse_transform_min_an_max_with_nulls(self): np.testing.assert_array_equal(result, expected_data) -class TestNumericalBoundedTransformer(TestCase): - - def test___init__(self): - """super() arguments are properly passed and set as attributes.""" - # Run - nt = NumericalBoundedTransformer(dtype='int', null_column=False) - - # Assert - assert nt.dtype == 'int' - assert nt.nan == 'mean' - assert nt.null_column is False - assert nt.min_value == 'auto' - assert nt.max_value == 'auto' - assert nt.rounding is None - - -class TestNumericalRoundedTransformer(TestCase): - - def test___init__(self): - """super() arguments are properly passed and set as attributes.""" - # Run - nt = NumericalRoundedTransformer(dtype='int', null_column=False) - - # Assert - assert nt.dtype == 'int' - assert nt.nan == 'mean' - assert nt.null_column is False - assert nt.min_value is None - assert nt.max_value is None - assert nt.rounding == 'auto' - - -class TestNumericalRoundedBoundedTransformer(TestCase): - - def test___init__(self): - """super() arguments are properly passed and set as attributes.""" - # Run - nt = NumericalRoundedBoundedTransformer(dtype='int', null_column=False) - - # Assert - assert nt.dtype == 'int' - assert nt.nan == 'mean' - assert nt.null_column is False - assert nt.min_value == 'auto' - assert nt.max_value == 'auto' - assert nt.rounding == 'auto' - - class TestGaussianCopulaTransformer: def test___init__super_attrs(self):