Skip to content

Commit

Permalink
Delete unnecessary transformers (#379)
Browse files Browse the repository at this point in the history
* Remove transformers

* Remove prints

* Lower TEST_THRESHOLD
  • Loading branch information
fealho authored Feb 3, 2022
1 parent 04c58f6 commit 9507870
Show file tree
Hide file tree
Showing 5 changed files with 3 additions and 195 deletions.
28 changes: 0 additions & 28 deletions rdt/transformers/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,34 +256,6 @@ def _reverse_transform(self, data):
return self._reverse_transform_by_row(data)


class CategoricalFuzzyTransformer(CategoricalTransformer):
"""Transformer for categorical data.
This transformer computes a float representative for each one of the categories
found in the fit data. Then, when transforming, it replaces the instances of these
categories with the corresponding representatives plus some added gaussian noise.
The representatives are decided by sorting the categorical values by their relative
frequency, then dividing the ``[0, 1]`` interval by these relative frequencies, and
finally assigning the middle point of each interval to the corresponding category.
When the transformation is reverted, each value is assigned the category that
corresponds to the interval it falls in.
Null values are considered just another category.
This class behaves exactly as the ``CategoricalTransformer`` with ``fuzzy=True``.
Args:
clip (bool):
If ``True``, clip the values to [0, 1]. Otherwise normalize them using modulo 1.
Defaults to ``False``.
"""

def __init__(self, clip=False):
super().__init__(fuzzy=True, clip=clip)


class OneHotEncodingTransformer(BaseTransformer):
"""OneHotEncoding for categorical data.
Expand Down
103 changes: 0 additions & 103 deletions rdt/transformers/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,109 +191,6 @@ def _reverse_transform(self, data):
return data.astype(self._dtype)


class NumericalRoundedBoundedTransformer(NumericalTransformer):
"""Transformer for numerical data.
This transformer replaces integer values with their float equivalent, bounded by the fitted
data (the minimum and maximum values seen while fitting). It will also round all values to
the maximum number of decimal places detected in the fitted data.
Non null float values are not modified.
This class behaves exactly as the ``NumericalTransformer`` with ``min_value='auto'``,
``max_value='auto'`` and ``rounding='auto'``.
Args:
dtype (data type):
Data type of the data to transform. It will be used when reversing the
transformation. If not provided, the dtype of the fit data will be used.
Defaults to ``None``.
nan (int, str or None):
Indicate what to do with the null values. If an integer is given, replace them
with the given value. If the strings ``'mean'`` or ``'mode'`` are given, replace
them with the corresponding aggregation. If ``None`` is given, do not replace them.
Defaults to ``'mean'``.
null_column (bool):
Whether to create a new column to indicate which values were null or not.
If ``None``, only create a new column when the data contains null values.
If ``True``, always create the new column whether there are null values or not.
If ``False``, do not create the new column.
Defaults to ``None``.
"""

def __init__(self, dtype=None, nan='mean', null_column=None):
super().__init__(dtype=dtype, nan=nan, null_column=null_column, min_value='auto',
max_value='auto', rounding='auto')


class NumericalBoundedTransformer(NumericalTransformer):
"""Transformer for numerical data.
This transformer replaces integer values with their float equivalent, bounded by the fitted
data (the minimum and maximum values seen while fitting).
Non null float values are not modified.
This class behaves exactly as the ``NumericalTransformer`` with ``min_value='auto'``,
``max_value='auto'`` and ``rounding=None``.
Args:
dtype (data type):
Data type of the data to transform. It will be used when reversing the
transformation. If not provided, the dtype of the fit data will be used.
Defaults to ``None``.
nan (int, str or None):
Indicate what to do with the null values. If an integer is given, replace them
with the given value. If the strings ``'mean'`` or ``'mode'`` are given, replace
them with the corresponding aggregation. If ``None`` is given, do not replace them.
Defaults to ``'mean'``.
null_column (bool):
Whether to create a new column to indicate which values were null or not.
If ``None``, only create a new column when the data contains null values.
If ``True``, always create the new column whether there are null values or not.
If ``False``, do not create the new column.
Defaults to ``None``.
"""

def __init__(self, dtype=None, nan='mean', null_column=None):
super().__init__(dtype=dtype, nan=nan, null_column=null_column, min_value='auto',
max_value='auto', rounding=None)


class NumericalRoundedTransformer(NumericalTransformer):
"""Transformer for numerical data.
This transformer replaces integer values with their float equivalent, rounding all values to
the maximum number of decimal places detected in the fitted data.
Non null float values are not modified.
This class behaves exactly as the ``NumericalTransformer`` with ``min_value=None``,
``max_value=None`` and ``rounding='auto'``.
Args:
dtype (data type):
Data type of the data to transform. It will be used when reversing the
transformation. If not provided, the dtype of the fit data will be used.
Defaults to ``None``.
nan (int, str or None):
Indicate what to do with the null values. If an integer is given, replace them
with the given value. If the strings ``'mean'`` or ``'mode'`` are given, replace
them with the corresponding aggregation. If ``None`` is given, do not replace them.
Defaults to ``'mean'``.
null_column (bool):
Whether to create a new column to indicate which values were null or not.
If ``None``, only create a new column when the data contains null values.
If ``True``, always create the new column whether there are null values or not.
If ``False``, do not create the new column.
Defaults to ``None``.
"""

def __init__(self, dtype=None, nan='mean', null_column=None):
super().__init__(dtype=dtype, nan=nan, null_column=null_column, min_value=None,
max_value=None, rounding='auto')


class GaussianCopulaTransformer(NumericalTransformer):
r"""Transformer for numerical data based on copulas transformation.
Expand Down
2 changes: 1 addition & 1 deletion tests/quality/test_quality.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from tests.quality.utils import download_single_table

R2_THRESHOLD = 0.2
TEST_THRESHOLD = 0.35
TEST_THRESHOLD = 0.3
MAX_SIZE = 5000000
TYPES_TO_SKIP = {'numerical', 'float', 'integer', 'id', None}

Expand Down
14 changes: 1 addition & 13 deletions tests/unit/transformers/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@
import pytest

from rdt.transformers.categorical import (
CategoricalFuzzyTransformer, CategoricalTransformer, LabelEncodingTransformer,
OneHotEncodingTransformer)
CategoricalTransformer, LabelEncodingTransformer, OneHotEncodingTransformer)

RE_SSN = re.compile(r'\d\d\d-\d\d-\d\d\d\d')

Expand Down Expand Up @@ -1582,14 +1581,3 @@ def test__reverse_transform_clips_values(self):

# Assert
pd.testing.assert_series_equal(out, pd.Series(['a', 'b', 'c']))


class TestCategoricalFuzzyTransformer:

def test___init__(self):
"""Test that the ``__init__`` method uses ``fuzzy==True`` by default."""
# Setup
transformer = CategoricalFuzzyTransformer()

# Assert
assert transformer.fuzzy
51 changes: 1 addition & 50 deletions tests/unit/transformers/test_numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@

from rdt.transformers.null import NullTransformer
from rdt.transformers.numerical import (
BayesGMMTransformer, GaussianCopulaTransformer, NumericalBoundedTransformer,
NumericalRoundedBoundedTransformer, NumericalRoundedTransformer, NumericalTransformer)
BayesGMMTransformer, GaussianCopulaTransformer, NumericalTransformer)


class TestNumericalTransformer(TestCase):
Expand Down Expand Up @@ -838,54 +837,6 @@ def test__reverse_transform_min_an_max_with_nulls(self):
np.testing.assert_array_equal(result, expected_data)


class TestNumericalBoundedTransformer(TestCase):

def test___init__(self):
"""super() arguments are properly passed and set as attributes."""
# Run
nt = NumericalBoundedTransformer(dtype='int', null_column=False)

# Assert
assert nt.dtype == 'int'
assert nt.nan == 'mean'
assert nt.null_column is False
assert nt.min_value == 'auto'
assert nt.max_value == 'auto'
assert nt.rounding is None


class TestNumericalRoundedTransformer(TestCase):

def test___init__(self):
"""super() arguments are properly passed and set as attributes."""
# Run
nt = NumericalRoundedTransformer(dtype='int', null_column=False)

# Assert
assert nt.dtype == 'int'
assert nt.nan == 'mean'
assert nt.null_column is False
assert nt.min_value is None
assert nt.max_value is None
assert nt.rounding == 'auto'


class TestNumericalRoundedBoundedTransformer(TestCase):

def test___init__(self):
"""super() arguments are properly passed and set as attributes."""
# Run
nt = NumericalRoundedBoundedTransformer(dtype='int', null_column=False)

# Assert
assert nt.dtype == 'int'
assert nt.nan == 'mean'
assert nt.null_column is False
assert nt.min_value == 'auto'
assert nt.max_value == 'auto'
assert nt.rounding == 'auto'


class TestGaussianCopulaTransformer:

def test___init__super_attrs(self):
Expand Down

0 comments on commit 9507870

Please sign in to comment.