Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Delete unnecessary transformers #379

Merged
merged 3 commits into from
Feb 3, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 0 additions & 28 deletions rdt/transformers/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,34 +256,6 @@ def _reverse_transform(self, data):
return self._reverse_transform_by_row(data)


class CategoricalFuzzyTransformer(CategoricalTransformer):
"""Transformer for categorical data.
This transformer computes a float representative for each one of the categories
found in the fit data. Then, when transforming, it replaces the instances of these
categories with the corresponding representatives plus some added gaussian noise.
The representatives are decided by sorting the categorical values by their relative
frequency, then dividing the ``[0, 1]`` interval by these relative frequencies, and
finally assigning the middle point of each interval to the corresponding category.
When the transformation is reverted, each value is assigned the category that
corresponds to the interval it falls in.
Null values are considered just another category.
This class behaves exactly as the ``CategoricalTransformer`` with ``fuzzy=True``.
Args:
clip (bool):
If ``True``, clip the values to [0, 1]. Otherwise normalize them using modulo 1.
Defaults to ``False``.
"""

def __init__(self, clip=False):
super().__init__(fuzzy=True, clip=clip)


class OneHotEncodingTransformer(BaseTransformer):
"""OneHotEncoding for categorical data.
Expand Down
103 changes: 0 additions & 103 deletions rdt/transformers/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,109 +191,6 @@ def _reverse_transform(self, data):
return data.astype(self._dtype)


class NumericalRoundedBoundedTransformer(NumericalTransformer):
"""Transformer for numerical data.
This transformer replaces integer values with their float equivalent, bounded by the fitted
data (the minimum and maximum values seen while fitting). It will also round all values to
the maximum number of decimal places detected in the fitted data.
Non null float values are not modified.
This class behaves exactly as the ``NumericalTransformer`` with ``min_value='auto'``,
``max_value='auto'`` and ``rounding='auto'``.
Args:
dtype (data type):
Data type of the data to transform. It will be used when reversing the
transformation. If not provided, the dtype of the fit data will be used.
Defaults to ``None``.
nan (int, str or None):
Indicate what to do with the null values. If an integer is given, replace them
with the given value. If the strings ``'mean'`` or ``'mode'`` are given, replace
them with the corresponding aggregation. If ``None`` is given, do not replace them.
Defaults to ``'mean'``.
null_column (bool):
Whether to create a new column to indicate which values were null or not.
If ``None``, only create a new column when the data contains null values.
If ``True``, always create the new column whether there are null values or not.
If ``False``, do not create the new column.
Defaults to ``None``.
"""

def __init__(self, dtype=None, nan='mean', null_column=None):
super().__init__(dtype=dtype, nan=nan, null_column=null_column, min_value='auto',
max_value='auto', rounding='auto')


class NumericalBoundedTransformer(NumericalTransformer):
"""Transformer for numerical data.
This transformer replaces integer values with their float equivalent, bounded by the fitted
data (the minimum and maximum values seen while fitting).
Non null float values are not modified.
This class behaves exactly as the ``NumericalTransformer`` with ``min_value='auto'``,
``max_value='auto'`` and ``rounding=None``.
Args:
dtype (data type):
Data type of the data to transform. It will be used when reversing the
transformation. If not provided, the dtype of the fit data will be used.
Defaults to ``None``.
nan (int, str or None):
Indicate what to do with the null values. If an integer is given, replace them
with the given value. If the strings ``'mean'`` or ``'mode'`` are given, replace
them with the corresponding aggregation. If ``None`` is given, do not replace them.
Defaults to ``'mean'``.
null_column (bool):
Whether to create a new column to indicate which values were null or not.
If ``None``, only create a new column when the data contains null values.
If ``True``, always create the new column whether there are null values or not.
If ``False``, do not create the new column.
Defaults to ``None``.
"""

def __init__(self, dtype=None, nan='mean', null_column=None):
super().__init__(dtype=dtype, nan=nan, null_column=null_column, min_value='auto',
max_value='auto', rounding=None)


class NumericalRoundedTransformer(NumericalTransformer):
"""Transformer for numerical data.
This transformer replaces integer values with their float equivalent, rounding all values to
the maximum number of decimal places detected in the fitted data.
Non null float values are not modified.
This class behaves exactly as the ``NumericalTransformer`` with ``min_value=None``,
``max_value=None`` and ``rounding='auto'``.
Args:
dtype (data type):
Data type of the data to transform. It will be used when reversing the
transformation. If not provided, the dtype of the fit data will be used.
Defaults to ``None``.
nan (int, str or None):
Indicate what to do with the null values. If an integer is given, replace them
with the given value. If the strings ``'mean'`` or ``'mode'`` are given, replace
them with the corresponding aggregation. If ``None`` is given, do not replace them.
Defaults to ``'mean'``.
null_column (bool):
Whether to create a new column to indicate which values were null or not.
If ``None``, only create a new column when the data contains null values.
If ``True``, always create the new column whether there are null values or not.
If ``False``, do not create the new column.
Defaults to ``None``.
"""

def __init__(self, dtype=None, nan='mean', null_column=None):
super().__init__(dtype=dtype, nan=nan, null_column=null_column, min_value=None,
max_value=None, rounding='auto')


class GaussianCopulaTransformer(NumericalTransformer):
r"""Transformer for numerical data based on copulas transformation.
Expand Down
2 changes: 1 addition & 1 deletion tests/quality/test_quality.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from tests.quality.utils import download_single_table

R2_THRESHOLD = 0.2
TEST_THRESHOLD = 0.35
TEST_THRESHOLD = 0.3
MAX_SIZE = 5000000
TYPES_TO_SKIP = {'numerical', 'float', 'integer', 'id', None}

Expand Down
14 changes: 1 addition & 13 deletions tests/unit/transformers/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@
import pytest

from rdt.transformers.categorical import (
CategoricalFuzzyTransformer, CategoricalTransformer, LabelEncodingTransformer,
OneHotEncodingTransformer)
CategoricalTransformer, LabelEncodingTransformer, OneHotEncodingTransformer)

RE_SSN = re.compile(r'\d\d\d-\d\d-\d\d\d\d')

Expand Down Expand Up @@ -1582,14 +1581,3 @@ def test__reverse_transform_clips_values(self):

# Assert
pd.testing.assert_series_equal(out, pd.Series(['a', 'b', 'c']))


class TestCategoricalFuzzyTransformer:

def test___init__(self):
"""Test that the ``__init__`` method uses ``fuzzy==True`` by default."""
# Setup
transformer = CategoricalFuzzyTransformer()

# Assert
assert transformer.fuzzy
51 changes: 1 addition & 50 deletions tests/unit/transformers/test_numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@

from rdt.transformers.null import NullTransformer
from rdt.transformers.numerical import (
BayesGMMTransformer, GaussianCopulaTransformer, NumericalBoundedTransformer,
NumericalRoundedBoundedTransformer, NumericalRoundedTransformer, NumericalTransformer)
BayesGMMTransformer, GaussianCopulaTransformer, NumericalTransformer)


class TestNumericalTransformer(TestCase):
Expand Down Expand Up @@ -838,54 +837,6 @@ def test__reverse_transform_min_an_max_with_nulls(self):
np.testing.assert_array_equal(result, expected_data)


class TestNumericalBoundedTransformer(TestCase):

def test___init__(self):
"""super() arguments are properly passed and set as attributes."""
# Run
nt = NumericalBoundedTransformer(dtype='int', null_column=False)

# Assert
assert nt.dtype == 'int'
assert nt.nan == 'mean'
assert nt.null_column is False
assert nt.min_value == 'auto'
assert nt.max_value == 'auto'
assert nt.rounding is None


class TestNumericalRoundedTransformer(TestCase):

def test___init__(self):
"""super() arguments are properly passed and set as attributes."""
# Run
nt = NumericalRoundedTransformer(dtype='int', null_column=False)

# Assert
assert nt.dtype == 'int'
assert nt.nan == 'mean'
assert nt.null_column is False
assert nt.min_value is None
assert nt.max_value is None
assert nt.rounding == 'auto'


class TestNumericalRoundedBoundedTransformer(TestCase):

def test___init__(self):
"""super() arguments are properly passed and set as attributes."""
# Run
nt = NumericalRoundedBoundedTransformer(dtype='int', null_column=False)

# Assert
assert nt.dtype == 'int'
assert nt.nan == 'mean'
assert nt.null_column is False
assert nt.min_value == 'auto'
assert nt.max_value == 'auto'
assert nt.rounding == 'auto'


class TestGaussianCopulaTransformer:

def test___init__super_attrs(self):
Expand Down