Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix learn_rounding_scheme for more than 14 digits #591

Merged
merged 2 commits into from
Dec 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 24 additions & 7 deletions rdt/transformers/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,14 +78,29 @@ def __init__(self, missing_value_replacement='mean', model_missing_values=False,
@staticmethod
def _learn_rounding_digits(data):
# check if data has any decimals
name = data.name
data = np.array(data)
roundable_data = data[~(np.isinf(data) | pd.isna(data))]
if ((roundable_data % 1) != 0).any():
if (roundable_data == roundable_data.round(MAX_DECIMALS)).all():
for decimal in range(MAX_DECIMALS + 1):
if (roundable_data == roundable_data.round(decimal)).all():
return decimal

# Doesn't contain numbers
if len(roundable_data) == 0:
return None

# Doesn't contain decimal digits
if ((roundable_data % 1) == 0).all():
return 0

# Try to round to fewer digits
if (roundable_data == roundable_data.round(MAX_DECIMALS)).all():
for decimal in range(MAX_DECIMALS + 1):
if (roundable_data == roundable_data.round(decimal)).all():
return decimal

# Can't round, not equal after MAX_DECIMALS digits of precision
warnings.warn(
f"No rounding scheme detected for column '{name}'."
' Synthetic data will not be rounded.'
)
return None

def _raise_out_of_bounds_error(self, value, name, bound_type, min_bound, max_bound):
Expand Down Expand Up @@ -178,8 +193,10 @@ def _reverse_transform(self, data):
data = data.clip(min_bound, max_bound)

is_integer = np.dtype(self._dtype).kind == 'i'
if self.learn_rounding_scheme or is_integer:
data = data.round(self._rounding_digits or 0)
if self.learn_rounding_scheme and self._rounding_digits is not None:
data = data.round(self._rounding_digits)
elif is_integer:
data = data.round(0)

if pd.isna(data).any() and is_integer:
return data
Expand Down
54 changes: 25 additions & 29 deletions tests/unit/transformers/test_numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,17 @@ def test___init__super_attrs(self):
def test__learn_rounding_digits_more_than_15_decimals(self):
"""Test the _learn_rounding_digits method with more than 15 decimals.

If the data has more than 15 decimals, None should be returned.

Input:
- An array that contains floats with more than 15 decimals.
Output:
- None
If the data has more than 15 decimals, return None and raise warning.
"""
data = np.random.random(size=10).round(20)
# Setup
data = pd.Series(np.random.random(size=10).round(20), name='col')

output = FloatFormatter._learn_rounding_digits(data)
# Run and Assert
warn_msg = (
"No rounding scheme detected for column 'col'. Synthetic data will not be rounded."
)
with pytest.warns(UserWarning, match=warn_msg):
output = FloatFormatter._learn_rounding_digits(data)

assert output is None

Expand All @@ -52,7 +53,7 @@ def test__learn_rounding_digits_less_than_15_decimals(self):
Output:
- 3
"""
data = np.array([10, 0., 0.1, 0.12, 0.123, np.nan])
data = pd.Series(np.array([10, 0., 0.1, 0.12, 0.123, np.nan]))

output = FloatFormatter._learn_rounding_digits(data)

Expand All @@ -61,36 +62,31 @@ def test__learn_rounding_digits_less_than_15_decimals(self):
def test__learn_rounding_digits_negative_decimals_float(self):
"""Test the _learn_rounding_digits method with floats multiples of powers of 10.

If the data has all multiples of 10 the output should be None.
If the data has all multiples of 10 the output should be 0.

Input:
- An array that contains floats that are multiples of powers of 10, 100 and 1000
and a NaN.
Output:
- None
- An array that contains floats that are multiples of powers of 10, 100 and 1000 and a NaN.
"""
data = np.array([1230., 12300., 123000., np.nan])
data = pd.Series(np.array([1230., 12300., 123000., np.nan]))

output = FloatFormatter._learn_rounding_digits(data)

assert output is None
assert output == 0

def test__learn_rounding_digits_negative_decimals_integer(self):
"""Test the _learn_rounding_digits method with integers multiples of powers of 10.

If the data has all multiples of 10 the output should be None.
If the data has all multiples of 10 the output should be 0.

Input:
- An array that contains integers that are multiples of powers of 10, 100 and 1000
and a NaN.
Output:
- None
"""
data = np.array([1230, 12300, 123000, np.nan])
data = pd.Series(np.array([1230, 12300, 123000, np.nan]))

output = FloatFormatter._learn_rounding_digits(data)

assert output is None
assert output == 0

def test__learn_rounding_digits_all_missing_value_replacements(self):
"""Test the _learn_rounding_digits method with data that is all NaNs.
Expand All @@ -102,7 +98,7 @@ def test__learn_rounding_digits_all_missing_value_replacements(self):
Output:
- None
"""
data = np.array([np.nan, np.nan, np.nan, np.nan])
data = pd.Series(np.array([np.nan, np.nan, np.nan, np.nan]))

output = FloatFormatter._learn_rounding_digits(data)

Expand Down Expand Up @@ -298,7 +294,7 @@ def test__fit_learn_rounding_scheme_true_max_decimals(self):
Input:
- Series with a value that has 15 decimals
Side Effect:
- ``_rounding_digits`` is set to ``None``
- ``_rounding_digits`` is set to None
"""
# Setup
data = pd.Series([0.000000000000001])
Expand All @@ -318,13 +314,13 @@ def test__fit_learn_rounding_scheme_true_inf(self):

If the ``learn_rounding_scheme`` parameter is set to ``True``, and the data
contains only integers or infinite values, ``_fit`` should learn
``_rounding_digits`` to be None.
``_rounding_digits`` to be 0.


Input:
- Series with ``np.inf`` as a value
Side Effect:
- ``_rounding_digits`` is set to None
- ``_rounding_digits`` is set to 0
"""
# Setup
data = pd.Series([15000, 4000, 60000, np.inf])
Expand All @@ -337,18 +333,18 @@ def test__fit_learn_rounding_scheme_true_inf(self):
transformer._fit(data)

# Asserts
assert transformer._rounding_digits is None
assert transformer._rounding_digits == 0

def test__fit_learn_rounding_scheme_true_max_zero(self):
"""Test ``_fit`` with ``learn_rounding_scheme`` set to ``True``.

If the ``learn_rounding_scheme`` parameter is set to ``True``, and the max
in the data is 0, ``_fit`` should learn the ``_rounding_digits`` to be None.
in the data is 0, ``_fit`` should learn the ``_rounding_digits`` to be 0.

Input:
- Series with 0 as max value
Side Effect:
- ``_rounding_digits`` is set to None
- ``_rounding_digits`` is set to 0
"""
# Setup
data = pd.Series([0, 0, 0])
Expand All @@ -361,7 +357,7 @@ def test__fit_learn_rounding_scheme_true_max_zero(self):
transformer._fit(data)

# Asserts
assert transformer._rounding_digits is None
assert transformer._rounding_digits == 0

def test__fit_enforce_min_max_values_false(self):
"""Test ``_fit`` with ``enforce_min_max_values`` set to ``False``.
Expand Down