Skip to content

Commit

Permalink
Remove psutil dependency and deprecate FrequencyEncoder (#711)
Browse files Browse the repository at this point in the history
  • Loading branch information
rwedge authored Sep 18, 2023
1 parent b3f441d commit 46d523a
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 169 deletions.
25 changes: 5 additions & 20 deletions rdt/transformers/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

import numpy as np
import pandas as pd
import psutil
from scipy.stats import norm

from rdt.errors import TransformerInputError
Expand Down Expand Up @@ -320,6 +319,11 @@ def __setstate__(self, state):
self.__dict__ = state

def __init__(self, add_noise=False):
warnings.warn(
"The 'FrequencyEncoder' transformer will no longer be supported in future versions "
"of the RDT library. Please use the 'UniformEncoder' transformer instead.",
FutureWarning
)
super().__init__()
self.add_noise = add_noise

Expand Down Expand Up @@ -475,19 +479,6 @@ def _transform(self, data):

return self._transform_by_row(data)

def _reverse_transform_by_matrix(self, data):
"""Reverse transform the data with matrix operations."""
num_rows = len(data)
num_categories = len(self.starts)

data = np.broadcast_to(data, (num_categories, num_rows)).T
starts = np.broadcast_to(self.starts.index, (num_rows, num_categories))
is_data_greater_than_starts = (data >= starts)[:, ::-1]
interval_indexes = num_categories - np.argmax(is_data_greater_than_starts, axis=1) - 1

get_category_from_index = list(self.starts.category).__getitem__
return pd.Series(interval_indexes).apply(get_category_from_index).astype(self.dtype)

def _reverse_transform_by_category(self, data):
"""Reverse transform the data by iterating over all the categories."""
result = np.empty(shape=(len(data), ), dtype=self.dtype)
Expand Down Expand Up @@ -522,12 +513,6 @@ def _reverse_transform(self, data):
num_rows = len(data)
num_categories = len(self.means)

# total shape * float size * number of matrices needed
needed_memory = num_rows * num_categories * 8 * 3
available_memory = psutil.virtual_memory().available
if available_memory > needed_memory:
return self._reverse_transform_by_matrix(data)

if num_rows > num_categories:
return self._reverse_transform_by_category(data)

Expand Down
1 change: 0 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
"scipy>=1.9.2,<2;python_version>='3.10'",
"scikit-learn>=0.24,<2;python_version<'3.10'",
"scikit-learn>=1.1.3,<2;python_version>='3.10'",
'psutil>=5.7,<6',
'Faker>=10',
]

Expand Down
44 changes: 4 additions & 40 deletions tests/integration/transformers/test_categorical.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import pickle
from io import BytesIO
from unittest.mock import Mock, patch

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -360,43 +359,12 @@ def test_frequency_encoder_mixed():
pd.testing.assert_frame_equal(data, reverse)


@patch('psutil.virtual_memory')
def test_frequency_encoder_mixed_low_virtual_memory(psutil_mock):
"""Test the FrequencyEncoder on mixed type data with low virtual memory.
Ensure that the FrequencyEncoder can fit, transform, and reverse
transform on mixed type data, when there is low virtual memory. Expect that the
reverse transformed data is the same as the input.
Input:
- 4 rows of mixed data
Output:
- The reverse transformed data
"""
# setup
data = pd.DataFrame([True, 'a', 1, None], columns=['column_name'])
column = 'column_name'
transformer = FrequencyEncoder()

virtual_memory = Mock()
virtual_memory.available = 1
psutil_mock.return_value = virtual_memory

# run
transformer.fit(data, column)
reverse = transformer.reverse_transform(transformer.transform(data))

# assert
pd.testing.assert_frame_equal(data, reverse)


@patch('psutil.virtual_memory')
def test_frequency_encoder_mixed_more_rows(psutil_mock):
"""Test the FrequencyEncoder on mixed type data with low virtual memory.
def test_frequency_encoder_mixed_more_rows():
"""Test the FrequencyEncoder on mixed type data.
Ensure that the FrequencyEncoder can fit, transform, and reverse
transform on mixed type data, when there is low virtual memory and a larger
number of rows. Expect that the reverse transformed data is the same as the input.
transform on mixed type data, when there is a larger number of rows.
Expect that the reverse transformed data is the same as the input.
Input:
- 4 rows of mixed data
Expand All @@ -409,10 +377,6 @@ def test_frequency_encoder_mixed_more_rows(psutil_mock):
transform_data = pd.DataFrame(['a', 1, None, 'a', True, 1], columns=['column_name'])
transformer = FrequencyEncoder()

virtual_memory = Mock()
virtual_memory.available = 1
psutil_mock.return_value = virtual_memory

# run
transformer.fit(data, column)
transformed = transformer.transform(transform_data)
Expand Down
129 changes: 21 additions & 108 deletions tests/unit/transformers/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,7 +512,12 @@ def test___setstate__(self):
def test___init__(self):
"""Passed arguments must be stored as attributes."""
# Run
transformer = FrequencyEncoder(add_noise='add_noise_value')
warn_message = (
"The 'FrequencyEncoder' transformer will no longer be supported in future "
"versions of the RDT library. Please use the 'UniformEncoder' transformer instead."
)
with pytest.warns(FutureWarning, match=warn_message):
transformer = FrequencyEncoder(add_noise='add_noise_value')

# Asserts
assert transformer.add_noise == 'add_noise_value'
Expand Down Expand Up @@ -700,11 +705,11 @@ def test__get_value_add_noise_true(self, norm_mock):
# Asserts
assert result == 0.2745

def test__reverse_transform_array(self):
"""Test reverse_transform a numpy.array"""
def test__reverse_transform_series(self):
"""Test reverse_transform a pandas Series"""
# Setup
data = pd.Series(['foo', 'bar', 'bar', 'foo', 'foo', 'tar'])
rt_data = np.array([-0.6, 0.5, 0.6, 0.2, 0.1, -0.2])
rt_data = pd.Series([-0.6, 0.5, 0.6, 0.2, 0.1, -0.2])
transformer = FrequencyEncoder()

# Run
Expand Down Expand Up @@ -987,83 +992,14 @@ def test__transform_by_row(self):
expected = np.array([0.875, 0.625, 0.375, 0.125])
assert (transformed == expected).all()

@patch('psutil.virtual_memory')
def test__reverse_transform_by_matrix_called(self, psutil_mock):
"""Test that the `_reverse_transform_by_matrix` method is called.
When there is enough virtual memory, expect that the
`_reverse_transform_by_matrix` method is called.
Setup:
The categorical transformer is instantiated with 4 categories. Also patch the
`psutil.virtual_memory` function to return a large enough `available_memory`.
Input:
- numerical data with 4 rows
Output:
- the output of `_reverse_transform_by_matrix`
Side effects:
- `_reverse_transform_by_matrix` will be called once
"""
# Setup
data = pd.Series([1, 2, 3, 4])

categorical_transformer_mock = Mock()
categorical_transformer_mock.means = pd.Series([0.125, 0.375, 0.625, 0.875])

virtual_memory = Mock()
virtual_memory.available = 4 * 4 * 8 * 3 + 1
psutil_mock.return_value = virtual_memory

# Run
reverse = FrequencyEncoder._reverse_transform(categorical_transformer_mock, data)

# Asserts
reverse_arg = categorical_transformer_mock._reverse_transform_by_matrix.call_args[0][0]
np.testing.assert_array_equal(reverse_arg, data.clip(0, 1))
assert reverse == categorical_transformer_mock._reverse_transform_by_matrix.return_value

@patch('psutil.virtual_memory')
def test__reverse_transform_by_matrix(self, psutil_mock):
"""Test the _reverse_transform_by_matrix method with numerical data.
Expect that the transformed data is correctly reverse transformed.
Setup:
The categorical transformer is instantiated with 4 categories and means. Also patch
the ``psutil.virtual_memory`` function to return a large enough ``available_memory``.
Input:
- transformed data with 4 rows
Ouptut:
- the original data
"""
# Setup
data = pd.Series([1, 2, 3, 4])
transformed = pd.Series([0.875, 0.625, 0.375, 0.125])

transformer = FrequencyEncoder()
transformer.starts = pd.DataFrame({'category': [4, 3, 2, 1]}, index=[0., 0.25, 0.5, 0.75])
transformer.dtype = data.dtype

virtual_memory = Mock()
virtual_memory.available = 4 * 4 * 8 * 3 + 1
psutil_mock.return_value = virtual_memory

# Run
reverse = transformer._reverse_transform_by_matrix(transformed)

# Assert
pd.testing.assert_series_equal(data, reverse)

@patch('psutil.virtual_memory')
def test__reverse_transform_by_category_called(self, psutil_mock):
def test__reverse_transform_by_category_called(self):
"""Test that the `_reverse_transform_by_category` method is called.
When there is not enough virtual memory and the number of rows is greater than the
number of categories, expect that the `_reverse_transform_by_category` method is called.
When the number of rows is greater than the number of categories, expect
that the `_reverse_transform_by_category` method is called.
Setup:
The categorical transformer is instantiated with 4 categories. Also patch the
`psutil.virtual_memory` function to return an `available_memory` of 1.
The categorical transformer is instantiated with 4 categories.
Input:
- numerical data with 5 rows
Output:
Expand All @@ -1077,10 +1013,6 @@ def test__reverse_transform_by_category_called(self, psutil_mock):
categorical_transformer_mock = Mock()
categorical_transformer_mock.means = pd.Series([0.125, 0.375, 0.625, 0.875])

virtual_memory = Mock()
virtual_memory.available = 1
psutil_mock.return_value = virtual_memory

# Run
reverse = FrequencyEncoder._reverse_transform(
categorical_transformer_mock, transform_data)
Expand All @@ -1090,16 +1022,14 @@ def test__reverse_transform_by_category_called(self, psutil_mock):
np.testing.assert_array_equal(reverse_arg, transform_data.clip(0, 1))
assert reverse == categorical_transformer_mock._reverse_transform_by_category.return_value

@patch('psutil.virtual_memory')
def test__reverse_transform_by_category(self, psutil_mock):
def test__reverse_transform_by_category(self):
"""Test the _reverse_transform_by_category method with numerical data.
Expect that the transformed data is correctly reverse transformed.
Setup:
The categorical transformer is instantiated with 4 categories, and the means
and intervals are set for those categories. Also patch the `psutil.virtual_memory`
function to return an `available_memory` of 1.
and intervals are set for those categories.
Input:
- transformed data with 5 rows
Ouptut:
Expand All @@ -1118,10 +1048,6 @@ def test__reverse_transform_by_category(self, psutil_mock):
}
transformer.dtype = data.dtype

virtual_memory = Mock()
virtual_memory.available = 1
psutil_mock.return_value = virtual_memory

reverse = transformer._reverse_transform_by_category(transformed)

pd.testing.assert_series_equal(data, reverse)
Expand Down Expand Up @@ -1152,17 +1078,14 @@ def test__get_category_from_start(self):
# Assert
assert category == 'c'

@patch('psutil.virtual_memory')
def test__reverse_transform_by_row_called(self, psutil_mock):
def test__reverse_transform_by_row_called(self):
"""Test that the `_reverse_transform_by_row` method is called.
When there is not enough virtual memory and the number of rows is less than or equal
to the number of categories, expect that the `_reverse_transform_by_row` method
is called.
When the number of rows is less than or equal to the number of categories,
expect that the `_reverse_transform_by_row` method is called.
Setup:
The categorical transformer is instantiated with 4 categories. Also patch the
`psutil.virtual_memory` function to return an `available_memory` of 1.
The categorical transformer is instantiated with 4 categories.
Input:
- numerical data with 4 rows
Output:
Expand All @@ -1179,10 +1102,6 @@ def test__reverse_transform_by_row_called(self, psutil_mock):
[0., 0.25, 0.5, 0.75], index=[4, 3, 2, 1], columns=['category'])
categorical_transformer_mock._normalize.return_value = data

virtual_memory = Mock()
virtual_memory.available = 1
psutil_mock.return_value = virtual_memory

# Run
reverse = FrequencyEncoder._reverse_transform(categorical_transformer_mock, data)

Expand All @@ -1191,16 +1110,14 @@ def test__reverse_transform_by_row_called(self, psutil_mock):
np.testing.assert_array_equal(reverse_arg, data.clip(0, 1))
assert reverse == categorical_transformer_mock._reverse_transform_by_row.return_value

@patch('psutil.virtual_memory')
def test__reverse_transform_by_row(self, psutil_mock):
def test__reverse_transform_by_row(self):
"""Test the _reverse_transform_by_row method with numerical data.
Expect that the transformed data is correctly reverse transformed.
Setup:
The categorical transformer is instantiated with 4 categories, and the means, starts,
and intervals are set for those categories. Also patch the `psutil.virtual_memory`
function to return an `available_memory` of 1.
and intervals are set for those categories.
Input:
- transformed data with 4 rows
Ouptut:
Expand All @@ -1222,10 +1139,6 @@ def test__reverse_transform_by_row(self, psutil_mock):
}
transformer.dtype = data.dtype

virtual_memory = Mock()
virtual_memory.available = 1
psutil_mock.return_value = virtual_memory

# Run
reverse = transformer._reverse_transform(transformed)

Expand Down

0 comments on commit 46d523a

Please sign in to comment.