From 46d523aeaa3ce2104f073156b004e258e6d4b27e Mon Sep 17 00:00:00 2001 From: Roy Wedge Date: Mon, 18 Sep 2023 11:05:57 -0400 Subject: [PATCH] Remove psutil dependency and deprecate FrequencyEncoder (#711) --- rdt/transformers/categorical.py | 25 +--- setup.py | 1 - .../transformers/test_categorical.py | 44 +----- tests/unit/transformers/test_categorical.py | 129 +++--------------- 4 files changed, 30 insertions(+), 169 deletions(-) diff --git a/rdt/transformers/categorical.py b/rdt/transformers/categorical.py index d04da5998..7d36259c3 100644 --- a/rdt/transformers/categorical.py +++ b/rdt/transformers/categorical.py @@ -5,7 +5,6 @@ import numpy as np import pandas as pd -import psutil from scipy.stats import norm from rdt.errors import TransformerInputError @@ -320,6 +319,11 @@ def __setstate__(self, state): self.__dict__ = state def __init__(self, add_noise=False): + warnings.warn( + "The 'FrequencyEncoder' transformer will no longer be supported in future versions " + "of the RDT library. Please use the 'UniformEncoder' transformer instead.", + FutureWarning + ) super().__init__() self.add_noise = add_noise @@ -475,19 +479,6 @@ def _transform(self, data): return self._transform_by_row(data) - def _reverse_transform_by_matrix(self, data): - """Reverse transform the data with matrix operations.""" - num_rows = len(data) - num_categories = len(self.starts) - - data = np.broadcast_to(data, (num_categories, num_rows)).T - starts = np.broadcast_to(self.starts.index, (num_rows, num_categories)) - is_data_greater_than_starts = (data >= starts)[:, ::-1] - interval_indexes = num_categories - np.argmax(is_data_greater_than_starts, axis=1) - 1 - - get_category_from_index = list(self.starts.category).__getitem__ - return pd.Series(interval_indexes).apply(get_category_from_index).astype(self.dtype) - def _reverse_transform_by_category(self, data): """Reverse transform the data by iterating over all the categories.""" result = np.empty(shape=(len(data), ), dtype=self.dtype) @@ -522,12 +513,6 @@ def _reverse_transform(self, data): num_rows = len(data) num_categories = len(self.means) - # total shape * float size * number of matrices needed - needed_memory = num_rows * num_categories * 8 * 3 - available_memory = psutil.virtual_memory().available - if available_memory > needed_memory: - return self._reverse_transform_by_matrix(data) - if num_rows > num_categories: return self._reverse_transform_by_category(data) diff --git a/setup.py b/setup.py index 7a7d2c6a0..d94dc7e3f 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,6 @@ "scipy>=1.9.2,<2;python_version>='3.10'", "scikit-learn>=0.24,<2;python_version<'3.10'", "scikit-learn>=1.1.3,<2;python_version>='3.10'", - 'psutil>=5.7,<6', 'Faker>=10', ] diff --git a/tests/integration/transformers/test_categorical.py b/tests/integration/transformers/test_categorical.py index 35cefba65..a1499534a 100644 --- a/tests/integration/transformers/test_categorical.py +++ b/tests/integration/transformers/test_categorical.py @@ -1,6 +1,5 @@ import pickle from io import BytesIO -from unittest.mock import Mock, patch import numpy as np import pandas as pd @@ -360,43 +359,12 @@ def test_frequency_encoder_mixed(): pd.testing.assert_frame_equal(data, reverse) -@patch('psutil.virtual_memory') -def test_frequency_encoder_mixed_low_virtual_memory(psutil_mock): - """Test the FrequencyEncoder on mixed type data with low virtual memory. - - Ensure that the FrequencyEncoder can fit, transform, and reverse - transform on mixed type data, when there is low virtual memory. Expect that the - reverse transformed data is the same as the input. - - Input: - - 4 rows of mixed data - Output: - - The reverse transformed data - """ - # setup - data = pd.DataFrame([True, 'a', 1, None], columns=['column_name']) - column = 'column_name' - transformer = FrequencyEncoder() - - virtual_memory = Mock() - virtual_memory.available = 1 - psutil_mock.return_value = virtual_memory - - # run - transformer.fit(data, column) - reverse = transformer.reverse_transform(transformer.transform(data)) - - # assert - pd.testing.assert_frame_equal(data, reverse) - - -@patch('psutil.virtual_memory') -def test_frequency_encoder_mixed_more_rows(psutil_mock): - """Test the FrequencyEncoder on mixed type data with low virtual memory. +def test_frequency_encoder_mixed_more_rows(): + """Test the FrequencyEncoder on mixed type data. Ensure that the FrequencyEncoder can fit, transform, and reverse - transform on mixed type data, when there is low virtual memory and a larger - number of rows. Expect that the reverse transformed data is the same as the input. + transform on mixed type data, when there is a larger number of rows. + Expect that the reverse transformed data is the same as the input. Input: - 4 rows of mixed data @@ -409,10 +377,6 @@ def test_frequency_encoder_mixed_more_rows(psutil_mock): transform_data = pd.DataFrame(['a', 1, None, 'a', True, 1], columns=['column_name']) transformer = FrequencyEncoder() - virtual_memory = Mock() - virtual_memory.available = 1 - psutil_mock.return_value = virtual_memory - # run transformer.fit(data, column) transformed = transformer.transform(transform_data) diff --git a/tests/unit/transformers/test_categorical.py b/tests/unit/transformers/test_categorical.py index e1d249779..98ebe62b7 100644 --- a/tests/unit/transformers/test_categorical.py +++ b/tests/unit/transformers/test_categorical.py @@ -512,7 +512,12 @@ def test___setstate__(self): def test___init__(self): """Passed arguments must be stored as attributes.""" # Run - transformer = FrequencyEncoder(add_noise='add_noise_value') + warn_message = ( + "The 'FrequencyEncoder' transformer will no longer be supported in future " + "versions of the RDT library. Please use the 'UniformEncoder' transformer instead." + ) + with pytest.warns(FutureWarning, match=warn_message): + transformer = FrequencyEncoder(add_noise='add_noise_value') # Asserts assert transformer.add_noise == 'add_noise_value' @@ -700,11 +705,11 @@ def test__get_value_add_noise_true(self, norm_mock): # Asserts assert result == 0.2745 - def test__reverse_transform_array(self): - """Test reverse_transform a numpy.array""" + def test__reverse_transform_series(self): + """Test reverse_transform a pandas Series""" # Setup data = pd.Series(['foo', 'bar', 'bar', 'foo', 'foo', 'tar']) - rt_data = np.array([-0.6, 0.5, 0.6, 0.2, 0.1, -0.2]) + rt_data = pd.Series([-0.6, 0.5, 0.6, 0.2, 0.1, -0.2]) transformer = FrequencyEncoder() # Run @@ -987,83 +992,14 @@ def test__transform_by_row(self): expected = np.array([0.875, 0.625, 0.375, 0.125]) assert (transformed == expected).all() - @patch('psutil.virtual_memory') - def test__reverse_transform_by_matrix_called(self, psutil_mock): - """Test that the `_reverse_transform_by_matrix` method is called. - - When there is enough virtual memory, expect that the - `_reverse_transform_by_matrix` method is called. - - Setup: - The categorical transformer is instantiated with 4 categories. Also patch the - `psutil.virtual_memory` function to return a large enough `available_memory`. - Input: - - numerical data with 4 rows - Output: - - the output of `_reverse_transform_by_matrix` - Side effects: - - `_reverse_transform_by_matrix` will be called once - """ - # Setup - data = pd.Series([1, 2, 3, 4]) - - categorical_transformer_mock = Mock() - categorical_transformer_mock.means = pd.Series([0.125, 0.375, 0.625, 0.875]) - - virtual_memory = Mock() - virtual_memory.available = 4 * 4 * 8 * 3 + 1 - psutil_mock.return_value = virtual_memory - - # Run - reverse = FrequencyEncoder._reverse_transform(categorical_transformer_mock, data) - - # Asserts - reverse_arg = categorical_transformer_mock._reverse_transform_by_matrix.call_args[0][0] - np.testing.assert_array_equal(reverse_arg, data.clip(0, 1)) - assert reverse == categorical_transformer_mock._reverse_transform_by_matrix.return_value - - @patch('psutil.virtual_memory') - def test__reverse_transform_by_matrix(self, psutil_mock): - """Test the _reverse_transform_by_matrix method with numerical data. - - Expect that the transformed data is correctly reverse transformed. - - Setup: - The categorical transformer is instantiated with 4 categories and means. Also patch - the ``psutil.virtual_memory`` function to return a large enough ``available_memory``. - Input: - - transformed data with 4 rows - Ouptut: - - the original data - """ - # Setup - data = pd.Series([1, 2, 3, 4]) - transformed = pd.Series([0.875, 0.625, 0.375, 0.125]) - - transformer = FrequencyEncoder() - transformer.starts = pd.DataFrame({'category': [4, 3, 2, 1]}, index=[0., 0.25, 0.5, 0.75]) - transformer.dtype = data.dtype - - virtual_memory = Mock() - virtual_memory.available = 4 * 4 * 8 * 3 + 1 - psutil_mock.return_value = virtual_memory - - # Run - reverse = transformer._reverse_transform_by_matrix(transformed) - - # Assert - pd.testing.assert_series_equal(data, reverse) - - @patch('psutil.virtual_memory') - def test__reverse_transform_by_category_called(self, psutil_mock): + def test__reverse_transform_by_category_called(self): """Test that the `_reverse_transform_by_category` method is called. - When there is not enough virtual memory and the number of rows is greater than the - number of categories, expect that the `_reverse_transform_by_category` method is called. + When the number of rows is greater than the number of categories, expect + that the `_reverse_transform_by_category` method is called. Setup: - The categorical transformer is instantiated with 4 categories. Also patch the - `psutil.virtual_memory` function to return an `available_memory` of 1. + The categorical transformer is instantiated with 4 categories. Input: - numerical data with 5 rows Output: @@ -1077,10 +1013,6 @@ def test__reverse_transform_by_category_called(self, psutil_mock): categorical_transformer_mock = Mock() categorical_transformer_mock.means = pd.Series([0.125, 0.375, 0.625, 0.875]) - virtual_memory = Mock() - virtual_memory.available = 1 - psutil_mock.return_value = virtual_memory - # Run reverse = FrequencyEncoder._reverse_transform( categorical_transformer_mock, transform_data) @@ -1090,16 +1022,14 @@ def test__reverse_transform_by_category_called(self, psutil_mock): np.testing.assert_array_equal(reverse_arg, transform_data.clip(0, 1)) assert reverse == categorical_transformer_mock._reverse_transform_by_category.return_value - @patch('psutil.virtual_memory') - def test__reverse_transform_by_category(self, psutil_mock): + def test__reverse_transform_by_category(self): """Test the _reverse_transform_by_category method with numerical data. Expect that the transformed data is correctly reverse transformed. Setup: The categorical transformer is instantiated with 4 categories, and the means - and intervals are set for those categories. Also patch the `psutil.virtual_memory` - function to return an `available_memory` of 1. + and intervals are set for those categories. Input: - transformed data with 5 rows Ouptut: @@ -1118,10 +1048,6 @@ def test__reverse_transform_by_category(self, psutil_mock): } transformer.dtype = data.dtype - virtual_memory = Mock() - virtual_memory.available = 1 - psutil_mock.return_value = virtual_memory - reverse = transformer._reverse_transform_by_category(transformed) pd.testing.assert_series_equal(data, reverse) @@ -1152,17 +1078,14 @@ def test__get_category_from_start(self): # Assert assert category == 'c' - @patch('psutil.virtual_memory') - def test__reverse_transform_by_row_called(self, psutil_mock): + def test__reverse_transform_by_row_called(self): """Test that the `_reverse_transform_by_row` method is called. - When there is not enough virtual memory and the number of rows is less than or equal - to the number of categories, expect that the `_reverse_transform_by_row` method - is called. + When the number of rows is less than or equal to the number of categories, + expect that the `_reverse_transform_by_row` method is called. Setup: - The categorical transformer is instantiated with 4 categories. Also patch the - `psutil.virtual_memory` function to return an `available_memory` of 1. + The categorical transformer is instantiated with 4 categories. Input: - numerical data with 4 rows Output: @@ -1179,10 +1102,6 @@ def test__reverse_transform_by_row_called(self, psutil_mock): [0., 0.25, 0.5, 0.75], index=[4, 3, 2, 1], columns=['category']) categorical_transformer_mock._normalize.return_value = data - virtual_memory = Mock() - virtual_memory.available = 1 - psutil_mock.return_value = virtual_memory - # Run reverse = FrequencyEncoder._reverse_transform(categorical_transformer_mock, data) @@ -1191,16 +1110,14 @@ def test__reverse_transform_by_row_called(self, psutil_mock): np.testing.assert_array_equal(reverse_arg, data.clip(0, 1)) assert reverse == categorical_transformer_mock._reverse_transform_by_row.return_value - @patch('psutil.virtual_memory') - def test__reverse_transform_by_row(self, psutil_mock): + def test__reverse_transform_by_row(self): """Test the _reverse_transform_by_row method with numerical data. Expect that the transformed data is correctly reverse transformed. Setup: The categorical transformer is instantiated with 4 categories, and the means, starts, - and intervals are set for those categories. Also patch the `psutil.virtual_memory` - function to return an `available_memory` of 1. + and intervals are set for those categories. Input: - transformed data with 4 rows Ouptut: @@ -1222,10 +1139,6 @@ def test__reverse_transform_by_row(self, psutil_mock): } transformer.dtype = data.dtype - virtual_memory = Mock() - virtual_memory.available = 1 - psutil_mock.return_value = virtual_memory - # Run reverse = transformer._reverse_transform(transformed)