From d69acd519808175639b543e29bc5f9e4c99472cf Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 7 Jul 2023 21:31:28 +0200 Subject: [PATCH] ENH add categorical_encoder to SMOTEN (#1001) --- doc/whats_new/v0.11.rst | 6 +++ imblearn/over_sampling/_smote/base.py | 43 +++++++++++++++++-- .../over_sampling/_smote/tests/test_smoten.py | 21 +++++++++ 3 files changed, 66 insertions(+), 4 deletions(-) diff --git a/doc/whats_new/v0.11.rst b/doc/whats_new/v0.11.rst index 2ae7268e9..5c303d0bf 100644 --- a/doc/whats_new/v0.11.rst +++ b/doc/whats_new/v0.11.rst @@ -33,3 +33,9 @@ Enhancements allowing to specify a :class:`~sklearn.preprocessing.OneHotEncoder` with custom parameters. :pr:`1000` by :user:`Guillaume Lemaitre `. + +- :class:`~imblearn.over_sampling.SMOTEN` now accepts a parameter `categorical_encoder` + allowing to specify a :class:`~sklearn.preprocessing.OrdinalEncoder` with custom + parameters. A new fitted parameter `categorical_encoder_` is exposed to access the + fitted encoder. + :pr:`1001` by :user:`Guillaume Lemaitre `. diff --git a/imblearn/over_sampling/_smote/base.py b/imblearn/over_sampling/_smote/base.py index 08bd96407..e4ea9d2d8 100644 --- a/imblearn/over_sampling/_smote/base.py +++ b/imblearn/over_sampling/_smote/base.py @@ -764,6 +764,10 @@ class SMOTEN(SMOTE): Parameters ---------- + categorical_encoder : estimator, default=None + Ordinal encoder used to encode the categorical features. If `None`, a + :class:`~sklearn.preprocessing.OrdinalEncoder` is used with default parameters. + {sampling_strategy} {random_state} @@ -791,6 +795,9 @@ class SMOTEN(SMOTE): Attributes ---------- + categorical_encoder_ : estimator + The encoder used to encode the categorical features. + sampling_strategy_ : dict Dictionary containing the information to sample the dataset. The keys corresponds to the class labels from which to sample and the values @@ -853,6 +860,31 @@ class SMOTEN(SMOTE): Class counts after resampling Counter({{0: 40, 1: 40}}) """ + _parameter_constraints: dict = { + **SMOTE._parameter_constraints, + "categorical_encoder": [ + HasMethods(["fit_transform", "inverse_transform"]), + None, + ], + } + + def __init__( + self, + categorical_encoder=None, + *, + sampling_strategy="auto", + random_state=None, + k_neighbors=5, + n_jobs=None, + ): + super().__init__( + sampling_strategy=sampling_strategy, + random_state=random_state, + k_neighbors=k_neighbors, + n_jobs=n_jobs, + ) + self.categorical_encoder = categorical_encoder + def _check_X_y(self, X, y): """Check should accept strings and not sparse matrices.""" y, binarize_y = check_target_type(y, indicate_one_vs_all=True) @@ -900,11 +932,14 @@ def _fit_resample(self, X, y): X_resampled = [X.copy()] y_resampled = [y.copy()] - encoder = OrdinalEncoder(dtype=np.int32) - X_encoded = encoder.fit_transform(X) + if self.categorical_encoder is None: + self.categorical_encoder_ = OrdinalEncoder(dtype=np.int32) + else: + self.categorical_encoder_ = clone(self.categorical_encoder) + X_encoded = self.categorical_encoder_.fit_transform(X) vdm = ValueDifferenceMetric( - n_categories=[len(cat) for cat in encoder.categories_] + n_categories=[len(cat) for cat in self.categorical_encoder_.categories_] ).fit(X_encoded, y) for class_sample, n_samples in self.sampling_strategy_.items(): @@ -922,7 +957,7 @@ def _fit_resample(self, X, y): X_class, class_sample, y.dtype, nn_indices, n_samples ) - X_new = encoder.inverse_transform(X_new) + X_new = self.categorical_encoder_.inverse_transform(X_new) X_resampled.append(X_new) y_resampled.append(y_new) diff --git a/imblearn/over_sampling/_smote/tests/test_smoten.py b/imblearn/over_sampling/_smote/tests/test_smoten.py index 774ad9963..6bd9d8356 100644 --- a/imblearn/over_sampling/_smote/tests/test_smoten.py +++ b/imblearn/over_sampling/_smote/tests/test_smoten.py @@ -1,5 +1,6 @@ import numpy as np import pytest +from sklearn.preprocessing import OrdinalEncoder from imblearn.over_sampling import SMOTEN @@ -27,6 +28,7 @@ def test_smoten(data): assert X_res.shape == (80, 3) assert y_res.shape == (80,) + assert isinstance(sampler.categorical_encoder_, OrdinalEncoder) def test_smoten_resampling(): @@ -52,3 +54,22 @@ def test_smoten_resampling(): X_generated, y_generated = X_res[X.shape[0] :], y_res[X.shape[0] :] np.testing.assert_array_equal(X_generated, "blue") np.testing.assert_array_equal(y_generated, "not apple") + + +def test_smoten_categorical_encoder(data): + """Check that `categorical_encoder` is used when provided.""" + + X, y = data + sampler = SMOTEN(random_state=0) + sampler.fit_resample(X, y) + + assert isinstance(sampler.categorical_encoder_, OrdinalEncoder) + assert sampler.categorical_encoder_.dtype == np.int32 + + encoder = OrdinalEncoder(dtype=np.int64) + sampler.set_params(categorical_encoder=encoder).fit_resample(X, y) + + assert isinstance(sampler.categorical_encoder_, OrdinalEncoder) + assert sampler.categorical_encoder is encoder + assert sampler.categorical_encoder_ is not encoder + assert sampler.categorical_encoder_.dtype == np.int64