From 2f6b1f68c5d234800117b05e0d404475454e4c3e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 10 Jul 2023 15:53:22 +0200 Subject: [PATCH] FIX divide by sqrt(2) the median entry in SMOTENC (#1014) --- doc/over_sampling.rst | 10 +++++----- doc/whats_new/v0.11.rst | 8 ++++++++ imblearn/over_sampling/_smote/base.py | 9 +++++++-- 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/doc/over_sampling.rst b/doc/over_sampling.rst index dcb5af980..3c3ca80b7 100644 --- a/doc/over_sampling.rst +++ b/doc/over_sampling.rst @@ -203,11 +203,11 @@ or relying on `dtype` inference if the columns are using the >>> print(sorted(Counter(y_resampled).items())) [(0, 30), (1, 30)] >>> print(X_resampled[-5:]) - [['A' 0.5246469549655818 2] - ['B' -0.3657680728116921 2] - ['B' 0.9344237230779993 2] - ['B' 0.3710891618824609 2] - ['B' 0.3327240726719727 2]] + [['A' 0.52... 2] + ['B' -0.36... 2] + ['B' 0.93... 2] + ['B' 0.37... 2] + ['B' 0.33... 2]] Therefore, it can be seen that the samples generated in the first and last columns are belonging to the same categories originally presented without any diff --git a/doc/whats_new/v0.11.rst b/doc/whats_new/v0.11.rst index 0e3ab26a2..387eb1ed3 100644 --- a/doc/whats_new/v0.11.rst +++ b/doc/whats_new/v0.11.rst @@ -6,6 +6,14 @@ Version 0.11.1 Changelog --------- +Bug fixes +......... + +- Fix a bug in :class:`~imblearn.over_sampling.SMOTENC` where the entries of the + one-hot encoding should be divided by `sqrt(2)` and not `2`, taking into account that + they are plugged into an Euclidean distance computation. + :pr:`1014` by :user:`Guillaume Lemaitre `. + Version 0.11.0 ============== diff --git a/imblearn/over_sampling/_smote/base.py b/imblearn/over_sampling/_smote/base.py index 3bbe5f3b0..4627d52be 100644 --- a/imblearn/over_sampling/_smote/base.py +++ b/imblearn/over_sampling/_smote/base.py @@ -671,13 +671,18 @@ def _fit_resample(self, X, y): # In the edge case where the median of the std is equal to 0, the 1s # entries will be also nullified. In this case, we store the original - # categorical encoding which will be later used for inversing the OHE + # categorical encoding which will be later used for inverting the OHE if math.isclose(self.median_std_, 0): self._X_categorical_minority_encoded = _safe_indexing( X_ohe.toarray(), np.flatnonzero(y == class_minority) ) - X_ohe.data = np.ones_like(X_ohe.data, dtype=X_ohe.dtype) * self.median_std_ / 2 + # With one-hot encoding, the median will be repeated twice. We need to divide + # by sqrt(2) such that we only have one median value contributing to the + # Euclidean distance + X_ohe.data = ( + np.ones_like(X_ohe.data, dtype=X_ohe.dtype) * self.median_std_ / np.sqrt(2) + ) X_encoded = sparse.hstack((X_continuous, X_ohe), format="csr") X_resampled, y_resampled = super()._fit_resample(X_encoded, y)