Skip to content

Commit

Permalink
FIX compute the median of std dev for each class to over-sample in SM…
Browse files Browse the repository at this point in the history
  • Loading branch information
glemaitre authored Jul 10, 2023
1 parent 2f6b1f6 commit a8e44ae
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 46 deletions.
4 changes: 2 additions & 2 deletions doc/over_sampling.rst
Original file line number Diff line number Diff line change
Expand Up @@ -203,9 +203,9 @@ or relying on `dtype` inference if the columns are using the
>>> print(sorted(Counter(y_resampled).items()))
[(0, 30), (1, 30)]
>>> print(X_resampled[-5:])
[['A' 0.52... 2]
[['A' 0.19... 2]
['B' -0.36... 2]
['B' 0.93... 2]
['B' 0.87... 2]
['B' 0.37... 2]
['B' 0.33... 2]]

Expand Down
9 changes: 9 additions & 0 deletions doc/whats_new/v0.11.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,15 @@ Bug fixes
they are plugged into an Euclidean distance computation.
:pr:`1014` by :user:`Guillaume Lemaitre <glemaitre>`.

- Fix a bug in :class:`~imblearn.over_sampling.SMOTENC` where the median of standard
deviation of the continuous features was only computed on the minority class. Now,
we are computing this statistic for each class that is up-sampled.
:pr:`1015` by :user:`Guillaume Lemaitre <glemaitre>`.

- Fix a bug in :class:`~imblearn.over_sampling.SMOTENC` such that the case where
the median of standard deviation of the continuous features is null is handled
in the multiclass case as well.
:pr:`1015` by :user:`Guillaume Lemaitre <glemaitre>`.

Version 0.11.0
==============
Expand Down
97 changes: 55 additions & 42 deletions imblearn/over_sampling/_smote/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import math
import numbers
import warnings
from collections import Counter

import numpy as np
from scipy import sparse
Expand All @@ -23,7 +22,6 @@
check_random_state,
)
from sklearn.utils.sparsefuncs_fast import (
csc_mean_variance_axis0,
csr_mean_variance_axis0,
)
from sklearn.utils.validation import _num_features
Expand Down Expand Up @@ -116,11 +114,11 @@ def _make_samples(
rows = np.floor_divide(samples_indices, nn_num.shape[1])
cols = np.mod(samples_indices, nn_num.shape[1])

X_new = self._generate_samples(X, nn_data, nn_num, rows, cols, steps)
X_new = self._generate_samples(X, nn_data, nn_num, rows, cols, steps, y_type)
y_new = np.full(n_samples, fill_value=y_type, dtype=y_dtype)
return X_new, y_new

def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps):
def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps, y_type=None):
r"""Generate a synthetic sample.
The rule for the generation is:
Expand Down Expand Up @@ -155,6 +153,9 @@ def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps):
steps : ndarray of shape (n_samples,), dtype=float
Step sizes for new samples.
y_type : None
Unused parameter. Only for compatibility reason with SMOTE-NC.
Returns
-------
X_new : {ndarray, sparse matrix} of shape (n_samples, n_features)
Expand Down Expand Up @@ -465,8 +466,9 @@ class SMOTENC(SMOTE):
continuous_features_ : ndarray of shape (n_cont_features,), dtype=np.int64
Indices of the continuous features.
median_std_ : float
Median of the standard deviation of the continuous features.
median_std_ : dict of int -> float
Median of the standard deviation of the continuous features for each
class to be over-sampled.
n_features_ : int
Number of features observed at `fit`.
Expand Down Expand Up @@ -627,23 +629,8 @@ def _fit_resample(self, X, y):
self._validate_column_types(X)
self._validate_estimator()

# compute the median of the standard deviation of the minority class
target_stats = Counter(y)
class_minority = min(target_stats, key=target_stats.get)

X_continuous = _safe_indexing(X, self.continuous_features_, axis=1)
X_continuous = check_array(X_continuous, accept_sparse=["csr", "csc"])
X_minority = _safe_indexing(X_continuous, np.flatnonzero(y == class_minority))

if sparse.issparse(X):
if X.format == "csr":
_, var = csr_mean_variance_axis0(X_minority)
else:
_, var = csc_mean_variance_axis0(X_minority)
else:
var = X_minority.var(axis=0)
self.median_std_ = np.median(np.sqrt(var))

X_categorical = _safe_indexing(X, self.categorical_features_, axis=1)
if X_continuous.dtype.name != "object":
dtype_ohe = X_continuous.dtype
Expand All @@ -664,28 +651,54 @@ def _fit_resample(self, X, y):
if not sparse.issparse(X_ohe):
X_ohe = sparse.csr_matrix(X_ohe, dtype=dtype_ohe)

# we can replace the 1 entries of the categorical features with the
# median of the standard deviation. It will ensure that whenever
# distance is computed between 2 samples, the difference will be equal
# to the median of the standard deviation as in the original paper.

# In the edge case where the median of the std is equal to 0, the 1s
# entries will be also nullified. In this case, we store the original
# categorical encoding which will be later used for inverting the OHE
if math.isclose(self.median_std_, 0):
self._X_categorical_minority_encoded = _safe_indexing(
X_ohe.toarray(), np.flatnonzero(y == class_minority)
X_encoded = sparse.hstack((X_continuous, X_ohe), format="csr", dtype=dtype_ohe)
X_resampled = [X_encoded.copy()]
y_resampled = [y.copy()]

# SMOTE resampling starts here
self.median_std_ = {}
for class_sample, n_samples in self.sampling_strategy_.items():
if n_samples == 0:
continue
target_class_indices = np.flatnonzero(y == class_sample)
X_class = _safe_indexing(X_encoded, target_class_indices)

_, var = csr_mean_variance_axis0(
X_class[:, : self.continuous_features_.size]
)
self.median_std_[class_sample] = np.median(np.sqrt(var))

# In the edge case where the median of the std is equal to 0, the 1s
# entries will be also nullified. In this case, we store the original
# categorical encoding which will be later used for inverting the OHE
if math.isclose(self.median_std_[class_sample], 0):
# This variable will be used when generating data
self._X_categorical_minority_encoded = X_class[
:, self.continuous_features_.size :
].toarray()

# we can replace the 1 entries of the categorical features with the
# median of the standard deviation. It will ensure that whenever
# distance is computed between 2 samples, the difference will be equal
# to the median of the standard deviation as in the original paper.
X_class_categorical = X_class[:, self.continuous_features_.size :]
# With one-hot encoding, the median will be repeated twice. We need
# to divide by sqrt(2) such that we only have one median value
# contributing to the Euclidean distance
X_class_categorical.data[:] = self.median_std_[class_sample] / np.sqrt(2)
X_class[:, self.continuous_features_.size :] = X_class_categorical

# With one-hot encoding, the median will be repeated twice. We need to divide
# by sqrt(2) such that we only have one median value contributing to the
# Euclidean distance
X_ohe.data = (
np.ones_like(X_ohe.data, dtype=X_ohe.dtype) * self.median_std_ / np.sqrt(2)
)
X_encoded = sparse.hstack((X_continuous, X_ohe), format="csr")
self.nn_k_.fit(X_class)
nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]
X_new, y_new = self._make_samples(
X_class, y.dtype, class_sample, X_class, nns, n_samples, 1.0
)
X_resampled.append(X_new)
y_resampled.append(y_new)

X_resampled, y_resampled = super()._fit_resample(X_encoded, y)
X_resampled = sparse.vstack(X_resampled, format=X_encoded.format)
y_resampled = np.hstack(y_resampled)
# SMOTE resampling ends here

# reverse the encoding of the categorical features
X_res_cat = X_resampled[:, self.continuous_features_.size :]
Expand Down Expand Up @@ -723,7 +736,7 @@ def _fit_resample(self, X, y):

return X_resampled, y_resampled

def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps):
def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps, y_type):
"""Generate a synthetic sample with an additional steps for the
categorical features.
Expand All @@ -741,7 +754,7 @@ def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps):

# In the case that the median std was equal to zeros, we have to
# create non-null entry based on the encoded of OHE
if math.isclose(self.median_std_, 0):
if math.isclose(self.median_std_[y_type], 0):
nn_data[
:, self.continuous_features_.size :
] = self._X_categorical_minority_encoded
Expand Down
23 changes: 21 additions & 2 deletions imblearn/over_sampling/_smote/tests/test_smote_nc.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,8 @@ def test_smotenc(data):
assert set(X[:, cat_idx]) == set(X_resampled[:, cat_idx])
assert X[:, cat_idx].dtype == X_resampled[:, cat_idx].dtype

assert isinstance(smote.median_std_, dict)


# part of the common test which apply to SMOTE-NC even if it is not default
# constructible
Expand Down Expand Up @@ -193,6 +195,7 @@ def test_smotenc_pandas():
X_res, y_res = smote.fit_resample(X, y)
assert_array_equal(X_res_pd.to_numpy(), X_res)
assert_allclose(y_res_pd, y_res)
assert set(smote.median_std_.keys()) == {0, 1}


def test_smotenc_preserve_dtype():
Expand Down Expand Up @@ -234,20 +237,36 @@ def test_smote_nc_with_null_median_std():
[
[1, 2, 1, "A"],
[2, 1, 2, "A"],
[2, 1, 2, "A"],
[1, 2, 3, "B"],
[1, 2, 4, "C"],
[1, 2, 5, "C"],
[1, 2, 4, "C"],
[1, 2, 4, "C"],
[1, 2, 4, "C"],
],
dtype="object",
)
labels = np.array(
["class_1", "class_1", "class_1", "class_2", "class_2"], dtype=object
[
"class_1",
"class_1",
"class_1",
"class_1",
"class_2",
"class_2",
"class_3",
"class_3",
"class_3",
],
dtype=object,
)
smote = SMOTENC(categorical_features=[3], k_neighbors=1, random_state=0)
X_res, y_res = smote.fit_resample(data, labels)
# check that the categorical feature is not random but correspond to the
# categories seen in the minority class samples
assert X_res[-1, -1] == "C"
assert_array_equal(X_res[-3:, -1], np.array(["C", "C", "C"], dtype=object))
assert smote.median_std_ == {"class_2": 0.0, "class_3": 0.0}


def test_smotenc_categorical_encoder():
Expand Down

0 comments on commit a8e44ae

Please sign in to comment.