Skip to content

Commit

Permalink
FIX multiply by random number < 0.5 for BorderlineSMOTE-2 (scikit-lea…
Browse files Browse the repository at this point in the history
…rn-contrib#1027)

Co-authored-by: Soledad Galli <solegalli@protonmail.com>
  • Loading branch information
glemaitre and solegalli authored Jul 11, 2023
1 parent d597b05 commit ec27259
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 6 deletions.
29 changes: 23 additions & 6 deletions imblearn/over_sampling/_smote/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def _validate_estimator(self):
)

def _make_samples(
self, X, y_dtype, y_type, nn_data, nn_num, n_samples, step_size=1.0
self, X, y_dtype, y_type, nn_data, nn_num, n_samples, step_size=1.0, y=None
):
"""A support function that returns artificial samples constructed along
the line connecting nearest neighbours.
Expand Down Expand Up @@ -98,6 +98,10 @@ def _make_samples(
step_size : float, default=1.0
The step size to create samples.
y : ndarray of shape (n_samples_all,), default=None
The true target associated with `nn_data`. Used by Borderline SMOTE-2 to
weight the distances in the sample generation process.
Returns
-------
X_new : {ndarray, sparse matrix} of shape (n_samples_new, n_features)
Expand All @@ -114,11 +118,13 @@ def _make_samples(
rows = np.floor_divide(samples_indices, nn_num.shape[1])
cols = np.mod(samples_indices, nn_num.shape[1])

X_new = self._generate_samples(X, nn_data, nn_num, rows, cols, steps, y_type)
X_new = self._generate_samples(X, nn_data, nn_num, rows, cols, steps, y_type, y)
y_new = np.full(n_samples, fill_value=y_type, dtype=y_dtype)
return X_new, y_new

def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps, y_type=None):
def _generate_samples(
self, X, nn_data, nn_num, rows, cols, steps, y_type=None, y=None
):
r"""Generate a synthetic sample.
The rule for the generation is:
Expand Down Expand Up @@ -153,15 +159,26 @@ def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps, y_type=None):
steps : ndarray of shape (n_samples,), dtype=float
Step sizes for new samples.
y_type : None
Unused parameter. Only for compatibility reason with SMOTE-NC.
y_type : str, int or None, default=None
Class label of the current target classes for which we want to generate
samples.
y : ndarray of shape (n_samples_all,), default=None
The true target associated with `nn_data`. Used by Borderline SMOTE-2 to
weight the distances in the sample generation process.
Returns
-------
X_new : {ndarray, sparse matrix} of shape (n_samples, n_features)
Synthetically generated samples.
"""
diffs = nn_data[nn_num[rows, cols]] - X[rows]
if y is not None: # only entering for BorderlineSMOTE-2
random_state = check_random_state(self.random_state)
mask_pair_samples = y[nn_num[rows, cols]] != y_type
diffs[mask_pair_samples] *= random_state.uniform(
low=0.0, high=0.5, size=(mask_pair_samples.sum(), 1)
)

if sparse.issparse(X):
sparse_func = type(X).__name__
Expand Down Expand Up @@ -736,7 +753,7 @@ def _fit_resample(self, X, y):

return X_resampled, y_resampled

def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps, y_type):
def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps, y_type, y=None):
"""Generate a synthetic sample with an additional steps for the
categorical features.
Expand Down
3 changes: 3 additions & 0 deletions imblearn/over_sampling/_smote/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,8 +224,10 @@ def _fit_resample(self, X, y):

if self.kind == "borderline-1":
X_to_sample_from = X_class # consider the positive class only
y_to_check_neighbors = None
else: # self.kind == "borderline-2"
X_to_sample_from = X # consider the whole dataset
y_to_check_neighbors = y

self.nn_k_.fit(X_to_sample_from)
nns = self.nn_k_.kneighbors(X_danger, return_distance=False)[:, 1:]
Expand All @@ -236,6 +238,7 @@ def _fit_resample(self, X, y):
X_to_sample_from,
nns,
n_samples,
y=y_to_check_neighbors,
)
if sparse.issparse(X_new):
X_resampled = sparse.vstack([X_resampled, X_new])
Expand Down

0 comments on commit ec27259

Please sign in to comment.