Skip to content

Commit

Permalink
FIX BorderlineSMOTE-2 use the full dataset to generate new sample (sc…
Browse files Browse the repository at this point in the history
  • Loading branch information
glemaitre authored Jul 10, 2023
1 parent d431b9d commit 2859cb0
Show file tree
Hide file tree
Showing 3 changed files with 131 additions and 97 deletions.
4 changes: 4 additions & 0 deletions doc/whats_new/v0.11.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ Bug fixes
in the multiclass case as well.
:pr:`1015` by :user:`Guillaume Lemaitre <glemaitre>`.

- Fix a bug in :class:`~imblearn.over_sampling.BorderlineSMOTE` version 2 where samples
should be generated from the whole dataset and not only from the minority class.
:pr:`1023` by :user:`Guillaume Lemaitre <glemaitre>`.

Version 0.11.0
==============

Expand Down
85 changes: 29 additions & 56 deletions imblearn/over_sampling/_smote/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,11 @@ class BorderlineSMOTE(BaseSMOTE):
nn_m_ : estimator object
Validated m-nearest neighbours created from the `m_neighbors` parameter.
in_danger_indices : dict of ndarray
Dictionary containing the indices of the samples considered in danger that
are used to generate new synthetic samples. The keys corresponds to the class
label.
n_features_in_ : int
Number of features in the input dataset.
Expand Down Expand Up @@ -201,74 +206,42 @@ def _fit_resample(self, X, y):
X_resampled = X.copy()
y_resampled = y.copy()

self.in_danger_indices = {}
for class_sample, n_samples in self.sampling_strategy_.items():
if n_samples == 0:
continue
target_class_indices = np.flatnonzero(y == class_sample)
X_class = _safe_indexing(X, target_class_indices)

self.nn_m_.fit(X)
danger_index = self._in_danger_noise(
mask_danger = self._in_danger_noise(
self.nn_m_, X_class, class_sample, y, kind="danger"
)
if not any(danger_index):
if not any(mask_danger):
continue
X_danger = _safe_indexing(X_class, mask_danger)
self.in_danger_indices[class_sample] = target_class_indices[mask_danger]

self.nn_k_.fit(X_class)
nns = self.nn_k_.kneighbors(
_safe_indexing(X_class, danger_index), return_distance=False
)[:, 1:]

# divergence between borderline-1 and borderline-2
if self.kind == "borderline-1":
# Create synthetic samples for borderline points.
X_new, y_new = self._make_samples(
_safe_indexing(X_class, danger_index),
y.dtype,
class_sample,
X_class,
nns,
n_samples,
)
if sparse.issparse(X_new):
X_resampled = sparse.vstack([X_resampled, X_new])
else:
X_resampled = np.vstack((X_resampled, X_new))
y_resampled = np.hstack((y_resampled, y_new))

elif self.kind == "borderline-2":
random_state = check_random_state(self.random_state)
fractions = random_state.beta(10, 10)

# only minority
X_new_1, y_new_1 = self._make_samples(
_safe_indexing(X_class, danger_index),
y.dtype,
class_sample,
X_class,
nns,
int(fractions * (n_samples + 1)),
step_size=1.0,
)

# we use a one-vs-rest policy to handle the multiclass in which
# new samples will be created considering not only the majority
# class but all over classes.
X_new_2, y_new_2 = self._make_samples(
_safe_indexing(X_class, danger_index),
y.dtype,
class_sample,
_safe_indexing(X, np.flatnonzero(y != class_sample)),
nns,
int((1 - fractions) * n_samples),
step_size=0.5,
)

if sparse.issparse(X_resampled):
X_resampled = sparse.vstack([X_resampled, X_new_1, X_new_2])
else:
X_resampled = np.vstack((X_resampled, X_new_1, X_new_2))
y_resampled = np.hstack((y_resampled, y_new_1, y_new_2))
X_to_sample_from = X_class # consider the positive class only
else: # self.kind == "borderline-2"
X_to_sample_from = X # consider the whole dataset

self.nn_k_.fit(X_to_sample_from)
nns = self.nn_k_.kneighbors(X_danger, return_distance=False)[:, 1:]
X_new, y_new = self._make_samples(
X_danger,
y.dtype,
class_sample,
X_to_sample_from,
nns,
n_samples,
)
if sparse.issparse(X_new):
X_resampled = sparse.vstack([X_resampled, X_new])
else:
X_resampled = np.vstack((X_resampled, X_new))
y_resampled = np.hstack((y_resampled, y_new))

return X_resampled, y_resampled

Expand Down
139 changes: 98 additions & 41 deletions imblearn/over_sampling/_smote/tests/test_borderline_smote.py
Original file line number Diff line number Diff line change
@@ -1,53 +1,110 @@
import numpy as np
from collections import Counter

import pytest
from sklearn.neighbors import NearestNeighbors
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.utils._testing import assert_allclose, assert_array_equal

from imblearn.over_sampling import BorderlineSMOTE


@pytest.fixture
def data():
X = np.array(
[
[0.11622591, -0.0317206],
[0.77481731, 0.60935141],
[1.25192108, -0.22367336],
[0.53366841, -0.30312976],
[1.52091956, -0.49283504],
[-0.28162401, -2.10400981],
[0.83680821, 1.72827342],
[0.3084254, 0.33299982],
[0.70472253, -0.73309052],
[0.28893132, -0.38761769],
[1.15514042, 0.0129463],
[0.88407872, 0.35454207],
[1.31301027, -0.92648734],
[-1.11515198, -0.93689695],
[-0.18410027, -0.45194484],
[0.9281014, 0.53085498],
[-0.14374509, 0.27370049],
[-0.41635887, -0.38299653],
[0.08711622, 0.93259929],
[1.70580611, -0.11219234],
]
@pytest.mark.parametrize("kind", ["borderline-1", "borderline-2"])
def test_borderline_smote_no_in_danger_samples(kind):
"""Check that the algorithm behave properly even on a dataset without any sample
in danger.
"""
X, y = make_classification(
n_samples=500,
n_features=2,
n_informative=2,
n_redundant=0,
n_repeated=0,
n_clusters_per_class=1,
n_classes=3,
weights=[0.1, 0.2, 0.7],
class_sep=1.5,
random_state=1,
)
y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0])
return X, y
smote = BorderlineSMOTE(kind=kind, m_neighbors=3, k_neighbors=5, random_state=0)
X_res, y_res = smote.fit_resample(X, y)

assert_allclose(X, X_res)
assert_allclose(y, y_res)
assert not smote.in_danger_indices

@pytest.mark.parametrize("kind", ["borderline-1", "borderline-2"])
def test_borderline_smote(kind, data):
bsmote = BorderlineSMOTE(kind=kind, random_state=42)
bsmote_nn = BorderlineSMOTE(
kind=kind,
random_state=42,
k_neighbors=NearestNeighbors(n_neighbors=6),
m_neighbors=NearestNeighbors(n_neighbors=11),

def test_borderline_smote_kind():
"""Check the behaviour of the `kind` parameter.
In short, "borderline-2" generates sample closer to the boundary decision than
"borderline-1". We generate an example where a logistic regression will perform
worse on "borderline-2" than on "borderline-1".
"""
X, y = make_classification(
n_samples=500,
n_features=2,
n_informative=2,
n_redundant=0,
n_repeated=0,
n_clusters_per_class=1,
n_classes=3,
weights=[0.1, 0.2, 0.7],
class_sep=1.0,
random_state=1,
)
smote = BorderlineSMOTE(
kind="borderline-1", m_neighbors=9, k_neighbors=5, random_state=0
)
X_res_borderline_1, y_res_borderline_1 = smote.fit_resample(X, y)
smote.set_params(kind="borderline-2")
X_res_borderline_2, y_res_borderline_2 = smote.fit_resample(X, y)

X_res_1, y_res_1 = bsmote.fit_resample(*data)
X_res_2, y_res_2 = bsmote_nn.fit_resample(*data)
score_borderline_1 = (
LogisticRegression()
.fit(X_res_borderline_1, y_res_borderline_1)
.score(X_res_borderline_1, y_res_borderline_1)
)
score_borderline_2 = (
LogisticRegression()
.fit(X_res_borderline_2, y_res_borderline_2)
.score(X_res_borderline_2, y_res_borderline_2)
)
assert score_borderline_1 > score_borderline_2


def test_borderline_smote_in_danger():
X, y = make_classification(
n_samples=500,
n_features=2,
n_informative=2,
n_redundant=0,
n_repeated=0,
n_clusters_per_class=1,
n_classes=3,
weights=[0.1, 0.2, 0.7],
class_sep=0.8,
random_state=1,
)
smote = BorderlineSMOTE(
kind="borderline-1",
m_neighbors=9,
k_neighbors=5,
random_state=0,
)
_, y_res_1 = smote.fit_resample(X, y)
in_danger_indices_borderline_1 = smote.in_danger_indices
smote.set_params(kind="borderline-2")
_, y_res_2 = smote.fit_resample(X, y)
in_danger_indices_borderline_2 = smote.in_danger_indices

assert_allclose(X_res_1, X_res_2)
assert_array_equal(y_res_1, y_res_2)
for key1, key2 in zip(
in_danger_indices_borderline_1, in_danger_indices_borderline_2
):
assert_array_equal(
in_danger_indices_borderline_1[key1], in_danger_indices_borderline_2[key2]
)
assert len(in_danger_indices_borderline_1) == len(in_danger_indices_borderline_2)
counter = Counter(y_res_1)
assert counter[0] == counter[1] == counter[2]
counter = Counter(y_res_2)
assert counter[0] == counter[1] == counter[2]

0 comments on commit 2859cb0

Please sign in to comment.