Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] EHN add voting paramter for ClusterCentroids #318

Merged
merged 30 commits into from
Aug 24, 2017
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
a68e8eb
EHN POC sparse handling for RandomUnderSampler
glemaitre Aug 12, 2017
0062d6d
EHN support sparse ENN
glemaitre Aug 12, 2017
6197d80
iter
glemaitre Aug 12, 2017
f669843
EHN sparse indexing IHT
glemaitre Aug 12, 2017
4adc6db
EHN sparse support nearmiss
glemaitre Aug 12, 2017
9c93dab
Merge branch 'master' into is/158
glemaitre Aug 13, 2017
bba7835
EHN support sparse matrices for NCR
glemaitre Aug 13, 2017
9cd917b
EHN support sparse Tomek and OSS
glemaitre Aug 13, 2017
c3ba307
EHN support sparsity for CNN
glemaitre Aug 13, 2017
d195868
EHN support sparse for SMOTE
glemaitre Aug 13, 2017
bcf44ab
EHN support sparse adasyn
glemaitre Aug 13, 2017
c405aa9
EHN support sparsity for sombine methods
glemaitre Aug 13, 2017
79637d7
EHN support sparsity BC
glemaitre Aug 13, 2017
c199af9
DOC update docstring
glemaitre Aug 14, 2017
425928f
DOC fix example topic classification
glemaitre Aug 14, 2017
4ba8c4e
FIX fix test and class clustercentroids
glemaitre Aug 14, 2017
8298fdc
TST add common test
glemaitre Aug 14, 2017
e4c6ebb
TST add ensemble
glemaitre Aug 14, 2017
1226a91
TST use allclose
glemaitre Aug 14, 2017
68b16b5
TST install conda with ubuntu container
glemaitre Aug 14, 2017
35c638b
TST increase tolerance
glemaitre Aug 14, 2017
004f920
TST increase tolerance
glemaitre Aug 14, 2017
d3ceb5a
TST test all versions NearMiss and SMOTE
glemaitre Aug 14, 2017
d9c4e55
TST set the algorithm of KMeans
glemaitre Aug 14, 2017
b469747
DOC add entry in user guide
glemaitre Aug 14, 2017
c05d0ba
DOC add entry sparse for CC
glemaitre Aug 14, 2017
1625879
DOC whatsnew entry
glemaitre Aug 14, 2017
72a605d
EHN add voting paramter for ClusterCentroids
glemaitre Aug 14, 2017
e1ffb13
TST fix common test fixing voting
glemaitre Aug 14, 2017
6c34e56
Merge remote-tracking branch 'origin/master' into is/317
glemaitre Aug 24, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
EHN support sparse for SMOTE
  • Loading branch information
glemaitre committed Aug 13, 2017
commit d195868e3139b89a0327318eb44403cb03bd1ea9
80 changes: 40 additions & 40 deletions imblearn/over_sampling/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,58 +19,58 @@ class BaseOverSampler(BaseSampler):

_sampling_type = 'over-sampling'

def fit(self, X, y):
"""Find the classes statistics before to perform sampling.
# def fit(self, X, y):
# """Find the classes statistics before to perform sampling.

Parameters
----------
X : array-like, shape (n_samples, n_features)
Matrix containing the data which have to be sampled.
# Parameters
# ----------
# X : array-like, shape (n_samples, n_features)
# Matrix containing the data which have to be sampled.

y : array-like, shape (n_samples,)
Corresponding label for each sample in X.
# y : array-like, shape (n_samples,)
# Corresponding label for each sample in X.

Returns
-------
self : object,
Return self.
# Returns
# -------
# self : object,
# Return self.

Notes
-----
Over-samplers do not accept sparse matrices.
# Notes
# -----
# Over-samplers do not accept sparse matrices.

"""
# over-sampling method does not handle sparse matrix
X, y = check_X_y(X, y)
# """
# # over-sampling method does not handle sparse matrix
# X, y = check_X_y(X, y)

return super(BaseOverSampler, self).fit(X, y)
# return super(BaseOverSampler, self).fit(X, y)

def sample(self, X, y):
"""Resample the dataset.
# def sample(self, X, y):
# """Resample the dataset.

Parameters
----------
X : array-like, shape (n_samples, n_features)
Matrix containing the data which have to be sampled.
# Parameters
# ----------
# X : array-like, shape (n_samples, n_features)
# Matrix containing the data which have to be sampled.

y : array-like, shape (n_samples,)
Corresponding label for each sample in X.
# y : array-like, shape (n_samples,)
# Corresponding label for each sample in X.

Returns
-------
X_resampled : array-like, shape (n_samples_new, n_features)
The array containing the resampled data.
# Returns
# -------
# X_resampled : array-like, shape (n_samples_new, n_features)
# The array containing the resampled data.

y_resampled : array-like, shape (n_samples_new,)
The corresponding label of `X_resampled`
# y_resampled : array-like, shape (n_samples_new,)
# The corresponding label of `X_resampled`

Notes
-----
Over-samplers do not accept sparse matrices.
# Notes
# -----
# Over-samplers do not accept sparse matrices.

"""
# """

# Check the consistency of X and y
X, y = check_X_y(X, y)
# # Check the consistency of X and y
# X, y = check_X_y(X, y)

return super(BaseOverSampler, self).sample(X, y)
# return super(BaseOverSampler, self).sample(X, y)
20 changes: 8 additions & 12 deletions imblearn/over_sampling/random_over_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from collections import Counter

import numpy as np
from sklearn.utils import check_random_state
from sklearn.utils import check_random_state, safe_indexing

from .base import BaseOverSampler

Expand Down Expand Up @@ -102,19 +102,15 @@ def _sample(self, X, y):
random_state = check_random_state(self.random_state)
target_stats = Counter(y)

X_resampled = X.copy()
y_resampled = y.copy()
sample_indices = range(X.shape[0])

for class_sample, num_samples in self.ratio_.items():
index_samples = random_state.randint(
target_class_indices = np.flatnonzero(y == class_sample)
indices = random_state.randint(
low=0, high=target_stats[class_sample], size=num_samples)

X_resampled = np.concatenate((X_resampled,
X[y == class_sample][index_samples]),
axis=0)
sample_indices = np.append(sample_indices,
target_class_indices[indices])

y_resampled = np.concatenate((y_resampled,
y[y == class_sample][index_samples]),
axis=0)

return X_resampled, y_resampled
return (safe_indexing(X, sample_indices),
safe_indexing(y, sample_indices))
128 changes: 90 additions & 38 deletions imblearn/over_sampling/smote.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,11 @@
from __future__ import division

import numpy as np

from scipy import sparse

from sklearn.svm import SVC
from sklearn.utils import check_random_state
from sklearn.utils import check_random_state, safe_indexing

from .base import BaseOverSampler
from ..exceptions import raise_isinstance_error
Expand Down Expand Up @@ -253,18 +256,34 @@ def _make_samples(self,

"""
random_state = check_random_state(self.random_state)
X_new = np.zeros((n_samples, X.shape[1]))
samples = random_state.randint(
samples_indices = random_state.randint(
low=0, high=len(nn_num.flatten()), size=n_samples)
steps = step_size * random_state.uniform(size=n_samples)
rows = np.floor_divide(samples, nn_num.shape[1])
cols = np.mod(samples, nn_num.shape[1])
for i, (sample, row, col, step) in enumerate(zip(samples, rows,
cols, steps)):
X_new[i] = X[row] - step * (X[row] - nn_data[nn_num[row, col]])
y_new = np.array([y_type] * len(X_new))
rows = np.floor_divide(samples_indices, nn_num.shape[1])
cols = np.mod(samples_indices, nn_num.shape[1])

if sparse.issparse(X):
row_indices, col_indices, samples = [], [], []
for i, (row, col, step) in enumerate(zip(rows, cols, steps)):
if X[row].nnz:
sample = X[row] - step * (X[row] -
nn_data[nn_num[row, col]])
row_indices += [i] * len(sample.indices)
col_indices += sample.indices.tolist()
samples += sample.data.tolist()
else:
X_new = np.zeros((n_samples, X.shape[1]))
for i, (row, col, step) in enumerate(zip(rows, cols, steps)):
X_new[i] = X[row] - step * (X[row] - nn_data[nn_num[row, col]])

return X_new, y_new
y_new = np.array([y_type] * len(samples_indices))

if sparse.issparse(X):
return (sparse.csr_matrix((samples, (row_indices, col_indices)),
[len(samples_indices), X.shape[1]]),
y_new)
else:
return X_new, y_new

def _validate_estimator(self):
"""Create the necessary objects for SMOTE."""
Expand Down Expand Up @@ -326,21 +345,26 @@ def _sample_regular(self, X, y):
intelligence research, 321-357, 2002.

"""

X_resampled = X.copy()
y_resampled = y.copy()

for class_sample, n_samples in self.ratio_.items():
if n_samples == 0:
continue
X_class = X[y == class_sample]
target_class_indices = np.flatnonzero(y == class_sample)
X_class = safe_indexing(X, target_class_indices)

self.nn_k_.fit(X_class)
nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]
X_new, y_new = self._make_samples(X_class, class_sample, X_class,
nns, n_samples, 1.0)

X_resampled = np.concatenate((X_resampled, X_new), axis=0)
y_resampled = np.concatenate((y_resampled, y_new), axis=0)
if sparse.issparse(X_new):
X_resampled = sparse.vstack([X_resampled, X_new])
else:
X_resampled = np.vstack((X_resampled, X_new))
y_resampled = np.hstack((y_resampled, y_new))

return X_resampled, y_resampled

Expand Down Expand Up @@ -381,7 +405,8 @@ def _sample_borderline(self, X, y):
for class_sample, n_samples in self.ratio_.items():
if n_samples == 0:
continue
X_class = X[y == class_sample]
target_class_indices = np.flatnonzero(y == class_sample)
X_class = safe_indexing(X, target_class_indices)

self.nn_m_.fit(X)
danger_index = self._in_danger_noise(X_class, class_sample, y,
Expand All @@ -391,39 +416,48 @@ def _sample_borderline(self, X, y):

self.nn_k_.fit(X_class)
nns = self.nn_k_.kneighbors(
X_class[danger_index], return_distance=False)[:, 1:]
safe_indexing(X_class, danger_index),
return_distance=False)[:, 1:]

# divergence between borderline-1 and borderline-2
if self.kind == 'borderline1':
# Create synthetic samples for borderline points.
X_new, y_new = self._make_samples(X_class[danger_index],
X_new, y_new = self._make_samples(safe_indexing(X_class,
danger_index),
class_sample, X_class,
nns, n_samples)
X_resampled = np.concatenate((X_resampled, X_new), axis=0)
y_resampled = np.concatenate((y_resampled, y_new), axis=0)
if sparse.issparse(X_new):
X_resampled = sparse.vstack([X_resampled, X_new])
else:
X_resampled = np.vstack((X_resampled, X_new))
y_resampled = np.hstack((y_resampled, y_new))

else:
random_state = check_random_state(self.random_state)
fractions = random_state.beta(10, 10)

# only minority
X_new_1, y_new_1 = self._make_samples(
X_class[danger_index], class_sample, X_class, nns,
safe_indexing(X_class, danger_index), class_sample,
X_class, nns,
int(fractions * (n_samples + 1)), step_size=1.)

# we use a one-vs-rest policy to handle the multiclass in which
# new samples will be created considering not only the majority
# class but all over classes.
X_new_2, y_new_2 = self._make_samples(
X_class[danger_index], class_sample, X[y != class_sample],
safe_indexing(X_class, danger_index), class_sample,
safe_indexing(X, np.flatnonzero(y != class_sample)),
nns, int((1 - fractions) * n_samples), step_size=0.5)

# Concatenate the newly generated samples to the original
# data set
X_resampled = np.concatenate((X_resampled, X_new_1, X_new_2),
axis=0)
y_resampled = np.concatenate((y_resampled, y_new_1, y_new_2),
axis=0)
if sparse.issparse(X_resampled):
X_resampled = sparse.vstack([X_resampled,
X_new_1, X_new_2])
else:
X_resampled = np.vstack((X_resampled,
X_new_1, X_new_2))
y_resampled = np.hstack((y_resampled,
y_new_1, y_new_2))

return X_resampled, y_resampled

Expand Down Expand Up @@ -463,51 +497,69 @@ def _sample_svm(self, X, y):
for class_sample, n_samples in self.ratio_.items():
if n_samples == 0:
continue
X_class = X[y == class_sample]
target_class_indices = np.flatnonzero(y == class_sample)
X_class = safe_indexing(X, target_class_indices)

self.svm_estimator_.fit(X, y)
support_index = self.svm_estimator_.support_[
y[self.svm_estimator_.support_] == class_sample]
support_vector = X[support_index]
support_vector = safe_indexing(X, support_index)

self.nn_m_.fit(X)
noise_bool = self._in_danger_noise(support_vector, class_sample, y,
kind='noise')
support_vector = support_vector[np.logical_not(noise_bool)]
support_vector = safe_indexing(
support_vector,
np.flatnonzero(np.logical_not(noise_bool)))
danger_bool = self._in_danger_noise(support_vector, class_sample,
y, kind='danger')
safety_bool = np.logical_not(danger_bool)

self.nn_k_.fit(X_class)
fractions = random_state.beta(10, 10)
if np.count_nonzero(danger_bool) > 0:
nns = self.nn_k_.kneighbors(support_vector[danger_bool],
nns = self.nn_k_.kneighbors(safe_indexing(
support_vector,
np.flatnonzero(danger_bool)),
return_distance=False)[:, 1:]

X_new_1, y_new_1 = self._make_samples(
support_vector[danger_bool], class_sample, X_class,
safe_indexing(support_vector, np.flatnonzero(danger_bool)),
class_sample, X_class,
nns, int(fractions * (n_samples + 1)), step_size=1.)

if np.count_nonzero(safety_bool) > 0:
nns = self.nn_k_.kneighbors(support_vector[safety_bool],
return_distance=False)[:, 1:]
nns = self.nn_k_.kneighbors(
safe_indexing(support_vector, np.flatnonzero(safety_bool)),
return_distance=False)[:, 1:]

X_new_2, y_new_2 = self._make_samples(
support_vector[safety_bool], class_sample, X_class,
safe_indexing(support_vector, np.flatnonzero(safety_bool)),
class_sample, X_class,
nns, int((1 - fractions) * n_samples),
step_size=-self.out_step)

if (np.count_nonzero(danger_bool) > 0 and
np.count_nonzero(safety_bool) > 0):
X_resampled = np.concatenate((X_resampled, X_new_1, X_new_2),
axis=0)
if sparse.issparse(X_resampled):
X_resampled = sparse.vstack([X_resampled,
X_new_1, X_new_2])
else:
X_resampled = np.vstack((X_resampled,
X_new_1, X_new_2))
y_resampled = np.concatenate((y_resampled, y_new_1, y_new_2),
axis=0)
elif np.count_nonzero(danger_bool) == 0:
X_resampled = np.concatenate((X_resampled, X_new_2), axis=0)
if sparse.issparse(X_resampled):
X_resampled = sparse.vstack([X_resampled, X_new_2])
else:
X_resampled = np.vstack((X_resampled, X_new_2))
y_resampled = np.concatenate((y_resampled, y_new_2), axis=0)
elif np.count_nonzero(safety_bool) == 0:
X_resampled = np.concatenate((X_resampled, X_new_1), axis=0)
if sparse.issparse(X_resampled):
X_resampled = sparse.vstack([X_resampled, X_new_1])
else:
X_resampled = np.vstack((X_resampled, X_new_1))
y_resampled = np.concatenate((y_resampled, y_new_1), axis=0)

return X_resampled, y_resampled
Expand Down