From a68e8eb00e52a5698b2c61804fec015d143a3c54 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 12 Aug 2017 13:01:11 +0200 Subject: [PATCH] EHN POC sparse handling for RandomUnderSampler --- imblearn/base.py | 21 +++---- imblearn/over_sampling/base.py | 58 +++++++++++++++++++ .../random_under_sampler.py | 25 +++----- 3 files changed, 77 insertions(+), 27 deletions(-) diff --git a/imblearn/base.py b/imblearn/base.py index af3d0536d..08b1b6adf 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -38,24 +38,25 @@ def sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {array-like, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) + y_resampled : array-like, shape (n_samples_new) The corresponding label of `X_resampled` """ # Check the consistency of X and y - X, y = check_X_y(X, y) + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) check_is_fitted(self, 'ratio_') self._check_X_y(X, y) @@ -70,7 +71,7 @@ def fit_sample(self, X, y): X : ndarray, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : ndarray, shape (n_samples,) Corresponding label for each sample in X. Returns @@ -78,7 +79,7 @@ def fit_sample(self, X, y): X_resampled : ndarray, shape (n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) + y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` """ @@ -138,10 +139,10 @@ def fit(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns @@ -150,7 +151,7 @@ def fit(self, X, y): Return self. """ - X, y = check_X_y(X, y) + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) y = check_target_type(y) self.X_hash_, self.y_hash_ = hash_X_y(X, y) # self.sampling_type is already checked in check_ratio diff --git a/imblearn/over_sampling/base.py b/imblearn/over_sampling/base.py index 9c1f6d51b..175ee4e47 100644 --- a/imblearn/over_sampling/base.py +++ b/imblearn/over_sampling/base.py @@ -5,6 +5,8 @@ # Christos Aridas # License: MIT +from sklearn.utils import check_X_y + from ..base import BaseSampler @@ -16,3 +18,59 @@ class BaseOverSampler(BaseSampler): """ _sampling_type = 'over-sampling' + + def fit(self, X, y): + """Find the classes statistics before to perform sampling. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Matrix containing the data which have to be sampled. + + y : array-like, shape (n_samples,) + Corresponding label for each sample in X. + + Returns + ------- + self : object, + Return self. + + Notes + ----- + Over-samplers do not accept sparse matrices. + + """ + # over-sampling method does not handle sparse matrix + X, y = check_X_y(X, y) + + return super(BaseOverSampler, self).fit(X, y) + + def sample(self, X, y): + """Resample the dataset. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Matrix containing the data which have to be sampled. + + y : array-like, shape (n_samples,) + Corresponding label for each sample in X. + + Returns + ------- + X_resampled : array-like, shape (n_samples_new, n_features) + The array containing the resampled data. + + y_resampled : array-like, shape (n_samples_new,) + The corresponding label of `X_resampled` + + Notes + ----- + Over-samplers do not accept sparse matrices. + + """ + + # Check the consistency of X and y + X, y = check_X_y(X, y) + + return super(BaseOverSampler, self).sample(X, y) diff --git a/imblearn/under_sampling/prototype_selection/random_under_sampler.py b/imblearn/under_sampling/prototype_selection/random_under_sampler.py index 9fa242363..5adfb8055 100644 --- a/imblearn/under_sampling/prototype_selection/random_under_sampler.py +++ b/imblearn/under_sampling/prototype_selection/random_under_sampler.py @@ -7,7 +7,7 @@ from __future__ import division import numpy as np -from sklearn.utils import check_random_state +from sklearn.utils import check_random_state, safe_indexing from ..base import BaseUnderSampler @@ -110,10 +110,7 @@ def _sample(self, X, y): """ random_state = check_random_state(self.random_state) - X_resampled = np.empty((0, X.shape[1]), dtype=X.dtype) - y_resampled = np.empty((0, ), dtype=y.dtype) - if self.return_indices: - idx_under = np.empty((0, ), dtype=int) + idx_under = np.empty((0, ), dtype=int) for target_class in np.unique(y): if target_class in self.ratio_.keys(): @@ -125,18 +122,12 @@ def _sample(self, X, y): else: index_target_class = slice(None) - X_resampled = np.concatenate( - (X_resampled, X[y == target_class][index_target_class]), - axis=0) - y_resampled = np.concatenate( - (y_resampled, y[y == target_class][index_target_class]), - axis=0) - if self.return_indices: - idx_under = np.concatenate( - (idx_under, np.flatnonzero(y == target_class)[ - index_target_class]), axis=0) + idx_under = np.concatenate( + (idx_under, np.flatnonzero(y == target_class)[ + index_target_class]), axis=0) if self.return_indices: - return X_resampled, y_resampled, idx_under + return (safe_indexing(X, idx_under), safe_indexing(y, idx_under), + idx_under) else: - return X_resampled, y_resampled + return safe_indexing(X, idx_under), safe_indexing(y, idx_under)