From a68e8eb00e52a5698b2c61804fec015d143a3c54 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 12 Aug 2017 13:01:11 +0200
Subject: [PATCH] EHN POC sparse handling for RandomUnderSampler

---
 imblearn/base.py                              | 21 +++----
 imblearn/over_sampling/base.py                | 58 +++++++++++++++++++
 .../random_under_sampler.py                   | 25 +++-----
 3 files changed, 77 insertions(+), 27 deletions(-)

diff --git a/imblearn/base.py b/imblearn/base.py
index af3d0536d..08b1b6adf 100644
--- a/imblearn/base.py
+++ b/imblearn/base.py
@@ -38,24 +38,25 @@ def sample(self, X, y):
 
         Parameters
         ----------
-        X : ndarray, shape (n_samples, n_features)
+        X :  {array-like, sparse matrix}, shape (n_samples, n_features)
             Matrix containing the data which have to be sampled.
 
-        y : ndarray, shape (n_samples, )
+        y : array-like, shape (n_samples,)
             Corresponding label for each sample in X.
 
         Returns
         -------
-        X_resampled : ndarray, shape (n_samples_new, n_features)
+        X_resampled : {array-like, sparse matrix}, shape \
+(n_samples_new, n_features)
             The array containing the resampled data.
 
-        y_resampled : ndarray, shape (n_samples_new)
+        y_resampled : array-like, shape (n_samples_new)
             The corresponding label of `X_resampled`
 
         """
 
         # Check the consistency of X and y
-        X, y = check_X_y(X, y)
+        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
 
         check_is_fitted(self, 'ratio_')
         self._check_X_y(X, y)
@@ -70,7 +71,7 @@ def fit_sample(self, X, y):
         X : ndarray, shape (n_samples, n_features)
             Matrix containing the data which have to be sampled.
 
-        y : ndarray, shape (n_samples, )
+        y : ndarray, shape (n_samples,)
             Corresponding label for each sample in X.
 
         Returns
@@ -78,7 +79,7 @@ def fit_sample(self, X, y):
         X_resampled : ndarray, shape (n_samples_new, n_features)
             The array containing the resampled data.
 
-        y_resampled : ndarray, shape (n_samples_new)
+        y_resampled : ndarray, shape (n_samples_new,)
             The corresponding label of `X_resampled`
 
         """
@@ -138,10 +139,10 @@ def fit(self, X, y):
 
         Parameters
         ----------
-        X : ndarray, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Matrix containing the data which have to be sampled.
 
-        y : ndarray, shape (n_samples, )
+        y : array-like, shape (n_samples,)
             Corresponding label for each sample in X.
 
         Returns
@@ -150,7 +151,7 @@ def fit(self, X, y):
             Return self.
 
         """
-        X, y = check_X_y(X, y)
+        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
         y = check_target_type(y)
         self.X_hash_, self.y_hash_ = hash_X_y(X, y)
         # self.sampling_type is already checked in check_ratio
diff --git a/imblearn/over_sampling/base.py b/imblearn/over_sampling/base.py
index 9c1f6d51b..175ee4e47 100644
--- a/imblearn/over_sampling/base.py
+++ b/imblearn/over_sampling/base.py
@@ -5,6 +5,8 @@
 #          Christos Aridas
 # License: MIT
 
+from sklearn.utils import check_X_y
+
 from ..base import BaseSampler
 
 
@@ -16,3 +18,59 @@ class BaseOverSampler(BaseSampler):
     """
 
     _sampling_type = 'over-sampling'
+
+    def fit(self, X, y):
+        """Find the classes statistics before to perform sampling.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Matrix containing the data which have to be sampled.
+
+        y : array-like, shape (n_samples,)
+            Corresponding label for each sample in X.
+
+        Returns
+        -------
+        self : object,
+            Return self.
+
+        Notes
+        -----
+        Over-samplers do not accept sparse matrices.
+
+        """
+        # over-sampling method does not handle sparse matrix
+        X, y = check_X_y(X, y)
+
+        return super(BaseOverSampler, self).fit(X, y)
+
+    def sample(self, X, y):
+        """Resample the dataset.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Matrix containing the data which have to be sampled.
+
+        y : array-like, shape (n_samples,)
+            Corresponding label for each sample in X.
+
+        Returns
+        -------
+        X_resampled : array-like, shape (n_samples_new, n_features)
+            The array containing the resampled data.
+
+        y_resampled : array-like, shape (n_samples_new,)
+            The corresponding label of `X_resampled`
+
+        Notes
+        -----
+        Over-samplers do not accept sparse matrices.
+
+        """
+
+        # Check the consistency of X and y
+        X, y = check_X_y(X, y)
+
+        return super(BaseOverSampler, self).sample(X, y)
diff --git a/imblearn/under_sampling/prototype_selection/random_under_sampler.py b/imblearn/under_sampling/prototype_selection/random_under_sampler.py
index 9fa242363..5adfb8055 100644
--- a/imblearn/under_sampling/prototype_selection/random_under_sampler.py
+++ b/imblearn/under_sampling/prototype_selection/random_under_sampler.py
@@ -7,7 +7,7 @@
 from __future__ import division
 
 import numpy as np
-from sklearn.utils import check_random_state
+from sklearn.utils import check_random_state, safe_indexing
 
 from ..base import BaseUnderSampler
 
@@ -110,10 +110,7 @@ def _sample(self, X, y):
         """
         random_state = check_random_state(self.random_state)
 
-        X_resampled = np.empty((0, X.shape[1]), dtype=X.dtype)
-        y_resampled = np.empty((0, ), dtype=y.dtype)
-        if self.return_indices:
-            idx_under = np.empty((0, ), dtype=int)
+        idx_under = np.empty((0, ), dtype=int)
 
         for target_class in np.unique(y):
             if target_class in self.ratio_.keys():
@@ -125,18 +122,12 @@ def _sample(self, X, y):
             else:
                 index_target_class = slice(None)
 
-            X_resampled = np.concatenate(
-                (X_resampled, X[y == target_class][index_target_class]),
-                axis=0)
-            y_resampled = np.concatenate(
-                (y_resampled, y[y == target_class][index_target_class]),
-                axis=0)
-            if self.return_indices:
-                idx_under = np.concatenate(
-                    (idx_under, np.flatnonzero(y == target_class)[
-                        index_target_class]), axis=0)
+            idx_under = np.concatenate(
+                (idx_under, np.flatnonzero(y == target_class)[
+                    index_target_class]), axis=0)
 
         if self.return_indices:
-            return X_resampled, y_resampled, idx_under
+            return (safe_indexing(X, idx_under), safe_indexing(y, idx_under),
+                    idx_under)
         else:
-            return X_resampled, y_resampled
+            return safe_indexing(X, idx_under), safe_indexing(y, idx_under)