EHN support sparse for SMOTE

scikit-learn-contrib · glemaitre · Aug 24, 2017 · Aug 12, 2017 · Aug 12, 2017 · Aug 12, 2017
commit d195868e3139b89a0327318eb44403cb03bd1ea9
diff --git a/imblearn/over_sampling/base.py b/imblearn/over_sampling/base.py
@@ -19,58 +19,58 @@ class BaseOverSampler(BaseSampler):
 
     _sampling_type = 'over-sampling'
 
-    def fit(self, X, y):
-        """Find the classes statistics before to perform sampling.
+    # def fit(self, X, y):
+    #     """Find the classes statistics before to perform sampling.
 
-        Parameters
-        ----------
-        X : array-like, shape (n_samples, n_features)
-            Matrix containing the data which have to be sampled.
+    #     Parameters
+    #     ----------
+    #     X : array-like, shape (n_samples, n_features)
+    #         Matrix containing the data which have to be sampled.
 
-        y : array-like, shape (n_samples,)
-            Corresponding label for each sample in X.
+    #     y : array-like, shape (n_samples,)
+    #         Corresponding label for each sample in X.
 
-        Returns
-        -------
-        self : object,
-            Return self.
+    #     Returns
+    #     -------
+    #     self : object,
+    #         Return self.
 
-        Notes
-        -----
-        Over-samplers do not accept sparse matrices.
+    #     Notes
+    #     -----
+    #     Over-samplers do not accept sparse matrices.
 
-        """
-        # over-sampling method does not handle sparse matrix
-        X, y = check_X_y(X, y)
+    #     """
+    #     # over-sampling method does not handle sparse matrix
+    #     X, y = check_X_y(X, y)
 
-        return super(BaseOverSampler, self).fit(X, y)
+    #     return super(BaseOverSampler, self).fit(X, y)
 
-    def sample(self, X, y):
-        """Resample the dataset.
+    # def sample(self, X, y):
+    #     """Resample the dataset.
 
-        Parameters
-        ----------
-        X : array-like, shape (n_samples, n_features)
-            Matrix containing the data which have to be sampled.
+    #     Parameters
+    #     ----------
+    #     X : array-like, shape (n_samples, n_features)
+    #         Matrix containing the data which have to be sampled.
 
-        y : array-like, shape (n_samples,)
-            Corresponding label for each sample in X.
+    #     y : array-like, shape (n_samples,)
+    #         Corresponding label for each sample in X.
 
-        Returns
-        -------
-        X_resampled : array-like, shape (n_samples_new, n_features)
-            The array containing the resampled data.
+    #     Returns
+    #     -------
+    #     X_resampled : array-like, shape (n_samples_new, n_features)
+    #         The array containing the resampled data.
 
-        y_resampled : array-like, shape (n_samples_new,)
-            The corresponding label of `X_resampled`
+    #     y_resampled : array-like, shape (n_samples_new,)
+    #         The corresponding label of `X_resampled`
 
-        Notes
-        -----
-        Over-samplers do not accept sparse matrices.
+    #     Notes
+    #     -----
+    #     Over-samplers do not accept sparse matrices.
 
-        """
+    #     """
 
-        # Check the consistency of X and y
-        X, y = check_X_y(X, y)
+    #     # Check the consistency of X and y
+    #     X, y = check_X_y(X, y)
 
-        return super(BaseOverSampler, self).sample(X, y)
+    #     return super(BaseOverSampler, self).sample(X, y)
diff --git a/imblearn/over_sampling/random_over_sampler.py b/imblearn/over_sampling/random_over_sampler.py
@@ -8,7 +8,7 @@
 from collections import Counter
 
 import numpy as np
-from sklearn.utils import check_random_state
+from sklearn.utils import check_random_state, safe_indexing
 
 from .base import BaseOverSampler
 
@@ -102,19 +102,15 @@ def _sample(self, X, y):
         random_state = check_random_state(self.random_state)
         target_stats = Counter(y)
 
-        X_resampled = X.copy()
-        y_resampled = y.copy()
+        sample_indices = range(X.shape[0])
 
         for class_sample, num_samples in self.ratio_.items():
-            index_samples = random_state.randint(
+            target_class_indices = np.flatnonzero(y == class_sample)
+            indices = random_state.randint(
                 low=0, high=target_stats[class_sample], size=num_samples)
 
-            X_resampled = np.concatenate((X_resampled,
-                                          X[y == class_sample][index_samples]),
-                                         axis=0)
+            sample_indices = np.append(sample_indices,
+                                       target_class_indices[indices])
 
-            y_resampled = np.concatenate((y_resampled,
-                                          y[y == class_sample][index_samples]),
-                                         axis=0)
-
-        return X_resampled, y_resampled
+        return (safe_indexing(X, sample_indices),
+                safe_indexing(y, sample_indices))
diff --git a/imblearn/over_sampling/smote.py b/imblearn/over_sampling/smote.py
@@ -8,8 +8,11 @@
 from __future__ import division
 
 import numpy as np
+
+from scipy import sparse
+
 from sklearn.svm import SVC
-from sklearn.utils import check_random_state
+from sklearn.utils import check_random_state, safe_indexing
 
 from .base import BaseOverSampler
 from ..exceptions import raise_isinstance_error
@@ -253,18 +256,34 @@ def _make_samples(self,
 
         """
         random_state = check_random_state(self.random_state)
-        X_new = np.zeros((n_samples, X.shape[1]))
-        samples = random_state.randint(
+        samples_indices = random_state.randint(
             low=0, high=len(nn_num.flatten()), size=n_samples)
         steps = step_size * random_state.uniform(size=n_samples)
-        rows = np.floor_divide(samples, nn_num.shape[1])
-        cols = np.mod(samples, nn_num.shape[1])
-        for i, (sample, row, col, step) in enumerate(zip(samples, rows,
-                                                         cols, steps)):
-            X_new[i] = X[row] - step * (X[row] - nn_data[nn_num[row, col]])
-        y_new = np.array([y_type] * len(X_new))
+        rows = np.floor_divide(samples_indices, nn_num.shape[1])
+        cols = np.mod(samples_indices, nn_num.shape[1])
+
+        if sparse.issparse(X):
+            row_indices, col_indices, samples = [], [], []
+            for i, (row, col, step) in enumerate(zip(rows, cols, steps)):
+                if X[row].nnz:
+                    sample = X[row] - step * (X[row] -
+                                              nn_data[nn_num[row, col]])
+                    row_indices += [i] * len(sample.indices)
+                    col_indices += sample.indices.tolist()
+                    samples += sample.data.tolist()
+        else:
+            X_new = np.zeros((n_samples, X.shape[1]))
+            for i, (row, col, step) in enumerate(zip(rows, cols, steps)):
+                X_new[i] = X[row] - step * (X[row] - nn_data[nn_num[row, col]])
 
-        return X_new, y_new
+        y_new = np.array([y_type] * len(samples_indices))
+
+        if sparse.issparse(X):
+            return (sparse.csr_matrix((samples, (row_indices, col_indices)),
+                                      [len(samples_indices), X.shape[1]]),
+                    y_new)
+        else:
+            return X_new, y_new
 
     def _validate_estimator(self):
         """Create the necessary objects for SMOTE."""
@@ -326,21 +345,26 @@ def _sample_regular(self, X, y):
            intelligence research, 321-357, 2002.
 
         """
+
         X_resampled = X.copy()
         y_resampled = y.copy()
 
         for class_sample, n_samples in self.ratio_.items():
             if n_samples == 0:
                 continue
-            X_class = X[y == class_sample]
+            target_class_indices = np.flatnonzero(y == class_sample)
+            X_class = safe_indexing(X, target_class_indices)
 
             self.nn_k_.fit(X_class)
             nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]
             X_new, y_new = self._make_samples(X_class, class_sample, X_class,
                                               nns, n_samples, 1.0)
 
-            X_resampled = np.concatenate((X_resampled, X_new), axis=0)
-            y_resampled = np.concatenate((y_resampled, y_new), axis=0)
+            if sparse.issparse(X_new):
+                X_resampled = sparse.vstack([X_resampled, X_new])
+            else:
+                X_resampled = np.vstack((X_resampled, X_new))
+            y_resampled = np.hstack((y_resampled, y_new))
 
         return X_resampled, y_resampled
 
@@ -381,7 +405,8 @@ def _sample_borderline(self, X, y):
         for class_sample, n_samples in self.ratio_.items():
             if n_samples == 0:
                 continue
-            X_class = X[y == class_sample]
+            target_class_indices = np.flatnonzero(y == class_sample)
+            X_class = safe_indexing(X, target_class_indices)
 
             self.nn_m_.fit(X)
             danger_index = self._in_danger_noise(X_class, class_sample, y,
@@ -391,39 +416,48 @@ def _sample_borderline(self, X, y):
 
             self.nn_k_.fit(X_class)
             nns = self.nn_k_.kneighbors(
-               X_class[danger_index], return_distance=False)[:, 1:]
+                safe_indexing(X_class, danger_index),
+                return_distance=False)[:, 1:]
 
             # divergence between borderline-1 and borderline-2
             if self.kind == 'borderline1':
                 # Create synthetic samples for borderline points.
-                X_new, y_new = self._make_samples(X_class[danger_index],
+                X_new, y_new = self._make_samples(safe_indexing(X_class,
+                                                                danger_index),
                                                   class_sample, X_class,
                                                   nns, n_samples)
-                X_resampled = np.concatenate((X_resampled, X_new), axis=0)
-                y_resampled = np.concatenate((y_resampled, y_new), axis=0)
+                if sparse.issparse(X_new):
+                    X_resampled = sparse.vstack([X_resampled, X_new])
+                else:
+                    X_resampled = np.vstack((X_resampled, X_new))
+                y_resampled = np.hstack((y_resampled, y_new))
 
             else:
                 random_state = check_random_state(self.random_state)
                 fractions = random_state.beta(10, 10)
 
                 # only minority
                 X_new_1, y_new_1 = self._make_samples(
-                    X_class[danger_index], class_sample, X_class, nns,
+                    safe_indexing(X_class, danger_index), class_sample,
+                    X_class, nns,
                     int(fractions * (n_samples + 1)), step_size=1.)
 
                 # we use a one-vs-rest policy to handle the multiclass in which
                 # new samples will be created considering not only the majority
                 # class but all over classes.
                 X_new_2, y_new_2 = self._make_samples(
-                    X_class[danger_index], class_sample, X[y != class_sample],
+                    safe_indexing(X_class, danger_index), class_sample,
+                    safe_indexing(X, np.flatnonzero(y != class_sample)),
                     nns, int((1 - fractions) * n_samples), step_size=0.5)
 
-                # Concatenate the newly generated samples to the original
-                # data set
-                X_resampled = np.concatenate((X_resampled, X_new_1, X_new_2),
-                                             axis=0)
-                y_resampled = np.concatenate((y_resampled, y_new_1, y_new_2),
-                                             axis=0)
+                if sparse.issparse(X_resampled):
+                    X_resampled = sparse.vstack([X_resampled,
+                                                 X_new_1, X_new_2])
+                else:
+                    X_resampled = np.vstack((X_resampled,
+                                             X_new_1, X_new_2))
+                y_resampled = np.hstack((y_resampled,
+                                         y_new_1, y_new_2))
 
         return X_resampled, y_resampled
 
@@ -463,51 +497,69 @@ def _sample_svm(self, X, y):
         for class_sample, n_samples in self.ratio_.items():
             if n_samples == 0:
                 continue
-            X_class = X[y == class_sample]
+            target_class_indices = np.flatnonzero(y == class_sample)
+            X_class = safe_indexing(X, target_class_indices)
 
             self.svm_estimator_.fit(X, y)
             support_index = self.svm_estimator_.support_[
                 y[self.svm_estimator_.support_] == class_sample]
-            support_vector = X[support_index]
+            support_vector = safe_indexing(X, support_index)
 
             self.nn_m_.fit(X)
             noise_bool = self._in_danger_noise(support_vector, class_sample, y,
                                                kind='noise')
-            support_vector = support_vector[np.logical_not(noise_bool)]
+            support_vector = safe_indexing(
+                support_vector,
+                np.flatnonzero(np.logical_not(noise_bool)))
             danger_bool = self._in_danger_noise(support_vector, class_sample,
                                                 y, kind='danger')
             safety_bool = np.logical_not(danger_bool)
 
             self.nn_k_.fit(X_class)
             fractions = random_state.beta(10, 10)
             if np.count_nonzero(danger_bool) > 0:
-                nns = self.nn_k_.kneighbors(support_vector[danger_bool],
+                nns = self.nn_k_.kneighbors(safe_indexing(
+                    support_vector,
+                    np.flatnonzero(danger_bool)),
                                             return_distance=False)[:, 1:]
 
                 X_new_1, y_new_1 = self._make_samples(
-                    support_vector[danger_bool], class_sample, X_class,
+                    safe_indexing(support_vector, np.flatnonzero(danger_bool)),
+                    class_sample, X_class,
                     nns, int(fractions * (n_samples + 1)), step_size=1.)
 
             if np.count_nonzero(safety_bool) > 0:
-                nns = self.nn_k_.kneighbors(support_vector[safety_bool],
-                                            return_distance=False)[:, 1:]
+                nns = self.nn_k_.kneighbors(
+                    safe_indexing(support_vector, np.flatnonzero(safety_bool)),
+                    return_distance=False)[:, 1:]
 
                 X_new_2, y_new_2 = self._make_samples(
-                    support_vector[safety_bool], class_sample, X_class,
+                    safe_indexing(support_vector, np.flatnonzero(safety_bool)),
+                    class_sample, X_class,
                     nns, int((1 - fractions) * n_samples),
                     step_size=-self.out_step)
 
             if (np.count_nonzero(danger_bool) > 0 and
                     np.count_nonzero(safety_bool) > 0):
-                X_resampled = np.concatenate((X_resampled, X_new_1, X_new_2),
-                                             axis=0)
+                if sparse.issparse(X_resampled):
+                    X_resampled = sparse.vstack([X_resampled,
+                                                 X_new_1, X_new_2])
+                else:
+                    X_resampled = np.vstack((X_resampled,
+                                             X_new_1, X_new_2))
                 y_resampled = np.concatenate((y_resampled, y_new_1, y_new_2),
                                              axis=0)
             elif np.count_nonzero(danger_bool) == 0:
-                X_resampled = np.concatenate((X_resampled, X_new_2), axis=0)
+                if sparse.issparse(X_resampled):
+                    X_resampled = sparse.vstack([X_resampled,  X_new_2])
+                else:
+                    X_resampled = np.vstack((X_resampled, X_new_2))
                 y_resampled = np.concatenate((y_resampled, y_new_2), axis=0)
             elif np.count_nonzero(safety_bool) == 0:
-                X_resampled = np.concatenate((X_resampled, X_new_1), axis=0)
+                if sparse.issparse(X_resampled):
+                    X_resampled = sparse.vstack([X_resampled, X_new_1])
+                else:
+                    X_resampled = np.vstack((X_resampled, X_new_1))
                 y_resampled = np.concatenate((y_resampled, y_new_1), axis=0)
 
         return X_resampled, y_resampled