MAINT: deprecate return_indices in favor of attribute sample_indices_ (…

…scikit-learn-contrib#474)
hamedmx · Sep 13, 2018 · a19d398 · a19d398
1 parent 7f93dfc
commit a19d398
Show file tree

Hide file tree

Showing 30 changed files with 337 additions and 176 deletions.
diff --git a/doc/whats_new/v0.0.4.rst b/doc/whats_new/v0.0.4.rst
@@ -147,3 +147,8 @@ Deprecation
 
 - Deprecate :class:`imblearn.ensemble.BalanceCascade`.
   :issue:`472` by :user:`Guillaume Lemaitre <glemaitre>`.
+
+- Deprecate ``return_indices`` in all samplers. Instead, an attribute
+  ``sample_indices_`` is created whenever the sampler is selecting a subset of
+  the original samples.
+  :issue:`474` by :user:`Guillaume Lemaitre <glemaitre`.
diff --git a/imblearn/ensemble/_easy_ensemble.py b/imblearn/ensemble/_easy_ensemble.py
@@ -121,14 +121,13 @@ def _fit_resample(self, X, y):
         for _ in range(self.n_subsets):
             rus = RandomUnderSampler(
                 sampling_strategy=self.sampling_strategy_,
-                return_indices=True,
                 random_state=random_state.randint(MAX_INT),
                 replacement=self.replacement)
-            sel_x, sel_y, sel_idx = rus.fit_resample(X, y)
+            sel_x, sel_y = rus.fit_resample(X, y)
             X_resampled.append(sel_x)
             y_resampled.append(sel_y)
             if self.return_indices:
-                idx_under.append(sel_idx)
+                idx_under.append(rus.sample_indices_)
 
         if self.return_indices:
             return (np.array(X_resampled), np.array(y_resampled),

diff --git a/imblearn/ensemble/_forest.py b/imblearn/ensemble/_forest.py
@@ -36,9 +36,9 @@ def _local_parallel_build_trees(sampler, tree, forest, X, y, sample_weight,
                                 tree_idx, n_trees, verbose=0,
                                 class_weight=None):
     # resample before to fit the tree
-    X_resampled, y_resampled, selected_idx = sampler.fit_sample(X, y)
+    X_resampled, y_resampled = sampler.fit_sample(X, y)
     if sample_weight is not None:
-        sample_weight = safe_indexing(sample_weight, selected_idx)
+        sample_weight = safe_indexing(sample_weight, sampler.sample_indices_)
     tree = _parallel_build_trees(tree, forest, X_resampled, y_resampled,
                                  sample_weight, tree_idx, n_trees,
                                  verbose=verbose, class_weight=class_weight)
@@ -306,8 +306,7 @@ def _validate_estimator(self, default=DecisionTreeClassifier()):
 
         self.base_sampler_ = RandomUnderSampler(
             sampling_strategy=self.sampling_strategy,
-            replacement=self.replacement,
-            return_indices=True)
+            replacement=self.replacement)
 
     def _make_sampler_estimator(self, random_state=None):
         """Make and configure a copy of the `base_estimator_` attribute.
@@ -450,9 +449,6 @@ def fit(self, X, y, sample_weight=None):
             # Create pipeline with the fitted samplers and trees
             self.pipelines_.extend([make_pipeline(deepcopy(s), deepcopy(t))
                                     for s, t in zip(samplers, trees)])
-            for idx in range(len(self.pipelines_)):
-                self.pipelines_[idx].named_steps[
-                    'randomundersampler'].set_params(return_indices=False)
 
         if self.oob_score:
             self._set_oob_score(X, y)

diff --git a/imblearn/ensemble/_weight_boosting.py b/imblearn/ensemble/_weight_boosting.py
@@ -169,8 +169,7 @@ def _validate_estimator(self, default=DecisionTreeClassifier()):
 
         self.base_sampler_ = RandomUnderSampler(
             sampling_strategy=self.sampling_strategy,
-            replacement=self.replacement,
-            return_indices=True)
+            replacement=self.replacement)
 
     def _make_sampler_estimator(self, append=True, random_state=None):
         """Make and configure a copy of the `base_estimator_` attribute.
@@ -191,9 +190,6 @@ def _make_sampler_estimator(self, append=True, random_state=None):
             self.samplers_.append(sampler)
             self.pipelines_.append(make_pipeline(deepcopy(sampler),
                                                  deepcopy(estimator)))
-            # do not return the indices within a pipeline
-            self.pipelines_[-1].named_steps['randomundersampler'].set_params(
-                return_indices=False)
 
         return estimator, sampler
 
@@ -202,8 +198,9 @@ def _boost_real(self, iboost, X, y, sample_weight, random_state):
         estimator, sampler = self._make_sampler_estimator(
             random_state=random_state)
 
-        X_res, y_res, idx_res = sampler.fit_resample(X, y)
-        sample_weight_res = safe_indexing(sample_weight, idx_res)
+        X_res, y_res = sampler.fit_resample(X, y)
+        sample_weight_res = safe_indexing(sample_weight,
+                                          sampler.sample_indices_)
         estimator.fit(X_res, y_res, sample_weight=sample_weight_res)
 
         y_predict_proba = estimator.predict_proba(X)
@@ -263,8 +260,9 @@ def _boost_discrete(self, iboost, X, y, sample_weight, random_state):
         estimator, sampler = self._make_sampler_estimator(
             random_state=random_state)
 
-        X_res, y_res, idx_res = sampler.fit_resample(X, y)
-        sample_weight_res = safe_indexing(sample_weight, idx_res)
+        X_res, y_res = sampler.fit_resample(X, y)
+        sample_weight_res = safe_indexing(sample_weight,
+                                          sampler.sample_indices_)
         estimator.fit(X_res, y_res, sample_weight=sample_weight_res)
 
         y_predict = estimator.predict(X)

diff --git a/imblearn/ensemble/tests/test_forest.py b/imblearn/ensemble/tests/test_forest.py
@@ -67,7 +67,7 @@ def test_balanced_random_forest_attributes(imbalanced_dataset):
     brf.fit(X, y)
 
     for idx in range(n_estimators):
-        X_res, y_res, _ = brf.samplers_[idx].fit_resample(X, y)
+        X_res, y_res = brf.samplers_[idx].fit_resample(X, y)
         X_res_2, y_res_2 = brf.pipelines_[idx].named_steps[
             'randomundersampler'].fit_resample(X, y)
         assert_allclose(X_res, X_res_2)

diff --git a/imblearn/keras/_generator.py b/imblearn/keras/_generator.py
@@ -35,7 +35,7 @@ class BalancedBatchGenerator(ParentClass):
     Create a keras ``Sequence`` which is given to ``fit_generator``. The
     sampler defines the sampling strategy used to balance the dataset ahead of
     creating the batch. The sampler should have an attribute
-    ``return_indices``.
+    ``sample_indices_``.
 
     Parameters
     ----------
@@ -49,7 +49,7 @@ class BalancedBatchGenerator(ParentClass):
         Sample weight.
 
     sampler : object or None, optional (default=RandomUnderSampler)
-        A sampler instance which has an attribute ``return_indices``.
+        A sampler instance which has an attribute ``sample_indices_``.
         By default, the sampler used is a
         :class:`imblearn.under_sampling.RandomUnderSampler`.
 
@@ -118,20 +118,18 @@ def __init__(self, X, y, sample_weight=None, sampler=None, batch_size=32,
     def _sample(self):
         random_state = check_random_state(self.random_state)
         if self.sampler is None:
-            self.sampler_ = RandomUnderSampler(return_indices=True,
-                                               random_state=random_state)
+            self.sampler_ = RandomUnderSampler(random_state=random_state)
         else:
-            if not hasattr(self.sampler, 'return_indices'):
-                raise ValueError("'sampler' needs to return the indices of "
-                                 "the samples selected. Provide a sampler "
-                                 "which has an attribute 'return_indices'.")
             self.sampler_ = clone(self.sampler)
-            self.sampler_.set_params(return_indices=True)
             # FIXME: Remove in 0.6
             if self.sampler_.__class__.__name__ not in DONT_HAVE_RANDOM_STATE:
                 set_random_state(self.sampler_, random_state)
 
-        _, _, self.indices_ = self.sampler_.fit_resample(self.X, self.y)
+        self.sampler_.fit_resample(self.X, self.y)
+        if not hasattr(self.sampler_, 'sample_indices_'):
+            raise ValueError("'sampler' needs to have an attribute "
+                             "'sample_indices_'.")
+        self.indices_ = self.sampler_.sample_indices_
         # shuffle the indices since the sampler are packing them by class
         random_state.shuffle(self.indices_)
 
@@ -168,7 +166,7 @@ def balanced_batch_generator(X, y, sample_weight=None, sampler=None,
     Returns a generator --- as well as the number of step per epoch --- which
     is given to ``fit_generator``. The sampler defines the sampling strategy
     used to balance the dataset ahead of creating the batch. The sampler should
-    have an attribute ``return_indices``.
+    have an attribute ``sample_indices_``.
 
     Parameters
     ----------
@@ -182,7 +180,7 @@ def balanced_batch_generator(X, y, sample_weight=None, sampler=None,
         Sample weight.
 
     sampler : object or None, optional (default=RandomUnderSampler)
-        A sampler instance which has an attribute ``return_indices``.
+        A sampler instance which has an attribute ``sample_indices_``.
         By default, the sampler used is a
         :class:`imblearn.under_sampling.RandomUnderSampler`.
 

diff --git a/imblearn/keras/tests/test_generator.py b/imblearn/keras/tests/test_generator.py
@@ -36,7 +36,7 @@ def _build_keras_model(n_classes, n_features):
 
 
 def test_balanced_batch_generator_class_no_return_indices(data):
-    with pytest.raises(ValueError, match='needs to return the indices'):
+    with pytest.raises(ValueError, match='needs to have an attribute'):
         BalancedBatchGenerator(*data, sampler=ClusterCentroids(), batch_size=10)
 
 
@@ -75,7 +75,7 @@ def test_balanced_batch_generator_class_sparse(data, keep_sparse):
 
 
 def test_balanced_batch_generator_function_no_return_indices(data):
-    with pytest.raises(ValueError, match='needs to return the indices'):
+    with pytest.raises(ValueError, match='needs to have an attribute'):
         balanced_batch_generator(
             *data, sampler=ClusterCentroids(), batch_size=10, random_state=42)
 

diff --git a/imblearn/over_sampling/_random_over_sampler.py b/imblearn/over_sampling/_random_over_sampler.py
@@ -13,6 +13,7 @@
 from .base import BaseOverSampler
 from ..utils import check_target_type
 from ..utils import Substitution
+from ..utils.deprecation import deprecate_parameter
 from ..utils._docstring import _random_state_docstring
 
 
@@ -37,11 +38,23 @@ class RandomOverSampler(BaseOverSampler):
         Whether or not to return the indices of the samples randomly selected
         in the corresponding classes.
 
+        .. deprecated:: 0.4
+           ``return_indices`` is deprecated. Use the attribute
+           ``sample_indices_`` instead.
+
     ratio : str, dict, or callable
         .. deprecated:: 0.4
            Use the parameter ``sampling_strategy`` instead. It will be removed
            in 0.6.
 
+    Attributes
+    ----------
+    sample_indices_ : ndarray, shape (n_new_samples)
+        Indices of the samples selected.
+
+        .. versionadded:: 0.4
+           ``sample_indices_`` used instead of ``return_indices=True``.
+
     Notes
     -----
     Supports multi-class resampling by sampling each class independently.
@@ -83,6 +96,10 @@ def _check_X_y(X, y):
         return X, y, binarize_y
 
     def _fit_resample(self, X, y):
+        if self.return_indices:
+            deprecate_parameter(self, '0.4', 'return_indices',
+                                'sample_indices_')
+
         random_state = check_random_state(self.random_state)
         target_stats = Counter(y)
 
@@ -95,10 +112,10 @@ def _fit_resample(self, X, y):
 
             sample_indices = np.append(sample_indices,
                                        target_class_indices[indices])
+        self.sample_indices_ = np.array(sample_indices)
 
         if self.return_indices:
-            return (safe_indexing(X, sample_indices), safe_indexing(
-                    y, sample_indices), sample_indices)
-        else:
-            return (safe_indexing(X, sample_indices), safe_indexing(
-                    y, sample_indices))
+            return (safe_indexing(X, sample_indices),
+                    safe_indexing(y, sample_indices), sample_indices)
+        return (safe_indexing(X, sample_indices),
+                safe_indexing(y, sample_indices))
diff --git a/imblearn/over_sampling/tests/test_random_over_sampler.py b/imblearn/over_sampling/tests/test_random_over_sampler.py
@@ -5,6 +5,7 @@
 
 from collections import Counter
 
+import pytest
 import numpy as np
 from sklearn.utils.testing import assert_allclose
 from sklearn.utils.testing import assert_array_equal
@@ -59,6 +60,7 @@ def test_ros_fit_resample_half():
     assert_array_equal(y_resampled, y_gt)
 
 
+@pytest.mark.filterwarnings("ignore:'return_indices' is deprecated from 0.4")
 def test_random_over_sampling_return_indices():
     ros = RandomOverSampler(return_indices=True, random_state=RND_SEED)
     X_resampled, y_resampled, sample_indices = ros.fit_resample(X, Y)

diff --git a/imblearn/tensorflow/_generator.py b/imblearn/tensorflow/_generator.py
@@ -27,7 +27,7 @@ def balanced_batch_generator(X, y, sample_weight=None, sampler=None,
     Returns a generator --- as well as the number of step per epoch --- which
     is given to ``fit_generator``. The sampler defines the sampling strategy
     used to balance the dataset ahead of creating the batch. The sampler should
-    have an attribute ``return_indices``.
+    have an attribute ``sample_indices_``.
 
     Parameters
     ----------
@@ -41,7 +41,7 @@ def balanced_batch_generator(X, y, sample_weight=None, sampler=None,
         Sample weight.
 
     sampler : object or None, optional (default=RandomUnderSampler)
-        A sampler instance which has an attribute ``return_indices``.
+        A sampler instance which has an attribute ``sample_indices_``.
         By default, the sampler used is a
         :class:`imblearn.under_sampling.RandomUnderSampler`.
 
@@ -122,20 +122,17 @@ def balanced_batch_generator(X, y, sample_weight=None, sampler=None,
 
     random_state = check_random_state(random_state)
     if sampler is None:
-        sampler_ = RandomUnderSampler(return_indices=True,
-                                      random_state=random_state)
+        sampler_ = RandomUnderSampler(random_state=random_state)
     else:
-        if not hasattr(sampler, 'return_indices'):
-            raise ValueError("'sampler' needs to return the indices of "
-                             "the samples selected. Provide a sampler "
-                             "which has an attribute 'return_indices'.")
         sampler_ = clone(sampler)
-        sampler_.set_params(return_indices=True)
         # FIXME: Remove in 0.6
         if sampler_.__class__.__name__ not in DONT_HAVE_RANDOM_STATE:
             set_random_state(sampler_, random_state)
-
-    _, _, indices = sampler_.fit_resample(X, y)
+    sampler_.fit_resample(X, y)
+    if not hasattr(sampler_, 'sample_indices_'):
+        raise ValueError("'sampler' needs to have an attribute "
+                         "'sample_indices_'.")
+    indices = sampler_.sample_indices_
     # shuffle the indices since the sampler are packing them by class
     random_state.shuffle(indices)
 

diff --git a/imblearn/tests/test_common.py b/imblearn/tests/test_common.py
@@ -62,6 +62,7 @@ def _generate_checks_per_estimator(check_generator, estimators):
 @pytest.mark.filterwarnings('ignore:"out_step" is deprecated in 0.4 and')
 @pytest.mark.filterwarnings('ignore:"m_neighbors" is deprecated in 0.4 and')
 @pytest.mark.filterwarnings("ignore:'y' should be of types")
+@pytest.mark.filterwarnings("ignore:'return_indices' is deprecated from 0.4")
 @pytest.mark.parametrize(
     'name, Estimator, check',
     _generate_checks_per_estimator(_yield_all_checks,

diff --git a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py
@@ -19,6 +19,7 @@
 
 from ..base import BaseCleaningSampler
 from ...utils import Substitution
+from ...utils.deprecation import deprecate_parameter
 from ...utils._docstring import _random_state_docstring
 
 
@@ -37,7 +38,11 @@ class CondensedNearestNeighbour(BaseCleaningSampler):
 
     return_indices : bool, optional (default=False)
         Whether or not to return the indices of the samples randomly
-        selected from the majority class.
+        selected.
+
+        .. deprecated:: 0.4
+           ``return_indices`` is deprecated. Use the attribute
+           ``sample_indices_`` instead.
 
     {random_state}
 
@@ -59,6 +64,14 @@ class CondensedNearestNeighbour(BaseCleaningSampler):
            Use the parameter ``sampling_strategy`` instead. It will be removed
            in 0.6.
 
+    Attributes
+    ----------
+    sample_indices_ : ndarray, shape (n_new_samples)
+        Indices of the samples selected.
+
+        .. versionadded:: 0.4
+           ``sample_indices_`` used instead of ``return_indices=True``.
+
     Notes
     -----
     The method is based on [1]_.
@@ -126,6 +139,9 @@ def _validate_estimator(self):
                              ' Got {} instead.'.format(type(self.n_neighbors)))
 
     def _fit_resample(self, X, y):
+        if self.return_indices:
+            deprecate_parameter(self, '0.4', 'return_indices',
+                                'sample_indices_')
         self._validate_estimator()
 
         random_state = check_random_state(self.random_state)
@@ -198,8 +214,9 @@ def _fit_resample(self, X, y):
                 idx_under = np.concatenate(
                     (idx_under, np.flatnonzero(y == target_class)), axis=0)
 
+        self.sample_indices_ = idx_under
+
         if self.return_indices:
             return (safe_indexing(X, idx_under), safe_indexing(y, idx_under),
                     idx_under)
-        else:
-            return safe_indexing(X, idx_under), safe_indexing(y, idx_under)
+        return safe_indexing(X, idx_under), safe_indexing(y, idx_under)