Skip to content

Commit

Permalink
MAINT: deprecate return_indices in favor of attribute sample_indices_ (
Browse files Browse the repository at this point in the history
  • Loading branch information
glemaitre authored Sep 13, 2018
1 parent 7f93dfc commit a19d398
Show file tree
Hide file tree
Showing 30 changed files with 337 additions and 176 deletions.
5 changes: 5 additions & 0 deletions doc/whats_new/v0.0.4.rst
Original file line number Diff line number Diff line change
Expand Up @@ -147,3 +147,8 @@ Deprecation

- Deprecate :class:`imblearn.ensemble.BalanceCascade`.
:issue:`472` by :user:`Guillaume Lemaitre <glemaitre>`.

- Deprecate ``return_indices`` in all samplers. Instead, an attribute
``sample_indices_`` is created whenever the sampler is selecting a subset of
the original samples.
:issue:`474` by :user:`Guillaume Lemaitre <glemaitre`.
5 changes: 2 additions & 3 deletions imblearn/ensemble/_easy_ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,14 +121,13 @@ def _fit_resample(self, X, y):
for _ in range(self.n_subsets):
rus = RandomUnderSampler(
sampling_strategy=self.sampling_strategy_,
return_indices=True,
random_state=random_state.randint(MAX_INT),
replacement=self.replacement)
sel_x, sel_y, sel_idx = rus.fit_resample(X, y)
sel_x, sel_y = rus.fit_resample(X, y)
X_resampled.append(sel_x)
y_resampled.append(sel_y)
if self.return_indices:
idx_under.append(sel_idx)
idx_under.append(rus.sample_indices_)

if self.return_indices:
return (np.array(X_resampled), np.array(y_resampled),
Expand Down
10 changes: 3 additions & 7 deletions imblearn/ensemble/_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@ def _local_parallel_build_trees(sampler, tree, forest, X, y, sample_weight,
tree_idx, n_trees, verbose=0,
class_weight=None):
# resample before to fit the tree
X_resampled, y_resampled, selected_idx = sampler.fit_sample(X, y)
X_resampled, y_resampled = sampler.fit_sample(X, y)
if sample_weight is not None:
sample_weight = safe_indexing(sample_weight, selected_idx)
sample_weight = safe_indexing(sample_weight, sampler.sample_indices_)
tree = _parallel_build_trees(tree, forest, X_resampled, y_resampled,
sample_weight, tree_idx, n_trees,
verbose=verbose, class_weight=class_weight)
Expand Down Expand Up @@ -306,8 +306,7 @@ def _validate_estimator(self, default=DecisionTreeClassifier()):

self.base_sampler_ = RandomUnderSampler(
sampling_strategy=self.sampling_strategy,
replacement=self.replacement,
return_indices=True)
replacement=self.replacement)

def _make_sampler_estimator(self, random_state=None):
"""Make and configure a copy of the `base_estimator_` attribute.
Expand Down Expand Up @@ -450,9 +449,6 @@ def fit(self, X, y, sample_weight=None):
# Create pipeline with the fitted samplers and trees
self.pipelines_.extend([make_pipeline(deepcopy(s), deepcopy(t))
for s, t in zip(samplers, trees)])
for idx in range(len(self.pipelines_)):
self.pipelines_[idx].named_steps[
'randomundersampler'].set_params(return_indices=False)

if self.oob_score:
self._set_oob_score(X, y)
Expand Down
16 changes: 7 additions & 9 deletions imblearn/ensemble/_weight_boosting.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,8 +169,7 @@ def _validate_estimator(self, default=DecisionTreeClassifier()):

self.base_sampler_ = RandomUnderSampler(
sampling_strategy=self.sampling_strategy,
replacement=self.replacement,
return_indices=True)
replacement=self.replacement)

def _make_sampler_estimator(self, append=True, random_state=None):
"""Make and configure a copy of the `base_estimator_` attribute.
Expand All @@ -191,9 +190,6 @@ def _make_sampler_estimator(self, append=True, random_state=None):
self.samplers_.append(sampler)
self.pipelines_.append(make_pipeline(deepcopy(sampler),
deepcopy(estimator)))
# do not return the indices within a pipeline
self.pipelines_[-1].named_steps['randomundersampler'].set_params(
return_indices=False)

return estimator, sampler

Expand All @@ -202,8 +198,9 @@ def _boost_real(self, iboost, X, y, sample_weight, random_state):
estimator, sampler = self._make_sampler_estimator(
random_state=random_state)

X_res, y_res, idx_res = sampler.fit_resample(X, y)
sample_weight_res = safe_indexing(sample_weight, idx_res)
X_res, y_res = sampler.fit_resample(X, y)
sample_weight_res = safe_indexing(sample_weight,
sampler.sample_indices_)
estimator.fit(X_res, y_res, sample_weight=sample_weight_res)

y_predict_proba = estimator.predict_proba(X)
Expand Down Expand Up @@ -263,8 +260,9 @@ def _boost_discrete(self, iboost, X, y, sample_weight, random_state):
estimator, sampler = self._make_sampler_estimator(
random_state=random_state)

X_res, y_res, idx_res = sampler.fit_resample(X, y)
sample_weight_res = safe_indexing(sample_weight, idx_res)
X_res, y_res = sampler.fit_resample(X, y)
sample_weight_res = safe_indexing(sample_weight,
sampler.sample_indices_)
estimator.fit(X_res, y_res, sample_weight=sample_weight_res)

y_predict = estimator.predict(X)
Expand Down
2 changes: 1 addition & 1 deletion imblearn/ensemble/tests/test_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def test_balanced_random_forest_attributes(imbalanced_dataset):
brf.fit(X, y)

for idx in range(n_estimators):
X_res, y_res, _ = brf.samplers_[idx].fit_resample(X, y)
X_res, y_res = brf.samplers_[idx].fit_resample(X, y)
X_res_2, y_res_2 = brf.pipelines_[idx].named_steps[
'randomundersampler'].fit_resample(X, y)
assert_allclose(X_res, X_res_2)
Expand Down
22 changes: 10 additions & 12 deletions imblearn/keras/_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class BalancedBatchGenerator(ParentClass):
Create a keras ``Sequence`` which is given to ``fit_generator``. The
sampler defines the sampling strategy used to balance the dataset ahead of
creating the batch. The sampler should have an attribute
``return_indices``.
``sample_indices_``.
Parameters
----------
Expand All @@ -49,7 +49,7 @@ class BalancedBatchGenerator(ParentClass):
Sample weight.
sampler : object or None, optional (default=RandomUnderSampler)
A sampler instance which has an attribute ``return_indices``.
A sampler instance which has an attribute ``sample_indices_``.
By default, the sampler used is a
:class:`imblearn.under_sampling.RandomUnderSampler`.
Expand Down Expand Up @@ -118,20 +118,18 @@ def __init__(self, X, y, sample_weight=None, sampler=None, batch_size=32,
def _sample(self):
random_state = check_random_state(self.random_state)
if self.sampler is None:
self.sampler_ = RandomUnderSampler(return_indices=True,
random_state=random_state)
self.sampler_ = RandomUnderSampler(random_state=random_state)
else:
if not hasattr(self.sampler, 'return_indices'):
raise ValueError("'sampler' needs to return the indices of "
"the samples selected. Provide a sampler "
"which has an attribute 'return_indices'.")
self.sampler_ = clone(self.sampler)
self.sampler_.set_params(return_indices=True)
# FIXME: Remove in 0.6
if self.sampler_.__class__.__name__ not in DONT_HAVE_RANDOM_STATE:
set_random_state(self.sampler_, random_state)

_, _, self.indices_ = self.sampler_.fit_resample(self.X, self.y)
self.sampler_.fit_resample(self.X, self.y)
if not hasattr(self.sampler_, 'sample_indices_'):
raise ValueError("'sampler' needs to have an attribute "
"'sample_indices_'.")
self.indices_ = self.sampler_.sample_indices_
# shuffle the indices since the sampler are packing them by class
random_state.shuffle(self.indices_)

Expand Down Expand Up @@ -168,7 +166,7 @@ def balanced_batch_generator(X, y, sample_weight=None, sampler=None,
Returns a generator --- as well as the number of step per epoch --- which
is given to ``fit_generator``. The sampler defines the sampling strategy
used to balance the dataset ahead of creating the batch. The sampler should
have an attribute ``return_indices``.
have an attribute ``sample_indices_``.
Parameters
----------
Expand All @@ -182,7 +180,7 @@ def balanced_batch_generator(X, y, sample_weight=None, sampler=None,
Sample weight.
sampler : object or None, optional (default=RandomUnderSampler)
A sampler instance which has an attribute ``return_indices``.
A sampler instance which has an attribute ``sample_indices_``.
By default, the sampler used is a
:class:`imblearn.under_sampling.RandomUnderSampler`.
Expand Down
4 changes: 2 additions & 2 deletions imblearn/keras/tests/test_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def _build_keras_model(n_classes, n_features):


def test_balanced_batch_generator_class_no_return_indices(data):
with pytest.raises(ValueError, match='needs to return the indices'):
with pytest.raises(ValueError, match='needs to have an attribute'):
BalancedBatchGenerator(*data, sampler=ClusterCentroids(), batch_size=10)


Expand Down Expand Up @@ -75,7 +75,7 @@ def test_balanced_batch_generator_class_sparse(data, keep_sparse):


def test_balanced_batch_generator_function_no_return_indices(data):
with pytest.raises(ValueError, match='needs to return the indices'):
with pytest.raises(ValueError, match='needs to have an attribute'):
balanced_batch_generator(
*data, sampler=ClusterCentroids(), batch_size=10, random_state=42)

Expand Down
27 changes: 22 additions & 5 deletions imblearn/over_sampling/_random_over_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from .base import BaseOverSampler
from ..utils import check_target_type
from ..utils import Substitution
from ..utils.deprecation import deprecate_parameter
from ..utils._docstring import _random_state_docstring


Expand All @@ -37,11 +38,23 @@ class RandomOverSampler(BaseOverSampler):
Whether or not to return the indices of the samples randomly selected
in the corresponding classes.
.. deprecated:: 0.4
``return_indices`` is deprecated. Use the attribute
``sample_indices_`` instead.
ratio : str, dict, or callable
.. deprecated:: 0.4
Use the parameter ``sampling_strategy`` instead. It will be removed
in 0.6.
Attributes
----------
sample_indices_ : ndarray, shape (n_new_samples)
Indices of the samples selected.
.. versionadded:: 0.4
``sample_indices_`` used instead of ``return_indices=True``.
Notes
-----
Supports multi-class resampling by sampling each class independently.
Expand Down Expand Up @@ -83,6 +96,10 @@ def _check_X_y(X, y):
return X, y, binarize_y

def _fit_resample(self, X, y):
if self.return_indices:
deprecate_parameter(self, '0.4', 'return_indices',
'sample_indices_')

random_state = check_random_state(self.random_state)
target_stats = Counter(y)

Expand All @@ -95,10 +112,10 @@ def _fit_resample(self, X, y):

sample_indices = np.append(sample_indices,
target_class_indices[indices])
self.sample_indices_ = np.array(sample_indices)

if self.return_indices:
return (safe_indexing(X, sample_indices), safe_indexing(
y, sample_indices), sample_indices)
else:
return (safe_indexing(X, sample_indices), safe_indexing(
y, sample_indices))
return (safe_indexing(X, sample_indices),
safe_indexing(y, sample_indices), sample_indices)
return (safe_indexing(X, sample_indices),
safe_indexing(y, sample_indices))
2 changes: 2 additions & 0 deletions imblearn/over_sampling/tests/test_random_over_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from collections import Counter

import pytest
import numpy as np
from sklearn.utils.testing import assert_allclose
from sklearn.utils.testing import assert_array_equal
Expand Down Expand Up @@ -59,6 +60,7 @@ def test_ros_fit_resample_half():
assert_array_equal(y_resampled, y_gt)


@pytest.mark.filterwarnings("ignore:'return_indices' is deprecated from 0.4")
def test_random_over_sampling_return_indices():
ros = RandomOverSampler(return_indices=True, random_state=RND_SEED)
X_resampled, y_resampled, sample_indices = ros.fit_resample(X, Y)
Expand Down
19 changes: 8 additions & 11 deletions imblearn/tensorflow/_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def balanced_batch_generator(X, y, sample_weight=None, sampler=None,
Returns a generator --- as well as the number of step per epoch --- which
is given to ``fit_generator``. The sampler defines the sampling strategy
used to balance the dataset ahead of creating the batch. The sampler should
have an attribute ``return_indices``.
have an attribute ``sample_indices_``.
Parameters
----------
Expand All @@ -41,7 +41,7 @@ def balanced_batch_generator(X, y, sample_weight=None, sampler=None,
Sample weight.
sampler : object or None, optional (default=RandomUnderSampler)
A sampler instance which has an attribute ``return_indices``.
A sampler instance which has an attribute ``sample_indices_``.
By default, the sampler used is a
:class:`imblearn.under_sampling.RandomUnderSampler`.
Expand Down Expand Up @@ -122,20 +122,17 @@ def balanced_batch_generator(X, y, sample_weight=None, sampler=None,

random_state = check_random_state(random_state)
if sampler is None:
sampler_ = RandomUnderSampler(return_indices=True,
random_state=random_state)
sampler_ = RandomUnderSampler(random_state=random_state)
else:
if not hasattr(sampler, 'return_indices'):
raise ValueError("'sampler' needs to return the indices of "
"the samples selected. Provide a sampler "
"which has an attribute 'return_indices'.")
sampler_ = clone(sampler)
sampler_.set_params(return_indices=True)
# FIXME: Remove in 0.6
if sampler_.__class__.__name__ not in DONT_HAVE_RANDOM_STATE:
set_random_state(sampler_, random_state)

_, _, indices = sampler_.fit_resample(X, y)
sampler_.fit_resample(X, y)
if not hasattr(sampler_, 'sample_indices_'):
raise ValueError("'sampler' needs to have an attribute "
"'sample_indices_'.")
indices = sampler_.sample_indices_
# shuffle the indices since the sampler are packing them by class
random_state.shuffle(indices)

Expand Down
1 change: 1 addition & 0 deletions imblearn/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def _generate_checks_per_estimator(check_generator, estimators):
@pytest.mark.filterwarnings('ignore:"out_step" is deprecated in 0.4 and')
@pytest.mark.filterwarnings('ignore:"m_neighbors" is deprecated in 0.4 and')
@pytest.mark.filterwarnings("ignore:'y' should be of types")
@pytest.mark.filterwarnings("ignore:'return_indices' is deprecated from 0.4")
@pytest.mark.parametrize(
'name, Estimator, check',
_generate_checks_per_estimator(_yield_all_checks,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

from ..base import BaseCleaningSampler
from ...utils import Substitution
from ...utils.deprecation import deprecate_parameter
from ...utils._docstring import _random_state_docstring


Expand All @@ -37,7 +38,11 @@ class CondensedNearestNeighbour(BaseCleaningSampler):
return_indices : bool, optional (default=False)
Whether or not to return the indices of the samples randomly
selected from the majority class.
selected.
.. deprecated:: 0.4
``return_indices`` is deprecated. Use the attribute
``sample_indices_`` instead.
{random_state}
Expand All @@ -59,6 +64,14 @@ class CondensedNearestNeighbour(BaseCleaningSampler):
Use the parameter ``sampling_strategy`` instead. It will be removed
in 0.6.
Attributes
----------
sample_indices_ : ndarray, shape (n_new_samples)
Indices of the samples selected.
.. versionadded:: 0.4
``sample_indices_`` used instead of ``return_indices=True``.
Notes
-----
The method is based on [1]_.
Expand Down Expand Up @@ -126,6 +139,9 @@ def _validate_estimator(self):
' Got {} instead.'.format(type(self.n_neighbors)))

def _fit_resample(self, X, y):
if self.return_indices:
deprecate_parameter(self, '0.4', 'return_indices',
'sample_indices_')
self._validate_estimator()

random_state = check_random_state(self.random_state)
Expand Down Expand Up @@ -198,8 +214,9 @@ def _fit_resample(self, X, y):
idx_under = np.concatenate(
(idx_under, np.flatnonzero(y == target_class)), axis=0)

self.sample_indices_ = idx_under

if self.return_indices:
return (safe_indexing(X, idx_under), safe_indexing(y, idx_under),
idx_under)
else:
return safe_indexing(X, idx_under), safe_indexing(y, idx_under)
return safe_indexing(X, idx_under), safe_indexing(y, idx_under)
Loading

0 comments on commit a19d398

Please sign in to comment.