Skip to content

Commit

Permalink
Update the ensemble method
Browse files Browse the repository at this point in the history
  • Loading branch information
Guillaume Lemaitre committed Jul 5, 2016
1 parent fdbc6e4 commit 6a3c5de
Show file tree
Hide file tree
Showing 27 changed files with 57 additions and 294 deletions.
129 changes: 28 additions & 101 deletions imblearn/ensemble/balance_cascade.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,16 @@
import numpy as np

from sklearn.utils import check_X_y
from sklearn.utils import check_random_state

from .ensemble_sampler import EnsembleSampler
from ..base import SamplerMixin


class BalanceCascade(EnsembleSampler):
ESTIMATOR_KIND = ('knn', 'decision-tree', 'random-forest', 'adaboost',
'gradient-boosting', 'linear-svm')


class BalanceCascade(SamplerMixin):
"""Create an ensemble of balanced sets by iteratively under-sampling the
imbalanced dataset using an estimator.
Expand All @@ -27,8 +32,11 @@ class BalanceCascade(EnsembleSampler):
Whether or not to return the indices of the samples randomly
selected from the majority class.
random_state : int or None, optional (default=None)
Seed for random number generation.
random_state : int, RandomState instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by np.random.
verbose : bool, optional (default=True)
Whether or not to print information about the processing.
Expand All @@ -52,15 +60,6 @@ class BalanceCascade(EnsembleSampler):
Attributes
----------
ratio : str or float
If 'auto', the ratio will be defined automatically to balance
the dataset. Otherwise, the ratio is defined as the number
of samples in the minority class over the the number of samples
in the majority class.
random_state : int or None
Seed for random number generation.
min_c_ : str or int
The identifier of the minority class.
Expand Down Expand Up @@ -91,89 +90,16 @@ class BalanceCascade(EnsembleSampler):
def __init__(self, ratio='auto', return_indices=False, random_state=None,
verbose=True, n_max_subset=None, classifier='knn',
bootstrap=True, **kwargs):
"""Initialise the balance cascade object.
Parameters
----------
ratio : str or float, optional (default='auto')
If 'auto', the ratio will be defined automatically to balance
the dataset. Otherwise, the ratio is defined as the number
of samples in the minority class over the the number of samples
in the majority class.
return_indices : bool, optional (default=True)
Whether or not to return the indices of the samples randomly
selected from the majority class.
random_state : int or None, optional (default=None)
Seed for random number generation.
verbose : bool, optional (default=True)
Whether or not to print information about the processing.
n_max_subset : int or None, optional (default=None)
Maximum number of subsets to generate. By default, all data from
the training will be selected that could lead to a large number of
subsets. We can probably reduced this number empirically.
classifier : str, optional (default='knn')
The classifier that will be selected to confront the prediction
with the real labels. The choices are the following: 'knn',
'decision-tree', 'random-forest', 'adaboost', 'gradient-boosting'
and 'linear-svm'.
bootstrap : bool, optional (default=True)
Whether to bootstrap the data before each iteration.
**kwargs : keywords
The parameters associated with the classifier provided.
Returns
-------
None
"""
super(BalanceCascade, self).__init__(ratio=ratio,
return_indices=return_indices,
verbose=verbose,
random_state=random_state)
# Define the classifier to use
list_classifier = ('knn', 'decision-tree', 'random-forest', 'adaboost',
'gradient-boosting', 'linear-svm')
if classifier in list_classifier:
self.classifier = classifier
else:
raise NotImplementedError
verbose=verbose)
self.return_indices = return_indices
self.random_state = random_state
self.classifier = classifier
self.n_max_subset = n_max_subset
self.bootstrap = bootstrap
self.kwargs = kwargs

def fit(self, X, y):
"""Find the classes statistics before to perform sampling.
Parameters
----------
X : ndarray, shape (n_samples, n_features)
Matrix containing the data which have to be sampled.
y : ndarray, shape (n_samples, )
Corresponding label for each sample in X.
Returns
-------
self : object,
Return self.
"""
# Check the consistency of X and y
X, y = check_X_y(X, y)

# Call the parent function
super(BalanceCascade, self).fit(X, y)

return self

def sample(self, X, y):
def _sample(self, X, y):
"""Resample the dataset.
Parameters
Expand All @@ -197,10 +123,11 @@ def sample(self, X, y):
containing the which samples have been selected.
"""
# Check the consistency of X and y
X, y = check_X_y(X, y)

super(BalanceCascade, self).sample(X, y)
if self.classifier not in ESTIMATOR_KIND:
raise NotImplementedError

random_state = check_random_state(self.random_state)

# Define the classifier to use
if self.classifier == 'knn':
Expand All @@ -210,25 +137,26 @@ def sample(self, X, y):
elif self.classifier == 'decision-tree':
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(
random_state=random_state,
**self.kwargs)
elif self.classifier == 'random-forest':
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(
random_state=self.random_state,
random_state=random_state,
**self.kwargs)
elif self.classifier == 'adaboost':
from sklearn.ensemble import AdaBoostClassifier
classifier = AdaBoostClassifier(
random_state=self.random_state,
random_state=random_state,
**self.kwargs)
elif self.classifier == 'gradient-boosting':
from sklearn.ensemble import GradientBoostingClassifier
classifier = GradientBoostingClassifier(
random_state=self.random_state,
random_state=random_state,
**self.kwargs)
elif self.classifier == 'linear-svm':
from sklearn.svm import LinearSVC
classifier = LinearSVC(random_state=self.random_state,
classifier = LinearSVC(random_state=random_state,
**self.kwargs)
else:
raise NotImplementedError
Expand Down Expand Up @@ -267,8 +195,7 @@ def sample(self, X, y):
# Generate an appropriate number of index to extract
# from the majority class depending of the false classification
# rate of the previous iteration
np.random.seed(self.random_state)
idx_sel_from_maj = np.random.choice(np.nonzero(b_sel_N)[0],
idx_sel_from_maj = random_state.choice(np.nonzero(b_sel_N)[0],
size=num_samples,
replace=False)
idx_sel_from_maj = np.concatenate((idx_mis_class,
Expand Down Expand Up @@ -296,7 +223,7 @@ def sample(self, X, y):
self.bootstrap):
# Apply a bootstrap on x_data
curr_sample_weight = np.ones((y_data.size,), dtype=np.float64)
indices = np.random.randint(0, y_data.size, y_data.size)
indices = random_state.randint(0, y_data.size, y_data.size)
sample_counts = np.bincount(indices, minlength=y_data.size)
curr_sample_weight *= sample_counts

Expand Down
88 changes: 11 additions & 77 deletions imblearn/ensemble/easy_ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@

from sklearn.utils import check_X_y

from .ensemble_sampler import EnsembleSampler
from ..base import SamplerMixin
from ..under_sampling import RandomUnderSampler


class EasyEnsemble(EnsembleSampler):
class EasyEnsemble(SamplerMixin):
"""Create an ensemble sets by iteratively applying random under-sampling.
This method iteratively select a random subset and make an ensemble of the
Expand All @@ -27,8 +27,11 @@ class EasyEnsemble(EnsembleSampler):
Whether or not to return the indices of the samples randomly
selected from the majority class.
random_state : int or None, optional (default=None)
Seed for random number generation.
random_state : int, RandomState instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by np.random.
verbose : bool, optional (default=True)
Whether or not to print information about the processing.
Expand All @@ -41,15 +44,6 @@ class EasyEnsemble(EnsembleSampler):
Attributes
----------
ratio : str or float
If 'auto', the ratio will be defined automatically to balance
the dataset. Otherwise, the ratio is defined as the number
of samples in the minority class over the the number of samples
in the majority class.
random_state : int or None
Seed for random number generation.
min_c_ : str or int
The identifier of the minority class.
Expand Down Expand Up @@ -78,70 +72,14 @@ class EasyEnsemble(EnsembleSampler):

def __init__(self, ratio='auto', return_indices=False, verbose=True,
random_state=None, replacement=False, n_subsets=10):
"""Initialise the easy ensenble object.
Parameters
----------
ratio : str or float, optional (default='auto')
If 'auto', the ratio will be defined automatically to balance
the dataset. Otherwise, the ratio is defined as the number
of samples in the minority class over the the number of samples
in the majority class.
return_indices : bool, optional (default=True)
Whether or not to return the indices of the samples randomly
selected from the majority class.
random_state : int or None, optional (default=None)
Seed for random number generation.
verbose : bool, optional (default=True)
Whether or not to print information about the processing.
replacement : bool, optional (default=False)
Whether or not to sample randomly with replacement or not.
n_subsets : int, optional (default=10)
Number of subsets to generate.
Returns
-------
None
"""
super(EasyEnsemble, self).__init__(ratio=ratio,
return_indices=return_indices,
verbose=verbose,
random_state=random_state)
verbose=verbose)
self.return_indices = return_indices
self.random_state = random_state
self.replacement = replacement
self.n_subsets = n_subsets

def fit(self, X, y):
"""Find the classes statistics before to perform sampling.
Parameters
----------
X : ndarray, shape (n_samples, n_features)
Matrix containing the data which have to be sampled.
y : ndarray, shape (n_samples, )
Corresponding label for each sample in X.
Returns
-------
self : object,
Return self.
"""
# Check the consistency of X and y
X, y = check_X_y(X, y)

# Call the parent function
super(EasyEnsemble, self).fit(X, y)

return self

def sample(self, X, y):
def _sample(self, X, y):
"""Resample the dataset.
Parameters
Expand All @@ -165,10 +103,6 @@ def sample(self, X, y):
containing the which samples have been selected.
"""
# Check the consistency of X and y
X, y = check_X_y(X, y)

super(EasyEnsemble, self).sample(X, y)

X_resampled = []
y_resampled = []
Expand Down
Loading

0 comments on commit 6a3c5de

Please sign in to comment.