diff --git a/imblearn/ensemble/balance_cascade.py b/imblearn/ensemble/balance_cascade.py index 92634d398..d7cb81212 100644 --- a/imblearn/ensemble/balance_cascade.py +++ b/imblearn/ensemble/balance_cascade.py @@ -1,13 +1,18 @@ """Class to perform under-sampling using balace cascade.""" from __future__ import print_function +import warnings + import numpy as np + +from sklearn.base import ClassifierMixin +from sklearn.neighbors import KNeighborsClassifier from sklearn.utils import check_random_state +from sklearn.utils.validation import has_fit_parameter -from ..base import BaseBinarySampler +from six import string_types -ESTIMATOR_KIND = ('knn', 'decision-tree', 'random-forest', 'adaboost', - 'gradient-boosting', 'linear-svm') +from ..base import BaseBinarySampler class BalanceCascade(BaseBinarySampler): @@ -40,18 +45,29 @@ class BalanceCascade(BaseBinarySampler): the training will be selected that could lead to a large number of subsets. We can probably deduce this number empirically. - classifier : str, optional (default='knn') + classifier : str, optional (default=None) The classifier that will be selected to confront the prediction with the real labels. The choices are the following: 'knn', 'decision-tree', 'random-forest', 'adaboost', 'gradient-boosting' and 'linear-svm'. + NOTE: `classifier` is deprecated from 0.2 and will be replaced in 0.4. + Use `estimator` instead. + + estimator : object, optional (default=KNeighborsClassifier()) + An estimator inherited from `sklearn.base.ClassifierMixin` and having + an attribute `predict_proba`. + bootstrap : bool, optional (default=True) Whether to bootstrap the data before each iteration. **kwargs : keywords The parameters associated with the classifier provided. + NOTE: `**kwargs` has been deprecated from 0.2 and will be replaced in + 0.4. Use `estimator` object instead to pass parameters associated + to an estimator. + Attributes ---------- min_c_ : str or int @@ -100,16 +116,97 @@ class BalanceCascade(BaseBinarySampler): """ def __init__(self, ratio='auto', return_indices=False, random_state=None, - n_max_subset=None, classifier='knn', bootstrap=True, - **kwargs): + n_max_subset=None, classifier=None, estimator=None, + bootstrap=True, **kwargs): super(BalanceCascade, self).__init__(ratio=ratio, random_state=random_state) self.return_indices = return_indices self.classifier = classifier + self.estimator = estimator self.n_max_subset = n_max_subset self.bootstrap = bootstrap self.kwargs = kwargs + def _validate_estimator(self): + """Private function to create the classifier""" + + if self.classifier is not None: + warnings.warn('`classifier` will be replaced in version' + ' 0.4. Use a `estimator` instead.', + DeprecationWarning) + self.estimator = self.classifier + + if (self.estimator is not None and + isinstance(self.estimator, ClassifierMixin) and + hasattr(self.estimator, 'predict')): + self.estimator_ = self.estimator + elif self.estimator is None: + self.estimator_ = KNeighborsClassifier() + # To be removed in 0.4 + elif (self.estimator is not None and + isinstance(self.estimator, string_types)): + warnings.warn('`estimator` will be replaced in version' + ' 0.4. Use a classifier object instead of a string.', + DeprecationWarning) + # Define the classifier to use + if self.estimator == 'knn': + self.estimator_ = KNeighborsClassifier( + **self.kwargs) + elif self.estimator == 'decision-tree': + from sklearn.tree import DecisionTreeClassifier + self.estimator_ = DecisionTreeClassifier( + random_state=self.random_state, + **self.kwargs) + elif self.estimator == 'random-forest': + from sklearn.ensemble import RandomForestClassifier + self.estimator_ = RandomForestClassifier( + random_state=self.random_state, + **self.kwargs) + elif self.estimator == 'adaboost': + from sklearn.ensemble import AdaBoostClassifier + self.estimator_ = AdaBoostClassifier( + random_state=self.random_state, + **self.kwargs) + elif self.estimator == 'gradient-boosting': + from sklearn.ensemble import GradientBoostingClassifier + self.estimator_ = GradientBoostingClassifier( + random_state=self.random_state, + **self.kwargs) + elif self.estimator == 'linear-svm': + from sklearn.svm import LinearSVC + self.estimator_ = LinearSVC(random_state=self.random_state, + **self.kwargs) + else: + raise NotImplementedError + else: + raise ValueError('Invalid parameter `estimator`') + + self.logger.debug(self.estimator_) + + def fit(self, X, y): + """Find the classes statistics before to perform sampling. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Matrix containing the data which have to be sampled. + + y : ndarray, shape (n_samples, ) + Corresponding label for each sample in X. + + Returns + ------- + self : object, + Return self. + + """ + + super(BalanceCascade, self).fit(X, y) + + self._validate_estimator() + + return self + def _sample(self, X, y): """Resample the dataset. @@ -135,42 +232,9 @@ def _sample(self, X, y): """ - if self.classifier not in ESTIMATOR_KIND: - raise NotImplementedError - random_state = check_random_state(self.random_state) - - # Define the classifier to use - if self.classifier == 'knn': - from sklearn.neighbors import KNeighborsClassifier - classifier = KNeighborsClassifier( - **self.kwargs) - elif self.classifier == 'decision-tree': - from sklearn.tree import DecisionTreeClassifier - classifier = DecisionTreeClassifier( - random_state=random_state, - **self.kwargs) - elif self.classifier == 'random-forest': - from sklearn.ensemble import RandomForestClassifier - classifier = RandomForestClassifier( - random_state=random_state, - **self.kwargs) - elif self.classifier == 'adaboost': - from sklearn.ensemble import AdaBoostClassifier - classifier = AdaBoostClassifier( - random_state=random_state, - **self.kwargs) - elif self.classifier == 'gradient-boosting': - from sklearn.ensemble import GradientBoostingClassifier - classifier = GradientBoostingClassifier( - random_state=random_state, - **self.kwargs) - elif self.classifier == 'linear-svm': - from sklearn.svm import LinearSVC - classifier = LinearSVC(random_state=random_state, - **self.kwargs) - else: - raise NotImplementedError + support_sample_weight = has_fit_parameter(self.estimator_, + "sample_weight") X_resampled = [] y_resampled = [] @@ -185,6 +249,7 @@ def _sample(self, X, y): # return them later if self.return_indices: idx_min = np.flatnonzero(y == self.min_c_) + idx_maj = np.flatnonzero(y == self.maj_c_) # Condition to initiliase before the search b_subset_search = True @@ -227,27 +292,42 @@ def _sample(self, X, y): X_resampled.append(x_data) y_resampled.append(y_data) if self.return_indices: - idx_under.append(np.concatenate((idx_min, idx_sel_from_maj), + idx_under.append(np.concatenate((idx_min, + idx_maj[idx_sel_from_maj]), axis=0)) - if (not (self.classifier == 'knn' or - self.classifier == 'linear-svm') and - self.bootstrap): - # Apply a bootstrap on x_data - curr_sample_weight = np.ones((y_data.size,), dtype=np.float64) + # Get the indices of interest + if self.bootstrap: indices = random_state.randint(0, y_data.size, y_data.size) - sample_counts = np.bincount(indices, minlength=y_data.size) - curr_sample_weight *= sample_counts + else: + indices = np.arange(y_data.size) - # Train the classifier using the current data - classifier.fit(x_data, y_data, curr_sample_weight) + # Draw samples, using sample weights, and then fit + if support_sample_weight: + self.logger.debug('Sample-weight is supported') + curr_sample_weight = np.ones((y_data.size,), dtype=np.float64) + if self.bootstrap: + self.logger.debug('Go for a bootstrap') + sample_counts = np.bincount(indices, minlength=y_data.size) + curr_sample_weight *= sample_counts + else: + self.logger.debug('No bootstrap') + mask = np.zeros(y_data.size, dtype=np.bool) + mask[indices] = True + not_indices_mask = ~mask + curr_sample_weight[not_indices_mask] = 0 + + self.estimator_.fit(x_data, y_data, + sample_weight=curr_sample_weight) + + # Draw samples, using a mask, and then fit else: - # Train the classifier using the current data - classifier.fit(x_data, y_data) + self.logger.debug('Sample-weight is not supported') + self.estimator_.fit(x_data[indices], y_data[indices]) # Predict using only the majority class - pred_label = classifier.predict(N_x[idx_sel_from_maj, :]) + pred_label = self.estimator_.predict(N_x[idx_sel_from_maj, :]) # Basically let's find which sample have to be retained for the # next round @@ -288,9 +368,8 @@ def _sample(self, X, y): X_resampled.append(x_data) y_resampled.append(y_data) if self.return_indices: - idx_under.append(np.concatenate((idx_min, - idx_sel_from_maj), - axis=0)) + idx_under.append(np.concatenate( + (idx_min, idx_maj[idx_sel_from_maj]), axis=0)) self.logger.debug('Creation of the subset #%s', n_subsets) @@ -321,9 +400,8 @@ def _sample(self, X, y): X_resampled.append(x_data) y_resampled.append(y_data) if self.return_indices: - idx_under.append(np.concatenate((idx_min, - idx_sel_from_maj), - axis=0)) + idx_under.append(np.concatenate( + (idx_min, idx_maj[idx_sel_from_maj]), axis=0)) self.logger.debug('Creation of the subset #%s', n_subsets) # We found a new subset, increase the counter diff --git a/imblearn/ensemble/tests/test_balance_cascade.py b/imblearn/ensemble/tests/test_balance_cascade.py index c8d1bb088..9db1d7301 100644 --- a/imblearn/ensemble/tests/test_balance_cascade.py +++ b/imblearn/ensemble/tests/test_balance_cascade.py @@ -7,6 +7,7 @@ from numpy.testing import (assert_array_equal, assert_equal, assert_raises, assert_warns) from sklearn.datasets import make_classification +from sklearn.ensemble import RandomForestClassifier from sklearn.utils.estimator_checks import check_estimator from imblearn.ensemble import BalanceCascade @@ -142,51 +143,51 @@ def test_fit_sample_auto(): # Create the sampling object bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, - return_indices=True) + return_indices=True, bootstrap=False) # Get the different subset X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.array([np.array([[0.11622591, -0.0317206], - [1.25192108, -0.22367336], - [0.53366841, -0.30312976], - [1.52091956, -0.49283504], - [0.88407872, 0.35454207], - [1.31301027, -0.92648734], - [-0.41635887, -0.38299653], - [1.70580611, -0.11219234], - [1.15514042, 0.0129463], - [0.08711622, 0.93259929], - [0.70472253, -0.73309052], - [-0.14374509, 0.27370049], - [0.83680821, 1.72827342], - [-0.18410027, -0.45194484], - [-0.28162401, -2.10400981], - [-1.11515198, -0.93689695]]), - np.array([[0.11622591, -0.0317206], - [1.25192108, -0.22367336], - [0.53366841, -0.30312976], - [1.52091956, -0.49283504], - [0.88407872, 0.35454207], - [1.31301027, -0.92648734], - [-0.41635887, -0.38299653], - [1.70580611, -0.11219234], - [1.15514042, 0.0129463], - [0.70472253, -0.73309052], - [-0.18410027, -0.45194484], - [0.77481731, 0.60935141], - [0.3084254, 0.33299982], - [0.28893132, -0.38761769], - [0.9281014, 0.53085498]])], dtype=object) + [1.25192108, -0.22367336], + [0.53366841, -0.30312976], + [1.52091956, -0.49283504], + [0.88407872, 0.35454207], + [1.31301027, -0.92648734], + [-0.41635887, -0.38299653], + [1.70580611, -0.11219234], + [1.15514042, 0.0129463], + [0.08711622, 0.93259929], + [0.70472253, -0.73309052], + [-0.14374509, 0.27370049], + [0.83680821, 1.72827342], + [-0.18410027, -0.45194484], + [-0.28162401, -2.10400981], + [-1.11515198, -0.93689695]]), + np.array([[0.11622591, -0.0317206], + [1.25192108, -0.22367336], + [0.53366841, -0.30312976], + [1.52091956, -0.49283504], + [0.88407872, 0.35454207], + [1.31301027, -0.92648734], + [-0.41635887, -0.38299653], + [1.70580611, -0.11219234], + [1.15514042, 0.0129463], + [0.70472253, -0.73309052], + [-0.18410027, -0.45194484], + [0.77481731, 0.60935141], + [0.3084254, 0.33299982], + [0.28893132, -0.38761769], + [0.9281014, 0.53085498]])], dtype=object) y_gt = np.array([np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]), np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1])], dtype=object) - idx_gt = np.array([np.array([0, 2, 3, 4, 11, 12, 17, 19, 6, 11, 4, 10, 2, - 8, 1, 7]), - np.array([0, 2, 3, 4, 11, 12, 17, 19, 6, 4, 8, 0, 3, 5, - 9])], dtype=object) + idx_gt = np.array([np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 18, 8, 16, 6, + 14, 5, 13]), + np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 8, 14, 1, 7, + 9, 15])], dtype=object) # Check each array for idx in range(X_gt.size): assert_array_equal(X_resampled[idx], X_gt[idx]) @@ -201,7 +202,8 @@ def test_fit_sample_half(): ratio = 0.8 # Create the sampling object - bc = BalanceCascade(ratio=ratio, random_state=RND_SEED) + bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, + bootstrap=False) # Get the different subset X_resampled, y_resampled = bc.fit_sample(X, Y) @@ -264,42 +266,42 @@ def test_fit_sample_auto_decision_tree(): X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y) X_gt = np.array([np.array([[0.11622591, -0.0317206], - [1.25192108, -0.22367336], - [0.53366841, -0.30312976], - [1.52091956, -0.49283504], - [0.88407872, 0.35454207], - [1.31301027, -0.92648734], - [-0.41635887, -0.38299653], - [1.70580611, -0.11219234], - [1.15514042, 0.0129463], - [0.08711622, 0.93259929], - [0.70472253, -0.73309052], - [-0.14374509, 0.27370049], - [0.83680821, 1.72827342], - [-0.18410027, -0.45194484], - [-0.28162401, -2.10400981], - [-1.11515198, -0.93689695]]), - np.array([[0.11622591, -0.0317206], - [1.25192108, -0.22367336], - [0.53366841, -0.30312976], - [1.52091956, -0.49283504], - [0.88407872, 0.35454207], - [1.31301027, -0.92648734], - [-0.41635887, -0.38299653], - [1.70580611, -0.11219234], - [-1.11515198, -0.93689695], - [0.77481731, 0.60935141], - [0.3084254, 0.33299982], - [0.28893132, -0.38761769], - [0.9281014, 0.53085498]])], dtype=object) + [1.25192108, -0.22367336], + [0.53366841, -0.30312976], + [1.52091956, -0.49283504], + [0.88407872, 0.35454207], + [1.31301027, -0.92648734], + [-0.41635887, -0.38299653], + [1.70580611, -0.11219234], + [1.15514042, 0.0129463], + [0.08711622, 0.93259929], + [0.70472253, -0.73309052], + [-0.14374509, 0.27370049], + [0.83680821, 1.72827342], + [-0.18410027, -0.45194484], + [-0.28162401, -2.10400981], + [-1.11515198, -0.93689695]]), + np.array([[0.11622591, -0.0317206], + [1.25192108, -0.22367336], + [0.53366841, -0.30312976], + [1.52091956, -0.49283504], + [0.88407872, 0.35454207], + [1.31301027, -0.92648734], + [-0.41635887, -0.38299653], + [1.70580611, -0.11219234], + [-1.11515198, -0.93689695], + [0.77481731, 0.60935141], + [0.3084254, 0.33299982], + [0.28893132, -0.38761769], + [0.9281014, 0.53085498]])], dtype=object) y_gt = np.array([np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]), np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1])], dtype=object) - idx_gt = np.array([np.array([0, 2, 3, 4, 11, 12, 17, 19, 6, 11, 4, - 10, 2, 8, 1, 7]), - np.array([0, 2, 3, 4, 11, 12, 17, 19, 7, 0, 3, 5, - 9])], dtype=object) + idx_gt = np.array([np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 18, 8, 16, 6, + 14, 5, 13]), + np.array([0, 2, 3, 4, 11, 12, 17, 19, 13, 1, 7, 9, + 15])], dtype=object) # Check each array for idx in range(X_gt.size): assert_array_equal(X_resampled[idx], X_gt[idx]) @@ -346,19 +348,21 @@ def test_fit_sample_auto_random_forest(): [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234], + [1.15514042, 0.0129463], [-0.14374509, 0.27370049], + [-1.11515198, -0.93689695], [0.77481731, 0.60935141], [0.3084254, 0.33299982], [0.28893132, -0.38761769], [0.9281014, 0.53085498]])], dtype=object) - y_gt = np.array([np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, - 1, 1, 1, 1]), - np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, - 1])], dtype=object) - idx_gt = np.array([np.array([0, 2, 3, 4, 11, 12, 17, 19, 6, 11, 4, 10, - 2, 8, 1, 7]), - np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 0, 3, 5, - 9])], dtype=object) + y_gt = np.array([np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, + 1, 1]), + np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, + 1])], dtype=object) + idx_gt = np.array([np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 18, 8, 16, 6, + 14, 5, 13]), + np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 16, 13, 1, 7, + 9, 15])], dtype=object) # Check each array for idx in range(X_gt.size): assert_array_equal(X_resampled[idx], X_gt[idx]) @@ -411,13 +415,13 @@ def test_fit_sample_auto_adaboost(): [0.28893132, -0.38761769], [0.9281014, 0.53085498]])], dtype=object) y_gt = np.array([np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, - 1, 1]), + 1, 1]), np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, - 1])], dtype=object) - idx_gt = np.array([np.array([0, 2, 3, 4, 11, 12, 17, 19, 6, 11, 4, 10, 2, - 8, 1, 7]), - np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 7, 0, 3, 5, - 9])], dtype=object) + 1])], dtype=object) + idx_gt = np.array([np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 18, 8, 16, + 6, 14, 5, 13]), + np.array([0, 2, 3, 4, 11, 12, 17, 19, 16, 13, 1, 7, + 9, 15])], dtype=object) # Check each array for idx in range(X_gt.size): assert_array_equal(X_resampled[idx], X_gt[idx]) @@ -471,13 +475,14 @@ def test_fit_sample_auto_gradient_boosting(): [0.28893132, -0.38761769], [0.9281014, 0.53085498]])], dtype=object) y_gt = np.array([np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, - 1, 1, 1]), + 1, 1, 1]), np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, - 1])], dtype=object) - idx_gt = np.array([np.array([0, 2, 3, 4, 11, 12, 17, 19, 6, 11, 4, 10, - 2, 8, 1, 7]), - np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 7, 0, 3, - 5, 9])], dtype=object) + 1])], dtype=object) + idx_gt = np.array([np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 18, 8, 16, 6, + 14, 5, 13]), + np.array([0, 2, 3, 4, 11, 12, 17, 19, 16, 13, 1, 7, 9, + 15])], dtype=object) + # Check each array for idx in range(X_gt.size): assert_array_equal(X_resampled[idx], X_gt[idx]) @@ -531,13 +536,14 @@ def test_fit_sample_auto_linear_svm(): [0.28893132, -0.38761769], [0.9281014, 0.53085498]])], dtype=object) y_gt = np.array([np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, - 1, 1]), + 1, 1]), np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, - 1])], dtype=object) - idx_gt = np.array([np.array([0, 2, 3, 4, 11, 12, 17, 19, 6, 11, 4, 10, - 2, 8, 1, 7]), - np.array([0, 2, 3, 4, 11, 12, 17, 19, 6, 4, 0, 3, - 5, 9])], dtype=object) + 1])], dtype=object) + idx_gt = np.array([np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 18, 8, 16, 6, + 14, 5, 13]), + np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 8, 1, 7, 9, + 15])], dtype=object) + # Check each array for idx in range(X_gt.size): assert_array_equal(X_resampled[idx], X_gt[idx]) @@ -587,7 +593,8 @@ def test_fit_sample_auto_early_stop(): [-1.11515198, -0.93689695]]]) y_gt = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]]) - idx_gt = np.array([[0, 2, 3, 4, 11, 12, 17, 19, 6, 11, 4, 10, 2, 8, 1, 7]]) + idx_gt = np.array([[0, 2, 3, 4, 11, 12, 17, 19, 10, 18, 8, 16, 6, 14, + 5, 13]]) # Check each array assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) @@ -603,7 +610,8 @@ def test_fit_sample_auto_early_stop_2(): # Create the sampling object bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, - return_indices=True, n_max_subset=n_subset) + return_indices=True, n_max_subset=n_subset, + bootstrap=False) # Get the different subset X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y) @@ -643,11 +651,10 @@ def test_fit_sample_auto_early_stop_2(): 1, 1, 1]), np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1])], dtype=object) - idx_gt = np.array([np.array([0, 2, 3, 4, 11, 12, 17, 19, 6, 11, 4, 10, - 2, 8, 1, 7]), - np.array([0, 2, 3, 4, 11, 12, 17, 19, 6, 4, 8, 0, 3, - 5, 9])], dtype=object) - + idx_gt = np.array([np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 18, 8, 16, 6, + 14, 5, 13]), + np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 8, 14, 1, 7, + 9, 15])], dtype=object) # Check each array for idx in range(X_gt.size): assert_array_equal(X_resampled[idx], X_gt[idx]) @@ -679,3 +686,140 @@ def test_multiclass_error(): y = np.array([0] * 3 + [1] * 2 + [2] * 15) bc = BalanceCascade(random_state=RND_SEED) assert_warns(UserWarning, bc.fit, X, y) + + +def test_give_classifier_obj(): + """Test the fit and sample routine with classifier a object""" + + # Define the ratio parameter + ratio = 'auto' + classifier = RandomForestClassifier(random_state=RND_SEED) + + # Create the sampling object + bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, + return_indices=True, estimator=classifier) + + # Get the different subset + X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y) + + X_gt = np.array([np.array([[0.11622591, -0.0317206], + [1.25192108, -0.22367336], + [0.53366841, -0.30312976], + [1.52091956, -0.49283504], + [0.88407872, 0.35454207], + [1.31301027, -0.92648734], + [-0.41635887, -0.38299653], + [1.70580611, -0.11219234], + [1.15514042, 0.0129463], + [0.08711622, 0.93259929], + [0.70472253, -0.73309052], + [-0.14374509, 0.27370049], + [0.83680821, 1.72827342], + [-0.18410027, -0.45194484], + [-0.28162401, -2.10400981], + [-1.11515198, -0.93689695]]), + np.array([[0.11622591, -0.0317206], + [1.25192108, -0.22367336], + [0.53366841, -0.30312976], + [1.52091956, -0.49283504], + [0.88407872, 0.35454207], + [1.31301027, -0.92648734], + [-0.41635887, -0.38299653], + [1.70580611, -0.11219234], + [1.15514042, 0.0129463], + [-0.14374509, 0.27370049], + [-1.11515198, -0.93689695], + [0.77481731, 0.60935141], + [0.3084254, 0.33299982], + [0.28893132, -0.38761769], + [0.9281014, 0.53085498]])], dtype=object) + y_gt = np.array([np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, + 1, 1]), + np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, + 1])], dtype=object) + idx_gt = np.array([np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 18, 8, 16, 6, + 14, 5, 13]), + np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 16, 13, 1, 7, + 9, 15])], dtype=object) + + # Check each array + for idx in range(X_gt.size): + assert_array_equal(X_resampled[idx], X_gt[idx]) + assert_array_equal(y_resampled[idx], y_gt[idx]) + assert_array_equal(idx_under[idx], idx_gt[idx]) + + +def test_give_classifier_wrong_obj(): + """Test either if an error is raised while a wrong object is passed""" + + # Define the ratio parameter + ratio = 'auto' + classifier = 2 + + # Create the sampling object + bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, + return_indices=True, estimator=2) + + # Get the different subset + assert_raises(ValueError, bc.fit_sample, X, Y) + + +def test_rf_wth_bootstrap(): + """Test the fit and sample routine with auto ratio with a random + forest.""" + + # Define the ratio parameter + ratio = 'auto' + classifier = RandomForestClassifier(random_state=RND_SEED) + + # Create the sampling object + bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, + return_indices=True, estimator=classifier, + bootstrap=False) + + # Get the different subset + X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y) + + X_gt = np.array([np.array([[0.11622591, -0.0317206], + [1.25192108, -0.22367336], + [0.53366841, -0.30312976], + [1.52091956, -0.49283504], + [0.88407872, 0.35454207], + [1.31301027, -0.92648734], + [-0.41635887, -0.38299653], + [1.70580611, -0.11219234], + [1.15514042, 0.0129463], + [0.08711622, 0.93259929], + [0.70472253, -0.73309052], + [-0.14374509, 0.27370049], + [0.83680821, 1.72827342], + [-0.18410027, -0.45194484], + [-0.28162401, -2.10400981], + [-1.11515198, -0.93689695]]), + np.array([[0.11622591, -0.0317206], + [1.25192108, -0.22367336], + [0.53366841, -0.30312976], + [1.52091956, -0.49283504], + [0.88407872, 0.35454207], + [1.31301027, -0.92648734], + [-0.41635887, -0.38299653], + [1.70580611, -0.11219234], + [1.15514042, 0.0129463], + [0.77481731, 0.60935141], + [0.3084254, 0.33299982], + [0.28893132, -0.38761769], + [0.9281014, 0.53085498]])], dtype=object) + y_gt = np.array([np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, + 1, 1]), + np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1])], + dtype=object) + idx_gt = np.array([np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 18, 8, 16, 6, + 14, 5, 13]), + np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 1, 7, 9, + 15])], dtype=object) + + # Check each array + for idx in range(X_gt.size): + assert_array_equal(X_resampled[idx], X_gt[idx]) + assert_array_equal(y_resampled[idx], y_gt[idx]) + assert_array_equal(idx_under[idx], idx_gt[idx]) diff --git a/imblearn/under_sampling/instance_hardness_threshold.py b/imblearn/under_sampling/instance_hardness_threshold.py index 321fd31ee..c435e952f 100644 --- a/imblearn/under_sampling/instance_hardness_threshold.py +++ b/imblearn/under_sampling/instance_hardness_threshold.py @@ -2,15 +2,19 @@ threshold.""" from __future__ import division, print_function +import warnings + from collections import Counter import numpy as np + +from sklearn.base import ClassifierMixin +from sklearn.ensemble import RandomForestClassifier from sklearn.cross_validation import StratifiedKFold -from ..base import BaseBinarySampler +from six import string_types -ESTIMATOR_KIND = ('knn', 'decision-tree', 'random-forest', 'adaboost', - 'gradient-boosting', 'linear-svm') +from ..base import BaseBinarySampler class InstanceHardnessThreshold(BaseBinarySampler): @@ -19,11 +23,17 @@ class InstanceHardnessThreshold(BaseBinarySampler): Parameters ---------- - estimator : str, optional (default='linear-svm') - Classifier to be used in to estimate instance hardness of the samples. - The choices are the following: 'knn', + estimator : object, optional (default=RandomForestClassifier()) + Classifier to be used to estimate instance hardness of the samples. + By default a RandomForestClassifer will be used. + If str, the choices using a string are the following: 'knn', 'decision-tree', 'random-forest', 'adaboost', 'gradient-boosting' and 'linear-svm'. + If object, an estimator inherited from `sklearn.base.ClassifierMixin` + and having an attribute `predict_proba`. + + NOTE: `estimator` as a string object is deprecated from 0.2 and will be + replaced in 0.4. Use `ClassifierMixin` object instead. ratio : str or float, optional (default='auto') If 'auto', the ratio will be defined automatically to balance @@ -47,6 +57,13 @@ class InstanceHardnessThreshold(BaseBinarySampler): n_jobs : int, optional (default=1) The number of threads to open if possible. + **kwargs: + Option for the different classifier. + + NOTE: `**kwargs` has been deprecated from 0.2 and will be replaced in + 0.4. Use `ClassifierMixin` object instead to pass parameter associated + to an estimator. + Attributes ---------- min_c_ : str or int @@ -96,17 +113,91 @@ class InstanceHardnessThreshold(BaseBinarySampler): """ - def __init__(self, estimator='linear-svm', ratio='auto', - return_indices=False, random_state=None, cv=5, n_jobs=1, - **kwargs): + def __init__(self, estimator=None, ratio='auto', return_indices=False, + random_state=None, cv=5, n_jobs=1, **kwargs): super(InstanceHardnessThreshold, self).__init__( ratio=ratio, random_state=random_state) self.estimator = estimator self.return_indices = return_indices - self.kwargs = kwargs self.cv = cv self.n_jobs = n_jobs + self.kwargs = kwargs + + def _validate_estimator(self): + """Private function to create the classifier""" + + if (self.estimator is not None and + isinstance(self.estimator, ClassifierMixin) and + hasattr(self.estimator, 'predict_proba')): + self.estimator_ = self.estimator + elif self.estimator is None: + self.estimator_ = RandomForestClassifier( + random_state=self.random_state, n_jobs=self.n_jobs) + # To be removed in 0.4 + elif (self.estimator is not None and + isinstance(self.estimator, string_types)): + # Select the appropriate classifier + warnings.warn('`estimator` will be replaced in version' + ' 0.4. Use a classifier object instead of a string.', + DeprecationWarning) + if self.estimator == 'knn': + from sklearn.neighbors import KNeighborsClassifier + self.estimator_ = KNeighborsClassifier( + **self.kwargs) + elif self.estimator == 'decision-tree': + from sklearn.tree import DecisionTreeClassifier + self.estimator_ = DecisionTreeClassifier( + random_state=self.random_state, + **self.kwargs) + elif self.estimator == 'random-forest': + self.estimator_ = RandomForestClassifier( + random_state=self.random_state, + **self.kwargs) + elif self.estimator == 'adaboost': + from sklearn.ensemble import AdaBoostClassifier + self.estimator_ = AdaBoostClassifier( + random_state=self.random_state, + **self.kwargs) + elif self.estimator == 'gradient-boosting': + from sklearn.ensemble import GradientBoostingClassifier + self.estimator_ = GradientBoostingClassifier( + random_state=self.random_state, + **self.kwargs) + elif self.estimator == 'linear-svm': + from sklearn.svm import SVC + self.estimator_ = SVC(probability=True, + random_state=self.random_state, + kernel='linear', + **self.kwargs) + else: + raise NotImplementedError + else: + raise ValueError('Invalid parameter `estimator`') + + def fit(self, X, y): + """Find the classes statistics before to perform sampling. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Matrix containing the data which have to be sampled. + + y : ndarray, shape (n_samples, ) + Corresponding label for each sample in X. + + Returns + ------- + self : object, + Return self. + + """ + + super(InstanceHardnessThreshold, self).fit(X, y) + + self._validate_estimator() + + return self def _sample(self, X, y): """Resample the dataset. @@ -133,43 +224,6 @@ def _sample(self, X, y): """ - if self.estimator not in ESTIMATOR_KIND: - raise NotImplementedError - - # Select the appropriate classifier - if self.estimator == 'knn': - from sklearn.neighbors import KNeighborsClassifier - estimator = KNeighborsClassifier( - **self.kwargs) - elif self.estimator == 'decision-tree': - from sklearn.tree import DecisionTreeClassifier - estimator = DecisionTreeClassifier( - random_state=self.random_state, - **self.kwargs) - elif self.estimator == 'random-forest': - from sklearn.ensemble import RandomForestClassifier - estimator = RandomForestClassifier( - random_state=self.random_state, - **self.kwargs) - elif self.estimator == 'adaboost': - from sklearn.ensemble import AdaBoostClassifier - estimator = AdaBoostClassifier( - random_state=self.random_state, - **self.kwargs) - elif self.estimator == 'gradient-boosting': - from sklearn.ensemble import GradientBoostingClassifier - estimator = GradientBoostingClassifier( - random_state=self.random_state, - **self.kwargs) - elif self.estimator == 'linear-svm': - from sklearn.svm import SVC - estimator = SVC(probability=True, - random_state=self.random_state, - kernel='linear', - **self.kwargs) - else: - raise NotImplementedError - # Create the different folds skf = StratifiedKFold(y, n_folds=self.cv, shuffle=False, random_state=self.random_state) @@ -180,10 +234,10 @@ def _sample(self, X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] - estimator.fit(X_train, y_train) + self.estimator_.fit(X_train, y_train) - probs = estimator.predict_proba(X_test) - classes = estimator.classes_ + probs = self.estimator_.predict_proba(X_test) + classes = self.estimator_.classes_ probabilities[test_index] = [ probs[l, np.where(classes == c)[0][0]] for l, c in enumerate(y_test)] diff --git a/imblearn/under_sampling/tests/test_instance_hardness_threshold.py b/imblearn/under_sampling/tests/test_instance_hardness_threshold.py index 02a057d4f..e423e1527 100644 --- a/imblearn/under_sampling/tests/test_instance_hardness_threshold.py +++ b/imblearn/under_sampling/tests/test_instance_hardness_threshold.py @@ -8,6 +8,7 @@ assert_warns) from sklearn.datasets import make_classification from sklearn.utils.estimator_checks import check_estimator +from sklearn.ensemble import GradientBoostingClassifier from imblearn.under_sampling import InstanceHardnessThreshold @@ -386,3 +387,39 @@ def test_multiclass_error(): y = np.array([0] * 10 + [1] * 3 + [2] *2) iht = InstanceHardnessThreshold(random_state=RND_SEED) assert_warns(UserWarning, iht.fit, X, y) + + +def test_iht_fit_sample_class_obj(): + """Test the fit sample routine passing a classifiermixin object""" + + # Resample the data + est = GradientBoostingClassifier(random_state=RND_SEED) + iht = InstanceHardnessThreshold(estimator=est, random_state=RND_SEED) + X_resampled, y_resampled = iht.fit_sample(X, Y) + + X_gt = np.array([[-0.3879569, 0.6894251], + [-0.09322739, 1.28177189], + [-0.77740357, 0.74097941], + [0.91542919, -0.65453327], + [-0.43877303, 1.07366684], + [-0.85795321, 0.82980738], + [-0.18430329, 0.52328473], + [-0.65571327, 0.42412021], + [-0.28305528, 0.30284991], + [1.06446472, -1.09279772], + [0.30543283, -0.02589502], + [-0.00717161, 0.00318087]]) + y_gt = np.array([0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0]) + assert_array_equal(X_resampled, X_gt) + assert_array_equal(y_resampled, y_gt) + + +def test_iht_fit_sample_wrong_class_obj(): + """Test either if an error is raised while passing a wrong classifier + object""" + + # Resample the data + from sklearn.cluster import KMeans + est = KMeans() + iht = InstanceHardnessThreshold(estimator=est, random_state=RND_SEED) + assert_raises(ValueError, iht.fit_sample, X, Y)