dzzp
diff --git a/‎doc/api.rst
Lines changed: 1 addition & 0 deletions b/‎doc/api.rst
Lines changed: 1 addition & 0 deletions
diff --git a/‎doc/ensemble.rst
Lines changed: 56 additions & 0 deletions b/‎doc/ensemble.rst
Lines changed: 56 additions & 0 deletions
diff --git a/‎doc/whats_new.rst
Lines changed: 4 additions & 0 deletions b/‎doc/whats_new.rst
Lines changed: 4 additions & 0 deletions
diff --git a/‎examples/ensemble/plot_comparison_bagging_classifier.py
Lines changed: 104 additions & 0 deletions b/‎examples/ensemble/plot_comparison_bagging_classifier.py
Lines changed: 104 additions & 0 deletions
diff --git a/‎imblearn/ensemble/__init__.py
Lines changed: 3 additions & 1 deletion b/‎imblearn/ensemble/__init__.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎imblearn/ensemble/balance_cascade.py
Lines changed: 2 additions & 2 deletions b/‎imblearn/ensemble/balance_cascade.py
Lines changed: 2 additions & 2 deletions
@@ -109,6 +109,7 @@ Prototype selection
    :template: class.rst
 
    ensemble.BalanceCascade
+   ensemble.BalancedBaggingClassifier
    ensemble.EasyEnsemble
 
 
 
@@ -6,6 +6,11 @@ Ensemble of samplers
 
 .. currentmodule:: imblearn.ensemble
 
+.. _ensemble_samplers:
+
+Samplers
+--------
+
 An imbalanced data set can be balanced by creating several balanced
 subsets. The module :mod:`imblearn.ensemble` allows to create such sets.
 
@@ -54,3 +59,54 @@ parameter ``n_max_subset`` and an additional bootstraping can be activated with
 See
 :ref:`sphx_glr_auto_examples_ensemble_plot_easy_ensemble.py` and
 :ref:`sphx_glr_auto_examples_ensemble_plot_balance_cascade.py`.
+
+.. _ensemble_meta_estimators:
+
+Chaining ensemble of samplers and estimators
+--------------------------------------------
+
+In ensemble classifiers, bagging methods build several estimators on different
+randomly selected subset of data. In scikit-learn, this classifier is named
+``BaggingClassifier``. However, this classifier does not allow to balance each
+subset of data. Therefore, when training on imbalanced data set, this
+classifier will favor the majority classes::
+
+  >>> from sklearn.model_selection import train_test_split
+  >>> from sklearn.metrics import confusion_matrix
+  >>> from sklearn.ensemble import BaggingClassifier
+  >>> from sklearn.tree import DecisionTreeClassifier
+  >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+  >>> bc = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
+  ...                        random_state=0)
+  >>> bc.fit(X_train, y_train) #doctest: +ELLIPSIS
+  BaggingClassifier(...)
+  >>> y_pred = bc.predict(X_test)
+  >>> confusion_matrix(y_test, y_pred)
+  array([[   0,    0,   12],
+         [   0,    0,   59],
+         [   0,    0, 1179]])
+
+:class:`BalancedBaggingClassifier` allows to resample each subset of data
+before to train each estimator of the ensemble. In short, it combines the
+output of an :class:`EasyEnsemble` sampler with an ensemble of classifiers
+(i.e. ``BaggingClassifier``). Therefore, :class:`BalancedBaggingClassifier`
+takes the same parameters than the scikit-learn
+``BaggingClassifier``. Additionally, there is two additional parameters,
+``ratio`` and ``replacement``, as in the :class:`EasyEnsemble` sampler::
+
+
+  >>> from imblearn.ensemble import BalancedBaggingClassifier
+  >>> bbc = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(),
+  ...                                 ratio='auto',
+  ...                                 replacement=False,
+  ...                                 random_state=0)
+  >>> bbc.fit(X, y) # doctest: +ELLIPSIS
+  BalancedBaggingClassifier(...)
+  >>> y_pred = bbc.predict(X_test)
+  >>> confusion_matrix(y_test, y_pred)
+  array([[  12,    0,    0],
+         [   0,   55,    4],
+         [  68,   53, 1058]])
+
+See
+:ref:`sphx_glr_auto_examples_ensemble_plot_comparison_bagging_classifier.py`.
@@ -53,6 +53,10 @@ New features
 Enhancement
 ~~~~~~~~~~~
 
+- Add :class:`ensemble.BalancedBaggingClassifier` which is a meta estimator to
+  directly use the :class:`ensemble.EasyEnsemble` chained with a classifier. By
+  `Guillaume Lemaitre`_.
+
 - All samplers accepts sparse matrices with defaulting on CSR type. By
   `Guillaume Lemaitre`_.
 
 
@@ -0,0 +1,104 @@
+"""
+=========================================================
+Comparison of balanced and imbalanced bagging classifiers
+=========================================================
+
+This example shows the benefit of balancing the training set when using a
+bagging classifier. ``BalancedBaggingClassifier`` chains a
+``RandomUnderSampler`` and a given classifier while ``BaggingClassifier`` is
+using directly the imbalanced data.
+
+Balancing the data set before training the classifier improve the
+classification performance. In addition, it avoids the ensemble to focus on the
+majority class which would be a known drawback of the decision tree
+classifiers.
+
+"""
+
+# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
+# License: MIT
+
+from collections import Counter
+import itertools
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+from sklearn.datasets import load_iris
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import BaggingClassifier
+from sklearn.metrics import confusion_matrix
+
+from imblearn.datasets import make_imbalance
+from imblearn.ensemble import BalancedBaggingClassifier
+
+from imblearn.metrics import classification_report_imbalanced
+
+
+def plot_confusion_matrix(cm, classes,
+                          normalize=False,
+                          title='Confusion matrix',
+                          cmap=plt.cm.Blues):
+    """
+    This function prints and plots the confusion matrix.
+    Normalization can be applied by setting `normalize=True`.
+    """
+    if normalize:
+        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
+        print("Normalized confusion matrix")
+    else:
+        print('Confusion matrix, without normalization')
+
+    print(cm)
+
+    plt.imshow(cm, interpolation='nearest', cmap=cmap)
+    plt.title(title)
+    plt.colorbar()
+    tick_marks = np.arange(len(classes))
+    plt.xticks(tick_marks, classes, rotation=45)
+    plt.yticks(tick_marks, classes)
+
+    fmt = '.2f' if normalize else 'd'
+    thresh = cm.max() / 2.
+    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
+        plt.text(j, i, format(cm[i, j], fmt),
+                 horizontalalignment="center",
+                 color="white" if cm[i, j] > thresh else "black")
+
+    plt.tight_layout()
+    plt.ylabel('True label')
+    plt.xlabel('Predicted label')
+
+
+iris = load_iris()
+X, y = make_imbalance(iris.data, iris.target, ratio={0: 25, 1: 40, 2: 50},
+                      random_state=0)
+X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+
+bagging = BaggingClassifier(random_state=0)
+balanced_bagging = BalancedBaggingClassifier(random_state=0)
+
+print('Class distribution of the training set: {}'.format(Counter(y_train)))
+
+bagging.fit(X_train, y_train)
+balanced_bagging.fit(X_train, y_train)
+
+print('Class distribution of the test set: {}'.format(Counter(y_test)))
+
+print('Classification results using a bagging classifier on imbalanced data')
+y_pred_bagging = bagging.predict(X_test)
+print(classification_report_imbalanced(y_test, y_pred_bagging))
+cm_bagging = confusion_matrix(y_test, y_pred_bagging)
+plt.figure()
+plot_confusion_matrix(cm_bagging, classes=iris.target_names,
+                      title='Confusion matrix using BaggingClassifier')
+
+print('Classification results using a bagging classifier on balanced data')
+y_pred_balanced_bagging = balanced_bagging.predict(X_test)
+print(classification_report_imbalanced(y_test, y_pred_balanced_bagging))
+cm_balanced_bagging = confusion_matrix(y_test, y_pred_balanced_bagging)
+plt.figure()
+plot_confusion_matrix(cm_balanced_bagging, classes=iris.target_names,
+                      title='Confusion matrix using BalancedBaggingClassifier')
+
+plt.show()
@@ -6,4 +6,6 @@
 from .easy_ensemble import EasyEnsemble
 from .balance_cascade import BalanceCascade
 
-__all__ = ['EasyEnsemble', 'BalanceCascade']
+from .classifier import BalancedBaggingClassifier
+
+__all__ = ['EasyEnsemble', 'BalancedBaggingClassifier', 'BalanceCascade']
@@ -27,7 +27,7 @@ class BalanceCascade(BaseEnsembleSampler):
     This method iteratively select subset and make an ensemble of the
     different sets. The selection is performed using a specific classifier.
 
-    Read more in the :ref:`User Guide <ensemble>`.
+    Read more in the :ref:`User Guide <ensemble_samplers>`.
 
     Parameters
     ----------
@@ -99,7 +99,7 @@ class BalanceCascade(BaseEnsembleSampler):
 
     See also
     --------
-    EasyEnsemble
+    BalancedBaggingClassifier, EasyEnsemble
 
     References
     ----------