scikit-learn-contrib · glemaitre · Oct 26, 2016 · Jul 25, 2016 · Jul 26, 2016 · Jul 26, 2016
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -14,7 +14,6 @@ Changelog
 
 Bug fixes
 ~~~~~~~~~
-
 - Fixed a bug in :class:`under_sampling.NearMiss` which was not picking the right samples during under sampling for the method 3. By `Guillaume Lemaitre`_.
 - Fixed a bug in :class:`ensemble.EasyEnsemble`, correction of the `random_state` generation. By `Guillaume Lemaitre`_ and `Christos Aridas`_.
 - Fixed a bug in :class:`under_sampling.RepeatedEditedNearestNeighbours`, add additional stopping criterion to avoid that the minority class become a majority class or that a class disappear. By `Guillaume Lemaitre`_.
@@ -38,6 +37,18 @@ Enhancement
 - Added support for bumpversion. By `Guillaume Lemaitre`_.
 - Validate the type of target in binary samplers. A warning is raised for the moment. By `Guillaume Lemaitre`_ and `Christos Aridas`_.
 
+New features
+~~~~~~~~~~~~
+
+- Added AllKNN under sampling technique.
+- Added support for bumpversion.
+
+API changes summary
+~~~~~~~~~~~~~~~~~~~
+
+- `size_ngh` has been deprecated in :class:`combine.SMOTEENN`. Use `n_neighbors` instead. By `Guillaume Lemaitre`_, `Christos Aridas`_, and `Dayvid Oliveira` .
+- `size_ngh` has been deprecated in :class:`under_sampling.EditedNearestNeighbors`. Use `n_neighbors` instead. By `Guillaume Lemaitre`_, `Christos Aridas`_, and `Dayvid Oliveira`_.
+
 Documentation changes
 ~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/examples/plot_unbalanced_dataset.ipynb b/examples/plot_unbalanced_dataset.ipynb
@@ -215,25 +215,25 @@
     "NM3 = NearMiss(version=3)\n",
     "nm3x, nm3y = NM3.fit_sample(x, y)\n",
     "# 'Condensed Nearest Neighbour'\n",
-    "CNN = CondensedNearestNeighbour(size_ngh=51, n_seeds_S=51)\n",
+    "CNN = CondensedNearestNeighbour(n_neighbors=51, n_seeds_S=51)\n",
     "cnnx, cnny = CNN.fit_sample(x, y)\n",
     "# 'One-Sided Selection'\n",
-    "OSS = OneSidedSelection(size_ngh=51, n_seeds_S=51)\n",
+    "OSS = OneSidedSelection(n_neighbors=51, n_seeds_S=51)\n",
     "ossx, ossy = OSS.fit_sample(x, y)\n",
     "# 'Neighboorhood Cleaning Rule'\n",
-    "NCR = NeighbourhoodCleaningRule(size_ngh=51)\n",
+    "NCR = NeighbourhoodCleaningRule(n_neighbors=51)\n",
     "ncrx, ncry = NCR.fit_sample(x, y) \n",
     "# 'Edited Neareast Neighbour'\n",
-    "ENN = EditedNearestNeighbours(size_ngh=51)\n",
+    "ENN = EditedNearestNeighbours(n_neighbors=51)\n",
     "ennx, enny = ENN.fit_sample(x, y)\n",
     "# 'Instance Hardness Threshold'\n",
     "IHT = InstanceHardnessThreshold()\n",
     "ihtx, ihty = IHT.fit_sample(x, y)\n",
     "# 'Repeated Edited Nearest Neighbour'\n",
-    "RENN = RepeatedEditedNearestNeighbours(size_ngh=51)\n",
+    "RENN = RepeatedEditedNearestNeighbours(n_neighbors=51)\n",
     "rennx, renny = RENN.fit_sample(x, y)\n",
     "# 'AllKNN'\n",
-    "ALLK = AllKNN(size_ngh=51)\n",
+    "ALLK = AllKNN(n_neighbors=51)\n",
     "allkx, allky = ALLK.fit_sample(x, y)\n",
     "\n",
     "# Apply PCA to be able to visualise the results\n",

diff --git a/imblearn/base.py b/imblearn/base.py
@@ -74,9 +74,12 @@ def fit(self, X, y):
         if hasattr(self, 'ratio'):
             self._validate_ratio()
 
+        if hasattr(self, 'size_ngh'):
+            self._validate_size_ngh_deprecation()
+
         self.logger.info('Compute classes statistics ...')
 
-        # # Raise an error if there is only one class
+        # Raise an error if there is only one class
         # if uniques.size == 1:
         #     raise RuntimeError("Only one class detected, aborting...")
         # Raise a warning for the moment to be compatible with BaseEstimator
@@ -149,6 +152,9 @@ def sample(self, X, y):
         if hasattr(self, 'ratio'):
             self._validate_ratio()
 
+        if hasattr(self, 'size_ngh'):
+            self._validate_size_ngh_deprecation()
+
         return self._sample(X, y)
 
     def fit_sample(self, X, y):
@@ -190,6 +196,15 @@ def _validate_ratio(self):
         else:
             raise ValueError('Unknown parameter type for ratio.')
 
+    def _validate_size_ngh_deprecation(self):
+        "Private function to warn about the deprecation about size_ngh."
+
+        # Announce deprecation if necessary
+        if self.size_ngh is not None:
+            warnings.warn('`size_ngh` will be replaced in version 0.4. Use'
+                          ' `n_neighbors` instead.', DeprecationWarning)
+            self.n_neighbors = self.size_ngh
+
     @abstractmethod
     def _sample(self, X, y):
         """Resample the dataset.

diff --git a/imblearn/combine/smote_enn.py b/imblearn/combine/smote_enn.py
@@ -1,6 +1,8 @@
 """Class to perform over-sampling using SMOTE and cleaning using ENN."""
 from __future__ import division, print_function
 
+import warnings
+
 from ..base import BaseBinarySampler
 from ..over_sampling import SMOTE
 from ..under_sampling import EditedNearestNeighbours
@@ -40,7 +42,14 @@ class SMOTEENN(BaseBinarySampler):
         The type of SMOTE algorithm to use one of the following
         options: 'regular', 'borderline1', 'borderline2', 'svm'.
 
-    size_ngh : int, optional (default=3)
+    size_ngh : int, optional (default=None)
+        Size of the neighbourhood to consider to compute the average
+        distance to the minority point samples.
+
+        NOTE: size_ngh is deprecated from 0.2 and will be replaced in 0.4
+        Use ``n_neighbors`` instead.
+
+    n_neighbors : int, optional (default=3)
         Size of the neighbourhood to consider to compute the average
         distance to the minority point samples.
 
@@ -103,7 +112,8 @@ class SMOTEENN(BaseBinarySampler):
 
     def __init__(self, ratio='auto', random_state=None,
                  k=5, m=10, out_step=0.5, kind_smote='regular',
-                 size_ngh=3, kind_enn='all', n_jobs=-1, **kwargs):
+                 size_ngh=None, n_neighbors=3, kind_enn='all', n_jobs=-1,
+                 **kwargs):
 
         super(SMOTEENN, self).__init__(ratio=ratio)
         self.random_state = random_state
@@ -112,6 +122,7 @@ def __init__(self, ratio='auto', random_state=None,
         self.out_step = out_step
         self.kind_smote = kind_smote
         self.size_ngh = size_ngh
+        self.n_neighbors = n_neighbors
         self.kind_enn = kind_enn
         self.n_jobs = n_jobs
         self.kwargs = kwargs
@@ -121,6 +132,7 @@ def __init__(self, ratio='auto', random_state=None,
                         **self.kwargs)
         self.enn = EditedNearestNeighbours(random_state=self.random_state,
                                            size_ngh=self.size_ngh,
+                                           n_neighbors=self.n_neighbors,
                                            kind_sel=self.kind_enn,
                                            n_jobs=self.n_jobs)
 
@@ -144,6 +156,12 @@ def fit(self, X, y):
 
         super(SMOTEENN, self).fit(X, y)
 
+        # Annonce deprecation if necessary
+        if self.size_ngh is not None:
+            warnings.warn('`size_ngh` will be replaced in version 0.4. Use'
+                          ' `n_neighbors` instead.', DeprecationWarning)
+            self.n_neighbors = self.size_ngh
+
         # Fit using SMOTE
         self.sm.fit(X, y)
 

diff --git a/imblearn/combine/smote_tomek.py b/imblearn/combine/smote_tomek.py
@@ -42,10 +42,6 @@ class SMOTETomek(BaseBinarySampler):
         The type of SMOTE algorithm to use one of the following
         options: 'regular', 'borderline1', 'borderline2', 'svm'
 
-    size_ngh : int, optional (default=3)
-        Size of the neighbourhood to consider to compute the average
-        distance to the minority point samples.
-
     kind_sel : str, optional (default='all')
         Strategy to use in order to exclude samples.
 

diff --git a/imblearn/under_sampling/condensed_nearest_neighbour.py b/imblearn/under_sampling/condensed_nearest_neighbour.py
@@ -2,6 +2,8 @@
 method."""
 from __future__ import division, print_function
 
+import warnings
+
 from collections import Counter
 
 import numpy as np
@@ -27,7 +29,14 @@ class CondensedNearestNeighbour(BaseMulticlassSampler):
         If None, the random number generator is the RandomState instance used
         by np.random.
 
-    size_ngh : int, optional (default=1)
+    size_ngh : int, optional (default=None)
+        Size of the neighbourhood to consider to compute the average
+        distance to the minority point samples.
+
+        NOTE: size_ngh is deprecated from 0.2 and will be replaced in 0.4
+        Use ``n_neighbors`` instead.
+
+    n_neighbors : int, optional (default=1)
         Size of the neighbourhood to consider to compute the average
         distance to the minority point samples.
 
@@ -86,12 +95,14 @@ class CondensedNearestNeighbour(BaseMulticlassSampler):
     """
 
     def __init__(self, return_indices=False, random_state=None,
-                 size_ngh=1, n_seeds_S=1, n_jobs=-1, **kwargs):
+                 size_ngh=None, n_neighbors=1, n_seeds_S=1, n_jobs=-1,
+                 **kwargs):
         super(CondensedNearestNeighbour, self).__init__()
 
         self.return_indices = return_indices
         self.random_state = random_state
         self.size_ngh = size_ngh
+        self.n_neighbors = n_neighbors
         self.n_seeds_S = n_seeds_S
         self.n_jobs = n_jobs
         self.kwargs = kwargs
@@ -158,7 +169,7 @@ def _sample(self, X, y):
             S_y = y[y == key]
 
             # Create a k-NN classifier
-            knn = KNeighborsClassifier(n_neighbors=self.size_ngh,
+            knn = KNeighborsClassifier(n_neighbors=self.n_neighbors,
                                        n_jobs=self.n_jobs,
                                        **self.kwargs)
 

diff --git a/imblearn/under_sampling/edited_nearest_neighbours.py b/imblearn/under_sampling/edited_nearest_neighbours.py
@@ -4,6 +4,7 @@
 
 from collections import Counter
 
+import warnings
 import numpy as np
 from scipy.stats import mode
 from sklearn.neighbors import NearestNeighbors
@@ -29,7 +30,14 @@ class EditedNearestNeighbours(BaseMulticlassSampler):
         If None, the random number generator is the RandomState instance used
         by np.random.
 
-    size_ngh : int, optional (default=3)
+    size_ngh : int, optional (default=None)
+        Size of the neighbourhood to consider to compute the average
+        distance to the minority point samples.
+
+       NOTE: size_ngh is deprecated from 0.2 and will be replaced in 0.4
+       Use ``n_neighbors`` instead.
+
+    n_neighbors : int, optional (default=3)
         Size of the neighbourhood to consider to compute the average
         distance to the minority point samples.
 
@@ -91,11 +99,12 @@ class EditedNearestNeighbours(BaseMulticlassSampler):
     """
 
     def __init__(self, return_indices=False, random_state=None,
-                 size_ngh=3, kind_sel='all', n_jobs=-1):
+                 size_ngh=None, n_neighbors=3, kind_sel='all', n_jobs=-1):
         super(EditedNearestNeighbours, self).__init__()
         self.return_indices = return_indices
         self.random_state = random_state
         self.size_ngh = size_ngh
+        self.n_neighbors = n_neighbors
         self.kind_sel = kind_sel
         self.n_jobs = n_jobs
 
@@ -140,7 +149,7 @@ def _sample(self, X, y):
             idx_under = np.flatnonzero(y == self.min_c_)
 
         # Create a k-NN to fit the whole data
-        nn_obj = NearestNeighbors(n_neighbors=self.size_ngh + 1,
+        nn_obj = NearestNeighbors(n_neighbors=self.n_neighbors + 1,
                                   n_jobs=self.n_jobs)
         # Fit the data
         nn_obj.fit(X)
@@ -217,7 +226,14 @@ class RepeatedEditedNearestNeighbours(BaseMulticlassSampler):
         If None, the random number generator is the RandomState instance used
         by np.random.
 
-    size_ngh : int, optional (default=3)
+    size_ngh : int, optional (default=None)
+        Size of the neighbourhood to consider to compute the average
+        distance to the minority point samples.
+
+        NOTE: size_ngh is deprecated from 0.2 and will be replaced in 0.4
+        Use ``n_neighbors`` instead.
+
+    n_neighbors : int, optional (default=3)
         Size of the neighbourhood to consider to compute the average
         distance to the minority point samples.
 
@@ -283,18 +299,20 @@ class RepeatedEditedNearestNeighbours(BaseMulticlassSampler):
     """
 
     def __init__(self, return_indices=False, random_state=None,
-                 size_ngh=3, max_iter=100, kind_sel='all', n_jobs=-1):
+                 size_ngh=None, n_neighbors=3, max_iter=100, kind_sel='all',
+                 n_jobs=-1):
         super(RepeatedEditedNearestNeighbours, self).__init__()
         self.return_indices = return_indices
         self.random_state = random_state
         self.size_ngh = size_ngh
+        self.n_neighbors = n_neighbors
         self.kind_sel = kind_sel
         self.n_jobs = n_jobs
         self.max_iter = max_iter
         self.enn_ = EditedNearestNeighbours(
             return_indices=self.return_indices,
             random_state=self.random_state,
-            size_ngh=self.size_ngh,
+            n_neighbors=self.n_neighbors,
             kind_sel=self.kind_sel,
             n_jobs=self.n_jobs)
 
@@ -441,7 +459,14 @@ class AllKNN(BaseMulticlassSampler):
         If None, the random number generator is the RandomState instance used
         by np.random.
 
-    size_ngh : int, optional (default=3)
+    size_ngh : int, optional (default=None)
+        Size of the neighbourhood to consider to compute the average
+        distance to the minority point samples.
+
+        NOTE: size_ngh is deprecated from 0.2 and will be replaced in 0.4
+        Use ``n_neighbors`` instead.
+
+    n_neighbors : int, optional (default=3)
         Size of the neighbourhood to consider to compute the average
         distance to the minority point samples.
 
@@ -503,17 +528,18 @@ class AllKNN(BaseMulticlassSampler):
     """
 
     def __init__(self, return_indices=False, random_state=None,
-                 size_ngh=3, kind_sel='all', n_jobs=-1):
+                 size_ngh=None, n_neighbors=3, kind_sel='all', n_jobs=-1):
         super(AllKNN, self).__init__()
         self.return_indices = return_indices
         self.random_state = random_state
         self.size_ngh = size_ngh
+        self.n_neighbors = n_neighbors
         self.kind_sel = kind_sel
         self.n_jobs = n_jobs
         self.enn_ = EditedNearestNeighbours(
             return_indices=self.return_indices,
             random_state=self.random_state,
-            size_ngh=self.size_ngh,
+            n_neighbors=self.n_neighbors,
             kind_sel=self.kind_sel,
             n_jobs=self.n_jobs)
 
@@ -572,10 +598,11 @@ def _sample(self, X, y):
         if self.return_indices:
             idx_under = np.arange(X.shape[0], dtype=int)
 
-        for curr_size_ngh in range(1, self.size_ngh + 1):
+        for curr_size_ngh in range(1, self.n_neighbors + 1):
             self.logger.debug('Apply ENN size_ngh #%s', curr_size_ngh)
             # updating ENN size_ngh
             self.enn_.size_ngh = curr_size_ngh
+
             if self.return_indices:
                 X_enn, y_enn, idx_enn = self.enn_.fit_sample(X_, y_)
             else: