Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG+1] Rename all occurrences of size_ngh to n_neighbors for consistency with scikit-learn #109

Merged
merged 5 commits into from
Oct 26, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion doc/whats_new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ Changelog

Bug fixes
~~~~~~~~~

- Fixed a bug in :class:`under_sampling.NearMiss` which was not picking the right samples during under sampling for the method 3. By `Guillaume Lemaitre`_.
- Fixed a bug in :class:`ensemble.EasyEnsemble`, correction of the `random_state` generation. By `Guillaume Lemaitre`_ and `Christos Aridas`_.
- Fixed a bug in :class:`under_sampling.RepeatedEditedNearestNeighbours`, add additional stopping criterion to avoid that the minority class become a majority class or that a class disappear. By `Guillaume Lemaitre`_.
Expand All @@ -38,6 +37,18 @@ Enhancement
- Added support for bumpversion. By `Guillaume Lemaitre`_.
- Validate the type of target in binary samplers. A warning is raised for the moment. By `Guillaume Lemaitre`_ and `Christos Aridas`_.

New features
~~~~~~~~~~~~

- Added AllKNN under sampling technique.
- Added support for bumpversion.

API changes summary
~~~~~~~~~~~~~~~~~~~

- `size_ngh` has been deprecated in :class:`combine.SMOTEENN`. Use `n_neighbors` instead. By `Guillaume Lemaitre`_, `Christos Aridas`_, and `Dayvid Oliveira` .
- `size_ngh` has been deprecated in :class:`under_sampling.EditedNearestNeighbors`. Use `n_neighbors` instead. By `Guillaume Lemaitre`_, `Christos Aridas`_, and `Dayvid Oliveira`_.

Documentation changes
~~~~~~~~~~~~~~~~~~~~~

Expand Down
12 changes: 6 additions & 6 deletions examples/plot_unbalanced_dataset.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -215,25 +215,25 @@
"NM3 = NearMiss(version=3)\n",
"nm3x, nm3y = NM3.fit_sample(x, y)\n",
"# 'Condensed Nearest Neighbour'\n",
"CNN = CondensedNearestNeighbour(size_ngh=51, n_seeds_S=51)\n",
"CNN = CondensedNearestNeighbour(n_neighbors=51, n_seeds_S=51)\n",
"cnnx, cnny = CNN.fit_sample(x, y)\n",
"# 'One-Sided Selection'\n",
"OSS = OneSidedSelection(size_ngh=51, n_seeds_S=51)\n",
"OSS = OneSidedSelection(n_neighbors=51, n_seeds_S=51)\n",
"ossx, ossy = OSS.fit_sample(x, y)\n",
"# 'Neighboorhood Cleaning Rule'\n",
"NCR = NeighbourhoodCleaningRule(size_ngh=51)\n",
"NCR = NeighbourhoodCleaningRule(n_neighbors=51)\n",
"ncrx, ncry = NCR.fit_sample(x, y) \n",
"# 'Edited Neareast Neighbour'\n",
"ENN = EditedNearestNeighbours(size_ngh=51)\n",
"ENN = EditedNearestNeighbours(n_neighbors=51)\n",
"ennx, enny = ENN.fit_sample(x, y)\n",
"# 'Instance Hardness Threshold'\n",
"IHT = InstanceHardnessThreshold()\n",
"ihtx, ihty = IHT.fit_sample(x, y)\n",
"# 'Repeated Edited Nearest Neighbour'\n",
"RENN = RepeatedEditedNearestNeighbours(size_ngh=51)\n",
"RENN = RepeatedEditedNearestNeighbours(n_neighbors=51)\n",
"rennx, renny = RENN.fit_sample(x, y)\n",
"# 'AllKNN'\n",
"ALLK = AllKNN(size_ngh=51)\n",
"ALLK = AllKNN(n_neighbors=51)\n",
"allkx, allky = ALLK.fit_sample(x, y)\n",
"\n",
"# Apply PCA to be able to visualise the results\n",
Expand Down
17 changes: 16 additions & 1 deletion imblearn/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,12 @@ def fit(self, X, y):
if hasattr(self, 'ratio'):
self._validate_ratio()

if hasattr(self, 'size_ngh'):
self._validate_size_ngh_deprecation()

self.logger.info('Compute classes statistics ...')

# # Raise an error if there is only one class
# Raise an error if there is only one class
# if uniques.size == 1:
# raise RuntimeError("Only one class detected, aborting...")
# Raise a warning for the moment to be compatible with BaseEstimator
Expand Down Expand Up @@ -149,6 +152,9 @@ def sample(self, X, y):
if hasattr(self, 'ratio'):
self._validate_ratio()

if hasattr(self, 'size_ngh'):
self._validate_size_ngh_deprecation()

return self._sample(X, y)

def fit_sample(self, X, y):
Expand Down Expand Up @@ -190,6 +196,15 @@ def _validate_ratio(self):
else:
raise ValueError('Unknown parameter type for ratio.')

def _validate_size_ngh_deprecation(self):
"Private function to warn about the deprecation about size_ngh."

# Announce deprecation if necessary
if self.size_ngh is not None:
warnings.warn('`size_ngh` will be replaced in version 0.4. Use'
' `n_neighbors` instead.', DeprecationWarning)
self.n_neighbors = self.size_ngh

@abstractmethod
def _sample(self, X, y):
"""Resample the dataset.
Expand Down
22 changes: 20 additions & 2 deletions imblearn/combine/smote_enn.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
"""Class to perform over-sampling using SMOTE and cleaning using ENN."""
from __future__ import division, print_function

import warnings

from ..base import BaseBinarySampler
from ..over_sampling import SMOTE
from ..under_sampling import EditedNearestNeighbours
Expand Down Expand Up @@ -40,7 +42,14 @@ class SMOTEENN(BaseBinarySampler):
The type of SMOTE algorithm to use one of the following
options: 'regular', 'borderline1', 'borderline2', 'svm'.

size_ngh : int, optional (default=3)
size_ngh : int, optional (default=None)
Size of the neighbourhood to consider to compute the average
distance to the minority point samples.

NOTE: size_ngh is deprecated from 0.2 and will be replaced in 0.4
Use ``n_neighbors`` instead.

n_neighbors : int, optional (default=3)
Size of the neighbourhood to consider to compute the average
distance to the minority point samples.

Expand Down Expand Up @@ -103,7 +112,8 @@ class SMOTEENN(BaseBinarySampler):

def __init__(self, ratio='auto', random_state=None,
k=5, m=10, out_step=0.5, kind_smote='regular',
size_ngh=3, kind_enn='all', n_jobs=-1, **kwargs):
size_ngh=None, n_neighbors=3, kind_enn='all', n_jobs=-1,
**kwargs):

super(SMOTEENN, self).__init__(ratio=ratio)
self.random_state = random_state
Expand All @@ -112,6 +122,7 @@ def __init__(self, ratio='auto', random_state=None,
self.out_step = out_step
self.kind_smote = kind_smote
self.size_ngh = size_ngh
self.n_neighbors = n_neighbors
self.kind_enn = kind_enn
self.n_jobs = n_jobs
self.kwargs = kwargs
Expand All @@ -121,6 +132,7 @@ def __init__(self, ratio='auto', random_state=None,
**self.kwargs)
self.enn = EditedNearestNeighbours(random_state=self.random_state,
size_ngh=self.size_ngh,
n_neighbors=self.n_neighbors,
kind_sel=self.kind_enn,
n_jobs=self.n_jobs)

Expand All @@ -144,6 +156,12 @@ def fit(self, X, y):

super(SMOTEENN, self).fit(X, y)

# Annonce deprecation if necessary
if self.size_ngh is not None:
warnings.warn('`size_ngh` will be replaced in version 0.4. Use'
' `n_neighbors` instead.', DeprecationWarning)
self.n_neighbors = self.size_ngh

# Fit using SMOTE
self.sm.fit(X, y)

Expand Down
4 changes: 0 additions & 4 deletions imblearn/combine/smote_tomek.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,6 @@ class SMOTETomek(BaseBinarySampler):
The type of SMOTE algorithm to use one of the following
options: 'regular', 'borderline1', 'borderline2', 'svm'

size_ngh : int, optional (default=3)
Size of the neighbourhood to consider to compute the average
distance to the minority point samples.

kind_sel : str, optional (default='all')
Strategy to use in order to exclude samples.

Expand Down
17 changes: 14 additions & 3 deletions imblearn/under_sampling/condensed_nearest_neighbour.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
method."""
from __future__ import division, print_function

import warnings

from collections import Counter

import numpy as np
Expand All @@ -27,7 +29,14 @@ class CondensedNearestNeighbour(BaseMulticlassSampler):
If None, the random number generator is the RandomState instance used
by np.random.

size_ngh : int, optional (default=1)
size_ngh : int, optional (default=None)
Size of the neighbourhood to consider to compute the average
distance to the minority point samples.

NOTE: size_ngh is deprecated from 0.2 and will be replaced in 0.4
Use ``n_neighbors`` instead.

n_neighbors : int, optional (default=1)
Size of the neighbourhood to consider to compute the average
distance to the minority point samples.

Expand Down Expand Up @@ -86,12 +95,14 @@ class CondensedNearestNeighbour(BaseMulticlassSampler):
"""

def __init__(self, return_indices=False, random_state=None,
size_ngh=1, n_seeds_S=1, n_jobs=-1, **kwargs):
size_ngh=None, n_neighbors=1, n_seeds_S=1, n_jobs=-1,
**kwargs):
super(CondensedNearestNeighbour, self).__init__()

self.return_indices = return_indices
self.random_state = random_state
self.size_ngh = size_ngh
self.n_neighbors = n_neighbors
self.n_seeds_S = n_seeds_S
self.n_jobs = n_jobs
self.kwargs = kwargs
Expand Down Expand Up @@ -158,7 +169,7 @@ def _sample(self, X, y):
S_y = y[y == key]

# Create a k-NN classifier
knn = KNeighborsClassifier(n_neighbors=self.size_ngh,
knn = KNeighborsClassifier(n_neighbors=self.n_neighbors,
n_jobs=self.n_jobs,
**self.kwargs)

Expand Down
47 changes: 37 additions & 10 deletions imblearn/under_sampling/edited_nearest_neighbours.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from collections import Counter

import warnings
import numpy as np
from scipy.stats import mode
from sklearn.neighbors import NearestNeighbors
Expand All @@ -29,7 +30,14 @@ class EditedNearestNeighbours(BaseMulticlassSampler):
If None, the random number generator is the RandomState instance used
by np.random.

size_ngh : int, optional (default=3)
size_ngh : int, optional (default=None)
Size of the neighbourhood to consider to compute the average
distance to the minority point samples.

NOTE: size_ngh is deprecated from 0.2 and will be replaced in 0.4
Use ``n_neighbors`` instead.

n_neighbors : int, optional (default=3)
Size of the neighbourhood to consider to compute the average
distance to the minority point samples.

Expand Down Expand Up @@ -91,11 +99,12 @@ class EditedNearestNeighbours(BaseMulticlassSampler):
"""

def __init__(self, return_indices=False, random_state=None,
size_ngh=3, kind_sel='all', n_jobs=-1):
size_ngh=None, n_neighbors=3, kind_sel='all', n_jobs=-1):
super(EditedNearestNeighbours, self).__init__()
self.return_indices = return_indices
self.random_state = random_state
self.size_ngh = size_ngh
self.n_neighbors = n_neighbors
self.kind_sel = kind_sel
self.n_jobs = n_jobs

Expand Down Expand Up @@ -140,7 +149,7 @@ def _sample(self, X, y):
idx_under = np.flatnonzero(y == self.min_c_)

# Create a k-NN to fit the whole data
nn_obj = NearestNeighbors(n_neighbors=self.size_ngh + 1,
nn_obj = NearestNeighbors(n_neighbors=self.n_neighbors + 1,
n_jobs=self.n_jobs)
# Fit the data
nn_obj.fit(X)
Expand Down Expand Up @@ -217,7 +226,14 @@ class RepeatedEditedNearestNeighbours(BaseMulticlassSampler):
If None, the random number generator is the RandomState instance used
by np.random.

size_ngh : int, optional (default=3)
size_ngh : int, optional (default=None)
Size of the neighbourhood to consider to compute the average
distance to the minority point samples.

NOTE: size_ngh is deprecated from 0.2 and will be replaced in 0.4
Use ``n_neighbors`` instead.

n_neighbors : int, optional (default=3)
Size of the neighbourhood to consider to compute the average
distance to the minority point samples.

Expand Down Expand Up @@ -283,18 +299,20 @@ class RepeatedEditedNearestNeighbours(BaseMulticlassSampler):
"""

def __init__(self, return_indices=False, random_state=None,
size_ngh=3, max_iter=100, kind_sel='all', n_jobs=-1):
size_ngh=None, n_neighbors=3, max_iter=100, kind_sel='all',
n_jobs=-1):
super(RepeatedEditedNearestNeighbours, self).__init__()
self.return_indices = return_indices
self.random_state = random_state
self.size_ngh = size_ngh
self.n_neighbors = n_neighbors
self.kind_sel = kind_sel
self.n_jobs = n_jobs
self.max_iter = max_iter
self.enn_ = EditedNearestNeighbours(
return_indices=self.return_indices,
random_state=self.random_state,
size_ngh=self.size_ngh,
n_neighbors=self.n_neighbors,
kind_sel=self.kind_sel,
n_jobs=self.n_jobs)

Expand Down Expand Up @@ -441,7 +459,14 @@ class AllKNN(BaseMulticlassSampler):
If None, the random number generator is the RandomState instance used
by np.random.

size_ngh : int, optional (default=3)
size_ngh : int, optional (default=None)
Size of the neighbourhood to consider to compute the average
distance to the minority point samples.

NOTE: size_ngh is deprecated from 0.2 and will be replaced in 0.4
Use ``n_neighbors`` instead.

n_neighbors : int, optional (default=3)
Size of the neighbourhood to consider to compute the average
distance to the minority point samples.

Expand Down Expand Up @@ -503,17 +528,18 @@ class AllKNN(BaseMulticlassSampler):
"""

def __init__(self, return_indices=False, random_state=None,
size_ngh=3, kind_sel='all', n_jobs=-1):
size_ngh=None, n_neighbors=3, kind_sel='all', n_jobs=-1):
super(AllKNN, self).__init__()
self.return_indices = return_indices
self.random_state = random_state
self.size_ngh = size_ngh
self.n_neighbors = n_neighbors
self.kind_sel = kind_sel
self.n_jobs = n_jobs
self.enn_ = EditedNearestNeighbours(
return_indices=self.return_indices,
random_state=self.random_state,
size_ngh=self.size_ngh,
n_neighbors=self.n_neighbors,
kind_sel=self.kind_sel,
n_jobs=self.n_jobs)

Expand Down Expand Up @@ -572,10 +598,11 @@ def _sample(self, X, y):
if self.return_indices:
idx_under = np.arange(X.shape[0], dtype=int)

for curr_size_ngh in range(1, self.size_ngh + 1):
for curr_size_ngh in range(1, self.n_neighbors + 1):
self.logger.debug('Apply ENN size_ngh #%s', curr_size_ngh)
# updating ENN size_ngh
self.enn_.size_ngh = curr_size_ngh

if self.return_indices:
X_enn, y_enn, idx_enn = self.enn_.fit_sample(X_, y_)
else:
Expand Down
Loading