refactoring

dengemann · dengemann · commit 53dddf8f9dcf · 2017-03-09T18:05:07.000+01:00
diff --git a/stlearn/stacking.py b/stlearn/stacking.py
@@ -3,33 +3,41 @@
 Prediction stacking API
 """
 # Author: Mehdi Rahim <rahim.mehdi@gmail.com>
+#         Denis A. Engemann <denis.engemann@gmail.com>
 #
 # License: BSD 3 clause
 
 import numpy as np
 from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
 from sklearn.metrics import accuracy_score
-from sklearn.externals.joblib import Memory, Parallel, delayed
+from sklearn.externals.joblib import Parallel, delayed
 
 
 def stack_features(X):
     """Stack features from sources
 
-    Parameters
-    ----------
-    X : a list of 2d matrices
-
-    Returns
-    -------
-    Xstacked : shape = (n_samples, n_features*n_sources) stacked 2d matrix
-    features_indices : shape = (n_sources, ) list of indices
+    Parameters:
+    -----------
+    X : list of array-like (n_samples, n_features)
+        The data to be used as source for each estimator. The first
+        dataset corresponds to the first estimator.
+
+    Returns:
+    --------
+    X_stacked : array, (n_samples, n_features)
+        The stacked data, such that the number of features corresponds
+        to the sum of number of featrues in each source.
+
+    features_indices : list of indexers
+        Index epxressions to be applied on the columns of X_stacked.
+        Can be slices, lists of intgers or bool.
     """
     X_stacked = np.hstack(X)
 
     features_markers = np.r_[0, np.cumsum([x.shape[1] for x in X])]
     feature_indices = [slice(features_markers[i],
-                             features_markers[i+1])
-                       for i in range(len(features_markers)-1)]
+                             features_markers[i + 1])
+                       for i in range(len(features_markers) - 1)]
 
     return X_stacked, feature_indices
 
@@ -66,23 +74,28 @@ def _predict_proba_estimator(clf, X):
 
 def _check_Xy(stacking, X, y=None):
     """check dimensions"""
-    if np.ndim(X) != 3:
-        raise ValueError(
-            'X must be 3 dimensional, your X has %d dimensions' % np.ndim(X))
-    expected_n_sources = len(stacking.estimators)
-    if expected_n_sources != np.asarray(X).shape[0]:
-        raise ValueError(
-            'The first axis of X (%d) should match the '
-            'number of estimators (%d)' % (
-                X.shape[0],
-                len(stacking.estimators)))
-    if y is not None:
-        if len(y) != np.asarray(X).shape[1]:
-            raise ValueError(
-                'The second axis of X (%d) should match the '
-                'number of samples (%d)' % (
-                    X.shape[1],
-                    len(stacking.estimators)))
+    if np.ndim(X) != 2:
+        raise ValueError('X_stacked must be a 2D array')
+
+    for ii, feat_inds in enumerate(stacking.feature_indices):
+        if not isinstance(X, np.ndarray):
+            raise ValueError('You have something else than an array in X[%d]'
+                             % ii)
+        if isinstance(feat_inds, (list, tuple, np.ndarray)):
+            this_max = np.max(feat_inds)
+            this_min = abs(np.min(feat_inds))
+            if this_max >= X.shape[1] or this_min > X.shape[1]:
+                raise ValueError('On source %s your indexer is out of bound'
+                                 % ii)
+        elif isinstance(feat_inds, slice):
+            stop = feat_inds.stop
+            start = feat_inds.start
+            if start is None:
+                start = 0
+            if stop is None:
+                stop = -1
+            if (start >= X.shape[1] or abs(stop) > X.shape[1]):
+                ValueError('Your slices are bad and generate empty views')
 
 
 class StackingClassifier(BaseEstimator, ClassifierMixin, TransformerMixin):
@@ -93,25 +106,29 @@ class StackingClassifier(BaseEstimator, ClassifierMixin, TransformerMixin):
     estimators : list of Estimator objects compatible with scikit-learn
         The estimators to be used with each source of inputs. Length must match
         the firt dimensions of X.
+
     stacking_estimator : Estimator objects compatible with scikit-learn
         The estimator used to integrate the predictions of the estimators.
-    memory : joblib memory object | None
-        The caching configuration. Defaults to `Memory(cachedir=None)`.
-    memory_level : int (defaults to 0)
-        The memory level used for caching.
+
+    features_indices : list of indexers
+        Index epxressions to be applied on the columns of X_stacked.
+        Can be slices, lists of intgers or bool.
     """
 
-    def __init__(self, estimators=None,
-                 stacking_estimator=None,
-                 feature_indices=None,
-                 memory=Memory(cachedir=None), memory_level=0,
+    def __init__(self, estimators,
+                 stacking_estimator,
+                 feature_indices,
                  n_jobs=1):
 
+        if len(estimators) != len(feature_indices):
+            raise ValueError('The estimators and feature indices must be of '
+                             'the same lenghts')
+
+        if len(set(estimators)) < len(estimators):
+            raise ValueError('Estimators must be indpendent')
         self.estimators = estimators
         self.stacking_estimator = stacking_estimator
         self.feature_indices = feature_indices
-        self.memory = memory
-        self.memory_level = memory_level
         self.n_jobs = n_jobs
 
     def fit(self, X, y):
@@ -127,8 +144,9 @@ def fit(self, X, y):
             Target vector relative to X.
         """
 
+        _check_Xy(self, X, y)
         X_list = _split_features(X, self.feature_indices)
-        _check_Xy(self, X_list, y)
+
         self.estimators = Parallel(n_jobs=self.n_jobs)(
             delayed(_fit_estimator)(clf, x, y)
             for x, clf in zip(X_list, self.estimators))
@@ -154,8 +172,8 @@ def predict(self, X):
         C : array, shape = (n_samples)
             Predicted class label per sample.
         """
+        _check_Xy(self, X)
         X_list = _split_features(X, self.feature_indices)
-        _check_Xy(self, X_list)
         predictions_ = Parallel(n_jobs=self.n_jobs)(
             delayed(_predict_proba_estimator)(clf, x)
             for x, clf in zip(X_list, self.estimators))
@@ -199,8 +217,8 @@ def predict_estimators(self, X):
         C : array, shape = (n_samples, n_estimators)
             Predicted class label per sample and estimators.
         """
+        _check_Xy(self, X)
         X_list = _split_features(X, self.feature_indices)
-        _check_Xy(self, X_list)
         predictions_ = Parallel(n_jobs=self.n_jobs)(
             delayed(_predict_estimator)(clf, x)
             for x, clf in zip(X_list, self.estimators))
diff --git a/stlearn/tests/test_stacking.py b/stlearn/tests/test_stacking.py
@@ -6,42 +6,83 @@
 
 from sklearn.datasets import make_classification
 from sklearn.linear_model import LogisticRegression
-from sklearn.model_selection import ShuffleSplit
 from stlearn import StackingClassifier
+from stlearn import stack_features
 
 n_samples = 200
-n_estimators = 2
+n_estimators = 3
 X0, y = make_classification(n_samples=200, random_state=42)
-X1 = X0 ** 2
-X = np.c_[X0, X1]
+# let's say we taks some columns and make them non-linear
+X1 = X0[:, :10] ** 2
+X2 = X0[:, 10:15] ** 2
 
-ss = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
+X = [X0, X1, X2]
+X_stacked, feature_indices = stack_features(X)
+
+
+def test_stack_features():
+    """Test stacking features"""
+    X0 = np.array([[1, 2], [3, 4]])
+    X1 = np.array([[1, 2, 4], [3, 4, 5]])
+    X = [X0, X1]
+    X_stacked, features_indices = stack_features(X)
+    assert_equal(np.size(X_stacked),
+                 np.size(X0) + np.size(X1))
+    assert_equal(len(features_indices), len(X))
+    assert_equal(X_stacked.shape, (2, 5))
 
 
 def test_stacking_essentials():
     """Test initializaing and essential basic function"""
-    stacking = StackingClassifier(
+
+    # check inputs
+    stacking = assert_raises(
+        ValueError, StackingClassifier,
+        estimators=2 * [LogisticRegression()],
+        feature_indices=feature_indices,
+        stacking_estimator=LogisticRegression())
+
+    stacking = assert_raises(
+        ValueError, StackingClassifier,
         estimators=n_estimators * [LogisticRegression()],
+        feature_indices=feature_indices[:2],
+        stacking_estimator=LogisticRegression())
+
+    # test stacking classifier
+    stacking = StackingClassifier(
+        estimators=[LogisticRegression() for _ in range(3)],
+        feature_indices=feature_indices,
         stacking_estimator=LogisticRegression())
-    # assert_equal(getattr(stacking, 'predictions_', None), None)
+
     assert_equal(stacking.stacking_estimator.__class__,
                  LogisticRegression)
     assert_equal([ee.__class__ for ee in stacking.estimators],
                  n_estimators * [LogisticRegression])
-    assert_raises(ValueError, stacking.fit, X[0], y)
-    assert_raises(ValueError, stacking.fit, X[:1], y)
-    assert_raises(ValueError, stacking.fit, X[:, :1], y)
 
-    stacking.fit(X, y)
+    stacking.fit(X_stacked, y)
 
-    predictions = stacking.predict(X)
+    predictions = stacking.predict(X_stacked)
     assert_array_equal(np.unique(predictions), np.array([0, 1]))
 
-    score = stacking.score(X, y)
+    score = stacking.score(X_stacked, y)
     assert_true(np.isscalar(score))
 
-    predictions_estimators = stacking.predict_estimators(X)
+    predictions_estimators = stacking.predict_estimators(X_stacked)
     assert_array_equal(
         predictions_estimators.shape, (n_samples, n_estimators))
-    scores_estimators = stacking.score_estimators(X, y)
+    scores_estimators = stacking.score_estimators(X_stacked, y)
     assert_equal(len(scores_estimators), n_estimators)
+
+    assert_raises(ValueError, stacking.fit, X, y)
+    stacking = StackingClassifier(
+        estimators=[LogisticRegression() for _ in range(3)],
+        feature_indices=[np.array([-500]), np.array([1]), np.array([2])],
+        stacking_estimator=LogisticRegression())
+
+    assert_raises(ValueError, stacking.fit, X_stacked, y)
+
+    stacking = StackingClassifier(
+        estimators=[LogisticRegression() for _ in range(3)],
+        feature_indices=[slice(5000, -5000), slice(1, 10), slice(20)],
+        stacking_estimator=LogisticRegression())
+    assert_raises(ValueError, stacking.fit, X_stacked, y)