reshaping stacking

mrahim · mrahim · commit 4da9b4173073 · 2017-03-09T15:16:54.000+01:00
diff --git a/examples/fmri_stacking.py b/examples/fmri_stacking.py
@@ -1,18 +1,22 @@
+import numpy as np
 from sklearn.datasets import make_classification
 from sklearn.linear_model import LogisticRegression, RidgeClassifier
 from sklearn.svm import LinearSVC
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.model_selection import ShuffleSplit
-from stlearn import StackingClassifier
+from stlearn import StackingClassifier, stack_features
 
 n = 20
 X, y = make_classification(n_samples=200, random_state=42)
 ss = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
 
+X_stacked, features_indices = stack_features(n*[X])
+
 stacking = StackingClassifier(estimators=n*[LogisticRegression()],
-                              stacking_estimator=LogisticRegression())
+                              stacking_estimator=LogisticRegression(),
+                              feature_indices=features_indices)
 
-for train, test in ss.split(X):
-    stacking.fit(n*[X[train]], y[train])
-    print(stacking.score(n*[X[test]], y[test]))
-    print(stacking.score_estimators(n*[X[test]], y[test]))
+for train, test in ss.split(X_stacked):
+    stacking.fit(X_stacked[train], y[train])
+    print(stacking.score(X_stacked[test], y[test]))
+    print(stacking.score_estimators(X_stacked[test], y[test]))
diff --git a/stlearn/__init__.py b/stlearn/__init__.py
@@ -1,2 +1,2 @@
-from .stacking import StackingClassifier
+from .stacking import StackingClassifier, stack_features
 from .multitask import MultiTaskEstimator
diff --git a/stlearn/stacking.py b/stlearn/stacking.py
@@ -12,6 +12,34 @@
 from sklearn.externals.joblib import Memory, Parallel, delayed
 
 
+def stack_features(X):
+    """Stack features from sources
+
+    Parameters:
+    -----------
+    X : a list of 2d matrices
+
+    Returns:
+    --------
+    - Xstacked : (n_samples x (n_features*n_sources)) stacked 2d matrix
+
+    - features_indices : (n_features*n_sources) list of indices
+    """
+    X_stacked = np.hstack(X)
+
+    features_markers = np.r_[0, np.cumsum([x.shape[1] for x in X])]
+    feature_indices = [slice(features_markers[i],
+                             features_markers[i+1])
+                       for i in range(len(features_markers)-1)]
+
+    return X_stacked, feature_indices
+
+
+def _split_features(X, feature_indices):
+    """helper"""
+    return [X[:, fi] for fi in feature_indices]
+
+
 def _fit_estimator(clf, X, y):
     """Helper to fit estimator"""
     return clf.fit(X, y)
@@ -76,11 +104,13 @@ class StackingClassifier(BaseEstimator, ClassifierMixin, TransformerMixin):
 
     def __init__(self, estimators=None,
                  stacking_estimator=None,
+                 feature_indices=None,
                  memory=Memory(cachedir=None), memory_level=0,
                  n_jobs=1):
 
         self.estimators = estimators
         self.stacking_estimator = stacking_estimator
+        self.feature_indices = feature_indices
         self.memory = memory
         self.memory_level = memory_level
         self.n_jobs = n_jobs
@@ -90,22 +120,23 @@ def fit(self, X, y):
 
         Parameters
         ----------
-        X : {array-like, sparse-matrix}, shape (n_estimators, n_samples,
-                                                n_features)
+        X : {array-like, sparse-matrix}, shape (n_samples, n_features)
             Training vector, where n_samples is the number of samples and
             n_features is the number of features.
 
         y : array-like, shape (n_samples,)
             Target vector relative to X.
         """
-        _check_Xy(self, X, y)
+
+        X_list = _split_features(X, self.feature_indices)
+        _check_Xy(self, X_list, y)
         self.estimators = Parallel(n_jobs=self.n_jobs)(
             delayed(_fit_estimator)(clf, x, y)
-            for x, clf in zip(X, self.estimators))
+            for x, clf in zip(X_list, self.estimators))
 
         predictions_ = Parallel(n_jobs=self.n_jobs)(
             delayed(_predict_proba_estimator)(clf, x)
-            for x, clf in zip(X, self.estimators))
+            for x, clf in zip(X_list, self.estimators))
         predictions_ = np.array(predictions_).T
 
         self.stacking_estimator.fit(predictions_, y)
@@ -116,19 +147,19 @@ def predict(self, X):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix}, shape = (n_estimators,
-                                                  n_samples, n_features)
+        X : {array-like, sparse matrix}, shape = (n_samples, n_features)
             The multi-input samples.
 
         Returns
         -------
         C : array, shape = (n_samples)
             Predicted class label per sample.
         """
-        _check_Xy(self, X)
+        X_list = _split_features(X, self.feature_indices)
+        _check_Xy(self, X_list)
         predictions_ = Parallel(n_jobs=self.n_jobs)(
             delayed(_predict_proba_estimator)(clf, x)
-            for x, clf in zip(X, self.estimators))
+            for x, clf in zip(X_list, self.estimators))
         predictions_ = np.array(predictions_).T
 
         return self.stacking_estimator.predict(predictions_)
@@ -142,7 +173,7 @@ def score(self, X, y):
 
         Parameters
         ----------
-        X : array-like, shape = (n_estimators, n_samples, n_features)
+        X : array-like, shape = (n_samples, n_features)
             The multi-input samples.
 
         y : array-like, shape = (n_samples) or (n_samples, n_outputs)
@@ -154,27 +185,26 @@ def score(self, X, y):
         score : float
             Mean accuracy of self.predict(X) wrt. y.
         """
-        _check_Xy(self, X, y)
         return accuracy_score(y, self.predict(X))
 
     def predict_estimators(self, X):
         """Predict class labels for samples in X for each estimators.
 
         Parameters
         ----------
-        X : {array-like, sparse matrix}, shape = (n_estimators,
-                                                  n_samples, n_features)
+        X : {array-like, sparse matrix}, shape = (n_samples, n_features)
             The multi-input samples.
 
         Returns
         -------
         C : array, shape = (n_samples, n_estimators)
             Predicted class label per sample and estimators.
         """
-        _check_Xy(self, X)
+        X_list = _split_features(X, self.feature_indices)
+        _check_Xy(self, X_list)
         predictions_ = Parallel(n_jobs=self.n_jobs)(
             delayed(_predict_estimator)(clf, x)
-            for x, clf in zip(X, self.estimators))
+            for x, clf in zip(X_list, self.estimators))
         return np.array(predictions_).T
 
     def score_estimators(self, X, y):
@@ -186,7 +216,7 @@ def score_estimators(self, X, y):
 
         Parameters
         ----------
-        X : array-like, shape = (n_estimators, n_samples, n_features)
+        X : array-like, shape = (n_samples, n_features)
             The multi-input samples.
 
         y : array-like, shape = (n_samples) or (n_samples, n_outputs)
@@ -197,6 +227,5 @@ def score_estimators(self, X, y):
         score : list of float, shape (n_estimators,)
             Mean accuracy of self.predict_estimators(X) wrt. y.
         """
-        _check_Xy(self, X, y)
         predictions_ = self.predict_estimators(X)
         return np.array([accuracy_score(y, p) for p in predictions_.T])
diff --git a/stlearn/tests/test_stacking.py b/stlearn/tests/test_stacking.py
@@ -13,7 +13,8 @@
 n_estimators = 2
 X0, y = make_classification(n_samples=200, random_state=42)
 X1 = X0 ** 2
-X = np.array([X0, X1])
+X = np.c_[X0, X1]
+
 ss = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
 
 

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`		`-from .stacking import StackingClassifier`
	`1`	`+from .stacking import StackingClassifier, stack_features`
`2`	`2`	`from .multitask import MultiTaskEstimator`