Skip to content

Commit 4da9b41

Browse files
committed
reshaping stacking
1 parent bc988f3 commit 4da9b41

File tree

4 files changed

+59
-25
lines changed

4 files changed

+59
-25
lines changed

examples/fmri_stacking.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,22 @@
1+
import numpy as np
12
from sklearn.datasets import make_classification
23
from sklearn.linear_model import LogisticRegression, RidgeClassifier
34
from sklearn.svm import LinearSVC
45
from sklearn.ensemble import RandomForestClassifier
56
from sklearn.model_selection import ShuffleSplit
6-
from stlearn import StackingClassifier
7+
from stlearn import StackingClassifier, stack_features
78

89
n = 20
910
X, y = make_classification(n_samples=200, random_state=42)
1011
ss = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
1112

13+
X_stacked, features_indices = stack_features(n*[X])
14+
1215
stacking = StackingClassifier(estimators=n*[LogisticRegression()],
13-
stacking_estimator=LogisticRegression())
16+
stacking_estimator=LogisticRegression(),
17+
feature_indices=features_indices)
1418

15-
for train, test in ss.split(X):
16-
stacking.fit(n*[X[train]], y[train])
17-
print(stacking.score(n*[X[test]], y[test]))
18-
print(stacking.score_estimators(n*[X[test]], y[test]))
19+
for train, test in ss.split(X_stacked):
20+
stacking.fit(X_stacked[train], y[train])
21+
print(stacking.score(X_stacked[test], y[test]))
22+
print(stacking.score_estimators(X_stacked[test], y[test]))

stlearn/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
from .stacking import StackingClassifier
1+
from .stacking import StackingClassifier, stack_features
22
from .multitask import MultiTaskEstimator

stlearn/stacking.py

Lines changed: 46 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,34 @@
1212
from sklearn.externals.joblib import Memory, Parallel, delayed
1313

1414

15+
def stack_features(X):
16+
"""Stack features from sources
17+
18+
Parameters:
19+
-----------
20+
X : a list of 2d matrices
21+
22+
Returns:
23+
--------
24+
- Xstacked : (n_samples x (n_features*n_sources)) stacked 2d matrix
25+
26+
- features_indices : (n_features*n_sources) list of indices
27+
"""
28+
X_stacked = np.hstack(X)
29+
30+
features_markers = np.r_[0, np.cumsum([x.shape[1] for x in X])]
31+
feature_indices = [slice(features_markers[i],
32+
features_markers[i+1])
33+
for i in range(len(features_markers)-1)]
34+
35+
return X_stacked, feature_indices
36+
37+
38+
def _split_features(X, feature_indices):
39+
"""helper"""
40+
return [X[:, fi] for fi in feature_indices]
41+
42+
1543
def _fit_estimator(clf, X, y):
1644
"""Helper to fit estimator"""
1745
return clf.fit(X, y)
@@ -76,11 +104,13 @@ class StackingClassifier(BaseEstimator, ClassifierMixin, TransformerMixin):
76104

77105
def __init__(self, estimators=None,
78106
stacking_estimator=None,
107+
feature_indices=None,
79108
memory=Memory(cachedir=None), memory_level=0,
80109
n_jobs=1):
81110

82111
self.estimators = estimators
83112
self.stacking_estimator = stacking_estimator
113+
self.feature_indices = feature_indices
84114
self.memory = memory
85115
self.memory_level = memory_level
86116
self.n_jobs = n_jobs
@@ -90,22 +120,23 @@ def fit(self, X, y):
90120
91121
Parameters
92122
----------
93-
X : {array-like, sparse-matrix}, shape (n_estimators, n_samples,
94-
n_features)
123+
X : {array-like, sparse-matrix}, shape (n_samples, n_features)
95124
Training vector, where n_samples is the number of samples and
96125
n_features is the number of features.
97126
98127
y : array-like, shape (n_samples,)
99128
Target vector relative to X.
100129
"""
101-
_check_Xy(self, X, y)
130+
131+
X_list = _split_features(X, self.feature_indices)
132+
_check_Xy(self, X_list, y)
102133
self.estimators = Parallel(n_jobs=self.n_jobs)(
103134
delayed(_fit_estimator)(clf, x, y)
104-
for x, clf in zip(X, self.estimators))
135+
for x, clf in zip(X_list, self.estimators))
105136

106137
predictions_ = Parallel(n_jobs=self.n_jobs)(
107138
delayed(_predict_proba_estimator)(clf, x)
108-
for x, clf in zip(X, self.estimators))
139+
for x, clf in zip(X_list, self.estimators))
109140
predictions_ = np.array(predictions_).T
110141

111142
self.stacking_estimator.fit(predictions_, y)
@@ -116,19 +147,19 @@ def predict(self, X):
116147
117148
Parameters
118149
----------
119-
X : {array-like, sparse matrix}, shape = (n_estimators,
120-
n_samples, n_features)
150+
X : {array-like, sparse matrix}, shape = (n_samples, n_features)
121151
The multi-input samples.
122152
123153
Returns
124154
-------
125155
C : array, shape = (n_samples)
126156
Predicted class label per sample.
127157
"""
128-
_check_Xy(self, X)
158+
X_list = _split_features(X, self.feature_indices)
159+
_check_Xy(self, X_list)
129160
predictions_ = Parallel(n_jobs=self.n_jobs)(
130161
delayed(_predict_proba_estimator)(clf, x)
131-
for x, clf in zip(X, self.estimators))
162+
for x, clf in zip(X_list, self.estimators))
132163
predictions_ = np.array(predictions_).T
133164

134165
return self.stacking_estimator.predict(predictions_)
@@ -142,7 +173,7 @@ def score(self, X, y):
142173
143174
Parameters
144175
----------
145-
X : array-like, shape = (n_estimators, n_samples, n_features)
176+
X : array-like, shape = (n_samples, n_features)
146177
The multi-input samples.
147178
148179
y : array-like, shape = (n_samples) or (n_samples, n_outputs)
@@ -154,27 +185,26 @@ def score(self, X, y):
154185
score : float
155186
Mean accuracy of self.predict(X) wrt. y.
156187
"""
157-
_check_Xy(self, X, y)
158188
return accuracy_score(y, self.predict(X))
159189

160190
def predict_estimators(self, X):
161191
"""Predict class labels for samples in X for each estimators.
162192
163193
Parameters
164194
----------
165-
X : {array-like, sparse matrix}, shape = (n_estimators,
166-
n_samples, n_features)
195+
X : {array-like, sparse matrix}, shape = (n_samples, n_features)
167196
The multi-input samples.
168197
169198
Returns
170199
-------
171200
C : array, shape = (n_samples, n_estimators)
172201
Predicted class label per sample and estimators.
173202
"""
174-
_check_Xy(self, X)
203+
X_list = _split_features(X, self.feature_indices)
204+
_check_Xy(self, X_list)
175205
predictions_ = Parallel(n_jobs=self.n_jobs)(
176206
delayed(_predict_estimator)(clf, x)
177-
for x, clf in zip(X, self.estimators))
207+
for x, clf in zip(X_list, self.estimators))
178208
return np.array(predictions_).T
179209

180210
def score_estimators(self, X, y):
@@ -186,7 +216,7 @@ def score_estimators(self, X, y):
186216
187217
Parameters
188218
----------
189-
X : array-like, shape = (n_estimators, n_samples, n_features)
219+
X : array-like, shape = (n_samples, n_features)
190220
The multi-input samples.
191221
192222
y : array-like, shape = (n_samples) or (n_samples, n_outputs)
@@ -197,6 +227,5 @@ def score_estimators(self, X, y):
197227
score : list of float, shape (n_estimators,)
198228
Mean accuracy of self.predict_estimators(X) wrt. y.
199229
"""
200-
_check_Xy(self, X, y)
201230
predictions_ = self.predict_estimators(X)
202231
return np.array([accuracy_score(y, p) for p in predictions_.T])

stlearn/tests/test_stacking.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@
1313
n_estimators = 2
1414
X0, y = make_classification(n_samples=200, random_state=42)
1515
X1 = X0 ** 2
16-
X = np.array([X0, X1])
16+
X = np.c_[X0, X1]
17+
1718
ss = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
1819

1920

0 commit comments

Comments
 (0)