Skip to content

Commit ecd8409

Browse files
committed
first pass on stacking docs and shape checks
1 parent 60f59a7 commit ecd8409

File tree

1 file changed

+112
-12
lines changed

1 file changed

+112
-12
lines changed

stlearn/stacking.py

Lines changed: 112 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -37,27 +37,68 @@ def _predict_proba_estimator(clf, X):
3737
raise NotImplementedError("predict_proba not supported")
3838

3939

40+
def _check_Xy(stacking, X, y=None):
41+
"""check dimensions"""
42+
if np.ndim(X) != 3:
43+
raise ValueError(
44+
'X must be 3 dimensional, your X has %d dimensions' % np.ndim(X))
45+
expected_n_sources = len(stacking.estimators)
46+
if expected_n_sources != np.asarray(X).shape[0]:
47+
raise ValueError(
48+
'The first axis of X (%d) should match the '
49+
'number of estimators (%d)' % (
50+
X.shape[0],
51+
len(stacking.estimators)))
52+
if y is not None:
53+
if len(y) != np.asarray(X).shape[1]:
54+
raise ValueError(
55+
'The second axis of X (%d) should match the '
56+
'number of samples (%d)' % (
57+
X.shape[1],
58+
len(stacking.estimators)))
59+
60+
4061
class StackingClassifier(BaseEstimator, ClassifierMixin, TransformerMixin):
41-
"""Meta-classifier of 3D X matrix with labels
62+
"""Stacking Meta-classifier of 3D X matrix with labels
63+
64+
Parameters
65+
----------
66+
estimators : list of Estimator objects compatible with scikit-learn
67+
The estimators to be used with each source of inputs. Length must match
68+
the firt dimensions of X.
69+
stacking_estimator : Estimator objects compatible with scikit-learn
70+
The estimator used to integrate the predictions of the estimators.
71+
memory : joblib memory object | None
72+
The caching configuration. Defaults to `Memory(cachedir=None)`.
73+
memory_level : int (defaults to 0)
74+
The memory level used for caching.
4275
"""
4376

4477
def __init__(self, estimators=None,
4578
stacking_estimator=None,
4679
memory=Memory(cachedir=None), memory_level=0,
4780
n_jobs=1):
48-
""" initialization
49-
"""
81+
5082
self.estimators = estimators
5183
self.stacking_estimator = stacking_estimator
5284
self.memory = memory
5385
self.memory_level = memory_level
5486
self.n_jobs = n_jobs
5587

5688
def fit(self, X, y):
57-
""" stacking model fitting
58-
X is 3D matrix
59-
"""
89+
"""Fit all estimators according to the given training data.
6090
91+
Parameters
92+
----------
93+
X : {array-like, sparse-matrix}, shape (n_estimators, n_samples,
94+
n_features)
95+
Training vector, where n_samples is the number of samples and
96+
n_features is the number of features.
97+
98+
y : array-like, shape (n_samples,)
99+
Target vector relative to X.
100+
"""
101+
_check_Xy(self, X, y)
61102
self.estimators = Parallel(n_jobs=self.n_jobs)(
62103
delayed(_fit_estimator)(clf, x, y)
63104
for x, clf in zip(X, self.estimators))
@@ -71,10 +112,20 @@ def fit(self, X, y):
71112
return self
72113

73114
def predict(self, X):
74-
""" stacking model prediction
75-
X is 3D matrix
115+
"""Predict class labels for samples in X.
116+
117+
Parameters
118+
----------
119+
X : {array-like, sparse matrix}, shape = (n_estimators,
120+
n_samples, n_features)
121+
The multi-input samples.
122+
123+
Returns
124+
-------
125+
C : array, shape = (n_samples)
126+
Predicted class label per sample.
76127
"""
77-
128+
_check_Xy(self, X)
78129
predictions_ = Parallel(n_jobs=self.n_jobs)(
79130
delayed(_predict_proba_estimator)(clf, x)
80131
for x, clf in zip(X, self.estimators))
@@ -83,20 +134,69 @@ def predict(self, X):
83134
return self.stacking_estimator.predict(predictions_)
84135

85136
def score(self, X, y):
86-
""" stacking model accuracy
137+
"""Returns the mean accuracy on the given test data and labels.
138+
139+
In multi-label classification, this is the subset accuracy
140+
which is a harsh metric since you require for each sample that
141+
each label set be correctly predicted.
142+
143+
Parameters
144+
----------
145+
X : array-like, shape = (n_estimators, n_samples, n_features)
146+
The multi-input samples.
147+
148+
y : array-like, shape = (n_samples) or (n_samples, n_outputs)
149+
True labels for X.
150+
151+
152+
Returns
153+
-------
154+
score : float
155+
Mean accuracy of self.predict(X) wrt. y.
87156
"""
157+
_check_Xy(self, X, y)
88158
return accuracy_score(y, self.predict(X))
89159

90160
def predict_estimators(self, X):
91-
""" prediction from separate estimators
161+
"""Predict class labels for samples in X for each estimators.
162+
163+
Parameters
164+
----------
165+
X : {array-like, sparse matrix}, shape = (n_estimators,
166+
n_samples, n_features)
167+
The multi-input samples.
168+
169+
Returns
170+
-------
171+
C : array, shape = (n_samples, n_estimators)
172+
Predicted class label per sample and estimators.
92173
"""
174+
_check_Xy(self, X)
93175
predictions_ = Parallel(n_jobs=self.n_jobs)(
94176
delayed(_predict_estimator)(clf, x)
95177
for x, clf in zip(X, self.estimators))
96178
return np.array(predictions_).T
97179

98180
def score_estimators(self, X, y):
99-
""" accuracy from separate estimators
181+
"""Returns the mean accuracy for each estimators.
182+
183+
In multi-label classification, this is the subset accuracy
184+
which is a harsh metric since you require for each sample that
185+
each label set be correctly predicted.
186+
187+
Parameters
188+
----------
189+
X : array-like, shape = (n_estimators, n_samples, n_features)
190+
The multi-input samples.
191+
192+
y : array-like, shape = (n_samples) or (n_samples, n_outputs)
193+
True labels for X.
194+
195+
Returns
196+
-------
197+
score : list of float, shape (n_estimators,)
198+
Mean accuracy of self.predict_estimators(X) wrt. y.
100199
"""
200+
_check_Xy(self, X, y)
101201
predictions_ = self.predict_estimators(X)
102202
return np.array([accuracy_score(y, p) for p in predictions_.T])

0 commit comments

Comments
 (0)