3
3
Prediction stacking API
4
4
"""
5
5
# Author: Mehdi Rahim <rahim.mehdi@gmail.com>
6
+ # Denis A. Engemann <denis.engemann@gmail.com>
6
7
#
7
8
# License: BSD 3 clause
8
9
9
10
import numpy as np
10
11
from sklearn .base import BaseEstimator , TransformerMixin , ClassifierMixin
11
12
from sklearn .metrics import accuracy_score
12
- from sklearn .externals .joblib import Memory , Parallel , delayed
13
+ from sklearn .externals .joblib import Parallel , delayed
13
14
14
15
15
16
def stack_features (X ):
16
17
"""Stack features from sources
17
18
18
- Parameters
19
- ----------
20
- X : a list of 2d matrices
21
-
22
- Returns
23
- -------
24
- Xstacked : shape = (n_samples, n_features*n_sources) stacked 2d matrix
25
- features_indices : shape = (n_sources, ) list of indices
19
+ Parameters:
20
+ -----------
21
+ X : list of array-like (n_samples, n_features)
22
+ The data to be used as source for each estimator. The first
23
+ dataset corresponds to the first estimator.
24
+
25
+ Returns:
26
+ --------
27
+ X_stacked : array, (n_samples, n_features)
28
+ The stacked data, such that the number of features corresponds
29
+ to the sum of number of featrues in each source.
30
+
31
+ features_indices : list of indexers
32
+ Index epxressions to be applied on the columns of X_stacked.
33
+ Can be slices, lists of intgers or bool.
26
34
"""
27
35
X_stacked = np .hstack (X )
28
36
29
37
features_markers = np .r_ [0 , np .cumsum ([x .shape [1 ] for x in X ])]
30
38
feature_indices = [slice (features_markers [i ],
31
- features_markers [i + 1 ])
32
- for i in range (len (features_markers )- 1 )]
39
+ features_markers [i + 1 ])
40
+ for i in range (len (features_markers ) - 1 )]
33
41
34
42
return X_stacked , feature_indices
35
43
@@ -66,23 +74,28 @@ def _predict_proba_estimator(clf, X):
66
74
67
75
def _check_Xy (stacking , X , y = None ):
68
76
"""check dimensions"""
69
- if np .ndim (X ) != 3 :
70
- raise ValueError (
71
- 'X must be 3 dimensional, your X has %d dimensions' % np .ndim (X ))
72
- expected_n_sources = len (stacking .estimators )
73
- if expected_n_sources != np .asarray (X ).shape [0 ]:
74
- raise ValueError (
75
- 'The first axis of X (%d) should match the '
76
- 'number of estimators (%d)' % (
77
- X .shape [0 ],
78
- len (stacking .estimators )))
79
- if y is not None :
80
- if len (y ) != np .asarray (X ).shape [1 ]:
81
- raise ValueError (
82
- 'The second axis of X (%d) should match the '
83
- 'number of samples (%d)' % (
84
- X .shape [1 ],
85
- len (stacking .estimators )))
77
+ if np .ndim (X ) != 2 :
78
+ raise ValueError ('X_stacked must be a 2D array' )
79
+
80
+ for ii , feat_inds in enumerate (stacking .feature_indices ):
81
+ if not isinstance (X , np .ndarray ):
82
+ raise ValueError ('You have something else than an array in X[%d]'
83
+ % ii )
84
+ if isinstance (feat_inds , (list , tuple , np .ndarray )):
85
+ this_max = np .max (feat_inds )
86
+ this_min = abs (np .min (feat_inds ))
87
+ if this_max >= X .shape [1 ] or this_min > X .shape [1 ]:
88
+ raise ValueError ('On source %s your indexer is out of bound'
89
+ % ii )
90
+ elif isinstance (feat_inds , slice ):
91
+ stop = feat_inds .stop
92
+ start = feat_inds .start
93
+ if start is None :
94
+ start = 0
95
+ if stop is None :
96
+ stop = - 1
97
+ if (start >= X .shape [1 ] or abs (stop ) > X .shape [1 ]):
98
+ ValueError ('Your slices are bad and generate empty views' )
86
99
87
100
88
101
class StackingClassifier (BaseEstimator , ClassifierMixin , TransformerMixin ):
@@ -93,25 +106,29 @@ class StackingClassifier(BaseEstimator, ClassifierMixin, TransformerMixin):
93
106
estimators : list of Estimator objects compatible with scikit-learn
94
107
The estimators to be used with each source of inputs. Length must match
95
108
the firt dimensions of X.
109
+
96
110
stacking_estimator : Estimator objects compatible with scikit-learn
97
111
The estimator used to integrate the predictions of the estimators.
98
- memory : joblib memory object | None
99
- The caching configuration. Defaults to `Memory(cachedir=None)`.
100
- memory_level : int (defaults to 0)
101
- The memory level used for caching .
112
+
113
+ features_indices : list of indexers
114
+ Index epxressions to be applied on the columns of X_stacked.
115
+ Can be slices, lists of intgers or bool .
102
116
"""
103
117
104
- def __init__ (self , estimators = None ,
105
- stacking_estimator = None ,
106
- feature_indices = None ,
107
- memory = Memory (cachedir = None ), memory_level = 0 ,
118
+ def __init__ (self , estimators ,
119
+ stacking_estimator ,
120
+ feature_indices ,
108
121
n_jobs = 1 ):
109
122
123
+ if len (estimators ) != len (feature_indices ):
124
+ raise ValueError ('The estimators and feature indices must be of '
125
+ 'the same lenghts' )
126
+
127
+ if len (set (estimators )) < len (estimators ):
128
+ raise ValueError ('Estimators must be indpendent' )
110
129
self .estimators = estimators
111
130
self .stacking_estimator = stacking_estimator
112
131
self .feature_indices = feature_indices
113
- self .memory = memory
114
- self .memory_level = memory_level
115
132
self .n_jobs = n_jobs
116
133
117
134
def fit (self , X , y ):
@@ -127,8 +144,9 @@ def fit(self, X, y):
127
144
Target vector relative to X.
128
145
"""
129
146
147
+ _check_Xy (self , X , y )
130
148
X_list = _split_features (X , self .feature_indices )
131
- _check_Xy ( self , X_list , y )
149
+
132
150
self .estimators = Parallel (n_jobs = self .n_jobs )(
133
151
delayed (_fit_estimator )(clf , x , y )
134
152
for x , clf in zip (X_list , self .estimators ))
@@ -154,8 +172,8 @@ def predict(self, X):
154
172
C : array, shape = (n_samples)
155
173
Predicted class label per sample.
156
174
"""
175
+ _check_Xy (self , X )
157
176
X_list = _split_features (X , self .feature_indices )
158
- _check_Xy (self , X_list )
159
177
predictions_ = Parallel (n_jobs = self .n_jobs )(
160
178
delayed (_predict_proba_estimator )(clf , x )
161
179
for x , clf in zip (X_list , self .estimators ))
@@ -199,8 +217,8 @@ def predict_estimators(self, X):
199
217
C : array, shape = (n_samples, n_estimators)
200
218
Predicted class label per sample and estimators.
201
219
"""
220
+ _check_Xy (self , X )
202
221
X_list = _split_features (X , self .feature_indices )
203
- _check_Xy (self , X_list )
204
222
predictions_ = Parallel (n_jobs = self .n_jobs )(
205
223
delayed (_predict_estimator )(clf , x )
206
224
for x , clf in zip (X_list , self .estimators ))
0 commit comments