Skip to content

Commit 53dddf8

Browse files
committed
refactoring
1 parent a1f9df4 commit 53dddf8

File tree

2 files changed

+115
-56
lines changed

2 files changed

+115
-56
lines changed

stlearn/stacking.py

Lines changed: 59 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -3,33 +3,41 @@
33
Prediction stacking API
44
"""
55
# Author: Mehdi Rahim <rahim.mehdi@gmail.com>
6+
# Denis A. Engemann <denis.engemann@gmail.com>
67
#
78
# License: BSD 3 clause
89

910
import numpy as np
1011
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
1112
from sklearn.metrics import accuracy_score
12-
from sklearn.externals.joblib import Memory, Parallel, delayed
13+
from sklearn.externals.joblib import Parallel, delayed
1314

1415

1516
def stack_features(X):
1617
"""Stack features from sources
1718
18-
Parameters
19-
----------
20-
X : a list of 2d matrices
21-
22-
Returns
23-
-------
24-
Xstacked : shape = (n_samples, n_features*n_sources) stacked 2d matrix
25-
features_indices : shape = (n_sources, ) list of indices
19+
Parameters:
20+
-----------
21+
X : list of array-like (n_samples, n_features)
22+
The data to be used as source for each estimator. The first
23+
dataset corresponds to the first estimator.
24+
25+
Returns:
26+
--------
27+
X_stacked : array, (n_samples, n_features)
28+
The stacked data, such that the number of features corresponds
29+
to the sum of number of featrues in each source.
30+
31+
features_indices : list of indexers
32+
Index epxressions to be applied on the columns of X_stacked.
33+
Can be slices, lists of intgers or bool.
2634
"""
2735
X_stacked = np.hstack(X)
2836

2937
features_markers = np.r_[0, np.cumsum([x.shape[1] for x in X])]
3038
feature_indices = [slice(features_markers[i],
31-
features_markers[i+1])
32-
for i in range(len(features_markers)-1)]
39+
features_markers[i + 1])
40+
for i in range(len(features_markers) - 1)]
3341

3442
return X_stacked, feature_indices
3543

@@ -66,23 +74,28 @@ def _predict_proba_estimator(clf, X):
6674

6775
def _check_Xy(stacking, X, y=None):
6876
"""check dimensions"""
69-
if np.ndim(X) != 3:
70-
raise ValueError(
71-
'X must be 3 dimensional, your X has %d dimensions' % np.ndim(X))
72-
expected_n_sources = len(stacking.estimators)
73-
if expected_n_sources != np.asarray(X).shape[0]:
74-
raise ValueError(
75-
'The first axis of X (%d) should match the '
76-
'number of estimators (%d)' % (
77-
X.shape[0],
78-
len(stacking.estimators)))
79-
if y is not None:
80-
if len(y) != np.asarray(X).shape[1]:
81-
raise ValueError(
82-
'The second axis of X (%d) should match the '
83-
'number of samples (%d)' % (
84-
X.shape[1],
85-
len(stacking.estimators)))
77+
if np.ndim(X) != 2:
78+
raise ValueError('X_stacked must be a 2D array')
79+
80+
for ii, feat_inds in enumerate(stacking.feature_indices):
81+
if not isinstance(X, np.ndarray):
82+
raise ValueError('You have something else than an array in X[%d]'
83+
% ii)
84+
if isinstance(feat_inds, (list, tuple, np.ndarray)):
85+
this_max = np.max(feat_inds)
86+
this_min = abs(np.min(feat_inds))
87+
if this_max >= X.shape[1] or this_min > X.shape[1]:
88+
raise ValueError('On source %s your indexer is out of bound'
89+
% ii)
90+
elif isinstance(feat_inds, slice):
91+
stop = feat_inds.stop
92+
start = feat_inds.start
93+
if start is None:
94+
start = 0
95+
if stop is None:
96+
stop = -1
97+
if (start >= X.shape[1] or abs(stop) > X.shape[1]):
98+
ValueError('Your slices are bad and generate empty views')
8699

87100

88101
class StackingClassifier(BaseEstimator, ClassifierMixin, TransformerMixin):
@@ -93,25 +106,29 @@ class StackingClassifier(BaseEstimator, ClassifierMixin, TransformerMixin):
93106
estimators : list of Estimator objects compatible with scikit-learn
94107
The estimators to be used with each source of inputs. Length must match
95108
the firt dimensions of X.
109+
96110
stacking_estimator : Estimator objects compatible with scikit-learn
97111
The estimator used to integrate the predictions of the estimators.
98-
memory : joblib memory object | None
99-
The caching configuration. Defaults to `Memory(cachedir=None)`.
100-
memory_level : int (defaults to 0)
101-
The memory level used for caching.
112+
113+
features_indices : list of indexers
114+
Index epxressions to be applied on the columns of X_stacked.
115+
Can be slices, lists of intgers or bool.
102116
"""
103117

104-
def __init__(self, estimators=None,
105-
stacking_estimator=None,
106-
feature_indices=None,
107-
memory=Memory(cachedir=None), memory_level=0,
118+
def __init__(self, estimators,
119+
stacking_estimator,
120+
feature_indices,
108121
n_jobs=1):
109122

123+
if len(estimators) != len(feature_indices):
124+
raise ValueError('The estimators and feature indices must be of '
125+
'the same lenghts')
126+
127+
if len(set(estimators)) < len(estimators):
128+
raise ValueError('Estimators must be indpendent')
110129
self.estimators = estimators
111130
self.stacking_estimator = stacking_estimator
112131
self.feature_indices = feature_indices
113-
self.memory = memory
114-
self.memory_level = memory_level
115132
self.n_jobs = n_jobs
116133

117134
def fit(self, X, y):
@@ -127,8 +144,9 @@ def fit(self, X, y):
127144
Target vector relative to X.
128145
"""
129146

147+
_check_Xy(self, X, y)
130148
X_list = _split_features(X, self.feature_indices)
131-
_check_Xy(self, X_list, y)
149+
132150
self.estimators = Parallel(n_jobs=self.n_jobs)(
133151
delayed(_fit_estimator)(clf, x, y)
134152
for x, clf in zip(X_list, self.estimators))
@@ -154,8 +172,8 @@ def predict(self, X):
154172
C : array, shape = (n_samples)
155173
Predicted class label per sample.
156174
"""
175+
_check_Xy(self, X)
157176
X_list = _split_features(X, self.feature_indices)
158-
_check_Xy(self, X_list)
159177
predictions_ = Parallel(n_jobs=self.n_jobs)(
160178
delayed(_predict_proba_estimator)(clf, x)
161179
for x, clf in zip(X_list, self.estimators))
@@ -199,8 +217,8 @@ def predict_estimators(self, X):
199217
C : array, shape = (n_samples, n_estimators)
200218
Predicted class label per sample and estimators.
201219
"""
220+
_check_Xy(self, X)
202221
X_list = _split_features(X, self.feature_indices)
203-
_check_Xy(self, X_list)
204222
predictions_ = Parallel(n_jobs=self.n_jobs)(
205223
delayed(_predict_estimator)(clf, x)
206224
for x, clf in zip(X_list, self.estimators))

stlearn/tests/test_stacking.py

Lines changed: 56 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,42 +6,83 @@
66

77
from sklearn.datasets import make_classification
88
from sklearn.linear_model import LogisticRegression
9-
from sklearn.model_selection import ShuffleSplit
109
from stlearn import StackingClassifier
10+
from stlearn import stack_features
1111

1212
n_samples = 200
13-
n_estimators = 2
13+
n_estimators = 3
1414
X0, y = make_classification(n_samples=200, random_state=42)
15-
X1 = X0 ** 2
16-
X = np.c_[X0, X1]
15+
# let's say we taks some columns and make them non-linear
16+
X1 = X0[:, :10] ** 2
17+
X2 = X0[:, 10:15] ** 2
1718

18-
ss = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
19+
X = [X0, X1, X2]
20+
X_stacked, feature_indices = stack_features(X)
21+
22+
23+
def test_stack_features():
24+
"""Test stacking features"""
25+
X0 = np.array([[1, 2], [3, 4]])
26+
X1 = np.array([[1, 2, 4], [3, 4, 5]])
27+
X = [X0, X1]
28+
X_stacked, features_indices = stack_features(X)
29+
assert_equal(np.size(X_stacked),
30+
np.size(X0) + np.size(X1))
31+
assert_equal(len(features_indices), len(X))
32+
assert_equal(X_stacked.shape, (2, 5))
1933

2034

2135
def test_stacking_essentials():
2236
"""Test initializaing and essential basic function"""
23-
stacking = StackingClassifier(
37+
38+
# check inputs
39+
stacking = assert_raises(
40+
ValueError, StackingClassifier,
41+
estimators=2 * [LogisticRegression()],
42+
feature_indices=feature_indices,
43+
stacking_estimator=LogisticRegression())
44+
45+
stacking = assert_raises(
46+
ValueError, StackingClassifier,
2447
estimators=n_estimators * [LogisticRegression()],
48+
feature_indices=feature_indices[:2],
49+
stacking_estimator=LogisticRegression())
50+
51+
# test stacking classifier
52+
stacking = StackingClassifier(
53+
estimators=[LogisticRegression() for _ in range(3)],
54+
feature_indices=feature_indices,
2555
stacking_estimator=LogisticRegression())
26-
# assert_equal(getattr(stacking, 'predictions_', None), None)
56+
2757
assert_equal(stacking.stacking_estimator.__class__,
2858
LogisticRegression)
2959
assert_equal([ee.__class__ for ee in stacking.estimators],
3060
n_estimators * [LogisticRegression])
31-
assert_raises(ValueError, stacking.fit, X[0], y)
32-
assert_raises(ValueError, stacking.fit, X[:1], y)
33-
assert_raises(ValueError, stacking.fit, X[:, :1], y)
3461

35-
stacking.fit(X, y)
62+
stacking.fit(X_stacked, y)
3663

37-
predictions = stacking.predict(X)
64+
predictions = stacking.predict(X_stacked)
3865
assert_array_equal(np.unique(predictions), np.array([0, 1]))
3966

40-
score = stacking.score(X, y)
67+
score = stacking.score(X_stacked, y)
4168
assert_true(np.isscalar(score))
4269

43-
predictions_estimators = stacking.predict_estimators(X)
70+
predictions_estimators = stacking.predict_estimators(X_stacked)
4471
assert_array_equal(
4572
predictions_estimators.shape, (n_samples, n_estimators))
46-
scores_estimators = stacking.score_estimators(X, y)
73+
scores_estimators = stacking.score_estimators(X_stacked, y)
4774
assert_equal(len(scores_estimators), n_estimators)
75+
76+
assert_raises(ValueError, stacking.fit, X, y)
77+
stacking = StackingClassifier(
78+
estimators=[LogisticRegression() for _ in range(3)],
79+
feature_indices=[np.array([-500]), np.array([1]), np.array([2])],
80+
stacking_estimator=LogisticRegression())
81+
82+
assert_raises(ValueError, stacking.fit, X_stacked, y)
83+
84+
stacking = StackingClassifier(
85+
estimators=[LogisticRegression() for _ in range(3)],
86+
feature_indices=[slice(5000, -5000), slice(1, 10), slice(20)],
87+
stacking_estimator=LogisticRegression())
88+
assert_raises(ValueError, stacking.fit, X_stacked, y)

0 commit comments

Comments
 (0)