first pass on stacking docs and shape checks

dengemann · dengemann · commit ecd8409d379f · 2017-03-09T14:06:39.000+01:00
diff --git a/stlearn/stacking.py b/stlearn/stacking.py
@@ -37,27 +37,68 @@ def _predict_proba_estimator(clf, X):
     raise NotImplementedError("predict_proba not supported")
 
 
+def _check_Xy(stacking, X, y=None):
+    """check dimensions"""
+    if np.ndim(X) != 3:
+        raise ValueError(
+            'X must be 3 dimensional, your X has %d dimensions' % np.ndim(X))
+    expected_n_sources = len(stacking.estimators)
+    if expected_n_sources != np.asarray(X).shape[0]:
+        raise ValueError(
+            'The first axis of X (%d) should match the '
+            'number of estimators (%d)' % (
+                X.shape[0],
+                len(stacking.estimators)))
+    if y is not None:
+        if len(y) != np.asarray(X).shape[1]:
+            raise ValueError(
+                'The second axis of X (%d) should match the '
+                'number of samples (%d)' % (
+                    X.shape[1],
+                    len(stacking.estimators)))
+
+
 class StackingClassifier(BaseEstimator, ClassifierMixin, TransformerMixin):
-    """Meta-classifier of 3D X matrix with labels
+    """Stacking Meta-classifier of 3D X matrix with labels
+
+    Parameters
+    ----------
+    estimators : list of Estimator objects compatible with scikit-learn
+        The estimators to be used with each source of inputs. Length must match
+        the firt dimensions of X.
+    stacking_estimator : Estimator objects compatible with scikit-learn
+        The estimator used to integrate the predictions of the estimators.
+    memory : joblib memory object | None
+        The caching configuration. Defaults to `Memory(cachedir=None)`.
+    memory_level : int (defaults to 0)
+        The memory level used for caching.
     """
 
     def __init__(self, estimators=None,
                  stacking_estimator=None,
                  memory=Memory(cachedir=None), memory_level=0,
                  n_jobs=1):
-        """ initialization
-        """
+
         self.estimators = estimators
         self.stacking_estimator = stacking_estimator
         self.memory = memory
         self.memory_level = memory_level
         self.n_jobs = n_jobs
 
     def fit(self, X, y):
-        """ stacking model fitting
-        X is 3D matrix
-        """
+        """Fit all estimators according to the given training data.
 
+        Parameters
+        ----------
+        X : {array-like, sparse-matrix}, shape (n_estimators, n_samples,
+                                                n_features)
+            Training vector, where n_samples is the number of samples and
+            n_features is the number of features.
+
+        y : array-like, shape (n_samples,)
+            Target vector relative to X.
+        """
+        _check_Xy(self, X, y)
         self.estimators = Parallel(n_jobs=self.n_jobs)(
             delayed(_fit_estimator)(clf, x, y)
             for x, clf in zip(X, self.estimators))
@@ -71,10 +112,20 @@ def fit(self, X, y):
         return self
 
     def predict(self, X):
-        """ stacking model prediction
-        X is 3D matrix
+        """Predict class labels for samples in X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape = (n_estimators,
+                                                  n_samples, n_features)
+            The multi-input samples.
+
+        Returns
+        -------
+        C : array, shape = (n_samples)
+            Predicted class label per sample.
         """
-
+        _check_Xy(self, X)
         predictions_ = Parallel(n_jobs=self.n_jobs)(
             delayed(_predict_proba_estimator)(clf, x)
             for x, clf in zip(X, self.estimators))
@@ -83,20 +134,69 @@ def predict(self, X):
         return self.stacking_estimator.predict(predictions_)
 
     def score(self, X, y):
-        """ stacking model accuracy
+        """Returns the mean accuracy on the given test data and labels.
+
+        In multi-label classification, this is the subset accuracy
+        which is a harsh metric since you require for each sample that
+        each label set be correctly predicted.
+
+        Parameters
+        ----------
+        X : array-like, shape = (n_estimators, n_samples, n_features)
+            The multi-input samples.
+
+        y : array-like, shape = (n_samples) or (n_samples, n_outputs)
+            True labels for X.
+
+
+        Returns
+        -------
+        score : float
+            Mean accuracy of self.predict(X) wrt. y.
         """
+        _check_Xy(self, X, y)
         return accuracy_score(y, self.predict(X))
 
     def predict_estimators(self, X):
-        """ prediction from separate estimators
+        """Predict class labels for samples in X for each estimators.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape = (n_estimators,
+                                                  n_samples, n_features)
+            The multi-input samples.
+
+        Returns
+        -------
+        C : array, shape = (n_samples, n_estimators)
+            Predicted class label per sample and estimators.
         """
+        _check_Xy(self, X)
         predictions_ = Parallel(n_jobs=self.n_jobs)(
             delayed(_predict_estimator)(clf, x)
             for x, clf in zip(X, self.estimators))
         return np.array(predictions_).T
 
     def score_estimators(self, X, y):
-        """ accuracy from separate estimators
+        """Returns the mean accuracy for each estimators.
+
+        In multi-label classification, this is the subset accuracy
+        which is a harsh metric since you require for each sample that
+        each label set be correctly predicted.
+
+        Parameters
+        ----------
+        X : array-like, shape = (n_estimators, n_samples, n_features)
+            The multi-input samples.
+
+        y : array-like, shape = (n_samples) or (n_samples, n_outputs)
+            True labels for X.
+
+        Returns
+        -------
+        score : list of float, shape (n_estimators,)
+            Mean accuracy of self.predict_estimators(X) wrt. y.
         """
+        _check_Xy(self, X, y)
         predictions_ = self.predict_estimators(X)
         return np.array([accuracy_score(y, p) for p in predictions_.T])