microsoft · najeeb-kazmi · Jul 19, 2019 · Jul 19, 2019 · Jul 19, 2019
diff --git a/src/python/nimbusml.pyproj b/src/python/nimbusml.pyproj
@@ -177,6 +177,7 @@
     <Compile Include="nimbusml\examples\PipelineWithGridSearchCV2.py" />
     <Compile Include="nimbusml\examples\PipelineWithGridSearchCV1.py" />
     <Compile Include="nimbusml\examples\pipeline.py" />
+    <Compile Include="nimbusml\examples\PipelineWithFeatureContributions.py" />
     <Compile Include="nimbusml\examples\Poisson.py" />
     <Compile Include="nimbusml\examples\PoissonRegressionRegressor.py" />
     <Compile Include="nimbusml\examples\RangeFilter.py" />

diff --git a/src/python/nimbusml/base_predictor.py b/src/python/nimbusml/base_predictor.py
@@ -97,6 +97,10 @@ def _invoke_inference_method(self, method, X, **params):
         data = getattr(pipeline, method)(X, **params)
         return data
 
+    @trace
+    def get_feature_contributions(self, X, **params):
+        return self._invoke_inference_method('get_feature_contributions', X, **params)
+
     @trace
     def predict(self, X, **params):
         """

diff --git a/src/python/nimbusml/examples/PipelineWithFeatureContributions.py b/src/python/nimbusml/examples/PipelineWithFeatureContributions.py
@@ -0,0 +1,86 @@
+###############################################################################
+# Pipeline with observation level feature contributions
+
+# Scoring a dataset with a trained model produces a score, or prediction, for
+# each example. To understand and explain these predictions it can be useful to
+# inspect which features influenced them most significantly. This function
+# computes a model-specific list of per-feature contributions to the score for
+# each example. These contributions can be positive (they make the score
+# higher) or negative (they make the score lower).
+
+from nimbusml import Pipeline, FileDataStream
+from nimbusml.datasets import get_dataset
+from nimbusml.ensemble import FastTreesBinaryClassifier
+from nimbusml.linear_model import LogisticRegressionBinaryClassifier
+
+# data input (as a FileDataStream)
+path = get_dataset('uciadult_train').as_filepath()
+
+data = FileDataStream.read_csv(path)
+print(data.head())
+#    label  workclass     education  ... capital-loss hours-per-week
+# 0      0    Private          11th  ...            0             40
+# 1      0    Private       HS-grad  ...            0             50
+# 2      1  Local-gov    Assoc-acdm  ...            0             40
+# 3      1    Private  Some-college  ...            0             40
+# 4      0          ?  Some-college  ...            0             30
+
+# define the training pipeline with a linear model
+lr_pipeline = Pipeline([LogisticRegressionBinaryClassifier(
+    feature=['age', 'education-num', 'hours-per-week'], label='label')])
+
+# train the model
+lr_model = lr_pipeline.fit(data)
+
+# For linear models, the contribution of a given feature is equal to the
+# product of feature value times the corresponding weight. Similarly, for
+# Generalized Additive Models (GAM), the contribution of a feature is equal to
+# the shape function for the given feature evaluated at the feature value.
+lr_feature_contributions = lr_model.get_feature_contributions(data)
+
+# Print predictions with feature contributions, which give a relative measure
+# of how much each feature impacted the Score.
+print("========== Feature Contributions for Linear Model ==========")
+print(lr_feature_contributions.head())
+#   label  ... PredictedLabel     Score ... FeatureContributions.hours-per-week
+# 0     0  ...              0 -2.010687 ...                            0.833069
+# 1     0  ...              0 -1.216163 ...                            0.809928
+# 2     1  ...              0 -1.248412 ...                            0.485957
+# 3     1  ...              0 -1.132419 ...                            0.583148
+# 4     0  ...              0 -1.969522 ...                            0.437361
+
+# define the training pipeline with a tree model
+tree_pipeline = Pipeline([FastTreesBinaryClassifier(
+    feature=['age', 'education-num', 'hours-per-week'], label='label')])
+
+# train the model
+tree_model = tree_pipeline.fit(data)
+
+# For tree-based models, the calculation of feature contribution essentially
+# consists in determining which splits in the tree have the most impact on the
+# final score and assigning the value of the impact to the features determining
+# the split. More precisely, the contribution of a feature is equal to the
+# change in score produced by exploring the opposite sub-tree every time a
+# decision node for the given feature is encountered.
+# 
+# Consider a simple case with a single decision tree that has a decision node
+# for the binary feature F1. Given an example that has feature F1 equal to
+# true, we can calculate the score it would have obtained if we chose the
+# subtree corresponding to the feature F1 being equal to false while keeping
+# the other features constant. The contribution of feature F1 for the given
+# example is the difference between the original score and the score obtained
+# by taking the opposite decision at the node corresponding to feature F1. This
+#  algorithm extends naturally to models with many decision trees.
+tree_feature_contributions = tree_model.get_feature_contributions(data)
+
+# Print predictions with feature contributions, which give a relative measure
+# of how much each feature impacted the Score.
+print("========== Feature Contributions for Tree Model ==========")
+print(tree_feature_contributions.head())
+#    label  ... PredictedLabel      Score ... FeatureContributions.hours-per-week
+# 0      0  ...              0 -16.717360 ...                           -0.608664
+# 1      0  ...              0  -7.688200 ...                           -0.541213
+# 2      1  ...              1   1.571164 ...                            0.032862
+# 3      1  ...              1   2.115638 ...                            0.537077
+# 4      0  ...              0 -23.038410 ...                           -0.682764
+
diff --git a/src/python/nimbusml/internal/core/base_pipeline_item.py b/src/python/nimbusml/internal/core/base_pipeline_item.py
@@ -15,6 +15,7 @@
 from abc import ABCMeta, abstractmethod
 from collections import OrderedDict
 from itertools import chain
+from shutil import copyfile
 from textwrap import wrap
 
 import six
@@ -447,6 +448,19 @@ def get_roles_params(self):
             res["columns"] = pars
         return res
 
+    @trace
+    def save_model(self, dst):
+        """
+        Save model to file. For more details, please refer to
+        `load/save model </nimbusml/loadsavemodels>`_
+
+        :param dst: filename to be saved with
+
+        """
+        if self.model_ is not None:
+            if os.path.isfile(self.model_):
+                copyfile(self.model_, dst)
+
     def __getitem__(self, cols):
         """
         Returns a View on this element restricted to the selected column.

diff --git a/src/python/nimbusml/pipeline.py b/src/python/nimbusml/pipeline.py
@@ -41,6 +41,8 @@
     transforms_datasetscorer
 from .internal.entrypoints.transforms_featurecombiner import \
     transforms_featurecombiner
+from .internal.entrypoints.transforms_featurecontributioncalculationtransformer import \
+    transforms_featurecontributioncalculationtransformer
 from .internal.entrypoints.transforms_labelcolumnkeybooleanconverter \
     import \
     transforms_labelcolumnkeybooleanconverter
@@ -1694,6 +1696,120 @@ def getn(n):
                     "only fit(X) is allowed or the training becomes "
                     "ambiguous.")
 
+    @trace
+    def get_feature_contributions(self, X, top=10, bottom=10, verbose=0, 
+                                  as_binary_data_stream=False, **params):
+        """
+        Calculates observation level feature contributions. Returns dataframe
+        with raw data, predictions, and feature contributiuons for each
+        prediction. Feature contributions are not supported for transforms, so
+        make sure that the last step in a pipeline is a model. Feature
+        contriutions are supported for the following models:
+
+        * Regression:
+
+            * OrdinaryLeastSquaresRegressor
+            * FastLinearRegressor
+            * OnlineGradientDescentRegressor
+            * PoissonRegressionRegressor
+            * GamRegressor
+            * LightGbmRegressor
+            * FastTreesRegressor
+            * FastForestRegressor
+            * FastTreesTweedieRegressor
+
+        * Binary Classification:
+
+            * AveragedPerceptronBinaryClassifier
+            * LinearSvmBinaryClassifier
+            * LogisticRegressionBinaryClassifier
+            * FastLinearBinaryClassifier
+            * SgdBinaryClassifier
+            * SymSgdBinaryClassifier
+            * GamBinaryClassifier
+            * FastForestBinaryClassifier
+            * FastTreesBinaryClassifier
+            * LightGbmBinaryClassifier
+
+        * Ranking:
+
+            * LightGbmRanker
+
+        :param X: {array-like [n_samples, n_features],
+            :py:class:`nimbusml.FileDataStream` }
+        :param top: the number of positive contributions with highest magnitude
+            to report.
+        :param bottom: The number of negative contributions with highest
+            magnitude to report.
+        :return: dataframe of containing the raw data, predicted label, score,
+            probabilities, and feature contributions.
+        """
+        self.verbose = verbose
+
+        if not self._is_fitted:
+            raise ValueError(
+                "Model is not fitted. Train or load a model before test().")
+
+        if len(self.steps) > 0:
+            last_node = self.last_node
+            if last_node.type == 'transform':
+                raise ValueError(
+                    "Pipeline needs a trainer as last step for test()")
+
+        X, y_temp, columns_renamed, feature_columns, label_column, \
+            schema, weights, weight_column = self._preprocess_X_y(X)
+
+        all_nodes = []
+        inputs = dict([('data', ''), ('predictor_model', self.model)])
+        if isinstance(X, FileDataStream):
+            importtext_node = data_customtextloader(
+                input_file="$file",
+                data="$data",
+                custom_schema=schema.to_string(
+                    add_sep=True))
+            all_nodes = [importtext_node]
+            inputs = dict([('file', ''), ('predictor_model', self.model)])
+
+        score_node = transforms_datasetscorer(
+            data="$data",
+            predictor_model="$predictor_model",
+            scored_data="$scoredvectordata")
+
+        fcc_node = transforms_featurecontributioncalculationtransformer(
+            data="$scoredvectordata",
+            predictor_model="$predictor_model",
+            output_data="$output_data",
+            top=top,
+            bottom=bottom,
+            normalize=True)
+
+        all_nodes.extend([score_node, fcc_node])
+
+        outputs = dict(output_data="")
+
+        graph = Graph(
+            inputs,
+            outputs,
+            as_binary_data_stream,
+            *all_nodes)
+
+        class_name = type(self).__name__
+        method_name = inspect.currentframe().f_code.co_name
+        telemetry_info = ".".join([class_name, method_name])
+
+        try:
+            (out_model, out_data, out_metrics) = graph.run(
+                X=X,
+                random_state=self.random_state,
+                model=self.model,
+                verbose=verbose,
+                telemetry_info=telemetry_info,
+                **params)
+        except RuntimeError as e:
+            raise e
+
+        return out_data
+
     @trace
     def _predict(self, X, y=None,
                  evaltype='auto', group_id=None,
@@ -1943,7 +2059,7 @@ def test(
             otherwise None
             in the returned tuple.
         :return: tuple (dataframe of evaluation metrics, dataframe of
-            scores). Is scores are
+            scores). If scores are
             required, set `output_scores`=True, otherwise None is
             returned by default.
         """

diff --git a/src/python/nimbusml/tests/pipeline/test_load_save.py b/src/python/nimbusml/tests/pipeline/test_load_save.py
@@ -222,6 +222,109 @@ def test_unfitted_pickled_pipeline_can_be_fit(self):
                             metrics_pickle.sum().sum(),
                             decimal=2)
 
+    def test_unpickled_pipeline_has_feature_contributions(self):
+        features = ['age', 'education-num', 'hours-per-week']
+
+        model_nimbusml = Pipeline(
+            steps=[FastLinearBinaryClassifier(feature=features)])
+        model_nimbusml.fit(train, label)
+        fc = model_nimbusml.get_feature_contributions(test)
+
+        # Save with pickle
+        pickle_filename = 'nimbusml_model.p'
+        with open(pickle_filename, 'wb') as f:
+            pickle.dump(model_nimbusml, f)
+        # Unpickle model
+        with open(pickle_filename, "rb") as f:
+            model_nimbusml_pickle = pickle.load(f)
+
+        fc_pickle = model_nimbusml_pickle.get_feature_contributions(test)
+
+        assert ['FeatureContributions.' + feature in fc_pickle.columns
+                for feature in features]
+
+        assert [fc['FeatureContributions.' + feature].equals(
+            fc_pickle['FeatureContributions.' + feature])
+                for feature in features]
+
+        os.remove(pickle_filename)
+
+    def test_unpickled_predictor_has_feature_contributions(self):
+        features = ['age', 'education-num', 'hours-per-week']
+
+        model_nimbusml = FastLinearBinaryClassifier(feature=features)
+        model_nimbusml.fit(train, label)
+        fc = model_nimbusml.get_feature_contributions(test)
+
+        # Save with pickle
+        pickle_filename = 'nimbusml_model.p'
+        with open(pickle_filename, 'wb') as f:
+            pickle.dump(model_nimbusml, f)
+        # Unpickle model
+        with open(pickle_filename, "rb") as f:
+            model_nimbusml_pickle = pickle.load(f)
+
+        fc_pickle = model_nimbusml_pickle.get_feature_contributions(test)
+
+        assert ['FeatureContributions.' + feature in fc_pickle.columns
+                for feature in features]
+
+        assert [fc['FeatureContributions.' + feature].equals(
+            fc_pickle['FeatureContributions.' + feature])
+                for feature in features]
+
+        os.remove(pickle_filename)
+
+    def test_pipeline_loaded_from_zip_has_feature_contributions(self):
+        features = ['age', 'education-num', 'hours-per-week']
+
+        model_nimbusml = Pipeline(
+            steps=[FastLinearBinaryClassifier(feature=features)])
+        model_nimbusml.fit(train, label)
+        fc = model_nimbusml.get_feature_contributions(test)
+
+        # Save the model to zip
+        model_filename = 'nimbusml_model.zip'
+        model_nimbusml.save_model(model_filename)
+        # Load the model from zip
+        model_nimbusml_zip = Pipeline()
+        model_nimbusml_zip.load_model(model_filename)
+
+        fc_zip = model_nimbusml_zip.get_feature_contributions(test)
+
+        assert ['FeatureContributions.' + feature in fc_zip.columns
+                for feature in features]
+
+        assert [fc['FeatureContributions.' + feature].equals(
+            fc_zip['FeatureContributions.' + feature])
+                for feature in features]
+
+        os.remove(model_filename)
+
+    def test_predictor_loaded_from_zip_has_feature_contributions(self):
+        features = ['age', 'education-num', 'hours-per-week']
+
+        model_nimbusml = FastLinearBinaryClassifier(feature=features)
+        model_nimbusml.fit(train, label)
+        fc = model_nimbusml.get_feature_contributions(test)
+
+        # Save the model to zip
+        model_filename = 'nimbusml_model.zip'
+        model_nimbusml.save_model(model_filename)
+        # Load the model from zip
+        model_nimbusml_zip = Pipeline()
+        model_nimbusml_zip.load_model(model_filename)
+
+        fc_zip = model_nimbusml_zip.get_feature_contributions(test)
+
+        assert ['FeatureContributions.' + feature in fc_zip.columns
+                for feature in features]
+
+        assert [fc['FeatureContributions.' + feature].equals(
+            fc_zip['FeatureContributions.' + feature])
+                for feature in features]
+
+        os.remove(model_filename)
 
 if __name__ == '__main__':
     unittest.main()