Skip to content
This repository was archived by the owner on Nov 16, 2023. It is now read-only.

Add observation level feature contributions to Pipeline and BasePredictor #196

Merged
merged 2 commits into from
Jul 19, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/python/nimbusml.pyproj
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@
<Compile Include="nimbusml\examples\PipelineWithGridSearchCV2.py" />
<Compile Include="nimbusml\examples\PipelineWithGridSearchCV1.py" />
<Compile Include="nimbusml\examples\pipeline.py" />
<Compile Include="nimbusml\examples\PipelineWithFeatureContributions.py" />
<Compile Include="nimbusml\examples\Poisson.py" />
<Compile Include="nimbusml\examples\PoissonRegressionRegressor.py" />
<Compile Include="nimbusml\examples\RangeFilter.py" />
Expand Down
4 changes: 4 additions & 0 deletions src/python/nimbusml/base_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,10 @@ def _invoke_inference_method(self, method, X, **params):
data = getattr(pipeline, method)(X, **params)
return data

@trace
def get_feature_contributions(self, X, **params):
return self._invoke_inference_method('get_feature_contributions', X, **params)

@trace
def predict(self, X, **params):
"""
Expand Down
86 changes: 86 additions & 0 deletions src/python/nimbusml/examples/PipelineWithFeatureContributions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
###############################################################################
# Pipeline with observation level feature contributions

# Scoring a dataset with a trained model produces a score, or prediction, for
# each example. To understand and explain these predictions it can be useful to
# inspect which features influenced them most significantly. This function
# computes a model-specific list of per-feature contributions to the score for
# each example. These contributions can be positive (they make the score
# higher) or negative (they make the score lower).

from nimbusml import Pipeline, FileDataStream
from nimbusml.datasets import get_dataset
from nimbusml.ensemble import FastTreesBinaryClassifier
from nimbusml.linear_model import LogisticRegressionBinaryClassifier

# data input (as a FileDataStream)
path = get_dataset('uciadult_train').as_filepath()

data = FileDataStream.read_csv(path)
print(data.head())
# label workclass education ... capital-loss hours-per-week
# 0 0 Private 11th ... 0 40
# 1 0 Private HS-grad ... 0 50
# 2 1 Local-gov Assoc-acdm ... 0 40
# 3 1 Private Some-college ... 0 40
# 4 0 ? Some-college ... 0 30

# define the training pipeline with a linear model
lr_pipeline = Pipeline([LogisticRegressionBinaryClassifier(
feature=['age', 'education-num', 'hours-per-week'], label='label')])

# train the model
lr_model = lr_pipeline.fit(data)

# For linear models, the contribution of a given feature is equal to the
# product of feature value times the corresponding weight. Similarly, for
# Generalized Additive Models (GAM), the contribution of a feature is equal to
# the shape function for the given feature evaluated at the feature value.
lr_feature_contributions = lr_model.get_feature_contributions(data)

# Print predictions with feature contributions, which give a relative measure
# of how much each feature impacted the Score.
print("========== Feature Contributions for Linear Model ==========")
print(lr_feature_contributions.head())
# label ... PredictedLabel Score ... FeatureContributions.hours-per-week
# 0 0 ... 0 -2.010687 ... 0.833069
# 1 0 ... 0 -1.216163 ... 0.809928
# 2 1 ... 0 -1.248412 ... 0.485957
# 3 1 ... 0 -1.132419 ... 0.583148
# 4 0 ... 0 -1.969522 ... 0.437361

# define the training pipeline with a tree model
tree_pipeline = Pipeline([FastTreesBinaryClassifier(
feature=['age', 'education-num', 'hours-per-week'], label='label')])

# train the model
tree_model = tree_pipeline.fit(data)

# For tree-based models, the calculation of feature contribution essentially
# consists in determining which splits in the tree have the most impact on the
# final score and assigning the value of the impact to the features determining
# the split. More precisely, the contribution of a feature is equal to the
# change in score produced by exploring the opposite sub-tree every time a
# decision node for the given feature is encountered.
#
# Consider a simple case with a single decision tree that has a decision node
# for the binary feature F1. Given an example that has feature F1 equal to
# true, we can calculate the score it would have obtained if we chose the
# subtree corresponding to the feature F1 being equal to false while keeping
# the other features constant. The contribution of feature F1 for the given
# example is the difference between the original score and the score obtained
# by taking the opposite decision at the node corresponding to feature F1. This
# algorithm extends naturally to models with many decision trees.
tree_feature_contributions = tree_model.get_feature_contributions(data)

# Print predictions with feature contributions, which give a relative measure
# of how much each feature impacted the Score.
print("========== Feature Contributions for Tree Model ==========")
print(tree_feature_contributions.head())
# label ... PredictedLabel Score ... FeatureContributions.hours-per-week
# 0 0 ... 0 -16.717360 ... -0.608664
# 1 0 ... 0 -7.688200 ... -0.541213
# 2 1 ... 1 1.571164 ... 0.032862
# 3 1 ... 1 2.115638 ... 0.537077
# 4 0 ... 0 -23.038410 ... -0.682764

14 changes: 14 additions & 0 deletions src/python/nimbusml/internal/core/base_pipeline_item.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from abc import ABCMeta, abstractmethod
from collections import OrderedDict
from itertools import chain
from shutil import copyfile
from textwrap import wrap

import six
Expand Down Expand Up @@ -447,6 +448,19 @@ def get_roles_params(self):
res["columns"] = pars
return res

@trace
def save_model(self, dst):
"""
Save model to file. For more details, please refer to
`load/save model </nimbusml/loadsavemodels>`_

:param dst: filename to be saved with

"""
if self.model_ is not None:
if os.path.isfile(self.model_):
copyfile(self.model_, dst)

def __getitem__(self, cols):
"""
Returns a View on this element restricted to the selected column.
Expand Down
118 changes: 117 additions & 1 deletion src/python/nimbusml/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@
transforms_datasetscorer
from .internal.entrypoints.transforms_featurecombiner import \
transforms_featurecombiner
from .internal.entrypoints.transforms_featurecontributioncalculationtransformer import \
transforms_featurecontributioncalculationtransformer
from .internal.entrypoints.transforms_labelcolumnkeybooleanconverter \
import \
transforms_labelcolumnkeybooleanconverter
Expand Down Expand Up @@ -1694,6 +1696,120 @@ def getn(n):
"only fit(X) is allowed or the training becomes "
"ambiguous.")

@trace
def get_feature_contributions(self, X, top=10, bottom=10, verbose=0,
as_binary_data_stream=False, **params):
"""
Calculates observation level feature contributions. Returns dataframe
with raw data, predictions, and feature contributiuons for each
prediction. Feature contributions are not supported for transforms, so
make sure that the last step in a pipeline is a model. Feature
contriutions are supported for the following models:

* Regression:

* OrdinaryLeastSquaresRegressor
* FastLinearRegressor
* OnlineGradientDescentRegressor
* PoissonRegressionRegressor
* GamRegressor
* LightGbmRegressor
* FastTreesRegressor
* FastForestRegressor
* FastTreesTweedieRegressor

* Binary Classification:

* AveragedPerceptronBinaryClassifier
* LinearSvmBinaryClassifier
* LogisticRegressionBinaryClassifier
* FastLinearBinaryClassifier
* SgdBinaryClassifier
* SymSgdBinaryClassifier
* GamBinaryClassifier
* FastForestBinaryClassifier
* FastTreesBinaryClassifier
* LightGbmBinaryClassifier

* Ranking:

* LightGbmRanker

:param X: {array-like [n_samples, n_features],
:py:class:`nimbusml.FileDataStream` }
:param top: the number of positive contributions with highest magnitude
to report.
:param bottom: The number of negative contributions with highest
magnitude to report.
:return: dataframe of containing the raw data, predicted label, score,
probabilities, and feature contributions.
"""
self.verbose = verbose

if not self._is_fitted:
raise ValueError(
"Model is not fitted. Train or load a model before test().")

if len(self.steps) > 0:
last_node = self.last_node
if last_node.type == 'transform':
raise ValueError(
"Pipeline needs a trainer as last step for test()")

X, y_temp, columns_renamed, feature_columns, label_column, \
schema, weights, weight_column = self._preprocess_X_y(X)

all_nodes = []
inputs = dict([('data', ''), ('predictor_model', self.model)])
if isinstance(X, FileDataStream):
importtext_node = data_customtextloader(
input_file="$file",
data="$data",
custom_schema=schema.to_string(
add_sep=True))
all_nodes = [importtext_node]
inputs = dict([('file', ''), ('predictor_model', self.model)])

score_node = transforms_datasetscorer(
data="$data",
predictor_model="$predictor_model",
scored_data="$scoredvectordata")

fcc_node = transforms_featurecontributioncalculationtransformer(
data="$scoredvectordata",
predictor_model="$predictor_model",
output_data="$output_data",
top=top,
bottom=bottom,
normalize=True)

all_nodes.extend([score_node, fcc_node])

outputs = dict(output_data="")

graph = Graph(
inputs,
outputs,
as_binary_data_stream,
*all_nodes)

class_name = type(self).__name__
method_name = inspect.currentframe().f_code.co_name
telemetry_info = ".".join([class_name, method_name])

try:
(out_model, out_data, out_metrics) = graph.run(
X=X,
random_state=self.random_state,
model=self.model,
verbose=verbose,
telemetry_info=telemetry_info,
**params)
except RuntimeError as e:
raise e

return out_data

@trace
def _predict(self, X, y=None,
evaltype='auto', group_id=None,
Expand Down Expand Up @@ -1943,7 +2059,7 @@ def test(
otherwise None
in the returned tuple.
:return: tuple (dataframe of evaluation metrics, dataframe of
scores). Is scores are
scores). If scores are
required, set `output_scores`=True, otherwise None is
returned by default.
"""
Expand Down
103 changes: 103 additions & 0 deletions src/python/nimbusml/tests/pipeline/test_load_save.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,109 @@ def test_unfitted_pickled_pipeline_can_be_fit(self):
metrics_pickle.sum().sum(),
decimal=2)

def test_unpickled_pipeline_has_feature_contributions(self):
features = ['age', 'education-num', 'hours-per-week']

model_nimbusml = Pipeline(
steps=[FastLinearBinaryClassifier(feature=features)])
model_nimbusml.fit(train, label)
fc = model_nimbusml.get_feature_contributions(test)

# Save with pickle
pickle_filename = 'nimbusml_model.p'
with open(pickle_filename, 'wb') as f:
pickle.dump(model_nimbusml, f)
# Unpickle model
with open(pickle_filename, "rb") as f:
model_nimbusml_pickle = pickle.load(f)

fc_pickle = model_nimbusml_pickle.get_feature_contributions(test)

assert ['FeatureContributions.' + feature in fc_pickle.columns
for feature in features]

assert [fc['FeatureContributions.' + feature].equals(
fc_pickle['FeatureContributions.' + feature])
for feature in features]

os.remove(pickle_filename)

def test_unpickled_predictor_has_feature_contributions(self):
features = ['age', 'education-num', 'hours-per-week']

model_nimbusml = FastLinearBinaryClassifier(feature=features)
model_nimbusml.fit(train, label)
fc = model_nimbusml.get_feature_contributions(test)

# Save with pickle
pickle_filename = 'nimbusml_model.p'
with open(pickle_filename, 'wb') as f:
pickle.dump(model_nimbusml, f)
# Unpickle model
with open(pickle_filename, "rb") as f:
model_nimbusml_pickle = pickle.load(f)

fc_pickle = model_nimbusml_pickle.get_feature_contributions(test)

assert ['FeatureContributions.' + feature in fc_pickle.columns
for feature in features]

assert [fc['FeatureContributions.' + feature].equals(
fc_pickle['FeatureContributions.' + feature])
for feature in features]

os.remove(pickle_filename)

def test_pipeline_loaded_from_zip_has_feature_contributions(self):
features = ['age', 'education-num', 'hours-per-week']

model_nimbusml = Pipeline(
steps=[FastLinearBinaryClassifier(feature=features)])
model_nimbusml.fit(train, label)
fc = model_nimbusml.get_feature_contributions(test)

# Save the model to zip
model_filename = 'nimbusml_model.zip'
model_nimbusml.save_model(model_filename)
# Load the model from zip
model_nimbusml_zip = Pipeline()
model_nimbusml_zip.load_model(model_filename)

fc_zip = model_nimbusml_zip.get_feature_contributions(test)

assert ['FeatureContributions.' + feature in fc_zip.columns
for feature in features]

assert [fc['FeatureContributions.' + feature].equals(
fc_zip['FeatureContributions.' + feature])
for feature in features]

os.remove(model_filename)

def test_predictor_loaded_from_zip_has_feature_contributions(self):
features = ['age', 'education-num', 'hours-per-week']

model_nimbusml = FastLinearBinaryClassifier(feature=features)
model_nimbusml.fit(train, label)
fc = model_nimbusml.get_feature_contributions(test)

# Save the model to zip
model_filename = 'nimbusml_model.zip'
model_nimbusml.save_model(model_filename)
# Load the model from zip
model_nimbusml_zip = Pipeline()
model_nimbusml_zip.load_model(model_filename)

fc_zip = model_nimbusml_zip.get_feature_contributions(test)

assert ['FeatureContributions.' + feature in fc_zip.columns
for feature in features]

assert [fc['FeatureContributions.' + feature].equals(
fc_zip['FeatureContributions.' + feature])
for feature in features]

os.remove(model_filename)

if __name__ == '__main__':
unittest.main()