Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Uplift tree/forest: add feature importance and parallelize forest #220

Merged
merged 4 commits into from
Jul 29, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 34 additions & 10 deletions causalml/inference/tree/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@
import scipy.stats as stats
import pandas as pd
from sklearn.utils.testing import ignore_warnings

from collections import defaultdict
from joblib import Parallel, delayed
import multiprocessing as mp

class DecisionTree:
""" Tree Node Class
Expand Down Expand Up @@ -176,14 +178,19 @@ def fit(self, X, treatment, y):
assert len(X) == len(y) and len(X) == len(treatment), 'Data length must be equal for X, treatment, and y.'

self.treatment_group = list(set(treatment))
self.feature_imp_dict = defaultdict(float)

self.fitted_uplift_tree = self.growDecisionTreeFrom(
X, treatment, y, evaluationFunction=self.evaluationFunction,
max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf,
depth=1, min_samples_treatment=self.min_samples_treatment,
n_reg=self.n_reg, parentNodeSummary=None
)
return self

self.feature_importances_ = np.zeros(X.shape[1])
for col, imp in self.feature_imp_dict.items():
self.feature_importances_[col] = imp
self.feature_importances_ /= self.feature_importances_.sum() # normalize to add to 1

# Prune Trees
def prune(self, X, treatment, y, minGain=0.0001, rule='maxAbsDiff'):
Expand Down Expand Up @@ -976,12 +983,14 @@ def growDecisionTreeFrom(self, X, treatment, y, evaluationFunction, max_depth=10
leftScore1 = evaluationFunction(leftNodeSummary)
rightScore2 = evaluationFunction(rightNodeSummary)
gain = (currentScore - p * leftScore1 - (1 - p) * rightScore2)
gain_for_imp = (len(X) * currentScore - len(X_l) * leftScore1 - len(X_r) * rightScore2)
else:
if (self.control_name in leftNodeSummary and
self.control_name in rightNodeSummary):
leftScore1 = evaluationFunction(leftNodeSummary, control_name=self.control_name)
rightScore2 = evaluationFunction(rightNodeSummary, control_name=self.control_name)
gain = (p * leftScore1 + (1 - p) * rightScore2 - currentScore)
gain_for_imp = (len(X_l) * leftScore1 + len(X_r) * rightScore2 - len(X) * currentScore)
if self.normalization:
norm_factor = self.normI(currentNodeSummary,
leftNodeSummary,
Expand All @@ -998,6 +1007,7 @@ def growDecisionTreeFrom(self, X, treatment, y, evaluationFunction, max_depth=10
bestAttribute = (col, value)
best_set_left = [X_l, w_l, y_l]
best_set_right = [X_r, w_r, y_r]
self.feature_imp_dict[bestAttribute[0]] += gain_for_imp

dcY = {'impurity': '%.3f' % currentScore, 'samples': '%d' % len(X)}
# Add treatment size
Expand Down Expand Up @@ -1200,7 +1210,8 @@ def __init__(self,
n_reg=10,
evaluationFunction=None,
control_name=None,
normalization=True):
normalization=True,
n_jobs=-1):
"""
Initialize the UpliftRandomForestClassifier class.
"""
Expand All @@ -1214,6 +1225,7 @@ def __init__(self,
self.n_reg = n_reg
self.evaluationFunction = evaluationFunction
self.control_name = control_name
self.n_jobs = n_jobs

# Create forest
self.uplift_forest = []
Expand All @@ -1229,6 +1241,9 @@ def __init__(self,

self.uplift_forest.append(uplift_tree)

if self.n_jobs == -1:
self.n_jobs = mp.cpu_count()

def fit(self, X, treatment, y):
"""
Fit the UpliftRandomForestClassifier.
Expand All @@ -1254,13 +1269,22 @@ def fit(self, X, treatment, y):
for i, treatment_group_key in enumerate(treatment_group_keys):
self.classes_[treatment_group_key] = i

# Bootstrap
for tree_i in range(len(self.uplift_forest)):
bt_index = np.random.choice(len(X), len(X))
x_train_bt = X[bt_index]
y_train_bt = y[bt_index]
treatment_train_bt = treatment[bt_index]
self.uplift_forest[tree_i].fit(X=x_train_bt, treatment=treatment_train_bt, y=y_train_bt)
self.uplift_forest = (
Parallel(n_jobs=self.n_jobs)
(delayed(self.bootstrap)(X, treatment, y, tree) for tree in self.uplift_forest)
)

all_importances = [tree.feature_importances_ for tree in self.uplift_forest]
self.feature_importances_ = np.mean(all_importances, axis=0)
self.feature_importances_ /= self.feature_importances_.sum() # normalize to add to 1
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need a self.feature_importances_.sum() > 0 check here as shown in your compute_feature_importances() reference to avoid dividing by zero (e.g., when root is pure)? This might be an extreme edge case though

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good callout- I left it out because it should be the rare extreme case (also, if the root is pure, the user is not using uplift trees correctly)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But yeah if we want to prevent an error from raising, we can add the condition

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agree your point that it's minor and more of an user error, thanks for confirming!


def bootstrap(self, X, treatment, y, tree):
bt_index = np.random.choice(len(X), len(X))
x_train_bt = X[bt_index]
y_train_bt = y[bt_index]
treatment_train_bt = treatment[bt_index]
tree.fit(X=x_train_bt, treatment=treatment_train_bt, y=y_train_bt)
return tree

@ignore_warnings(category=FutureWarning)
def predict(self, X, full_output=False):
Expand Down
1,248 changes: 716 additions & 532 deletions examples/feature_interpretations_example.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
setuptools>=41.0.0
pip>=10.0
numpy>=0.16.0
numpy>=0.16.0,<1.19.0
scipy==1.4.1
matplotlib
pandas>=0.24.1
Expand Down