Skip to content

Commit

Permalink
Merge pull request #220 from uber/uplift_tree
Browse files Browse the repository at this point in the history
Uplift tree/forest: add feature importance and parallelize forest
  • Loading branch information
yungmsh authored Jul 29, 2020
2 parents f14c290 + b281cb1 commit b55d70e
Show file tree
Hide file tree
Showing 3 changed files with 751 additions and 543 deletions.
44 changes: 34 additions & 10 deletions causalml/inference/tree/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@
import scipy.stats as stats
import pandas as pd
from sklearn.utils.testing import ignore_warnings

from collections import defaultdict
from joblib import Parallel, delayed
import multiprocessing as mp

class DecisionTree:
""" Tree Node Class
Expand Down Expand Up @@ -176,14 +178,19 @@ def fit(self, X, treatment, y):
assert len(X) == len(y) and len(X) == len(treatment), 'Data length must be equal for X, treatment, and y.'

self.treatment_group = list(set(treatment))
self.feature_imp_dict = defaultdict(float)

self.fitted_uplift_tree = self.growDecisionTreeFrom(
X, treatment, y, evaluationFunction=self.evaluationFunction,
max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf,
depth=1, min_samples_treatment=self.min_samples_treatment,
n_reg=self.n_reg, parentNodeSummary=None
)
return self

self.feature_importances_ = np.zeros(X.shape[1])
for col, imp in self.feature_imp_dict.items():
self.feature_importances_[col] = imp
self.feature_importances_ /= self.feature_importances_.sum() # normalize to add to 1

# Prune Trees
def prune(self, X, treatment, y, minGain=0.0001, rule='maxAbsDiff'):
Expand Down Expand Up @@ -976,12 +983,14 @@ def growDecisionTreeFrom(self, X, treatment, y, evaluationFunction, max_depth=10
leftScore1 = evaluationFunction(leftNodeSummary)
rightScore2 = evaluationFunction(rightNodeSummary)
gain = (currentScore - p * leftScore1 - (1 - p) * rightScore2)
gain_for_imp = (len(X) * currentScore - len(X_l) * leftScore1 - len(X_r) * rightScore2)
else:
if (self.control_name in leftNodeSummary and
self.control_name in rightNodeSummary):
leftScore1 = evaluationFunction(leftNodeSummary, control_name=self.control_name)
rightScore2 = evaluationFunction(rightNodeSummary, control_name=self.control_name)
gain = (p * leftScore1 + (1 - p) * rightScore2 - currentScore)
gain_for_imp = (len(X_l) * leftScore1 + len(X_r) * rightScore2 - len(X) * currentScore)
if self.normalization:
norm_factor = self.normI(currentNodeSummary,
leftNodeSummary,
Expand All @@ -998,6 +1007,7 @@ def growDecisionTreeFrom(self, X, treatment, y, evaluationFunction, max_depth=10
bestAttribute = (col, value)
best_set_left = [X_l, w_l, y_l]
best_set_right = [X_r, w_r, y_r]
self.feature_imp_dict[bestAttribute[0]] += gain_for_imp

dcY = {'impurity': '%.3f' % currentScore, 'samples': '%d' % len(X)}
# Add treatment size
Expand Down Expand Up @@ -1200,7 +1210,8 @@ def __init__(self,
n_reg=10,
evaluationFunction=None,
control_name=None,
normalization=True):
normalization=True,
n_jobs=-1):
"""
Initialize the UpliftRandomForestClassifier class.
"""
Expand All @@ -1214,6 +1225,7 @@ def __init__(self,
self.n_reg = n_reg
self.evaluationFunction = evaluationFunction
self.control_name = control_name
self.n_jobs = n_jobs

# Create forest
self.uplift_forest = []
Expand All @@ -1229,6 +1241,9 @@ def __init__(self,

self.uplift_forest.append(uplift_tree)

if self.n_jobs == -1:
self.n_jobs = mp.cpu_count()

def fit(self, X, treatment, y):
"""
Fit the UpliftRandomForestClassifier.
Expand All @@ -1254,13 +1269,22 @@ def fit(self, X, treatment, y):
for i, treatment_group_key in enumerate(treatment_group_keys):
self.classes_[treatment_group_key] = i

# Bootstrap
for tree_i in range(len(self.uplift_forest)):
bt_index = np.random.choice(len(X), len(X))
x_train_bt = X[bt_index]
y_train_bt = y[bt_index]
treatment_train_bt = treatment[bt_index]
self.uplift_forest[tree_i].fit(X=x_train_bt, treatment=treatment_train_bt, y=y_train_bt)
self.uplift_forest = (
Parallel(n_jobs=self.n_jobs)
(delayed(self.bootstrap)(X, treatment, y, tree) for tree in self.uplift_forest)
)

all_importances = [tree.feature_importances_ for tree in self.uplift_forest]
self.feature_importances_ = np.mean(all_importances, axis=0)
self.feature_importances_ /= self.feature_importances_.sum() # normalize to add to 1

def bootstrap(self, X, treatment, y, tree):
bt_index = np.random.choice(len(X), len(X))
x_train_bt = X[bt_index]
y_train_bt = y[bt_index]
treatment_train_bt = treatment[bt_index]
tree.fit(X=x_train_bt, treatment=treatment_train_bt, y=y_train_bt)
return tree

@ignore_warnings(category=FutureWarning)
def predict(self, X, full_output=False):
Expand Down
1,248 changes: 716 additions & 532 deletions examples/feature_interpretations_example.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
setuptools>=41.0.0
pip>=10.0
numpy>=0.16.0
numpy>=0.16.0,<1.19.0
scipy==1.4.1
matplotlib
pandas>=0.24.1
Expand Down

0 comments on commit b55d70e

Please sign in to comment.