Merge pull request #220 from uber/uplift_tree

Uplift tree/forest: add feature importance and parallelize forest
uber · Jul 29, 2020 · b55d70e · b55d70e
2 parents f14c290 + b281cb1
commit b55d70e
Show file tree

Hide file tree

Showing 3 changed files with 751 additions and 543 deletions.
diff --git a/causalml/inference/tree/models.py b/causalml/inference/tree/models.py
@@ -19,7 +19,9 @@
 import scipy.stats as stats
 import pandas as pd
 from sklearn.utils.testing import ignore_warnings
-
+from collections import defaultdict
+from joblib import Parallel, delayed
+import multiprocessing as mp
 
 class DecisionTree:
     """ Tree Node Class
@@ -176,14 +178,19 @@ def fit(self, X, treatment, y):
         assert len(X) == len(y) and len(X) == len(treatment), 'Data length must be equal for X, treatment, and y.'
 
         self.treatment_group = list(set(treatment))
+        self.feature_imp_dict = defaultdict(float)
 
         self.fitted_uplift_tree = self.growDecisionTreeFrom(
             X, treatment, y, evaluationFunction=self.evaluationFunction,
             max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf,
             depth=1, min_samples_treatment=self.min_samples_treatment,
             n_reg=self.n_reg, parentNodeSummary=None
         )
-        return self
+
+        self.feature_importances_ = np.zeros(X.shape[1])
+        for col, imp in self.feature_imp_dict.items():
+            self.feature_importances_[col] = imp
+        self.feature_importances_ /= self.feature_importances_.sum()  # normalize to add to 1
 
     # Prune Trees
     def prune(self, X, treatment, y, minGain=0.0001, rule='maxAbsDiff'):
@@ -976,12 +983,14 @@ def growDecisionTreeFrom(self, X, treatment, y, evaluationFunction, max_depth=10
                     leftScore1 = evaluationFunction(leftNodeSummary)
                     rightScore2 = evaluationFunction(rightNodeSummary)
                     gain = (currentScore - p * leftScore1 - (1 - p) * rightScore2)
+                    gain_for_imp = (len(X) * currentScore - len(X_l) * leftScore1 - len(X_r) * rightScore2)
                 else:
                     if (self.control_name in leftNodeSummary and
                         self.control_name in rightNodeSummary):
                         leftScore1 = evaluationFunction(leftNodeSummary, control_name=self.control_name)
                         rightScore2 = evaluationFunction(rightNodeSummary, control_name=self.control_name)
                         gain = (p * leftScore1 + (1 - p) * rightScore2 - currentScore)
+                        gain_for_imp = (len(X_l) * leftScore1 + len(X_r) * rightScore2 - len(X) * currentScore)
                         if self.normalization:
                             norm_factor = self.normI(currentNodeSummary,
                                                      leftNodeSummary,
@@ -998,6 +1007,7 @@ def growDecisionTreeFrom(self, X, treatment, y, evaluationFunction, max_depth=10
                     bestAttribute = (col, value)
                     best_set_left = [X_l, w_l, y_l]
                     best_set_right = [X_r, w_r, y_r]
+                    self.feature_imp_dict[bestAttribute[0]] += gain_for_imp
 
         dcY = {'impurity': '%.3f' % currentScore, 'samples': '%d' % len(X)}
         # Add treatment size
@@ -1200,7 +1210,8 @@ def __init__(self,
                  n_reg=10,
                  evaluationFunction=None,
                  control_name=None,
-                 normalization=True):
+                 normalization=True,
+                 n_jobs=-1):
         """
         Initialize the UpliftRandomForestClassifier class.
         """
@@ -1214,6 +1225,7 @@ def __init__(self,
         self.n_reg = n_reg
         self.evaluationFunction = evaluationFunction
         self.control_name = control_name
+        self.n_jobs = n_jobs
 
         # Create forest
         self.uplift_forest = []
@@ -1229,6 +1241,9 @@ def __init__(self,
 
             self.uplift_forest.append(uplift_tree)
 
+        if self.n_jobs == -1:
+            self.n_jobs = mp.cpu_count()
+
     def fit(self, X, treatment, y):
         """
         Fit the UpliftRandomForestClassifier.
@@ -1254,13 +1269,22 @@ def fit(self, X, treatment, y):
         for i, treatment_group_key in enumerate(treatment_group_keys):
             self.classes_[treatment_group_key] = i
 
-        # Bootstrap
-        for tree_i in range(len(self.uplift_forest)):
-            bt_index = np.random.choice(len(X), len(X))
-            x_train_bt = X[bt_index]
-            y_train_bt = y[bt_index]
-            treatment_train_bt = treatment[bt_index]
-            self.uplift_forest[tree_i].fit(X=x_train_bt, treatment=treatment_train_bt, y=y_train_bt)
+        self.uplift_forest = (
+            Parallel(n_jobs=self.n_jobs)
+            (delayed(self.bootstrap)(X, treatment, y, tree) for tree in self.uplift_forest)
+        )
+
+        all_importances = [tree.feature_importances_ for tree in self.uplift_forest]
+        self.feature_importances_ = np.mean(all_importances, axis=0)
+        self.feature_importances_ /= self.feature_importances_.sum()  # normalize to add to 1
+
+    def bootstrap(self, X, treatment, y, tree):
+        bt_index = np.random.choice(len(X), len(X))
+        x_train_bt = X[bt_index]
+        y_train_bt = y[bt_index]
+        treatment_train_bt = treatment[bt_index]
+        tree.fit(X=x_train_bt, treatment=treatment_train_bt, y=y_train_bt)
+        return tree
 
     @ignore_warnings(category=FutureWarning)
     def predict(self, X, full_output=False):

diff --git a/examples/feature_interpretations_example.ipynb b/examples/feature_interpretations_example.ipynb
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,6 @@
 setuptools>=41.0.0
 pip>=10.0
-numpy>=0.16.0
+numpy>=0.16.0,<1.19.0
 scipy==1.4.1
 matplotlib
 pandas>=0.24.1