Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions autoum/approaches/uplift_random_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from datetime import datetime

import numpy as np
import pandas as pd
from causalml.inference.tree import UpliftRandomForestClassifier

from autoum.approaches.utils import ApproachParameters, DataSetsHelper, Helper
Expand Down Expand Up @@ -57,9 +58,13 @@ def __init__(self, parameters: dict, approach_parameters: ApproachParameters, ev
self.feature_importance = approach_parameters.feature_importance
self.save = approach_parameters.save
self.path = approach_parameters.path
self.post_prune = approach_parameters.post_prune
self.split_number = approach_parameters.split_number
self.log = logging.getLogger(type(self).__name__)

if eval_function not in ["ED", "KL", "CHI"]:
self.post_prune = False

def analyze(self, data_set_helper: DataSetsHelper) -> dict:
"""
Calculate the score (ITE/Uplift/CATE) for each sample using uplift random forest
Expand All @@ -78,6 +83,10 @@ def analyze(self, data_set_helper: DataSetsHelper) -> dict:

urf.fit(X=data_set_helper.x_train, treatment=experiment_groups_col, y=data_set_helper.y_train)

if self.post_prune:
for list_id, tree in enumerate(urf.uplift_forest):
tree.prune(data_set_helper.x_train, experiment_groups_col, data_set_helper.y_train)

self.log.debug(urf)

if self.save:
Expand Down
5 changes: 3 additions & 2 deletions autoum/approaches/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def add_treatment_group_key(df: pd.DataFrame) -> np.array:
"""

experiment_groups_col = ["c" if x == 0 else "t" for x in df.treatment]
experiment_groups_col = np.array(experiment_groups_col)
experiment_groups_col = np.array(experiment_groups_col, dtype=object)

return experiment_groups_col

Expand Down Expand Up @@ -108,7 +108,7 @@ class ApproachParameters:
Utility class that encompassees all parameters needed to create an approach instance.
"""

def __init__(self, cost_sensitive: bool, feature_importance: bool, path: str, save: bool, split_number: int):
def __init__(self, cost_sensitive: bool, feature_importance: bool, path: str, post_prune: bool, save: bool, split_number: int):
"""
Utility class that encompassees all parameters needed to create an approach instance.

Expand All @@ -122,5 +122,6 @@ def __init__(self, cost_sensitive: bool, feature_importance: bool, path: str, sa
self.cost_sensitive = cost_sensitive
self.feature_importance = feature_importance
self.path = path
self.post_prune = post_prune
self.save = save
self.split_number = split_number
28 changes: 18 additions & 10 deletions autoum/pipelines/pipeline_rw.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def __init__(self,
plot_uqc: bool = True,
plot_save_figures: bool = False,
pool_capacity: int = 40,
post_prune: bool = False,
rlearner: bool = False,
run_name: str = "RUN",
run_id: int = 1,
Expand Down Expand Up @@ -109,6 +110,7 @@ def __init__(self,
:param plot_uqc: True if the UQC value for a curve should be included in the plot legend. False otherwise. Default: True
:param plot_save_figures: True if the resulting qini figures shall be saved. False otherwise. Default: False
:param pool_capacity: Set this to the maximum number of free kernels for the calculation. Default 40
:param post_prune: Prune the uplift models after training, applies to URF_CHI, URF_ED and URF_KL
:param rlearner: True, if R-Learner should be applied. False otherwise. Default: False
:param run_id: Id of the run (For logging and saving purposes). Default: 1
:param run_name: Name of the run (For logging and saving purposes). Default: "RUN"
Expand Down Expand Up @@ -152,6 +154,7 @@ def __init__(self,
self.plot_uqc = plot_uqc
self.plot_save_figures = plot_save_figures
self.pool_capacity = pool_capacity
self.post_prune = post_prune
self.rlearner = rlearner
self.random_seed = random_seed
self.save_models = save_models
Expand Down Expand Up @@ -216,26 +219,31 @@ def sanity_checks(self):
assert 0.1 <= self.validation_size <= 0.9, "Please select 0.1 <= validation_size <= 0.9"
assert self.n_estimators % 4 == 0, "Please select a multiple of 4 as n_estimators"

def analyze_dataset(self, data: pd.DataFrame):
def analyze_dataset(self, data: pd.DataFrame, test_data: pd.DataFrame = None):
"""
Apply, compare, and evaluate various uplift modeling approaches on the given data set.

:param data: Dataset to be analyzed
:param test_data: (optional) Test Dataset, which the pipeline will use for the test metrics
"""

if not isinstance(data, pd.DataFrame):
return

start = time.time()
logging.info("Starting analyzing dataset ... ")

try:
df_train, df_test = train_test_split(data, test_size=self.test_size, shuffle=True, stratify=data[['response', 'treatment']], random_state=self.random_seed)
df_train.reset_index(inplace=True, drop=True)
df_test.reset_index(inplace=True, drop=True)
except ValueError:
logging.error("Stratification not possible" + data.groupby(["response", "treatment"]).size().reset_index(name="Counter").to_string())
raise ValueError("Stratification not possible" + data.groupby(["response", "treatment"]).size().reset_index(name="Counter").to_string())
if test_data is not None:
assert data.columns.equals(test_data.columns), "The train and test dataset columns are not identical"
df_train, df_test = data.sample(frac=1.0, random_state=self.random_seed), test_data
else:
try:
df_train, df_test = train_test_split(data, test_size=self.test_size, shuffle=True, stratify=data[['response', 'treatment']], random_state=self.random_seed)
df_train.reset_index(inplace=True, drop=True)
df_test.reset_index(inplace=True, drop=True)
except ValueError:
logging.error("Stratification not possible" + data.groupby(["response", "treatment"]).size().reset_index(name="Counter").to_string())
raise ValueError("Stratification not possible" + data.groupby(["response", "treatment"]).size().reset_index(name="Counter").to_string())

# Get feature names
feature_names = list(df_train.drop(['response', 'treatment'], axis=1).columns.values)
Expand Down Expand Up @@ -436,7 +444,7 @@ def train_eval_splits(self, args):

scores_dict = HelperPipeline.apply_uplift_approaches(df_train=df_train, df_valid=df_valid, df_test=df_test, parameters=self.parameters, approach=[approach_name],
split_number=i, cost_sensitive=self.cost_sensitive, feature_importance=self.feature_importance,
save_models=self.save_models)
save_models=self.save_models, post_prune=self.post_prune)

logging.info("Start Evaluation. Split number {}".format(i))

Expand Down
6 changes: 4 additions & 2 deletions autoum/pipelines/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,8 @@ def apply_uplift_approaches(df_train: pd.DataFrame,
split_number: int,
cost_sensitive: bool = False,
feature_importance: bool = False,
save_models: bool = False) -> dict:
save_models: bool = False,
post_prune: bool = False) -> dict:
"""
Apply given uplift modeling approaches on the given dataframes and return the scores

Expand All @@ -95,6 +96,7 @@ def apply_uplift_approaches(df_train: pd.DataFrame,
:param cost_sensitive: Set this to true for cost sensitive learning.
:param feature_importance: Set this to True to return the feature importances of the classifiers
:param save_models: True if the models generated during training shall be saved. False otherwise.
:param post_prune: Set this to true to prune the trees of the URF approaches after training
:return: Dictionary with the following keys: df_scores_train, df_scores_test, df_train, df_test, feature_importances (empty dictionary if not used)
"""

Expand All @@ -110,7 +112,7 @@ def apply_uplift_approaches(df_train: pd.DataFrame,
ds_helper = DataSetsHelper(df_train=df_train, df_valid=df_valid, df_test=df_test)
# ApproachParameters contains alll parameters necessary to initialize an approach classifier
root = f"{get_data_home()}/models/"
approach_params = ApproachParameters(cost_sensitive=cost_sensitive, feature_importance=feature_importance, path=root, save=save_models, split_number=split_number)
approach_params = ApproachParameters(cost_sensitive=cost_sensitive, feature_importance=feature_importance, path=root, post_prune=post_prune, save=save_models, split_number=split_number)

# This dictionary is used as wrapper for passing all parameters at once for apply_approach
apply_params = {
Expand Down
2 changes: 1 addition & 1 deletion tests/test_bayesian_causal_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def setUp(self):

ds_helper = DataSetsHelper(df_train=df_train, df_valid=df_valid, df_test=df_test)
root = f"{get_data_home()}/testing/models/"
approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, save=False, split_number=0)
approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, post_prune=False, save=False, split_number=0)
self.ds_helper = ds_helper
self.approach_params = approach_params

Expand Down
2 changes: 1 addition & 1 deletion tests/test_class_variable_transformation.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def setUp(self):

ds_helper = DataSetsHelper(df_train=df_train, df_valid=df_valid, df_test=df_test)
root = f"{get_data_home()}/testing/models/"
approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, save=False, split_number=0)
approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, post_prune=False, save=False, split_number=0)
self.ds_helper = ds_helper
self.approach_params = approach_params

Expand Down
2 changes: 1 addition & 1 deletion tests/test_generalized_random_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def setUp(self):

ds_helper = DataSetsHelper(df_train=df_train, df_valid=df_valid, df_test=df_test)
root = f"{get_data_home()}/testing/models/"
approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, save=False, split_number=0)
approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, post_prune=False, save=False, split_number=0)
self.ds_helper = ds_helper
self.approach_params = approach_params

Expand Down
2 changes: 1 addition & 1 deletion tests/test_lais_generalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def setUp(self):

ds_helper = DataSetsHelper(df_train=df_train, df_valid=df_valid, df_test=df_test)
root = f"{get_data_home()}/testing/models/"
approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, save=False, split_number=0)
approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, post_prune=False, save=False, split_number=0)
self.ds_helper = ds_helper
self.approach_params = approach_params

Expand Down
4 changes: 2 additions & 2 deletions tests/test_pipeline_rw.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,10 +241,10 @@ def test_plotting(self):

def test_create_approach_tuples(self):
cv_number_splits = 10
pipeline = PipelineRW(cv_number_splits=cv_number_splits, urf_ddp=False, two_model=False)
pipeline = PipelineRW(cv_number_splits=cv_number_splits, slearner=True, two_model=True)
dataframe_pairs = pipeline.create_k_splits(df_train=self.df_train, df_test=self.df_test)
tuple_list = pipeline.create_approach_tuples(dataframe_pairs)
self.assertEqual(len(tuple_list), 15 * cv_number_splits)
self.assertEqual(len(tuple_list), 2 * cv_number_splits)
for _tuple in tuple_list:
self.assertEqual(len(_tuple), 5)

Expand Down
2 changes: 1 addition & 1 deletion tests/test_r_learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def setUp(self):

ds_helper = DataSetsHelper(df_train=df_train, df_valid=df_valid, df_test=df_test)
root = f"{get_data_home()}/testing/models/"
approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, save=False, split_number=0)
approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, post_prune=False, save=False, split_number=0)
self.ds_helper = ds_helper
self.approach_params = approach_params

Expand Down
2 changes: 1 addition & 1 deletion tests/test_s_learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def setUp(self):

ds_helper = DataSetsHelper(df_train=df_train, df_valid=df_valid, df_test=df_test)
root = f"{get_data_home()}/testing/models/"
approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, save=False, split_number=0)
approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, post_prune=False, save=False, split_number=0)
self.ds_helper = ds_helper
self.approach_params = approach_params

Expand Down
2 changes: 1 addition & 1 deletion tests/test_traditional.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def setUp(self):

ds_helper = DataSetsHelper(df_train=df_train, df_valid=df_valid, df_test=df_test)
root = f"{get_data_home()}/testing/models/"
approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, save=False, split_number=0)
approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, post_prune=False, save=False, split_number=0)
self.ds_helper = ds_helper
self.approach_params = approach_params

Expand Down
2 changes: 1 addition & 1 deletion tests/test_treatment_dummy.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def setUp(self):

ds_helper = DataSetsHelper(df_train=df_train, df_valid=df_valid, df_test=df_test)
root = f"{get_data_home()}/testing/models/"
approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, save=False, split_number=0)
approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, post_prune=False, save=False, split_number=0)
self.ds_helper = ds_helper
self.approach_params = approach_params

Expand Down
2 changes: 1 addition & 1 deletion tests/test_two_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def setUp(self):

ds_helper = DataSetsHelper(df_train=df_train, df_valid=df_valid, df_test=df_test)
root = f"{get_data_home()}/testing/models/"
approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, save=False, split_number=0)
approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, post_prune=False, save=False, split_number=0)
self.ds_helper = ds_helper
self.approach_params = approach_params

Expand Down
2 changes: 1 addition & 1 deletion tests/test_uplift_random_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def setUp(self):

ds_helper = DataSetsHelper(df_train=df_train, df_valid=df_valid, df_test=df_test)
root = f"{get_data_home()}/testing/models/"
approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, save=False, split_number=0)
approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, post_prune=False, save=False, split_number=0)
self.ds_helper = ds_helper
self.approach_params = approach_params

Expand Down
10 changes: 6 additions & 4 deletions tests/test_utils_pipelines.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import time
import unittest
from unittest.mock import MagicMock, patch

Expand Down Expand Up @@ -31,7 +32,7 @@ def setUp(self):
self.df_train, self.df_valid = train_test_split(data, test_size=0.2, shuffle=True, stratify=data[['response', 'treatment']], random_state=123)

self.ds_helper = DataSetsHelper(df_train=self.df_train, df_valid=self.df_valid, df_test=self.df_test)
self.approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=None, save=False, split_number=0)
self.approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=None, post_prune=False, save=False, split_number=0)
self.apply_params = {
"data_set_helper": self.ds_helper,
"feature_importance": False,
Expand All @@ -56,7 +57,8 @@ def setUp(self):
"n_jobs": n_jobs,
"control_name": "c",
"normalization": True,
"honesty": False
"honesty": False,
"post_prune": True
}

s_learner_parameters = {
Expand Down Expand Up @@ -240,7 +242,7 @@ def test_apply_uplift_approaches(self, m_apply_approach):

if i == "TWO_MODEL":
self.assertTrue(TwoModel.__instancecheck__(m_apply_approach.call_args[0][0]))
elif "URF" in i:
elif i == "URF":
self.assertTrue(UpliftRandomForest.__instancecheck__(m_apply_approach.call_args[0][0]))
elif i == "TRADITIONAL":
self.assertTrue(Traditional.__instancecheck__(m_apply_approach.call_args[0][0]))
Expand Down Expand Up @@ -343,7 +345,7 @@ def test_cast_to_dataframe(self):
df_uplift = helper.cast_to_dataframe(list_dict)

# Check if type equals pd.DataFrame
self.assertEqual(type(df_uplift), pd.DataFrame)
self.assertEqual(df_uplift.__class__, pd.DataFrame)

# Check if the DataFrame contains 55 columns (11 columns for each approach)
self.assertEqual(df_uplift.shape[1], 22)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_x_learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def setUp(self):

ds_helper = DataSetsHelper(df_train=df_train, df_valid=df_valid, df_test=df_test)
root = f"{get_data_home()}/testing/models/"
approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, save=False, split_number=0)
approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, post_prune=False, save=False, split_number=0)
self.ds_helper = ds_helper
self.approach_params = approach_params

Expand Down