Skip to content

Commit 13c1aac

Browse files
authored
Merge branch 'master' into master
2 parents a1d32b4 + cdaa5c7 commit 13c1aac

File tree

4 files changed

+14
-96
lines changed

4 files changed

+14
-96
lines changed

autoviml/Auto_ViML.py

Lines changed: 8 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -1366,6 +1366,14 @@ def warn():
13661366
#### Do binning only when there are numeric features ####
13671367
#### When we Bin the first time, we set the entropy_binning flag to False so
13681368
#### no numeric variables are removed. But next time, we will remove them later!
1369+
pdb.set_trace()
1370+
# Optionally, select top n variables based on their predictive power
1371+
# This step is useful if you want to bin only the most informative variables
1372+
entropy_binner = EntropyBinningTransformer(replace_vars=False, modeltype=modeltype, top_n_vars=None)
1373+
1374+
# Fit the transformer to the training data
1375+
entropy_binner.fit_transform(X_train, y_train)
1376+
13691377
part_train, num_vars, important_features, part_cv = add_entropy_binning(part_train,
13701378
each_target, saved_num_vars,
13711379
saved_important_features, part_cv,
@@ -5493,95 +5501,6 @@ def remove_highly_correlated_vars_fast(df, corr_limit=0.70):
54935501

54945502

54955503
#####################################################################################
5496-
def add_entropy_binning(temp_train, targ, num_vars, important_features, temp_test,
5497-
modeltype, entropy_binning, verbose=0):
5498-
"""
5499-
###### This is where we do ENTROPY BINNING OF CONTINUOUS VARS ###########
5500-
#### It is best to do Binning on ONLY on the top most variables from Important_Features!
5501-
#### Make sure that the Top 2-10 vars are all CONTINUOUS VARS! Otherwise Binning is Waste!
5502-
#### This method ensures you get the Best Results by generalizing on the top numeric vars!
5503-
"""
5504-
temp_train = copy.deepcopy(temp_train)
5505-
temp_test = copy.deepcopy(temp_test)
5506-
max_depth = 10
5507-
seed = 99
5508-
num_vars = copy.deepcopy(num_vars)
5509-
continuous_vars = copy.deepcopy(num_vars)
5510-
important_features = copy.deepcopy(important_features)
5511-
print('Determining which of %d continuous variables should be Entropy Binned...' % len(continuous_vars))
5512-
if 0 < len(continuous_vars) <= 2:
5513-
max_depth = 2
5514-
continuous_vars = continuous_vars[:]
5515-
elif 2 < len(continuous_vars) <= 5:
5516-
max_depth = len(continuous_vars) - 2
5517-
continuous_vars = continuous_vars[:2]
5518-
elif 5 < len(continuous_vars) <= 10:
5519-
max_depth = 5
5520-
continuous_vars = continuous_vars[:5]
5521-
elif 10 < len(continuous_vars) <= 50:
5522-
max_depth = max_depth
5523-
continuous_vars = continuous_vars[:10]
5524-
else:
5525-
max_depth = max_depth
5526-
continuous_vars = continuous_vars[:50]
5527-
new_bincols = []
5528-
### This is an Awesome Entropy Based Binning Method for Continuous Variables ###########
5529-
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
5530-
if modeltype == 'Regression':
5531-
clf = DecisionTreeRegressor(criterion='mse', min_samples_leaf=2,
5532-
max_depth=max_depth,
5533-
random_state=seed)
5534-
else:
5535-
clf = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=2,
5536-
max_depth=max_depth,
5537-
random_state=seed)
5538-
####### This is where we bin each variable through a method known as Entropy Binning ##############
5539-
for each_num in continuous_vars:
5540-
try:
5541-
clf.fit(temp_train[each_num].values.reshape(-1, 1), temp_train[targ].values)
5542-
entropy_threshold = clf.tree_.threshold[clf.tree_.threshold > -2]
5543-
entropy_threshold = np.sort(entropy_threshold)
5544-
if isinstance(each_num, str):
5545-
bincol = each_num + '_bin'
5546-
temp_train[bincol] = np.digitize(temp_train[each_num].values, entropy_threshold)
5547-
else:
5548-
bincol = 'bin_' + str(each_num)
5549-
temp_train[bincol] = np.digitize(temp_train[each_num].values, entropy_threshold)
5550-
#### We Drop the original continuous variable after you have created the bin when Flag is true
5551-
### We Don't drop these original numeric vars since they will be used later for full train binning
5552-
if type(temp_test) != str:
5553-
if isinstance(each_num, str):
5554-
bincol = each_num + '_bin'
5555-
temp_test[bincol] = np.digitize(temp_test[each_num].values, entropy_threshold)
5556-
else:
5557-
bincol = 'bin_' + str(each_num)
5558-
temp_test[bincol] = np.digitize(temp_test[each_num].values, entropy_threshold)
5559-
#### We Drop the original continuous variable after you have created the bin when Flag is true
5560-
### We Don't drop these original numeric vars since they will be used later for full train binning
5561-
if entropy_binning:
5562-
### In the second time, we don't repeat adding binned vars since they have already been added!
5563-
#### we also make sure that the orig num vars which have now been binned are removed!
5564-
temp_train.drop(each_num, axis=1, inplace=True)
5565-
if type(temp_test) != str:
5566-
temp_test.drop(each_num, axis=1, inplace=True)
5567-
else:
5568-
#### In the first time, we add binned vars to important_features ###
5569-
### In the second time, we don't repeat that since they have already been added!
5570-
important_features.append(bincol)
5571-
num_vars.append(bincol)
5572-
important_features.remove(each_num)
5573-
#### Drop these original continuous variable from further consideration that's all! ###
5574-
num_vars.remove(each_num)
5575-
new_bincols.append(bincol)
5576-
except:
5577-
print('Error in %s during Entropy Binning' % each_num)
5578-
print(' Selected and binned only top %s continuous variables.' % (len(new_bincols)))
5579-
if verbose and len(new_bincols) <= 30:
5580-
print(' %s' % new_bincols)
5581-
return temp_train, num_vars, important_features, temp_test
5582-
5583-
5584-
#############################################################################
55855504
from imblearn.over_sampling import SMOTE, SVMSMOTE
55865505
from imblearn.over_sampling import ADASYN, SMOTENC
55875506
from sklearn.cluster import KMeans
@@ -5701,7 +5620,6 @@ def marthas_columns(data, verbose=0):
57015620
))
57025621
print('--------------------------------------------------------------------')
57035622

5704-
57055623
##################################################################################
57065624
def get_size(input_bytes, suffix="B"):
57075625
"""
@@ -5742,5 +5660,3 @@ def print_system_info():
57425660
print(f"Total: {get_size(svmem.total)}")
57435661
print(f"Available: {get_size(svmem.available)}")
57445662
print(f"Used: {get_size(svmem.used)}")
5745-
print("=" * 18, "System Information End", "=" * 18)
5746-
#####################################################################################

autoviml/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
################################################################################
88
# Version
99
from .__version__ import __version__, __nlp_version__
10+
from .Auto_ViML import Auto_ViML, EntropyBinningTransformer
11+
from .Auto_NLP import Auto_NLP
1012
if __name__ == "__main__":
1113
module_type = 'Running'
1214
else:

autoviml/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
__author__ = "Ram Seshadri"
66
__description__ = "Automatically Build Multiple Interpretable ML Models in Single Line of Code"
77
__url__ = "https://github.com/AutoViML/Auto_ViML.git"
8-
__version__ = "0.1.717"
8+
__version__ = "0.1.720"
99
__nlp_version__ = "0.1.01"
1010
__license__ = "Apache License 2.0"
1111
__copyright__ = "2020-21 Google"

setup.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
setuptools.setup(
77
name="autoviml",
8-
version="0.1.717",
8+
version="0.1.720",
99
author="Ram Seshadri",
1010
# author_email="author@example.com",
1111
description="Automatically Build Variant Interpretable ML models fast - now with CatBoost!",
@@ -28,12 +28,12 @@
2828
"textblob",
2929
"nltk",
3030
"regex",
31-
"scikit-learn<1.2",
32-
"xgboost<=1.5.2",
31+
"xgboost<=1.6.2",
3332
"vaderSentiment",
3433
"imbalanced-learn>=0.10.1",
3534
"shap>=0.36.0",
3635
"imbalanced_ensemble>=0.2.0",
36+
"scikit-learn>=0.24,<=1.2.2",
3737
"lightgbm>=3.0.0",
3838
],
3939
classifiers=[

0 commit comments

Comments
 (0)