@@ -1366,6 +1366,14 @@ def warn():
1366
1366
#### Do binning only when there are numeric features ####
1367
1367
#### When we Bin the first time, we set the entropy_binning flag to False so
1368
1368
#### no numeric variables are removed. But next time, we will remove them later!
1369
+ pdb .set_trace ()
1370
+ # Optionally, select top n variables based on their predictive power
1371
+ # This step is useful if you want to bin only the most informative variables
1372
+ entropy_binner = EntropyBinningTransformer (replace_vars = False , modeltype = modeltype , top_n_vars = None )
1373
+
1374
+ # Fit the transformer to the training data
1375
+ entropy_binner .fit_transform (X_train , y_train )
1376
+
1369
1377
part_train , num_vars , important_features , part_cv = add_entropy_binning (part_train ,
1370
1378
each_target , saved_num_vars ,
1371
1379
saved_important_features , part_cv ,
@@ -5493,95 +5501,6 @@ def remove_highly_correlated_vars_fast(df, corr_limit=0.70):
5493
5501
5494
5502
5495
5503
#####################################################################################
5496
- def add_entropy_binning (temp_train , targ , num_vars , important_features , temp_test ,
5497
- modeltype , entropy_binning , verbose = 0 ):
5498
- """
5499
- ###### This is where we do ENTROPY BINNING OF CONTINUOUS VARS ###########
5500
- #### It is best to do Binning on ONLY on the top most variables from Important_Features!
5501
- #### Make sure that the Top 2-10 vars are all CONTINUOUS VARS! Otherwise Binning is Waste!
5502
- #### This method ensures you get the Best Results by generalizing on the top numeric vars!
5503
- """
5504
- temp_train = copy .deepcopy (temp_train )
5505
- temp_test = copy .deepcopy (temp_test )
5506
- max_depth = 10
5507
- seed = 99
5508
- num_vars = copy .deepcopy (num_vars )
5509
- continuous_vars = copy .deepcopy (num_vars )
5510
- important_features = copy .deepcopy (important_features )
5511
- print ('Determining which of %d continuous variables should be Entropy Binned...' % len (continuous_vars ))
5512
- if 0 < len (continuous_vars ) <= 2 :
5513
- max_depth = 2
5514
- continuous_vars = continuous_vars [:]
5515
- elif 2 < len (continuous_vars ) <= 5 :
5516
- max_depth = len (continuous_vars ) - 2
5517
- continuous_vars = continuous_vars [:2 ]
5518
- elif 5 < len (continuous_vars ) <= 10 :
5519
- max_depth = 5
5520
- continuous_vars = continuous_vars [:5 ]
5521
- elif 10 < len (continuous_vars ) <= 50 :
5522
- max_depth = max_depth
5523
- continuous_vars = continuous_vars [:10 ]
5524
- else :
5525
- max_depth = max_depth
5526
- continuous_vars = continuous_vars [:50 ]
5527
- new_bincols = []
5528
- ### This is an Awesome Entropy Based Binning Method for Continuous Variables ###########
5529
- from sklearn .tree import DecisionTreeRegressor , DecisionTreeClassifier
5530
- if modeltype == 'Regression' :
5531
- clf = DecisionTreeRegressor (criterion = 'mse' , min_samples_leaf = 2 ,
5532
- max_depth = max_depth ,
5533
- random_state = seed )
5534
- else :
5535
- clf = DecisionTreeClassifier (criterion = 'entropy' , min_samples_leaf = 2 ,
5536
- max_depth = max_depth ,
5537
- random_state = seed )
5538
- ####### This is where we bin each variable through a method known as Entropy Binning ##############
5539
- for each_num in continuous_vars :
5540
- try :
5541
- clf .fit (temp_train [each_num ].values .reshape (- 1 , 1 ), temp_train [targ ].values )
5542
- entropy_threshold = clf .tree_ .threshold [clf .tree_ .threshold > - 2 ]
5543
- entropy_threshold = np .sort (entropy_threshold )
5544
- if isinstance (each_num , str ):
5545
- bincol = each_num + '_bin'
5546
- temp_train [bincol ] = np .digitize (temp_train [each_num ].values , entropy_threshold )
5547
- else :
5548
- bincol = 'bin_' + str (each_num )
5549
- temp_train [bincol ] = np .digitize (temp_train [each_num ].values , entropy_threshold )
5550
- #### We Drop the original continuous variable after you have created the bin when Flag is true
5551
- ### We Don't drop these original numeric vars since they will be used later for full train binning
5552
- if type (temp_test ) != str :
5553
- if isinstance (each_num , str ):
5554
- bincol = each_num + '_bin'
5555
- temp_test [bincol ] = np .digitize (temp_test [each_num ].values , entropy_threshold )
5556
- else :
5557
- bincol = 'bin_' + str (each_num )
5558
- temp_test [bincol ] = np .digitize (temp_test [each_num ].values , entropy_threshold )
5559
- #### We Drop the original continuous variable after you have created the bin when Flag is true
5560
- ### We Don't drop these original numeric vars since they will be used later for full train binning
5561
- if entropy_binning :
5562
- ### In the second time, we don't repeat adding binned vars since they have already been added!
5563
- #### we also make sure that the orig num vars which have now been binned are removed!
5564
- temp_train .drop (each_num , axis = 1 , inplace = True )
5565
- if type (temp_test ) != str :
5566
- temp_test .drop (each_num , axis = 1 , inplace = True )
5567
- else :
5568
- #### In the first time, we add binned vars to important_features ###
5569
- ### In the second time, we don't repeat that since they have already been added!
5570
- important_features .append (bincol )
5571
- num_vars .append (bincol )
5572
- important_features .remove (each_num )
5573
- #### Drop these original continuous variable from further consideration that's all! ###
5574
- num_vars .remove (each_num )
5575
- new_bincols .append (bincol )
5576
- except :
5577
- print ('Error in %s during Entropy Binning' % each_num )
5578
- print (' Selected and binned only top %s continuous variables.' % (len (new_bincols )))
5579
- if verbose and len (new_bincols ) <= 30 :
5580
- print (' %s' % new_bincols )
5581
- return temp_train , num_vars , important_features , temp_test
5582
-
5583
-
5584
- #############################################################################
5585
5504
from imblearn .over_sampling import SMOTE , SVMSMOTE
5586
5505
from imblearn .over_sampling import ADASYN , SMOTENC
5587
5506
from sklearn .cluster import KMeans
@@ -5701,7 +5620,6 @@ def marthas_columns(data, verbose=0):
5701
5620
))
5702
5621
print ('--------------------------------------------------------------------' )
5703
5622
5704
-
5705
5623
##################################################################################
5706
5624
def get_size (input_bytes , suffix = "B" ):
5707
5625
"""
@@ -5742,5 +5660,3 @@ def print_system_info():
5742
5660
print (f"Total: { get_size (svmem .total )} " )
5743
5661
print (f"Available: { get_size (svmem .available )} " )
5744
5662
print (f"Used: { get_size (svmem .used )} " )
5745
- print ("=" * 18 , "System Information End" , "=" * 18 )
5746
- #####################################################################################
0 commit comments