-
Notifications
You must be signed in to change notification settings - Fork 6
/
FRUFS.py
168 lines (148 loc) · 7.71 KB
/
FRUFS.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
# Feature Relevance based Unsupervised Feature Selection
from sklearn.base import TransformerMixin, BaseEstimator
from tqdm import tqdm
import contextlib
import joblib
import numpy as np
from joblib import Parallel, delayed
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
import seaborn as sns
import matplotlib.pyplot as plt
# tqdm enabled for joblib
# This is all thanks to the answer by frenzykryger at https://stackoverflow.com/questions/24983493/tracking-progress-of-joblib-parallel-execution/58936697#58936697
@contextlib.contextmanager
def tqdm_joblib(tqdm_object):
"""Context manager to patch joblib to report into tqdm progress bar given as argument"""
class TqdmBatchCompletionCallback(joblib.parallel.BatchCompletionCallBack):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def __call__(self, *args, **kwargs):
tqdm_object.update(n=self.batch_size)
return super().__call__(*args, **kwargs)
old_batch_callback = joblib.parallel.BatchCompletionCallBack
joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback
try:
yield tqdm_object
finally:
joblib.parallel.BatchCompletionCallBack = old_batch_callback
tqdm_object.close()
# Define the class for our unsupervised feature selection algorithm
class FRUFS(TransformerMixin, BaseEstimator):
def __init__(self, model_r=None, model_c=None, k=1.0, n_jobs=1, verbose=0, categorical_features=None, random_state=None):
# Define the random state
self.random_state = random_state
# By default, the regression model is going to be a decision tree
if model_r is not None:
self.model_r = model_r
else:
self.model_r = DecisionTreeRegressor(random_state=self.random_state)
# By default, the classification model is going to be a decision tree
# Classification is required for categorical variables
if model_c is not None:
self.model_c = model_c
else:
self.model_c = DecisionTreeClassifier(random_state=self.random_state)
# Defines the number of features to select. Can be either a fraction between (0,1] or an integer between (0, number of features]
self.k = k
# Number of parallel jobs to run.
self.n_jobs = n_jobs
# The verbosity level for FRUFS. 0 gives a nice progress bar.
self.verbose = verbose
# A list which denotes categorical features if any. Set to None if there are no categorical variables
self.categorical_features = categorical_features
# Hidden variables
self.columns_ = None
self._feat_imps_ = None
# This function calculates the importance of all features in predicting the current feature
def cal_feat_imp_(self, index, X, model):
# This will contain the feature importance scores
feat_imp = np.zeros(X.shape[1])
# Remove the current feature from the dataset
x_train = np.concatenate((X[:,:index], X[:,index+1:]), axis=1)
# The current feature is now our target variable
y_train = X[:,index]
# Train
model.fit(x_train, y_train)
# Generate indices that will be used to add the feature importances in their correct positions
inds = np.concatenate((np.reshape(np.arange(index), (-1,1)), np.reshape(np.arange(index+1, X.shape[1]), (-1, 1))), axis=0)[:,0]
# Save the feature importance of each feature
try:
feat_imp[inds]+= model.feature_importances_
except:
feat_imp[inds]+= model.coef_
# Return the computed feature importance scores
return feat_imp
# The actual feature selection happens here
def fit(self, X):
# If pandas dataframe is provided as input then convert to numpy matrix and save column names
if isinstance(X, pd.DataFrame):
self.columns_ = X.columns
X = X.values
# If pandas dataframe is provided as input,
# then categorical features will exist (if provided) as a list of string containing column names
# Have to convert that to indices (list of integers)
if self.categorical_features is not None:
self.categorical_features = [i for i in range(X.shape[1]) if self.columns_[i] in self.categorical_features]
else:
# If numpy matrix is provided as input, then no column names exist
self.columns_ = np.arange(X.shape[1])
# Categorical features (if provided) will already be present as indices (list of integers)
# Instantiate the parallel processing object
parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)
# If verbosity is set to 0, then show a nice progress bar
if self.verbose == 0:
with tqdm_joblib(tqdm(desc="Progress bar", total=X.shape[1])) as progress_bar:
# Process all feature importance in parallel
feat_imps = parallel(delayed(self.cal_feat_imp_)(i, X, self.model_r if self.categorical_features is None or i not in self.categorical_features else self.model_c) for i in range(X.shape[1]))
else:
# Process all feature importance in parallel
feat_imps = parallel(delayed(self.cal_feat_imp_)(i, X, self.model_r if self.categorical_features is None or i not in self.categorical_features else self.model_c) for i in range(X.shape[1]))
# Average out the scores since parallel processing saves all results in the form of a list
self.feat_imps_ = np.average(np.asarray(feat_imps), axis=0)
# Remove unnecessary variables
del feat_imps
# Sort the columns based on feat_imps_
inds = np.argsort(np.absolute(self.feat_imps_))[::-1]
self.columns_ = self.columns_[inds]
self.feat_imps_ = self.feat_imps_[inds]
# If top K is provided as a float then take k*(number of features) many features
if type(self.k) == float:
self.k = round(self.k * X.shape[1])
# Prune the dataset
def transform(self, X):
# Take only the top k number of features
cols = self.columns_[:self.k]
# If data is a pandas dataframe then pruning is different
if isinstance(X, pd.DataFrame):
return X[cols]
else:
return X[:,cols]
# Fit and then prune
def fit_transform(self, X):
# First run fit
self.fit(X)
# Then run transform
return self.transform(X)
# Check out the feature importance scores
def feature_importance(self, top_x_feats=None):
if top_x_feats is not None:
y_axis = np.arange(top_x_feats)
x_axis = self.feat_imps_[:top_x_feats]
else:
y_axis = np.arange(len(self.columns_))
x_axis = self.feat_imps_
# This dashed line indicates the cut-off point for your features.
# All features below this line have been pruned
sns.lineplot(x=x_axis, y=[self.k for i in range(len(y_axis))], linestyle='--')
# The feature importance scores in the form of a horizontal bar chart
sns.barplot(x=x_axis, y=y_axis, orient="h")
# Use feature names if already provided or generate synthetic ones
if type(self.columns_[0]) == str:
plt.yticks(y_axis, self.columns_[:len(y_axis)], size='small')
else:
plt.yticks(y_axis, ["Feature "+str(i) for i in self.columns_[:len(y_axis)]], size='small')
plt.xlabel("Importance Scores")
plt.ylabel("Features")
sns.despine()
plt.show()