Skip to content

Commit e0e9dbc

Browse files
committed
ML based on bank marketing data set(UCI) V1.0
1 parent c5fb29d commit e0e9dbc

File tree

10 files changed

+86776
-0
lines changed

10 files changed

+86776
-0
lines changed
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on Sat Jun 10 22:17:20 2017
4+
5+
@author: liulo
6+
"""
7+
8+
from datetime import datetime
9+
import pandas as pd
10+
11+
from sklearn.ensemble import RandomForestClassifier
12+
from sklearn.utils import shuffle
13+
from sklearn import preprocessing
14+
15+
16+
def feature_scaling(data, numeric_attrs):
17+
for i in numeric_attrs:
18+
std = data[i].std()
19+
if std != 0:
20+
data[i] = (data[i]-data[i].mean()) / std
21+
else:
22+
data = data.drop(i, axis=1)
23+
return data
24+
25+
26+
def encode_cate_attrs(data, cate_attrs):
27+
data = encode_edu_attrs(data)
28+
cate_attrs.remove('education')
29+
for i in cate_attrs:
30+
dummies_df = pd.get_dummies(data[i])
31+
dummies_df = dummies_df.rename(columns=lambda x: i+'_'+str(x))
32+
data = pd.concat([data,dummies_df],axis=1)
33+
data = data.drop(i, axis=1)
34+
return data
35+
36+
37+
def encode_bin_attrs(data, bin_attrs):
38+
for i in bin_attrs:
39+
data.loc[data[i] == 'no', i] = 0
40+
data.loc[data[i] == 'yes', i] = 1
41+
return data
42+
43+
44+
def encode_edu_attrs(data):
45+
values = ["illiterate", "basic.4y", "basic.6y", "basic.9y",
46+
"high.school", "professional.course", "university.degree"]
47+
levels = range(1,len(values)+1)
48+
dict_levels = dict(zip(values, levels))
49+
for v in values:
50+
data.loc[data['education'] == v, 'education'] = dict_levels[v]
51+
return data
52+
53+
54+
def trans_num_attrs(data, numeric_attrs):
55+
bining_num = 10
56+
bining_attr = 'age'
57+
data[bining_attr] = pd.qcut(data[bining_attr], bining_num)
58+
data[bining_attr] = pd.factorize(data[bining_attr])[0]+1
59+
60+
for i in numeric_attrs:
61+
scaler = preprocessing.StandardScaler()
62+
data[i] = scaler.fit_transform(data[i])
63+
return data
64+
65+
66+
def fill_unknown(data, bin_attrs, cate_attrs, numeric_attrs):
67+
# fill_attrs = ['education', 'default', 'housing', 'loan']
68+
fill_attrs = []
69+
for i in bin_attrs+cate_attrs:
70+
if data[data[i] == 'unknown']['y'].count() < 500:
71+
# delete col containing unknown
72+
data = data[data[i] != 'unknown']
73+
else:
74+
fill_attrs.append(i)
75+
76+
data = encode_cate_attrs(data, cate_attrs)
77+
data = encode_bin_attrs(data, bin_attrs)
78+
data = trans_num_attrs(data, numeric_attrs)
79+
data['y'] = data['y'].map({'no': 0, 'yes': 1}).astype(int)
80+
for i in fill_attrs:
81+
test_data = data[data[i] == 'unknown']
82+
testX = test_data.drop(fill_attrs, axis=1)
83+
train_data = data[data[i] != 'unknown']
84+
trainY = train_data[i]
85+
trainX = train_data.drop(fill_attrs, axis=1)
86+
test_data[i] = train_predict_unknown(trainX, trainY, testX)
87+
data = pd.concat([train_data, test_data])
88+
89+
return data
90+
91+
92+
def train_predict_unknown(trainX, trainY, testX):
93+
forest = RandomForestClassifier(n_estimators=100)
94+
forest = forest.fit(trainX, trainY)
95+
test_predictY = forest.predict(testX).astype(int)
96+
return pd.DataFrame(test_predictY,index=testX.index)
97+
98+
99+
def preprocess_data():
100+
input_data_path = "../data/bank-additional/bank-additional-full.csv"
101+
processed_data_path = '../processed_data/bank-additional-full.csv'
102+
print("Loading data...")
103+
data = pd.read_csv(input_data_path, sep=';')
104+
print("Preprocessing data...")
105+
numeric_attrs = ['age', 'duration', 'campaign', 'pdays', 'previous',
106+
'emp.var.rate', 'cons.price.idx', 'cons.conf.idx',
107+
'euribor3m', 'nr.employed',]
108+
bin_attrs = ['default', 'housing', 'loan']
109+
cate_attrs = ['poutcome', 'education', 'job', 'marital',
110+
'contact', 'month','day_of_week']
111+
112+
data = shuffle(data)
113+
data = fill_unknown(data, bin_attrs, cate_attrs, numeric_attrs)
114+
data.to_csv(processed_data_path, index=False)
115+
116+
117+
start_time = datetime.now()
118+
preprocess_data()
119+
end_time = datetime.now()
120+
delta_seconds = (end_time - start_time).seconds
121+
print("Cost time: {}s".format(delta_seconds))
122+
123+
124+
125+
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
import random
2+
from sklearn.neighbors import NearestNeighbors
3+
import numpy as np
4+
class Smote:
5+
def __init__(self,samples,N=10,k=3):
6+
self.n_samples,self.n_attrs=samples.shape
7+
self.N=N
8+
self.k=k
9+
self.samples=samples
10+
self.newindex=0
11+
# self.synthetic=np.zeros((self.n_samples*N,self.n_attrs))
12+
13+
def over_sampling(self):
14+
N=int(self.N/1)
15+
self.synthetic = np.zeros((self.n_samples * N, self.n_attrs))
16+
neighbors=NearestNeighbors(n_neighbors=self.k).fit(self.samples)
17+
for i in range(len(self.samples)):
18+
nnarray=neighbors.kneighbors(self.samples[i].reshape(1,-1),return_distance=False)[0]
19+
#print nnarray
20+
self._populate(N,i,nnarray)
21+
return self.synthetic
22+
23+
24+
# for each minority class samples,choose N of the k nearest neighbors and generate N synthetic samples.
25+
def _populate(self,N,i,nnarray):
26+
for j in range(N):
27+
nn=random.randint(0,self.k-1)
28+
dif=self.samples[nnarray[nn]]-self.samples[i]
29+
gap=random.random()
30+
self.synthetic[self.newindex]=self.samples[i]+gap*dif
31+
self.newindex+=1
32+
33+
34+
"""
35+
a=np.array([[1,2,3],[4,5,6],[2,3,1],[2,1,2],[2,3,4],[2,3,4]])
36+
s=Smote(a,N=1)
37+
print s.over_sampling()
38+
"""
1.51 KB
Binary file not shown.
Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on Wed Jun 14 14:37:24 2017
4+
5+
@author: liulo
6+
"""
7+
from datetime import datetime
8+
9+
import pandas as pd
10+
import numpy as np
11+
from matplotlib import pylab
12+
13+
from sklearn.ensemble import RandomForestClassifier
14+
from sklearn import metrics
15+
from sklearn.utils import shuffle
16+
from sklearn.linear_model import LogisticRegression
17+
from sklearn.metrics import classification_report
18+
19+
import smote
20+
21+
22+
def split_data(data):
23+
data_len = data['y'].count()
24+
split1 = int(data_len*0.6)
25+
split2 = int(data_len*0.8)
26+
train_data = data[:split1]
27+
cv_data = data[split1:split2]
28+
test_data = data[split2:]
29+
30+
return train_data, cv_data, test_data
31+
32+
33+
def resample_train_data(train_data, n, frac):
34+
numeric_attrs = ['age', 'duration', 'campaign', 'pdays', 'previous',
35+
'emp.var.rate', 'cons.price.idx', 'cons.conf.idx',
36+
'euribor3m', 'nr.employed',]
37+
#numeric_attrs = train_data.drop('y',axis=1).columns
38+
pos_train_data_original = train_data[train_data['y'] == 1]
39+
pos_train_data = train_data[train_data['y'] == 1]
40+
new_count = n * pos_train_data['y'].count()
41+
neg_train_data = train_data[train_data['y'] == 0].sample(frac=frac)
42+
train_list = []
43+
if n != 0:
44+
pos_train_X = pos_train_data[numeric_attrs]
45+
pos_train_X2 = pd.concat([pos_train_data.drop(numeric_attrs, axis=1)] * n)
46+
pos_train_X2.index = range(new_count)
47+
48+
s = smote.Smote(pos_train_X.values, N=n, k=3)
49+
pos_train_X = s.over_sampling()
50+
pos_train_X = pd.DataFrame(pos_train_X, columns=numeric_attrs,
51+
index=range(new_count))
52+
pos_train_data = pd.concat([pos_train_X, pos_train_X2], axis=1)
53+
pos_train_data = pd.DataFrame(pos_train_data, columns=pos_train_data_original.columns)
54+
train_list = [pos_train_data, neg_train_data, pos_train_data_original]
55+
else:
56+
train_list = [neg_train_data, pos_train_data_original]
57+
print("Size of positive train data: {} * {}".format(pos_train_data_original['y'].count(), n+1))
58+
print("Size of negative train data: {} * {}".format(neg_train_data['y'].count(), frac))
59+
train_data = pd.concat(train_list, axis=0)
60+
return shuffle(train_data)
61+
62+
63+
def evaluate(test_predictY, test_y):
64+
test_len = test_y.shape[0]
65+
true_pos = 0
66+
false_pos = 0
67+
true_neg = 0
68+
false_neg = 0
69+
for i in range(test_len):
70+
if test_predictY[i] == 1:
71+
if test_y[i] == 1:
72+
true_pos += 1
73+
else:
74+
false_pos += 1
75+
else:
76+
if test_y[i] == 0:
77+
true_neg += 1
78+
else:
79+
false_neg += 1
80+
81+
accuracy = 1.0 * (true_pos+true_neg) / test_len
82+
precision = 1.0 * true_pos / (true_pos + false_pos)
83+
recall = 1.0 * true_pos / (true_pos + false_neg)
84+
f1Score = 2 * precision * recall / (precision + recall)
85+
print("Accuracy: {}".format(accuracy))
86+
print("Precision: {}".format(precision ))
87+
print("Recall: {}".format(recall))
88+
print("F1 Score: {}".format(f1Score))
89+
90+
91+
def plot_pr(auc_score, precision, recall, label=None):
92+
pylab.figure(num=None, figsize=(6, 5))
93+
pylab.xlim([0.0, 1.0])
94+
pylab.ylim([0.0, 1.0])
95+
pylab.xlabel('Recall')
96+
pylab.ylabel('Precision')
97+
pylab.title('P/R (AUC=%0.2f) / %s' % (auc_score, label))
98+
pylab.fill_between(recall, precision, alpha=0.2)
99+
pylab.grid(True, linestyle='-', color='0.75')
100+
pylab.plot(recall, precision, lw=1)
101+
pylab.show()
102+
103+
104+
105+
def plot_roc(auc_score, fpr, tpr, label=None):
106+
pylab.figure(num=None, figsize=(6, 5))
107+
pylab.xlim([0.0, 1.0])
108+
pylab.ylim([0.0, 1.0])
109+
pylab.xlabel('False positive rate')
110+
pylab.ylabel('True positive rate')
111+
pylab.title('ROC (AUC=%0.2f) / %s' % (auc_score, label))
112+
pylab.fill_between(fpr, tpr, alpha=0.2)
113+
pylab.grid(True, linestyle='-', color='0.75')
114+
pylab.plot(fpr, tpr, lw=1)
115+
pylab.show()
116+
117+
118+
def train_evaluate(train_data, test_data, classifier, n=1, frac=1.0, threshold = 0.5):
119+
train_data = resample_train_data(train_data, n, frac)
120+
train_X = train_data.drop('y',axis=1)
121+
train_y = train_data['y']
122+
test_X = test_data.drop('y', axis=1)
123+
test_y = test_data['y']
124+
125+
classifier = classifier.fit(train_X, train_y)
126+
prodict_prob_y = classifier.predict_proba(test_X)[:,1]
127+
report = classification_report(test_y, prodict_prob_y > threshold,
128+
target_names = ['no', 'yes'])
129+
prodict_y = (prodict_prob_y > threshold).astype(int)
130+
accuracy = np.mean(test_y.values == prodict_y)
131+
print("Accuracy: {}".format(accuracy))
132+
print(report)
133+
fpr, tpr, thresholds = metrics.roc_curve(test_y, prodict_prob_y)
134+
precision, recall, thresholds = metrics.precision_recall_curve(test_y, prodict_prob_y)
135+
test_auc = metrics.auc(fpr, tpr)
136+
plot_pr(test_auc, precision, recall, "yes")
137+
138+
return prodict_y
139+
# print("AUC: {}".format(test_auc))
140+
# plot_roc(test_auc, fpr, tpr, "yes")
141+
142+
143+
def select_model(train_data, cv_data):
144+
for i in range(1):
145+
# print("n_estimators: {}".format(i))
146+
# print("threshold: {}".format(i/50.0))
147+
# print("n: {}".format(i))
148+
forest = RandomForestClassifier(n_estimators=400, oob_score=True)
149+
#lr = LogisticRegression(max_iter=100, C=1, random_state=0)
150+
train_evaluate(train_data, cv_data, forest, n=7, frac=1.0, threshold=0.4)
151+
152+
153+
def find_key_attrs(forest):
154+
feature_importance = forest.feature_importances_
155+
feature_importance = 100.0 * (feature_importance / feature_importance.max())
156+
fi_threshold = 5
157+
important_idx = np.where(feature_importance > fi_threshold)[0]
158+
important_features = features_list[important_idx]
159+
print "\n", important_features.shape[0], "Important features(>", \
160+
fi_threshold, "% of max importance)...\n"#, \
161+
#important_features
162+
sorted_idx = np.argsort(feature_importance[important_idx])[::-1]
163+
#get the figure about important features
164+
pos = np.arange(sorted_idx.shape[0]) + .5
165+
#plt.subplot(1, 2, 2)
166+
plt.title('Feature Importance')
167+
plt.barh(pos, feature_importance[important_idx][sorted_idx[::-1]], \
168+
color='r',align='center')
169+
plt.yticks(pos, important_features[sorted_idx[::-1]])
170+
plt.xlabel('Relative Importance')
171+
plt.draw()
172+
plt.show()
173+
174+
175+
processed_data = '../processed_data/bank-additional-full.csv'
176+
data = pd.read_csv(processed_data)
177+
train_data, cv_data, test_data = split_data(data)
178+
179+
features_list = train_data.drop('y',axis=1).columns
180+
select_model(train_data, cv_data)
181+
start_time = datetime.now()
182+
import matplotlib.pyplot as plt
183+
print('Training...')
184+
forest = RandomForestClassifier(n_estimators=400, oob_score=True)
185+
prodict_y = train_evaluate(train_data, test_data, forest, n=7, frac=1, threshold=0.40)
186+
# find_key_attrs(forest)
187+
188+
end_time = datetime.now()
189+
delta_seconds = (end_time - start_time).seconds
190+
191+
print("Cost time: {}s".format(delta_seconds))
192+
6 KB
Binary file not shown.

0 commit comments

Comments
 (0)