|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | +""" |
| 3 | +Created on Wed Jun 14 14:37:24 2017 |
| 4 | +
|
| 5 | +@author: liulo |
| 6 | +""" |
| 7 | +from datetime import datetime |
| 8 | + |
| 9 | +import pandas as pd |
| 10 | +import numpy as np |
| 11 | +from matplotlib import pylab |
| 12 | + |
| 13 | +from sklearn.ensemble import RandomForestClassifier |
| 14 | +from sklearn import metrics |
| 15 | +from sklearn.utils import shuffle |
| 16 | +from sklearn.linear_model import LogisticRegression |
| 17 | +from sklearn.metrics import classification_report |
| 18 | + |
| 19 | +import smote |
| 20 | + |
| 21 | + |
| 22 | +def split_data(data): |
| 23 | + data_len = data['y'].count() |
| 24 | + split1 = int(data_len*0.6) |
| 25 | + split2 = int(data_len*0.8) |
| 26 | + train_data = data[:split1] |
| 27 | + cv_data = data[split1:split2] |
| 28 | + test_data = data[split2:] |
| 29 | + |
| 30 | + return train_data, cv_data, test_data |
| 31 | + |
| 32 | + |
| 33 | +def resample_train_data(train_data, n, frac): |
| 34 | + numeric_attrs = ['age', 'duration', 'campaign', 'pdays', 'previous', |
| 35 | + 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', |
| 36 | + 'euribor3m', 'nr.employed',] |
| 37 | + #numeric_attrs = train_data.drop('y',axis=1).columns |
| 38 | + pos_train_data_original = train_data[train_data['y'] == 1] |
| 39 | + pos_train_data = train_data[train_data['y'] == 1] |
| 40 | + new_count = n * pos_train_data['y'].count() |
| 41 | + neg_train_data = train_data[train_data['y'] == 0].sample(frac=frac) |
| 42 | + train_list = [] |
| 43 | + if n != 0: |
| 44 | + pos_train_X = pos_train_data[numeric_attrs] |
| 45 | + pos_train_X2 = pd.concat([pos_train_data.drop(numeric_attrs, axis=1)] * n) |
| 46 | + pos_train_X2.index = range(new_count) |
| 47 | + |
| 48 | + s = smote.Smote(pos_train_X.values, N=n, k=3) |
| 49 | + pos_train_X = s.over_sampling() |
| 50 | + pos_train_X = pd.DataFrame(pos_train_X, columns=numeric_attrs, |
| 51 | + index=range(new_count)) |
| 52 | + pos_train_data = pd.concat([pos_train_X, pos_train_X2], axis=1) |
| 53 | + pos_train_data = pd.DataFrame(pos_train_data, columns=pos_train_data_original.columns) |
| 54 | + train_list = [pos_train_data, neg_train_data, pos_train_data_original] |
| 55 | + else: |
| 56 | + train_list = [neg_train_data, pos_train_data_original] |
| 57 | + print("Size of positive train data: {} * {}".format(pos_train_data_original['y'].count(), n+1)) |
| 58 | + print("Size of negative train data: {} * {}".format(neg_train_data['y'].count(), frac)) |
| 59 | + train_data = pd.concat(train_list, axis=0) |
| 60 | + return shuffle(train_data) |
| 61 | + |
| 62 | + |
| 63 | +def evaluate(test_predictY, test_y): |
| 64 | + test_len = test_y.shape[0] |
| 65 | + true_pos = 0 |
| 66 | + false_pos = 0 |
| 67 | + true_neg = 0 |
| 68 | + false_neg = 0 |
| 69 | + for i in range(test_len): |
| 70 | + if test_predictY[i] == 1: |
| 71 | + if test_y[i] == 1: |
| 72 | + true_pos += 1 |
| 73 | + else: |
| 74 | + false_pos += 1 |
| 75 | + else: |
| 76 | + if test_y[i] == 0: |
| 77 | + true_neg += 1 |
| 78 | + else: |
| 79 | + false_neg += 1 |
| 80 | + |
| 81 | + accuracy = 1.0 * (true_pos+true_neg) / test_len |
| 82 | + precision = 1.0 * true_pos / (true_pos + false_pos) |
| 83 | + recall = 1.0 * true_pos / (true_pos + false_neg) |
| 84 | + f1Score = 2 * precision * recall / (precision + recall) |
| 85 | + print("Accuracy: {}".format(accuracy)) |
| 86 | + print("Precision: {}".format(precision )) |
| 87 | + print("Recall: {}".format(recall)) |
| 88 | + print("F1 Score: {}".format(f1Score)) |
| 89 | + |
| 90 | + |
| 91 | +def plot_pr(auc_score, precision, recall, label=None): |
| 92 | + pylab.figure(num=None, figsize=(6, 5)) |
| 93 | + pylab.xlim([0.0, 1.0]) |
| 94 | + pylab.ylim([0.0, 1.0]) |
| 95 | + pylab.xlabel('Recall') |
| 96 | + pylab.ylabel('Precision') |
| 97 | + pylab.title('P/R (AUC=%0.2f) / %s' % (auc_score, label)) |
| 98 | + pylab.fill_between(recall, precision, alpha=0.2) |
| 99 | + pylab.grid(True, linestyle='-', color='0.75') |
| 100 | + pylab.plot(recall, precision, lw=1) |
| 101 | + pylab.show() |
| 102 | + |
| 103 | + |
| 104 | + |
| 105 | +def plot_roc(auc_score, fpr, tpr, label=None): |
| 106 | + pylab.figure(num=None, figsize=(6, 5)) |
| 107 | + pylab.xlim([0.0, 1.0]) |
| 108 | + pylab.ylim([0.0, 1.0]) |
| 109 | + pylab.xlabel('False positive rate') |
| 110 | + pylab.ylabel('True positive rate') |
| 111 | + pylab.title('ROC (AUC=%0.2f) / %s' % (auc_score, label)) |
| 112 | + pylab.fill_between(fpr, tpr, alpha=0.2) |
| 113 | + pylab.grid(True, linestyle='-', color='0.75') |
| 114 | + pylab.plot(fpr, tpr, lw=1) |
| 115 | + pylab.show() |
| 116 | + |
| 117 | + |
| 118 | +def train_evaluate(train_data, test_data, classifier, n=1, frac=1.0, threshold = 0.5): |
| 119 | + train_data = resample_train_data(train_data, n, frac) |
| 120 | + train_X = train_data.drop('y',axis=1) |
| 121 | + train_y = train_data['y'] |
| 122 | + test_X = test_data.drop('y', axis=1) |
| 123 | + test_y = test_data['y'] |
| 124 | + |
| 125 | + classifier = classifier.fit(train_X, train_y) |
| 126 | + prodict_prob_y = classifier.predict_proba(test_X)[:,1] |
| 127 | + report = classification_report(test_y, prodict_prob_y > threshold, |
| 128 | + target_names = ['no', 'yes']) |
| 129 | + prodict_y = (prodict_prob_y > threshold).astype(int) |
| 130 | + accuracy = np.mean(test_y.values == prodict_y) |
| 131 | + print("Accuracy: {}".format(accuracy)) |
| 132 | + print(report) |
| 133 | + fpr, tpr, thresholds = metrics.roc_curve(test_y, prodict_prob_y) |
| 134 | + precision, recall, thresholds = metrics.precision_recall_curve(test_y, prodict_prob_y) |
| 135 | + test_auc = metrics.auc(fpr, tpr) |
| 136 | + plot_pr(test_auc, precision, recall, "yes") |
| 137 | + |
| 138 | + return prodict_y |
| 139 | + # print("AUC: {}".format(test_auc)) |
| 140 | + # plot_roc(test_auc, fpr, tpr, "yes") |
| 141 | + |
| 142 | + |
| 143 | +def select_model(train_data, cv_data): |
| 144 | + for i in range(1): |
| 145 | + # print("n_estimators: {}".format(i)) |
| 146 | + # print("threshold: {}".format(i/50.0)) |
| 147 | + # print("n: {}".format(i)) |
| 148 | + forest = RandomForestClassifier(n_estimators=400, oob_score=True) |
| 149 | + #lr = LogisticRegression(max_iter=100, C=1, random_state=0) |
| 150 | + train_evaluate(train_data, cv_data, forest, n=7, frac=1.0, threshold=0.4) |
| 151 | + |
| 152 | + |
| 153 | +def find_key_attrs(forest): |
| 154 | + feature_importance = forest.feature_importances_ |
| 155 | + feature_importance = 100.0 * (feature_importance / feature_importance.max()) |
| 156 | + fi_threshold = 5 |
| 157 | + important_idx = np.where(feature_importance > fi_threshold)[0] |
| 158 | + important_features = features_list[important_idx] |
| 159 | + print "\n", important_features.shape[0], "Important features(>", \ |
| 160 | + fi_threshold, "% of max importance)...\n"#, \ |
| 161 | + #important_features |
| 162 | + sorted_idx = np.argsort(feature_importance[important_idx])[::-1] |
| 163 | + #get the figure about important features |
| 164 | + pos = np.arange(sorted_idx.shape[0]) + .5 |
| 165 | + #plt.subplot(1, 2, 2) |
| 166 | + plt.title('Feature Importance') |
| 167 | + plt.barh(pos, feature_importance[important_idx][sorted_idx[::-1]], \ |
| 168 | + color='r',align='center') |
| 169 | + plt.yticks(pos, important_features[sorted_idx[::-1]]) |
| 170 | + plt.xlabel('Relative Importance') |
| 171 | + plt.draw() |
| 172 | + plt.show() |
| 173 | + |
| 174 | + |
| 175 | +processed_data = '../processed_data/bank-additional-full.csv' |
| 176 | +data = pd.read_csv(processed_data) |
| 177 | +train_data, cv_data, test_data = split_data(data) |
| 178 | + |
| 179 | +features_list = train_data.drop('y',axis=1).columns |
| 180 | +select_model(train_data, cv_data) |
| 181 | +start_time = datetime.now() |
| 182 | +import matplotlib.pyplot as plt |
| 183 | +print('Training...') |
| 184 | +forest = RandomForestClassifier(n_estimators=400, oob_score=True) |
| 185 | +prodict_y = train_evaluate(train_data, test_data, forest, n=7, frac=1, threshold=0.40) |
| 186 | +# find_key_attrs(forest) |
| 187 | + |
| 188 | +end_time = datetime.now() |
| 189 | +delta_seconds = (end_time - start_time).seconds |
| 190 | + |
| 191 | +print("Cost time: {}s".format(delta_seconds)) |
| 192 | + |
0 commit comments