-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathperform_random_forest.py
96 lines (79 loc) · 3.53 KB
/
perform_random_forest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import csv
##bread and butter packages
import pdb
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
#for tree:
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
#for data visualization & result analysis
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
# import pydot, pydotplus
import graphviz
from plot_confusion_matrix import plot_confusion_matrix
from tree_evaluation import reg_eval, clf_eval
from rf_tune import *
def random_forest_reg(x_train, y_train, x_test, y_test,
num_estimator=20, min_samples_split =2, max_depth = None,
max_features = 'auto', random=1,tune=False):
reg = RandomForestRegressor(n_estimators = num_estimator, random_state= random,
max_depth = max_depth, min_samples_split = min_samples_split,
max_features = max_features)
# print('=================Baseline performance===============')
reg.fit(x_train, y_train)
print("on train set::")
y_pred_train = reg.predict(x_train)
reg_eval(reg, y_train.values.ravel(), y_pred_train)
print("on test set::")
y_pred_reg = reg.predict(x_test)
y_test_reg = y_test.values.ravel()
# pdb.set_trace()
# result = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_reg})
reg_eval(reg, y_test_reg, y_pred_reg)
if tune == True:
print('Tuning...')
find_n_estimators('reg', reg, x_train, x_test, y_train, y_test)
find_max_depth('reg', reg, x_train, x_test, y_train, y_test)
find_min_sample_split('reg', reg, x_train, x_test, y_train, y_test)
find_max_features('reg', reg, x_train, x_test, y_train, y_test)
# dot_data = tree.export_graphviz(reg, feature_names = labels,
# filled=True,
# special_characters=True)
# graph = graphviz.Source(dot_data)
# graph.format = 'png'
# graph = pydotplus.graph_from_dot_data(dot_data)
# graph.render("./output/salary-predict", view=True) #plt.show
# # dope graphics
# dot_data = StringIO()t_file=dot_data)
# graph = pydot.graph_from_dot_data(dot_data.getvalue())
# graph.write_pdf("graph.pdf")
# pdb.set_trace()
# pdb.set_trace()
def random_forest_clf(x_train, y_train, x_test, y_test,
num_estimator=20, min_samples_split =2, max_depth = None,
max_features = 'auto', random=1,tune=False):
clf = RandomForestClassifier(n_estimators = num_estimator, random_state= random,
max_depth = max_depth, min_samples_split = min_samples_split,
max_features = max_features)
clf.fit(x_train, y_train)
# print('============BASELINE=======')
print("on train set::")
y_pred_train = clf.predict(x_train)
clf_eval(clf, y_train.values.ravel(), y_pred_train.ravel())
print("on test set::")
y_pred_clf = clf.predict(x_test)
# y_test_clf = y_test.values.ravel()
clf_eval(clf, y_test.values.ravel(), y_pred_clf.ravel())
# pdb.set_trace()
# result = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_clf})
if tune == True:
print('Tuning...')
find_n_estimators('clf', clf, x_train, x_test, y_train, y_test)
find_max_depth('clf', clf, x_train, x_test, y_train, y_test)
find_min_sample_split('clf', clf, x_train, x_test, y_train, y_test)
find_max_features('clf', clf, x_train, x_test, y_train, y_test)
# pass