-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathregressions.py
90 lines (70 loc) · 4.02 KB
/
regressions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import numpy as np
import pandas as pd
import os
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression as LR
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.neighbors import KNeighborsRegressor as KNR
from sklearn.svm import SVR as SVR
from sklearn.linear_model import ElasticNet as EN
from sklearn.ensemble import GradientBoostingRegressor as GBR
def calc_rmse(yhat, y):
return np.sqrt(((yhat-y)**2).mean())
def eval_model(model, X_train, y_train, X_test, y_test):
ypred = model.predict(X_test)
ytrainpred = model.predict(X_train)
r2score = round(r2_score(y_test, ypred), 2)
rmse_train = round(calc_rmse(ytrainpred, y_train), 2)
rmse_test = round(calc_rmse(ypred, y_test), 2)
return rmse_train, rmse_test, r2score
def run_models(X, y, models):
print('models within run_models:', models)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
hyperpara_dict = { LR : {}, # use defaults only
RFR : { 'randomforestregressor__max_features' : ['auto'],
'randomforestregressor__max_depth': [10],
'randomforestregressor__bootstrap': [True],
'randomforestregressor__min_samples_leaf': [1, 4],
'randomforestregressor__min_samples_split': [2, 5],
'randomforestregressor__n_estimators': [10],
},
GBR : { 'gradientboostingregressor__n_estimators' : [100],
'gradientboostingregressor__max_depth': [5, 10],
'gradientboostingregressor__min_samples_split': [3, 10],
'gradientboostingregressor__learning_rate': [0.01, 0.05, 0.1],
'gradientboostingregressor__loss': ['ls'],
},
KNR : { 'kneighborsregressor__n_neighbors' : [1, 5, 10],
'kneighborsregressor__weights': ['uniform', 'distance'],
},
SVR : { 'svr__kernel': ['linear', 'rbf'],
'svr__C': [0.5, 10, 20],
'svr__epsilon': [0.1, 5, 10, 50],
},
EN : { 'elasticnet__alpha': [1, 0.1, 0.01], # equivalent to lambda; alpha=0 means no regularization, ie linear regression
'elasticnet__l1_ratio': [0.5, 0.7, 0.9], # l1=1 means L1 penalty, ie Lasso (not L2/Ridge)
'elasticnet__max_iter': [1000],
}
}
model_names_dict = {'Linear Regression': LR, 'Random Forest': RFR, 'Gradient Boosting': GBR, 'K Neighbors': KNR, 'S V R': SVR, 'Elastic Net': EN}
model_comparison_dict = {}
for model_name in models:
model = model_names_dict[model_name]
print('model in run_models:', model)
# data preprocessing (removing mean and scaling to unit variance with StandardScaler)
pipeline = make_pipeline( StandardScaler(),
model()
)
hyperparameters = hyperpara_dict[model]
clf = GridSearchCV(pipeline, hyperparameters, cv=3) # cv=3 is same as cv=None (default)
clf.fit(X_train, y_train)
print('best params for {}:'.format(model), clf.best_params_)
r2score, rmse_train, rmse_test = eval_model(clf.best_estimator_, X_train, y_train, X_test, y_test)
model_comparison_dict[model_name] = r2score, rmse_train, rmse_test
return model_comparison_dict
if __name__ == '__main__':
run_models()