-
Notifications
You must be signed in to change notification settings - Fork 1
/
xGBoosting.py
186 lines (160 loc) · 5.54 KB
/
xGBoosting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
from xgboost import XGBClassifier
import xgboost as xgb
from logistic_regression import *
print(f'>>>>>>>>>>>>>>XGBOOST<<<<<<<<<<<')
# BOW features
xgb_model = XGBClassifier(max_depth=6, n_estimators=1000).fit(xtrain_bow, ytrain)
prediction = xgb_model.predict(xvalid_bow)
print(f'F1 score for XGBoost BOW features {f1_score(yvalid, prediction)}')
test_pred = xgb_model.predict(test_bow)
test['label'] = test_pred
submission = test[['id','label']]
submission.to_csv('Submissions/sub_xgb_bow.csv', index=False)
# TF-IDF features
xgb = XGBClassifier(max_depth=6, n_estimators=1000).fit(xtrain_tfidf, ytrain)
prediction = xgb.predict(xvalid_tfidf)
print(f'F1 score for XGBoost TF-IDF features {f1_score(yvalid, prediction)}')
# Word2Vec features
xgb = XGBClassifier(max_depth=6, n_estimators=1000, nthread= 3).fit(xtrain_w2v, ytrain)
prediction = xgb.predict(xvalid_w2v)
print(f'F1 score for XGBoost Word2Vec features {f1_score(yvalid, prediction)}')
# doc2Vec
xgb = XGBClassifier(max_depth=6, n_estimators=1000, nthread= 3).fit(xtrain_d2v, ytrain)
prediction = xgb.predict(xvalid_d2v)
print(f'F1 score for XGBoost Doc2Vec features {f1_score(yvalid, prediction)}')
'''
XGBOOST + WORD2VEC PARAMETER TUNING
'''
#using D matrices. Can hold both features and target variable
dtrain = xgb.DMatrix(xtrain_w2v, label=ytrain)
dvalid = xgb.DMatrix(xvalid_w2v, label=yvalid)
dtest = xgb.DMatrix(test_w2v)
# Parameters that we are going to tune
params = {
'objective':'binary:logistic',
'max_depth':6,
'min_child_weight': 1,
'eta':.3,
'subsample': 1,
'colsample_bytree': 1
}
# define evaluation metric
def custom_eval(preds, dtrain):
labels = dtrain.get_label().astype(np.int)
preds = (preds >= 0.3).astype(np.int)
return [('f1_score', f1_score(labels, preds))]
#Tuning max_depth and min_child_weight
gridsearch_params = [
(max_depth, min_child_weight)
for max_depth in range(6, 10)
for min_child_weight in range(5, 8)
]
max_f1 = 0. # initializing with 0
best_params = None
cv_results = {}
for max_depth, min_child_weight in gridsearch_params:
print(f'CV with max_depth={max_depth}, min_child_weight={min_child_weight}')
# Update our parameters
params['max_depth'] = max_depth
params['min_child_weight'] = min_child_weight
# Cross-validation
cv_results = xgb.cv(params,
dtrain,
feval=custom_eval,
num_boost_round=200,
maximize=True,
seed=16,
nfold=5,
early_stopping_rounds=10
)
# Finding best F1 Score
mean_f1 = cv_results['test-f1_score-mean'].max()
boost_rounds = cv_results['test-f1_score-mean'].argmax()
print(f'\tF1 Score {mean_f1} for {boost_rounds}')
if mean_f1 > max_f1:
max_f1 = mean_f1
best_params = (max_depth, min_child_weight)
print(f'Best params: {best_params[0]}, {best_params[1]}, F1 Score: { max_f1}')
#Updating max_depth and min_child_weight parameters.
params['max_depth'] = best_params[0]
params['min_child_weight'] = best_params[1]
#Tuning subsample and colsample
gridsearch_params = [
(subsample, colsample)
for subsample in [i/10. for i in range(5,10)]
for colsample in [i/10. for i in range(5,10)] ]
max_f1 = 0.
best_params = None
for subsample, colsample in gridsearch_params:
print(f'CV with subsample={subsample}, colsample={colsample}')
# Update our parameters
params['colsample'] = colsample
params['subsample'] = subsample
cv_results = xgb.cv(
params,
dtrain,
feval= custom_eval,
num_boost_round=200,
maximize=True,
seed=16,
nfold=5,
early_stopping_rounds=10
)
# Finding best F1 Score
mean_f1 = cv_results['test-f1_score-mean'].max()
boost_rounds = cv_results['test-f1_score-mean'].argmax() # position of the maximum element
print(f'\tF1 Score {mean_f1} for {boost_rounds}')
if mean_f1 > max_f1:
max_f1 = mean_f1
best_params = (subsample, colsample)
print(f'Best params: {best_params[0]}, {best_params[1]}, F1 Score: { max_f1}')
params['subsample'] = best_params[0]
params['colsample_bytree'] = best_params[1]
# Tuning the learning rate
max_f1 = 0.
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
print(f'CV with eta={eta}')
# Update ETA
params['eta'] = eta
# Run CV
cv_results = xgb.cv(
params,
dtrain,
feval= custom_eval,
num_boost_round=1000,
maximize=True,
seed=16,
nfold=5,
early_stopping_rounds=20
)
# Finding best F1 Score
mean_f1 = cv_results['test-f1_score-mean'].max()
boost_rounds = cv_results['test-f1_score-mean'].argmax()
print(f'\tF1 Score {mean_f1} for {boost_rounds}')
if mean_f1 > max_f1:
max_f1 = mean_f1
best_params = eta
print(f'Best params: {best_params}, F1 Score: {max_f1}')
params['eta'] = best_params
# final parameters for best result
# final_params = {'colsample': 0.9,
# 'colsample_bytree': 0.5, 'eta': 0.1,
# 'max_depth': 8, 'min_child_weight': 6,
# 'objective': 'binary:logistic',
# 'subsample': 0.9}
# train model again with the tuned parameters
xgb_model = xgb.train(
params,
dtrain,
feval= custom_eval,
num_boost_round= 1000,
maximize=True,
evals=[(dvalid, "Validation")],
early_stopping_rounds=10
)
# final submission
test_pred = xgb_model.predict(dtest)
test['label'] = (test_pred >= 0.3).astype(np.int)
submission = test[['id','label']]
submission.to_csv('Submissions/sub_xgb_w2v_finetuned.csv', index=False)