Skip to content

Commit 427b848

Browse files
authored
Create 04_lightGBM_2.py
1 parent a80074d commit 427b848

File tree

1 file changed

+207
-0
lines changed

1 file changed

+207
-0
lines changed
Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
2+
import pandas as pd
3+
import time
4+
import numpy as np
5+
from sklearn.cross_validation import train_test_split
6+
import lightgbm as lgb
7+
import gc
8+
9+
def lgb_modelfit_nocv(params, dtrain, dvalid, predictors, target='target', objective='binary', metrics='auc',
10+
feval=None, early_stopping_rounds=20, num_boost_round=3000, verbose_eval=10, categorical_features=None):
11+
lgb_params = {
12+
'boosting_type': 'gbdt',
13+
'objective': objective,
14+
'metric':metrics,
15+
'learning_rate': 0.01,
16+
#'is_unbalance': 'true', #because training data is unbalance (replaced with scale_pos_weight)
17+
'num_leaves': 31, # we should let it be smaller than 2^(max_depth)
18+
'max_depth': -1, # -1 means no limit
19+
'min_child_samples': 20, # Minimum number of data need in a child(min_data_in_leaf)
20+
'max_bin': 255, # Number of bucketed bin for feature values
21+
'subsample': 0.6, # Subsample ratio of the training instance.
22+
'subsample_freq': 0, # frequence of subsample, <=0 means no enable
23+
'colsample_bytree': 0.3, # Subsample ratio of columns when constructing each tree.
24+
'min_child_weight': 5, # Minimum sum of instance weight(hessian) needed in a child(leaf)
25+
'subsample_for_bin': 200000, # Number of samples for constructing bin
26+
'min_split_gain': 0, # lambda_l1, lambda_l2 and min_gain_to_split to regularization
27+
'reg_alpha': 0, # L1 regularization term on weights
28+
'reg_lambda': 0, # L2 regularization term on weights
29+
'nthread': 4,
30+
'verbose': 0,
31+
'metric':metrics
32+
}
33+
34+
lgb_params.update(params)
35+
36+
print("preparing validation datasets")
37+
38+
xgtrain = lgb.Dataset(dtrain[predictors].values, label=dtrain[target].values,
39+
feature_name=predictors,
40+
categorical_feature=categorical_features
41+
)
42+
xgvalid = lgb.Dataset(dvalid[predictors].values, label=dvalid[target].values,
43+
feature_name=predictors,
44+
categorical_feature=categorical_features
45+
)
46+
47+
evals_results = {}
48+
49+
bst1 = lgb.train(lgb_params,
50+
xgtrain,
51+
valid_sets=[xgtrain, xgvalid],
52+
valid_names=['train','valid'],
53+
evals_result=evals_results,
54+
num_boost_round=num_boost_round,
55+
early_stopping_rounds=early_stopping_rounds,
56+
verbose_eval=10,
57+
feval=feval)
58+
59+
n_estimators = bst1.best_iteration
60+
print("\nModel Report")
61+
print("n_estimators : ", n_estimators)
62+
print(metrics+":", evals_results['valid'][metrics][n_estimators-1])
63+
64+
return bst1
65+
66+
path = '../input/'
67+
68+
dtypes = {
69+
'ip' : 'uint32',
70+
'app' : 'uint16',
71+
'device' : 'uint16',
72+
'os' : 'uint16',
73+
'channel' : 'uint16',
74+
'is_attributed' : 'uint8',
75+
'click_id' : 'uint32'
76+
}
77+
78+
print('loading train data...')
79+
train_df = pd.read_csv(path+"train.csv", skiprows=range(1,144903891), nrows=40000000, dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'is_attributed'])
80+
81+
print('loading test data...')
82+
test_df = pd.read_csv(path+"test.csv", dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'click_id'])
83+
84+
len_train = len(train_df)
85+
train_df=train_df.append(test_df)
86+
87+
del test_df
88+
gc.collect()
89+
90+
print('Extracting new features...')
91+
train_df['hour'] = pd.to_datetime(train_df.click_time).dt.hour.astype('uint8')
92+
train_df['day'] = pd.to_datetime(train_df.click_time).dt.day.astype('uint8')
93+
94+
gc.collect()
95+
96+
print('grouping by ip-day-hour combination...')
97+
gp = train_df[['ip','day','hour','channel']].groupby(by=['ip','day','hour'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_tcount'})
98+
train_df = train_df.merge(gp, on=['ip','day','hour'], how='left')
99+
del gp
100+
gc.collect()
101+
102+
print('grouping by ip-app combination...')
103+
gp = train_df[['ip', 'app', 'channel']].groupby(by=['ip', 'app'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_count'})
104+
train_df = train_df.merge(gp, on=['ip','app'], how='left')
105+
del gp
106+
gc.collect()
107+
108+
109+
print('grouping by ip-app-os combination...')
110+
gp = train_df[['ip','app', 'os', 'channel']].groupby(by=['ip', 'app', 'os'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_os_count'})
111+
train_df = train_df.merge(gp, on=['ip','app', 'os'], how='left')
112+
del gp
113+
gc.collect()
114+
115+
116+
# Adding features with var and mean hour (inspired from nuhsikander's script)
117+
print('grouping by : ip_day_chl_var_hour')
118+
gp = train_df[['ip','day','hour','channel']].groupby(by=['ip','day','channel'])[['hour']].var().reset_index().rename(index=str, columns={'hour': 'ip_tchan_count'})
119+
train_df = train_df.merge(gp, on=['ip','day','channel'], how='left')
120+
del gp
121+
gc.collect()
122+
123+
print('grouping by : ip_app_os_var_hour')
124+
gp = train_df[['ip','app', 'os', 'hour']].groupby(by=['ip', 'app', 'os'])[['hour']].var().reset_index().rename(index=str, columns={'hour': 'ip_app_os_var'})
125+
train_df = train_df.merge(gp, on=['ip','app', 'os'], how='left')
126+
del gp
127+
gc.collect()
128+
129+
print('grouping by : ip_app_channel_var_day')
130+
gp = train_df[['ip','app', 'channel', 'day']].groupby(by=['ip', 'app', 'channel'])[['day']].var().reset_index().rename(index=str, columns={'day': 'ip_app_channel_var_day'})
131+
train_df = train_df.merge(gp, on=['ip','app', 'channel'], how='left')
132+
del gp
133+
gc.collect()
134+
135+
print('grouping by : ip_app_chl_mean_hour')
136+
gp = train_df[['ip','app', 'channel','hour']].groupby(by=['ip', 'app', 'channel'])[['hour']].mean().reset_index().rename(index=str, columns={'hour': 'ip_app_channel_mean_hour'})
137+
print("merging...")
138+
train_df = train_df.merge(gp, on=['ip','app', 'channel'], how='left')
139+
del gp
140+
gc.collect()
141+
142+
print("vars and data type: ")
143+
train_df.info()
144+
train_df['ip_tcount'] = train_df['ip_tcount'].astype('uint16')
145+
train_df['ip_app_count'] = train_df['ip_app_count'].astype('uint16')
146+
train_df['ip_app_os_count'] = train_df['ip_app_os_count'].astype('uint16')
147+
148+
149+
test_df = train_df[len_train:]
150+
val_df = train_df[(len_train-2500000):len_train]
151+
train_df = train_df[:(len_train-2500000)]
152+
153+
print("train size: ", len(train_df))
154+
print("valid size: ", len(val_df))
155+
print("test size : ", len(test_df))
156+
157+
target = 'is_attributed'
158+
predictors = ['app','device','os', 'channel', 'hour', 'day',
159+
'ip_tcount', 'ip_tchan_count', 'ip_app_count',
160+
'ip_app_os_count', 'ip_app_os_var',
161+
'ip_app_channel_var_day','ip_app_channel_mean_hour']
162+
categorical = ['app', 'device', 'os', 'channel', 'hour', 'day']
163+
164+
sub = pd.DataFrame()
165+
sub['click_id'] = test_df['click_id'].astype('int')
166+
167+
gc.collect()
168+
169+
print("Training...")
170+
start_time = time.time()
171+
172+
173+
params = {
174+
'learning_rate': 0.15,
175+
#'is_unbalance': 'true', # replaced with scale_pos_weight argument
176+
'num_leaves': 7, # 2^max_depth - 1
177+
'max_depth': 3, # -1 means no limit
178+
'min_child_samples': 100, # Minimum number of data need in a child(min_data_in_leaf)
179+
'max_bin': 100, # Number of bucketed bin for feature values
180+
'subsample': 0.7, # Subsample ratio of the training instance.
181+
'subsample_freq': 1, # frequence of subsample, <=0 means no enable
182+
'colsample_bytree': 0.9, # Subsample ratio of columns when constructing each tree.
183+
'min_child_weight': 0, # Minimum sum of instance weight(hessian) needed in a child(leaf)
184+
'scale_pos_weight':99 # because training data is extremely unbalanced
185+
}
186+
bst = lgb_modelfit_nocv(params,
187+
train_df,
188+
val_df,
189+
predictors,
190+
target,
191+
objective='binary',
192+
metrics='auc',
193+
early_stopping_rounds=30,
194+
verbose_eval=True,
195+
num_boost_round=500,
196+
categorical_features=categorical)
197+
198+
print('[{}]: model training time'.format(time.time() - start_time))
199+
del train_df
200+
del val_df
201+
gc.collect()
202+
203+
print("Predicting...")
204+
sub['is_attributed'] = bst.predict(test_df[predictors])
205+
print("writing...")
206+
sub.to_csv('sub_lgb_balanced99.csv',index=False)
207+
print("done...")

0 commit comments

Comments
 (0)