|
| 1 | + |
| 2 | +import pandas as pd |
| 3 | +import time |
| 4 | +import numpy as np |
| 5 | +from sklearn.cross_validation import train_test_split |
| 6 | +import lightgbm as lgb |
| 7 | +import gc |
| 8 | + |
| 9 | +def lgb_modelfit_nocv(params, dtrain, dvalid, predictors, target='target', objective='binary', metrics='auc', |
| 10 | + feval=None, early_stopping_rounds=20, num_boost_round=3000, verbose_eval=10, categorical_features=None): |
| 11 | + lgb_params = { |
| 12 | + 'boosting_type': 'gbdt', |
| 13 | + 'objective': objective, |
| 14 | + 'metric':metrics, |
| 15 | + 'learning_rate': 0.01, |
| 16 | + #'is_unbalance': 'true', #because training data is unbalance (replaced with scale_pos_weight) |
| 17 | + 'num_leaves': 31, # we should let it be smaller than 2^(max_depth) |
| 18 | + 'max_depth': -1, # -1 means no limit |
| 19 | + 'min_child_samples': 20, # Minimum number of data need in a child(min_data_in_leaf) |
| 20 | + 'max_bin': 255, # Number of bucketed bin for feature values |
| 21 | + 'subsample': 0.6, # Subsample ratio of the training instance. |
| 22 | + 'subsample_freq': 0, # frequence of subsample, <=0 means no enable |
| 23 | + 'colsample_bytree': 0.3, # Subsample ratio of columns when constructing each tree. |
| 24 | + 'min_child_weight': 5, # Minimum sum of instance weight(hessian) needed in a child(leaf) |
| 25 | + 'subsample_for_bin': 200000, # Number of samples for constructing bin |
| 26 | + 'min_split_gain': 0, # lambda_l1, lambda_l2 and min_gain_to_split to regularization |
| 27 | + 'reg_alpha': 0, # L1 regularization term on weights |
| 28 | + 'reg_lambda': 0, # L2 regularization term on weights |
| 29 | + 'nthread': 4, |
| 30 | + 'verbose': 0, |
| 31 | + 'metric':metrics |
| 32 | + } |
| 33 | + |
| 34 | + lgb_params.update(params) |
| 35 | + |
| 36 | + print("preparing validation datasets") |
| 37 | + |
| 38 | + xgtrain = lgb.Dataset(dtrain[predictors].values, label=dtrain[target].values, |
| 39 | + feature_name=predictors, |
| 40 | + categorical_feature=categorical_features |
| 41 | + ) |
| 42 | + xgvalid = lgb.Dataset(dvalid[predictors].values, label=dvalid[target].values, |
| 43 | + feature_name=predictors, |
| 44 | + categorical_feature=categorical_features |
| 45 | + ) |
| 46 | + |
| 47 | + evals_results = {} |
| 48 | + |
| 49 | + bst1 = lgb.train(lgb_params, |
| 50 | + xgtrain, |
| 51 | + valid_sets=[xgtrain, xgvalid], |
| 52 | + valid_names=['train','valid'], |
| 53 | + evals_result=evals_results, |
| 54 | + num_boost_round=num_boost_round, |
| 55 | + early_stopping_rounds=early_stopping_rounds, |
| 56 | + verbose_eval=10, |
| 57 | + feval=feval) |
| 58 | + |
| 59 | + n_estimators = bst1.best_iteration |
| 60 | + print("\nModel Report") |
| 61 | + print("n_estimators : ", n_estimators) |
| 62 | + print(metrics+":", evals_results['valid'][metrics][n_estimators-1]) |
| 63 | + |
| 64 | + return bst1 |
| 65 | + |
| 66 | +path = '../input/' |
| 67 | + |
| 68 | +dtypes = { |
| 69 | + 'ip' : 'uint32', |
| 70 | + 'app' : 'uint16', |
| 71 | + 'device' : 'uint16', |
| 72 | + 'os' : 'uint16', |
| 73 | + 'channel' : 'uint16', |
| 74 | + 'is_attributed' : 'uint8', |
| 75 | + 'click_id' : 'uint32' |
| 76 | + } |
| 77 | + |
| 78 | +print('loading train data...') |
| 79 | +train_df = pd.read_csv(path+"train.csv", skiprows=range(1,144903891), nrows=40000000, dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'is_attributed']) |
| 80 | + |
| 81 | +print('loading test data...') |
| 82 | +test_df = pd.read_csv(path+"test.csv", dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'click_id']) |
| 83 | + |
| 84 | +len_train = len(train_df) |
| 85 | +train_df=train_df.append(test_df) |
| 86 | + |
| 87 | +del test_df |
| 88 | +gc.collect() |
| 89 | + |
| 90 | +print('Extracting new features...') |
| 91 | +train_df['hour'] = pd.to_datetime(train_df.click_time).dt.hour.astype('uint8') |
| 92 | +train_df['day'] = pd.to_datetime(train_df.click_time).dt.day.astype('uint8') |
| 93 | + |
| 94 | +gc.collect() |
| 95 | + |
| 96 | +print('grouping by ip-day-hour combination...') |
| 97 | +gp = train_df[['ip','day','hour','channel']].groupby(by=['ip','day','hour'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_tcount'}) |
| 98 | +train_df = train_df.merge(gp, on=['ip','day','hour'], how='left') |
| 99 | +del gp |
| 100 | +gc.collect() |
| 101 | + |
| 102 | +print('grouping by ip-app combination...') |
| 103 | +gp = train_df[['ip', 'app', 'channel']].groupby(by=['ip', 'app'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_count'}) |
| 104 | +train_df = train_df.merge(gp, on=['ip','app'], how='left') |
| 105 | +del gp |
| 106 | +gc.collect() |
| 107 | + |
| 108 | + |
| 109 | +print('grouping by ip-app-os combination...') |
| 110 | +gp = train_df[['ip','app', 'os', 'channel']].groupby(by=['ip', 'app', 'os'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_os_count'}) |
| 111 | +train_df = train_df.merge(gp, on=['ip','app', 'os'], how='left') |
| 112 | +del gp |
| 113 | +gc.collect() |
| 114 | + |
| 115 | + |
| 116 | +# Adding features with var and mean hour (inspired from nuhsikander's script) |
| 117 | +print('grouping by : ip_day_chl_var_hour') |
| 118 | +gp = train_df[['ip','day','hour','channel']].groupby(by=['ip','day','channel'])[['hour']].var().reset_index().rename(index=str, columns={'hour': 'ip_tchan_count'}) |
| 119 | +train_df = train_df.merge(gp, on=['ip','day','channel'], how='left') |
| 120 | +del gp |
| 121 | +gc.collect() |
| 122 | + |
| 123 | +print('grouping by : ip_app_os_var_hour') |
| 124 | +gp = train_df[['ip','app', 'os', 'hour']].groupby(by=['ip', 'app', 'os'])[['hour']].var().reset_index().rename(index=str, columns={'hour': 'ip_app_os_var'}) |
| 125 | +train_df = train_df.merge(gp, on=['ip','app', 'os'], how='left') |
| 126 | +del gp |
| 127 | +gc.collect() |
| 128 | + |
| 129 | +print('grouping by : ip_app_channel_var_day') |
| 130 | +gp = train_df[['ip','app', 'channel', 'day']].groupby(by=['ip', 'app', 'channel'])[['day']].var().reset_index().rename(index=str, columns={'day': 'ip_app_channel_var_day'}) |
| 131 | +train_df = train_df.merge(gp, on=['ip','app', 'channel'], how='left') |
| 132 | +del gp |
| 133 | +gc.collect() |
| 134 | + |
| 135 | +print('grouping by : ip_app_chl_mean_hour') |
| 136 | +gp = train_df[['ip','app', 'channel','hour']].groupby(by=['ip', 'app', 'channel'])[['hour']].mean().reset_index().rename(index=str, columns={'hour': 'ip_app_channel_mean_hour'}) |
| 137 | +print("merging...") |
| 138 | +train_df = train_df.merge(gp, on=['ip','app', 'channel'], how='left') |
| 139 | +del gp |
| 140 | +gc.collect() |
| 141 | + |
| 142 | +print("vars and data type: ") |
| 143 | +train_df.info() |
| 144 | +train_df['ip_tcount'] = train_df['ip_tcount'].astype('uint16') |
| 145 | +train_df['ip_app_count'] = train_df['ip_app_count'].astype('uint16') |
| 146 | +train_df['ip_app_os_count'] = train_df['ip_app_os_count'].astype('uint16') |
| 147 | + |
| 148 | + |
| 149 | +test_df = train_df[len_train:] |
| 150 | +val_df = train_df[(len_train-2500000):len_train] |
| 151 | +train_df = train_df[:(len_train-2500000)] |
| 152 | + |
| 153 | +print("train size: ", len(train_df)) |
| 154 | +print("valid size: ", len(val_df)) |
| 155 | +print("test size : ", len(test_df)) |
| 156 | + |
| 157 | +target = 'is_attributed' |
| 158 | +predictors = ['app','device','os', 'channel', 'hour', 'day', |
| 159 | + 'ip_tcount', 'ip_tchan_count', 'ip_app_count', |
| 160 | + 'ip_app_os_count', 'ip_app_os_var', |
| 161 | + 'ip_app_channel_var_day','ip_app_channel_mean_hour'] |
| 162 | +categorical = ['app', 'device', 'os', 'channel', 'hour', 'day'] |
| 163 | + |
| 164 | +sub = pd.DataFrame() |
| 165 | +sub['click_id'] = test_df['click_id'].astype('int') |
| 166 | + |
| 167 | +gc.collect() |
| 168 | + |
| 169 | +print("Training...") |
| 170 | +start_time = time.time() |
| 171 | + |
| 172 | + |
| 173 | +params = { |
| 174 | + 'learning_rate': 0.15, |
| 175 | + #'is_unbalance': 'true', # replaced with scale_pos_weight argument |
| 176 | + 'num_leaves': 7, # 2^max_depth - 1 |
| 177 | + 'max_depth': 3, # -1 means no limit |
| 178 | + 'min_child_samples': 100, # Minimum number of data need in a child(min_data_in_leaf) |
| 179 | + 'max_bin': 100, # Number of bucketed bin for feature values |
| 180 | + 'subsample': 0.7, # Subsample ratio of the training instance. |
| 181 | + 'subsample_freq': 1, # frequence of subsample, <=0 means no enable |
| 182 | + 'colsample_bytree': 0.9, # Subsample ratio of columns when constructing each tree. |
| 183 | + 'min_child_weight': 0, # Minimum sum of instance weight(hessian) needed in a child(leaf) |
| 184 | + 'scale_pos_weight':99 # because training data is extremely unbalanced |
| 185 | +} |
| 186 | +bst = lgb_modelfit_nocv(params, |
| 187 | + train_df, |
| 188 | + val_df, |
| 189 | + predictors, |
| 190 | + target, |
| 191 | + objective='binary', |
| 192 | + metrics='auc', |
| 193 | + early_stopping_rounds=30, |
| 194 | + verbose_eval=True, |
| 195 | + num_boost_round=500, |
| 196 | + categorical_features=categorical) |
| 197 | + |
| 198 | +print('[{}]: model training time'.format(time.time() - start_time)) |
| 199 | +del train_df |
| 200 | +del val_df |
| 201 | +gc.collect() |
| 202 | + |
| 203 | +print("Predicting...") |
| 204 | +sub['is_attributed'] = bst.predict(test_df[predictors]) |
| 205 | +print("writing...") |
| 206 | +sub.to_csv('sub_lgb_balanced99.csv',index=False) |
| 207 | +print("done...") |
0 commit comments