1
1
import pandas as pd
2
2
import numpy as np
3
3
from constants import columns , date_col , agg_cols
4
+ from clickhouse .client import clickhouse_client
5
+
4
6
import warnings
5
7
warnings .filterwarnings ("ignore" )
6
8
@@ -171,14 +173,14 @@ def set_target_columns(df):
171
173
def remove_date_col (df ):
172
174
"""Drop the date columun"""
173
175
df .drop ([date_col ],axis = 1 ,inplace = True )
174
-
176
+ return df
175
177
176
178
def get_cols_withohut_pid (df ):
177
179
"""Return the columns EXCEPT OF PID"""
178
180
cols = df .drop ("pid" ,axis = 1 ).columns
179
- return cols
181
+ return df , cols
180
182
181
- def create_target_traffic_by_target_columns (target_columns ): # TODO take the data from the db
183
+ def create_target_traffic_by_target_columns (df , target_columns ):
182
184
"""Extract the traffic for next hours"""
183
185
next_hrs = []
184
186
for hr in [1 ,4 ,8 ,12 ,24 ,72 ,168 ]: # TODO dynamical value, as well dynamical for the database
@@ -188,25 +190,26 @@ def create_target_traffic_by_target_columns(target_columns): # TODO take the
188
190
return next_hrs
189
191
190
192
191
- #Pre-processing
192
- df = read_data_csv ()
193
- df = sort_df_by_date_col (date_col , df )
194
- df = convert_df_to_datetime (df )
195
- df = filter_df_by_specific_date (df , time_delta_years = 1 )
196
- df = filter_df_with_most_frequent_pid (df )
197
- df = replace_null_values (df )
198
- df , cat_features = categorize_features (df )
199
- df = extract_date_components (df , date_col )
200
- df = add_traffic_table (df )
201
- df = convert_cat_features_to_dummies (df , cat_features )
202
- df = combine_all_pids (df , date_col , agg_cols )
203
-
204
- # Setting data fro predictions
205
- target_columns = set_target_columns (df )
206
- remove_date_col (df )
207
- cols = get_cols_withohut_pid (df )
208
- next_hrs = create_target_traffic_by_target_columns (target_columns )
209
-
210
- # Clear N/A
211
- df = df .dropna ()
212
-
193
+ def pre_process_data ():
194
+ #Pre-processing
195
+ df = read_data_csv ()
196
+ df = sort_df_by_date_col (date_col , df )
197
+ df = convert_df_to_datetime (df )
198
+ df = filter_df_by_specific_date (df , time_delta_years = 1 )
199
+ df = filter_df_with_most_frequent_pid (df )
200
+ df = replace_null_values (df )
201
+ df , cat_features = categorize_features (df )
202
+ df = extract_date_components (df , date_col )
203
+ df = add_traffic_table (df )
204
+ df = convert_cat_features_to_dummies (df , cat_features )
205
+ df = combine_all_pids (df , date_col , agg_cols )
206
+
207
+ # Setting data fro predictions
208
+ target_columns = set_target_columns (df )
209
+ df = remove_date_col (df )
210
+ df , cols = get_cols_withohut_pid (df )
211
+ next_hrs = create_target_traffic_by_target_columns (df , target_columns )
212
+
213
+ # Clear N/A
214
+ df = df .dropna ()
215
+ return df , cat_features , cols , next_hrs
0 commit comments