55import config
66import math
77import time
8- #import numpy as np
9- # curve-fit() function imported from scipy
10- #from scipy.optimize import curve_fit
8+ import logs as p
119from matplotlib import pyplot as plt
1210
1311
@@ -17,7 +15,7 @@ def log_errors(code,errors):
1715
1816# compares observed and corrected values. If not within a threshold, they are both marked as error [304]
1917def compare_observed_corrected (df ,field_observed ,field_corrected ):
20- print (" Comparing observed vs corrected for fields " + str (field_observed )+ " vs " + str (field_corrected ))
18+ p . log (" Comparing observed vs corrected for fields " + str (field_observed )+ " vs " + str (field_corrected ))
2119 df_comp = df [df ['field_id' ].isin ([field_observed ,field_corrected ]) ]
2220 temp_observed_errors = df_comp .groupby (['observation_date' ])['value' ].diff ().dropna ().abs ().gt (config .temperature_difference_allowed_obs_corr )
2321
@@ -30,7 +28,7 @@ def compare_observed_corrected (df,field_observed,field_corrected):
3028
3129# verifies that min is less than max at a given time. If not, both entries are marked as errors [305]
3230def compare_min_max (df ,field_min ,field_max ):
33- print (" Comparing min/max for fields " + str (field_min ) + "/" + str (field_max ))
31+ p . log (" Comparing min/max for fields " + str (field_min ) + "/" + str (field_max ))
3432 df_comp = df [df ['field_id' ].isin ([field_min ,field_max ])].sort_values (by = ['observation_date' ])
3533 obs_date = None
3634 min_temp = math .nan
@@ -52,7 +50,7 @@ def compare_min_max (df,field_min,field_max):
5250
5351# Verifies that the first field in the list is less than all other fields - same observation time. If not, marked as flagged [2]
5452def compare_field_less_than_other_fields (df ,fields ):
55- print (" Comparing that field " + str (fields [0 ])+ " is less than these fields: " + str (fields [1 ]))
53+ p . log (" Comparing that field " + str (fields [0 ])+ " is less than these fields: " + str (fields [1 ]))
5654 df_comp = df [df ['field_id' ].isin (fields )].sort_values (by = ['observation_date' ])
5755 obs_date = None
5856 min_temp = math .nan
@@ -84,7 +82,7 @@ def compare_field_less_than_other_fields(df,fields):
8482
8583# check the the min field is the min of all previous fields between last min field measurement or 24 hours. If not marked as flagged [3]
8684def check_field_is_min_over_period (df ,min_field ,max_field ):
87- print (" Checking that field " + str (min_field )+ " is the minimum of all values of this field: " + str (max_field ))
85+ p . log (" Checking that field " + str (min_field )+ " is the minimum of all values of this field: " + str (max_field ))
8886 df_comp = df [df ['field_id' ].isin ([min_field ,max_field ])].sort_values (by = ['observation_date' ])
8987 obs_date = None
9088 min_temp = math .nan
@@ -106,7 +104,7 @@ def check_field_is_min_over_period(df,min_field,max_field):
106104
107105# check that the max field is the max of all previous fields between last max field measurement or 24 hours. If not marked as flagged [4]
108106def check_field_is_max_over_period (df ,max_field ,min_field ):
109- print (" Checking that field " + str (max_field )+ " is the maximum of all values of this field: " + str (min_field ))
107+ p . log (" Checking that field " + str (max_field )+ " is the maximum of all values of this field: " + str (min_field ))
110108 df_comp = df [df ['field_id' ].isin ([max_field ,min_field ])].sort_values (by = ['observation_date' ])
111109 obs_date = None
112110 max_temp = math .nan
@@ -143,7 +141,7 @@ def compare_min_max_df (df,field_min,field_max):
143141
144142# checks air temperature and wet bulb are less than a certain threshold
145143def check_air_wet_bulb (df , fields ):
146- print (" Checking wet bulb for fields: " + str (fields ))
144+ p . log (" Checking wet bulb for fields: " + str (fields ))
147145 df_comp = df [df ['field_id' ].isin ([fields ])].sort_values (by = ['observation_date' ])
148146 obs_date = None
149147 f0 = math .nan
@@ -178,10 +176,9 @@ def check_air_wet_bulb(df, fields):
178176
179177
180178
181- # Detects outliers and flags them [1]
179+ # Detects outliers and flags them [1] and returns list of graph data
182180def flag_outliers (df , field_id ):
183-
184-
181+ outliers_data = []
185182 df_proc = df [df .field_id == field_id ].sort_values (by = ['observation_date' ])
186183
187184 #determine list of series that are eligible for validation based on rule: needs less than 5 days before or after with no data
@@ -203,9 +200,9 @@ def flag_outliers (df, field_id):
203200 standard_deviation = delta .std ()
204201 outliers = df_proc [df_proc .index .isin (delta [delta .gt (config .temperature_outlier_std_factor * standard_deviation )].index )]
205202 if outliers .size > 0 :
203+ ans_max = ans + config .temperature_outlier_std_factor * standard_deviation
204+ ans_min = ans - config .temperature_outlier_std_factor * standard_deviation
206205 if config .temperature_plot_outliers == True :
207- ans_max = ans + config .temperature_outlier_std_factor * standard_deviation
208- ans_min = ans - config .temperature_outlier_std_factor * standard_deviation
209206 fig , ax = plt .subplots (1 , figsize = (20 , 8 ))
210207 fig .autofmt_xdate ()
211208 ax .plot (x , y , '.' , color = 'black' , label = "data" )
@@ -218,10 +215,37 @@ def flag_outliers (df, field_id):
218215 #flag the outliers
219216 for ind ,outlier in outliers .iterrows ():
220217 df .at [ind ,'flagged' ]= 10
221-
218+ # Build graph json data
219+ data = "{\" data\" :["
220+ first_data = True
221+ for ind in x .keys ():
222+ if first_data == False :
223+ data = data + ","
224+ first_data = False
225+ data = data + "{\" x\" :\" " + str (x [ind ])+ "\" ,"
226+ data = data + "\" y\" :" + str (y [ind ])+ ","
227+ data = data + "\" ly\" :"
228+ if pd .isna (ans_min [ind ]):
229+ data = data + "null"
230+ else :
231+ data = data + str (ans_min [ind ])
232+ data = data + ",\" uy\" :"
233+ if pd .isna (ans_max [ind ]):
234+ data = data + "null"
235+ else :
236+ data = data + str (ans_max [ind ])
237+ data = data + ",\" outlier\" :"
238+ if ind in outliers :
239+ data = data + str (outliers [ind ])
240+ else :
241+ data = data + "null"
242+ data = data + "}"
243+ data = data + "]}"
244+ outliers_data .append ((field_id ,data ))
222245 obs_date = row ['observation_date' ]
223246 list_partial = []
224247 list_partial .append (row )
248+ return outliers_data
225249
226250
227251
@@ -231,11 +255,11 @@ def phase_2(entries,debug=False):
231255
232256 def logPerf (tic ,message ):
233257 toc = time .perf_counter ()
234- print (message , end = '' )
235- print (f": { toc - tic :0.4f} seconds" )
258+ p . log (message , end = '' )
259+ p . log (f": { toc - tic :0.4f} seconds" )
236260 return toc
237261
238- print ("Starting temperature phase 2" )
262+ p . log ("Starting temperature phase 2" )
239263 tic = time .perf_counter ()
240264 # execute post process id3 on the whole dataset, not one entry at a time
241265 df = pd .DataFrame (entries ,
@@ -271,7 +295,7 @@ def logPerf(tic,message):
271295 try :
272296 check_field_is_min_over_period (df_temp_nona , fields [0 ], fields [1 ])
273297 except :
274- print (df_temp_nona , fields [0 ], fields [1 ])
298+ p . log (df_temp_nona , fields [0 ], fields [1 ])
275299 tic = logPerf (tic , "Completed field is minimum of other fields over period of time" )
276300
277301 # check temperature is the max of other values within past 24 hours max
@@ -298,12 +322,14 @@ def logPerf(tic,message):
298322 tic = logPerf (tic , "Completed removing detected errors before outlier detection" )
299323
300324 # get series of values for a given field ID
325+ outliers_graph = []
301326 for field in config .temperature_stat_outliers :
302- flag_outliers (df_temp_cleaned , field )
327+ outliers_graph . append ( flag_outliers (df_temp_cleaned , field ) )
303328 tic = logPerf (tic , "Completed outlier detection" )
304329
305330
306331 # fit the series
307332 df_temp_cleaned .to_sql ('data_entries_corrected_final' , db .engine , if_exists = 'append' , index = False )
333+ return outliers_graph
308334
309335
0 commit comments