Adding outliers in a new table

hhacot · hhacot · commit 002e347a350c · 2024-02-28T18:31:54.000-05:00
diff --git a/DRAW-post-processing/config.py b/DRAW-post-processing/config.py
@@ -2,7 +2,6 @@
 
 import phase1_methods as methods
 
-
 # assigning post-process ID's to field ID's
 # assign a TUPLE to multiple field_id's with one PPID; otherwise assign integer for single field_id
 ppid_to_field_id = {1: (4, 6, 7, 8, 67, 69),
diff --git a/DRAW-post-processing/database_connection.py b/DRAW-post-processing/database_connection.py
@@ -26,7 +26,7 @@
 
 cursor = conn.cursor()
 
-url = "mysql+mysqlconnector://"+db_user+":"+db_passwd+"@l"+db_host+"/"+db_name
+url = "mysql+mysqlconnector://"+db_user+":"+db_passwd+"@"+db_host+"/"+db_name
 engine = sqlalchemy.create_engine(url)
 
 
@@ -55,4 +55,10 @@ def phase_2_data():
     sql_command = sql_commands.phase_2_data_sql
     cursor.execute(sql_command)
     result = cursor.fetchall()
-    return result
+    return result
+
+def outliers_stats():
+    sql_command=sql_commands.outliers_stats_sql
+    cursor.execute(sql_command)
+    result = cursor.fetchall()
+    return result    
diff --git a/DRAW-post-processing/execute_post_process.py b/DRAW-post-processing/execute_post_process.py
@@ -1,4 +1,5 @@
 import database_connection as db
+import datetime
 import tables
 import observation_reconciliation as reconcile
 import remove_low_transcription_users as remove_ltu
@@ -32,16 +33,15 @@
 
 import phase2_methods as id1p2_methods
 import time
-
+import logs as l
 import sef_gen
 
-#import argparse
 
 def logPerf(message):
     global tic
     toc = time.perf_counter()
-    print(message, end='')
-    print (f":  {toc - tic:0.4f} seconds")
+    l.log(message, end='')
+    l.log (f":  {toc - tic:0.4f} seconds")
     tic=toc
 
 # point data entry to particular post_processing algorithm for phase 1 depending on its post_process_id
@@ -95,7 +95,12 @@ def filter_id(pp_id, entry, phase):
         else:
             pass
 
+
+# REdirecting stdout to a stream so that it can be saved in a DB
+
 tic = time.perf_counter()
+start_time=datetime.datetime.now()
+
 
 # Experimental: implement a continue flag
 continue_flag=False
@@ -129,17 +134,17 @@ def filter_id(pp_id, entry, phase):
 
 logPerf("Created indexed for raw data processing")
 
-print ("Phase 1: ")
+l.log ("Phase 1: ")
 counter = 0
 for row in raw_entries:
     post_process_id = row[8]
     filter_id(post_process_id, row, 1)
     counter += 1
     if (counter % 1000) == 0:
-        print('.', end="")
+        l.log('.', end="")
     if (counter % 50000) == 0:
-        print("")
-print("")
+        l.log("")
+l.log("")
 logPerf("Phase 1 complete")
 
 # Save corrected data in database
@@ -173,18 +178,19 @@ def filter_id(pp_id, entry, phase):
 
 pressure_lead_digs_added = id1p2_methods.pressure_artificial_lead_digs_list()
 counter = 0
-print ("Phase 2:")
+l.log ("Phase 2:")
 for row in entries:
     post_process_id = row[8]
     filter_id(post_process_id, row, 2)
     counter += 1
     if (counter % 1000) == 0:
-        print('.', end="")
+        l.log('.', end="")
     if (counter % 50000) == 0:
-        print("")
+        l.log("")
 
 logPerf("Completed post-process 1 phase 2")
-id3p2.phase_2(entries)
+tables.create_outliers_graphs()
+outlier_graphs=id3p2.phase_2(entries)
 logPerf("Completed post-process 3 phase 2")
 
 tables.populate_final_corrected_table()
@@ -193,21 +199,30 @@ def filter_id(pp_id, entry, phase):
 logPerf("Phase 2 complete")
 
 #####################       EXECUTE PHASE 3 (ISO TRANSLATION)       #########################
-print ("Phase 3:")
+l.log ("Phase 3:")
 tables.create_final_corrected_table_iso(continue_flag)
 entries=db.phase_2_data()
 for row in entries:
     post_process_id = row[8]
     filter_id(post_process_id, row, 3)
     counter += 1
     if (counter % 1000) == 0:
-        print('.', end="")
+        l.log('.', end="")
     if (counter % 50000) == 0:
-        print("")
+        l.log("")
 tables.populate_final_corrected_table_iso()
 
 logPerf("Completed phase 3")    
 
+#################### Generating SEF files ##########################
+l.log("Generating SEF files")
+sef_gen.generateSEFs()
+logPerf("SEF files generated")
+
+#################### Saving report #############################
+report_id= tables.writeReport(l.report,start_time)
+tables.insert_outlier_graphs(report_id, outlier_graphs)
+tables.insert_outlier_stats(report_id, db.outliers_stats())
 
 #####################       DELETE ALL DISPENSABLE TABLES (KEEP FINAL + ERRORS/EDITS TABLES)       ############################
 tables.delete_table('data_entries_raw')
@@ -216,7 +231,3 @@ def filter_id(pp_id, entry, phase):
 logPerf("cleaned up database")
 
 
-#################### Generating SEF files ##########################
-print("Generating SEF files")
-sef_gen.generateSEFs()
-logPerf("SEF files generated")
diff --git a/DRAW-post-processing/observation_reconciliation.py b/DRAW-post-processing/observation_reconciliation.py
@@ -3,7 +3,7 @@
 import database_connection as db
 import statistics as stats
 import tables
-
+import logs as p
 cursor = db.cursor
 
 
@@ -13,7 +13,7 @@ def remove_duplicates():
 
     counter = 0
     checked_entries = {}
-    print("Reconciliation:")
+    p.log("Reconciliation:")
     for entry in data_entries:
         if entry[9] is not None and entry[0] not in checked_entries.keys():
             cursor.execute("SELECT * FROM data_entries_corrected "
@@ -50,7 +50,7 @@ def remove_duplicates():
             tables.add_to_duplicateless_table(*entry)
         counter += 1
         if (counter % 1000) == 0:
-            print('.', end="")
+            p.log('.', end="")
         if (counter % 50000) == 0:
-            print("")
+            p.log("")
     tables.populate_duplicateless_table()
diff --git a/DRAW-post-processing/post_process_ids/id3/id_3_phase_2.py b/DRAW-post-processing/post_process_ids/id3/id_3_phase_2.py
@@ -5,9 +5,7 @@
 import config
 import math
 import time
-#import numpy as np
-# curve-fit() function imported from scipy
-#from scipy.optimize import curve_fit
+import logs as p
 from matplotlib import pyplot as plt
 
 
@@ -17,7 +15,7 @@ def log_errors(code,errors):
 
 # compares observed and corrected values. If not within a threshold, they are both marked as error [304]
 def compare_observed_corrected (df,field_observed,field_corrected):
-    print("   Comparing observed vs corrected for fields " + str(field_observed)+" vs "+str(field_corrected))
+    p.log("   Comparing observed vs corrected for fields " + str(field_observed)+" vs "+str(field_corrected))
     df_comp=df[df['field_id'].isin([field_observed,field_corrected]) ]
     temp_observed_errors=df_comp.groupby(['observation_date'])['value'].diff().dropna().abs().gt(config.temperature_difference_allowed_obs_corr)
 
@@ -30,7 +28,7 @@ def compare_observed_corrected (df,field_observed,field_corrected):
 
 # verifies that min is less than max at a given time. If not, both entries are marked as errors [305]
 def compare_min_max (df,field_min,field_max):
-    print("   Comparing min/max for fields "+str(field_min) +"/"+str(field_max))
+    p.log("   Comparing min/max for fields "+str(field_min) +"/"+str(field_max))
     df_comp=df[df['field_id'].isin([field_min,field_max])].sort_values(by=['observation_date'])
     obs_date=None
     min_temp=math.nan
@@ -52,7 +50,7 @@ def compare_min_max (df,field_min,field_max):
 
 # Verifies that the first field in the list is less than all other fields - same observation time. If not, marked as flagged [2]
 def compare_field_less_than_other_fields(df,fields):
-    print ("   Comparing that field "+str(fields[0])+" is less than these fields: "+str(fields[1])) 
+    p.log ("   Comparing that field "+str(fields[0])+" is less than these fields: "+str(fields[1])) 
     df_comp=df[df['field_id'].isin(fields)].sort_values(by=['observation_date'])
     obs_date=None
     min_temp=math.nan
@@ -84,7 +82,7 @@ def compare_field_less_than_other_fields(df,fields):
 
 # check the the min field is the min of all previous fields between last min field measurement or 24 hours. If not marked as flagged [3]
 def check_field_is_min_over_period(df,min_field,max_field):
-    print ("   Checking that field "+str(min_field)+" is the minimum of all values of this field: "+str(max_field))       
+    p.log ("   Checking that field "+str(min_field)+" is the minimum of all values of this field: "+str(max_field))       
     df_comp=df[df['field_id'].isin([min_field,max_field])].sort_values(by=['observation_date'])
     obs_date=None
     min_temp=math.nan
@@ -106,7 +104,7 @@ def check_field_is_min_over_period(df,min_field,max_field):
                 
 # check that the max field is the max of all previous fields between last max field measurement or 24 hours. If not marked as flagged [4]                
 def check_field_is_max_over_period(df,max_field,min_field):
-    print ("   Checking that field "+str(max_field)+" is the maximum of all values of this field: "+str(min_field))       
+    p.log ("   Checking that field "+str(max_field)+" is the maximum of all values of this field: "+str(min_field))       
     df_comp=df[df['field_id'].isin([max_field,min_field])].sort_values(by=['observation_date'])
     obs_date=None
     max_temp=math.nan
@@ -143,7 +141,7 @@ def compare_min_max_df (df,field_min,field_max):
 
 # checks air temperature and wet bulb are less than a certain threshold
 def check_air_wet_bulb(df, fields):
-    print ("   Checking wet bulb for fields: "+str(fields))       
+    p.log ("   Checking wet bulb for fields: "+str(fields))       
     df_comp=df[df['field_id'].isin([fields])].sort_values(by=['observation_date'])
     obs_date=None
     f0=math.nan
@@ -178,10 +176,9 @@ def check_air_wet_bulb(df, fields):
 
    
     
-# Detects outliers and flags them [1]
+# Detects outliers and flags them [1] and returns list of graph data
 def flag_outliers (df, field_id):
-
-
+    outliers_data=[]
     df_proc=df[df.field_id==field_id].sort_values(by=['observation_date'])
     
     #determine list of series that are eligible for validation based on rule: needs less than 5 days before or after with no data
@@ -203,9 +200,9 @@ def flag_outliers (df, field_id):
                 standard_deviation=delta.std()
                 outliers=df_proc[df_proc.index.isin(delta[delta.gt(config.temperature_outlier_std_factor*standard_deviation)].index)]
                 if outliers.size >0:
+                    ans_max=ans+config.temperature_outlier_std_factor*standard_deviation
+                    ans_min=ans-config.temperature_outlier_std_factor*standard_deviation
                     if config.temperature_plot_outliers == True:
-                        ans_max=ans+config.temperature_outlier_std_factor*standard_deviation
-                        ans_min=ans-config.temperature_outlier_std_factor*standard_deviation
                         fig, ax = plt.subplots(1, figsize = (20, 8))
                         fig.autofmt_xdate()
                         ax.plot(x, y, '.', color ='black', label ="data")
@@ -218,10 +215,37 @@ def flag_outliers (df, field_id):
                     #flag the outliers
                     for ind,outlier in outliers.iterrows():
                         df.at[ind,'flagged']=10
-                        
+                    # Build graph json data
+                    data="{\"data\":["
+                    first_data=True
+                    for ind in x.keys():
+                        if first_data==False:
+                            data=data+","
+                        first_data=False
+                        data=data+"{\"x\":\""+str(x[ind])+"\","
+                        data=data+"\"y\":"+str(y[ind])+","
+                        data=data+"\"ly\":"
+                        if pd.isna(ans_min[ind]):
+                            data=data+"null"
+                        else:
+                            data=data+str(ans_min[ind])
+                        data=data+",\"uy\":"
+                        if pd.isna(ans_max[ind]):
+                            data=data+"null"
+                        else:
+                            data=data+str(ans_max[ind])
+                        data=data+",\"outlier\":"
+                        if ind in outliers:
+                            data=data+str(outliers[ind])
+                        else:
+                            data=data+"null"
+                        data=data+"}"
+                    data=data+"]}"
+                    outliers_data.append((field_id,data))
             obs_date=row['observation_date']
             list_partial=[]
             list_partial.append(row)
+    return outliers_data
 
 
 
@@ -231,11 +255,11 @@ def phase_2(entries,debug=False):
     
     def logPerf(tic,message):
         toc = time.perf_counter()
-        print(message, end='')
-        print (f":  {toc - tic:0.4f} seconds")
+        p.log(message, end='')
+        p.log (f":  {toc - tic:0.4f} seconds")
         return toc
     
-    print ("Starting temperature phase 2")
+    p.log ("Starting temperature phase 2")
     tic = time.perf_counter()
     # execute post process id3 on the whole dataset, not one entry at a time
     df=pd.DataFrame(entries, 
@@ -271,7 +295,7 @@ def logPerf(tic,message):
         try:
             check_field_is_min_over_period(df_temp_nona, fields[0], fields[1])
         except:
-            print(df_temp_nona, fields[0], fields[1])
+            p.log(df_temp_nona, fields[0], fields[1])
     tic=logPerf(tic, "Completed field is minimum of other fields over period of time")
     
     # check temperature is the max of other values within past 24 hours max
@@ -298,12 +322,14 @@ def logPerf(tic,message):
     tic=logPerf(tic, "Completed removing detected errors before outlier detection")
     
     # get series of values for a given field ID
+    outliers_graph=[]
     for field in config.temperature_stat_outliers:
-        flag_outliers(df_temp_cleaned, field)
+        outliers_graph.append(flag_outliers(df_temp_cleaned, field))
     tic=logPerf(tic, "Completed outlier detection")
 
 
     # fit the series
     df_temp_cleaned.to_sql('data_entries_corrected_final', db.engine, if_exists='append', index=False)
+    return outliers_graph
 
     
diff --git a/DRAW-post-processing/sef_gen.py b/DRAW-post-processing/sef_gen.py
@@ -3,6 +3,8 @@
 import datetime
 import config
 import os
+import logs as p
+
 cursor = db.cursor
 
 def generateSEFs():
@@ -76,7 +78,7 @@ def getFilename(sef_type,type_result_set):
             
     
 def generateSEF(sef_type):
-    print("Generating SEF for type: " + sef_type)
+    p.log("Generating SEF for type: " + sef_type)
     if type(config.sef_type_to_field_id[sef_type]) == int:
         query="select value,observation_date from data_entries_corrected_final_iso where field_id = {} order by observation_date asc".format(config.sef_type_to_field_id[sef_type])
     else:
@@ -94,7 +96,7 @@ def generateSEF(sef_type):
                 "\t"+value+"\t|\t\n"
             type_result_set.append(result_str)
         except:
-            print ("Couldn't generate SEF line for value="+str(value)+", observation date ="+str(observation_date))
+            p.log ("Couldn't generate SEF line for value="+str(value)+", observation date ="+str(observation_date))
         
         
     (filename,index_start,index_end)=getFilename(sef_type, type_result_set)
diff --git a/DRAW-post-processing/setup_raw_data_table.py b/DRAW-post-processing/setup_raw_data_table.py
@@ -14,3 +14,6 @@ def set_up_raw_data_table(continue_flag):
     # TODO : update other field id's with their respective pp_id
 
     tables.create_raw_data_table(continue_flag)
+    tables.create_post_processing_reports_table()
+    tables.create_outliers_stats_table()
+    
diff --git a/DRAW-post-processing/sql_commands.py b/DRAW-post-processing/sql_commands.py
@@ -69,6 +69,7 @@ def create_error_edit_table(phase):
 phase_1_data_test_sql = "SELECT * FROM data_entries_corrected_duplicateless_test;"
 
 phase_2_data_sql = "select * from data_entries_corrected_final"
+outliers_stats_sql="select field_id,count(*) from data_entries_corrected_final where flagged=10 group by field_id"
 
 # MySQL commands used during post-processing phases:
 
diff --git a/DRAW-post-processing/tables.py b/DRAW-post-processing/tables.py