Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update checker.py #50

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
233 changes: 118 additions & 115 deletions data-checks/checker.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,11 @@
# ---------------------------------------------------------
# Settings
import os

import pandas as pd
pd.options.plotting.backend = "plotly"

import numpy as np
import plotly
import plotly.graph_objects as go
import plotly.express as px
import argparse
from plotly.subplots import make_subplots



Expand All @@ -35,6 +31,7 @@ def __init__(self,
level = None,
ind_dict = None,
prefix = None,
postfix = None,
col_names_dict = None,
htrahshold = -3):
"""
Expand All @@ -58,8 +55,6 @@ def __init__(self,
self.outputs_path = self.path + '/' + 'out'
if not os.path.exists(self.outputs_path):
os.mkdir(self.outputs_path)
else:
self.outputs_path = outputs_path
# List files in path
self.files = os.listdir(self.path)

Expand All @@ -76,7 +71,8 @@ def __init__(self,
# 'i8' : 'mean_distance_per_week.csv',
# 'i9': 'week_home_vs_day_location_per_day.csv',
# 'i10' : 'origin_destination_matrix_time_per_day.csv',
# 'i11': 'unique_subscriber_home_locations_per_month.csv'}
# 'i11': 'unique_subscriber_home_locations_per_month.csv'

else:
self.ind_dict = ind_dict

Expand All @@ -85,6 +81,11 @@ def __init__(self,
else:
self.ind_dict = {k:prefix+v for (k,v) in self.ind_dict.items()}

if postfix is None:
pass
else:
self.ind_dict = {k:v+postfix for (k,v) in self.ind_dict.items()}

# Check if files exist
files_bol = all([os.path.isfile(self.path + '/' + self.ind_dict[key]) for key in self.ind_dict.keys()])
assert files_bol,"Some indicators don't exist. Check defaults or set self.ind_dict"
Expand Down Expand Up @@ -126,8 +127,8 @@ def load(file_path, timevar = None):
# Load data
df = pd.read_csv(file_path)
# Patch cleannig of headers in the middle of the data
c1_name = df.columns[0]
df = df[~df[c1_name].astype(str).str.contains(c1_name)]
# c1_name = df.columns[0]
# df = df[~df[c1_name].astype(str).str.contains(c1_name)]
# Convert date vars
if timevar is None:
timevar = df.columns[0]
Expand Down Expand Up @@ -163,13 +164,8 @@ def time_complete(data, timevar = None, timefreq = 'D'):
data = data.reindex(full_time_range, fill_value=0)
return(data)

# Bolleans
i1bol = 'i1' in self.ind_dict
i3bol = 'i3' in self.ind_dict
i5bol = 'i5' in self.ind_dict

# Indicator 1
if i1bol:
if 'i1' in self.ind_dict:
# self.i1_hour = remove_missings(self.i1, regionvar = self.col_names_dict['i1']['Geography'])\
# .groupby(['date', self.col_names_dict['i1']['Time']])\
# .agg({self.col_names_dict['i1']['Geography'] : pd.Series.nunique ,
Expand All @@ -189,7 +185,7 @@ def time_complete(data, timevar = None, timefreq = 'D'):
self.i1_date = time_complete(self.i1_date, 'date')

# Indicator 3
if i3bol:
if 'i3' in self.ind_dict:
self.i3_date = remove_missings(self.i3, regionvar = self.col_names_dict['i3']['Geography'])\
.groupby('date')\
.agg({self.col_names_dict['i3']['Geography'] : pd.Series.nunique ,
Expand All @@ -201,7 +197,7 @@ def time_complete(data, timevar = None, timefreq = 'D'):
self.i3_date = time_complete(self.i3_date, 'date')

# Indicator 5
if i5bol:
if 'i5' in self.ind_dict:
i5_nmissing = remove_missings(remove_missings(self.i5, self.col_names_dict['i5']['Geography_from']),
self.col_names_dict['i5']['Geography_to'])
self.i5_date = i5_nmissing\
Expand All @@ -217,49 +213,8 @@ def time_complete(data, timevar = None, timefreq = 'D'):
# Remove first day for plots since it doesn't have movements from the day before
# so it is biased by definition.
self.i5_date = self.i5_date[~(self.i5_date.index == self.i5_date.index.min())]

# Create a merged dataset making sure all indicators are in the same resolution
if (i1bol & i3bol):
index = [self.col_names_dict['i1']['Time'], self.col_names_dict['i1']['Geography']]

i3 = self.i3.rename(columns = {self.col_names_dict['i3']['Count'] : 'subs'})
self.merged = self.i1\
.groupby(index)\
.agg({self.col_names_dict['i1']['Count'] : np.sum})\
.reset_index()\
.merge(i3, on = index, how = 'outer')\
.fillna(0)\
.rename(columns = {self.col_names_dict['i1']['Count'] : 'trans'})

if i5bol:
# Just movements out of region
i5_org = self.i5[[self.col_names_dict['i5']['Time'],
self.col_names_dict['i5']['Geography_from'],
self.col_names_dict['i5']['Count']]]\
.rename(columns = {self.col_names_dict['i5']['Count'] : 'mov_out',
self.col_names_dict['i5']['Time'] : index[0],
self.col_names_dict['i5']['Geography_from'] : index[1]})\
.groupby(index)\
.agg({'mov_out' : np.sum})\
.reset_index()

# Just movements into a region
i5_dest = self.i5[[self.col_names_dict['i5']['Time'],
self.col_names_dict['i5']['Geography_to'],
self.col_names_dict['i5']['Count']]]\
.rename(columns = {self.col_names_dict['i5']['Count'] : 'mov_in',
self.col_names_dict['i5']['Time'] : index[0],
self.col_names_dict['i5']['Geography_to'] : index[1]})\
.groupby(index)\
.agg({'mov_in' : np.sum})\
.reset_index()

self.merged = self.merged\
.merge(i5_org, on = index, how = 'outer')\
.merge(i5_dest, on = index, how = 'outer')\
.fillna(0)

# ---------------------------------------------------------
# ---------------------------------------------------------
# Plots

def plot_i1_hist(self, show = True, export = True):
Expand All @@ -280,9 +235,10 @@ def plot_i1_count(self, show = True, export = True):
fig = go.Figure(data=go.Scatter(x=self.i1_date.index,
y=self.i1_date[self.col_names_dict['i1']['Count']]))
fig.update_layout(title_text="Indicator 1: Total number of transactions.")
fig.update_yaxes(rangemode="tozero")

print("Plotting indicator 1 daily count series...")

if export:
file_name = self.outputs_path + '/' + 'i1_count.html'
print('Saving: ' + file_name)
Expand All @@ -307,6 +263,7 @@ def plot_i3_count(self, show = True, export = True):
fig = go.Figure(data=go.Scatter(x=self.i3_date.index,
y=self.i3_date[self.col_names_dict['i3']['Count']]))
fig.update_layout(title_text="Indicator 3: Total number of daily active subscribers.")
fig.update_yaxes(rangemode="tozero")

print("Plotting indicator 3 histogram...")
if export:
Expand Down Expand Up @@ -334,14 +291,15 @@ def plot_i5_count(self, show = True, export = True):
fig = go.Figure(data=go.Scatter(x=self.i5_date.index,
y=self.i5_date[self.col_names_dict['i5']['Count']]))
fig.update_layout(title_text="Indicator 5: Total number of movements.")
fig.update_yaxes(rangemode="tozero")

print("Plotting indicator 5 daily movement counts...")
if export:
file_name = self.outputs_path + '/' + 'i5_count.html'
print('Saving: ' + file_name)
plotly.offline.plot(fig, filename = file_name, auto_open=False)
if show:
fig.show()

def plot_i5_region_count(self, show = True, export = True):
fig = go.Figure()
fig.add_trace(go.Scatter(x=self.i5_date.index,
Expand Down Expand Up @@ -371,51 +329,115 @@ def plot_region_missings(self, show = True, export = True):
plotly.offline.plot(fig, filename = file_name, auto_open=False)
if show:
fig.show()


def plot_i1_i3_count(self, show = True, export = True):

y = self.i1_date[self.col_names_dict['i1']['Count']]/self.i3_date[self.col_names_dict['i3']['Count']]

fig = go.Figure(data=go.Scatter(x=self.i1_date.index,
y=y))

fig.update_layout(title_text="Indicator 1/ Indicator 3: Total number of transactions per subscriber.")
fig.update_yaxes(rangemode="tozero")

print("Plotting Total number of transactions per subscriber...")

if export:
file_name = self.outputs_path + '/' + 'i1_per_i3_count.html'
print('Saving: ' + file_name)
plotly.offline.plot(fig, filename = file_name, auto_open=False)
if show:
fig.show()


# Subscribers vs transactions scatter
def plot_subs_v_trans(self, show = True, export = True):
df = self.merged
# Variable to mark red obs with zeros
df['color'] = np.where((df['trans']== 0) | (df['subs']== 0),
'zeros',
'normal')
i1_i3 = df\
.rename(columns = {self.col_names_dict['i1']['Time'] : 'Date',
self.col_names_dict['i1']['Geography'] : 'Region',
'subs' : 'Number of subscribers',
'trans': 'Number of transactions'})
fig = i1_i3.plot.scatter(x="Number of subscribers",
y="Number of transactions",
color = 'color',
hover_data=['Date', 'Region'],
title = 'Number of subscrivers vs number of transactions.')
fig.update_layout(showlegend=False)

print("Plotting indicators 1 and 3 scatter...")
def plot_i5_i3_count(self, show = True, export = True):

y = self.i5_date[self.col_names_dict['i5']['Count']]/self.i3_date[self.col_names_dict['i3']['Count']]

fig = go.Figure(data=go.Scatter(x=self.i3_date.index,
y=y))

fig.update_layout(title_text="Indicator 5/ Indicator 3: Total number of movements per subscriber.")
fig.update_yaxes(rangemode="tozero")

print("Plotting Total number of movements per subscriber...")

if export:
file_name = self.outputs_path + '/' + 'i3_vs_i1.html'
file_name = self.outputs_path + '/' + 'i5_per_i3_count.html'
print('Saving: ' + file_name)
plotly.offline.plot(fig, filename = file_name, auto_open=False)
if show:
fig.show()


# ---------------------------------------------------------







# ---------------------------------------------------------
# Check pipelines
def completeness_checks(self, export = True, show = True):
def completeness_checks(self, export = True):
if 'i1' in self.ind_dict:
self.plot_i1_hist(export = export, show = show)
self.plot_region_missings(export = export, show = show)
self.plot_i1_count(export = export, show = show)
self.plot_i1_n_regions(export = export, show = show)
self.plot_i1_hist(export = export)
self.plot_region_missings(export = export)
self.plot_i1_count(export = export)
self.plot_i1_n_regions(export = export)
if 'i3' in self.ind_dict:
self.plot_i3_hist(export = export, show = show)
self.plot_i3_count(export = export, show = show)
self.plot_i3_hist(export = export)
self.plot_i3_count(export = export)
if 'i5' in self.ind_dict:
self.plot_i5_count(export = export, show = show)
self.plot_i5_count(export = export)
# self.plot_i5_region_count(export = export)
if ('i1' in self.ind_dict) & ('i3' in self.ind_dict):
self.plot_subs_v_trans(export = export, show = show)
if ('i1' in self.ind_dict) & ('i3' in self.ind_dict):
self.plot_i1_i3_count(export = export)

if ('i5' in self.ind_dict) & ('i3' in self.ind_dict):
self.plot_i5_i3_count(export = export)

if ('i5' in self.ind_dict) & ('i3' in self.ind_dict) & ('i1' in self.ind_dict):

html_string = '''
<html>
<head>
<style type="text/css">
iframe[seamless] {border: none;}
</style>
</head>

<body>
<!-- <div id="includedContent"></div> -->
<iframe height="600" width="900" src="i1_count.html" seamless></iframe>

<iframe height="600" width="900" src="i1_hist.html" seamless></iframe>

<iframe height="600" width="900" src="i1_n_region.html" seamless></iframe>

<iframe height="600" width="900" src="i3_count.html" seamless></iframe>

<iframe height="600" width="900" src="i3_hist.html" seamless></iframe>

<iframe height="600" width="900" src="i5_count.html" seamless></iframe>

<iframe height="600" width="900" src="region_missings.html" seamless></iframe>

<iframe height="600" width="900" src="i1_per_i3_count.html" seamless></iframe>

<iframe height="600" width="900" src="i5_per_i3_count.html" seamless></iframe>


</body>
</html> '''

file_name = self.outputs_path + '/' + 'data_check_report.html'
f = open(file_name,'w')
f.write(html_string)
f.close()



# USAGE OUTILERS: Indicator wards and days with towers down
def usage_outliers(self, htrahshold = None):
Expand Down Expand Up @@ -460,29 +482,10 @@ def usage_outliers(self, htrahshold = None):
file.close()



# ---------------------------------------------------------
# Run script from the terminal

# if __name__ == "__main__":

# # Initializ parser
# parser = argparse.ArgumentParser()

# # Adding optional argument
# parser.add_argument("-p", "--Path")
# parser.add_argument("--Prefix")
# parser.add_argument("--Output")

# # Read arguments from command line
# args = parser.parse_args()

# # Create checker instance
# indicators_checker = checker(path = args.Path, prefix = args.Prefix, outputs_path = args.Output)

# #------------------------------------------------------------------------------
# # Export completeness plots
# indicators_checker.completeness_checks()

# #------------------------------------------------------------------------------
# # Export towers down sheet
# indicators_checker.usage_outliers()