1
1
#!/usr/bin/env python
2
2
from pyspark .sql import SparkSession
3
- from pyspark .sql .functions import col , window , asc , desc , lead , lag , udf , hour , month , dayofmonth , collect_list , lit , year , date_trunc , dayofweek
3
+ from pyspark .sql .functions import col , window , asc , desc , lead , lag , udf , hour , month , dayofmonth , dayofyear , collect_list , lit , year , date_trunc , dayofweek , when , unix_timestamp
4
4
import pyspark .sql .functions as F
5
5
from pyspark .sql .window import Window
6
6
from pyspark .sql .types import FloatType , IntegerType , DateType , TimestampType
7
7
from pyspark import SparkConf
8
- import datetime
8
+ from datetime import datetime , timedelta
9
9
import os
10
10
from math import isnan
11
11
import argparse
12
12
import json
13
+ import calendar
13
14
14
15
#read arguments
15
16
parser = argparse .ArgumentParser ()
21
22
#initiate spark context
22
23
spark = SparkSession .builder .appName ("SAIDI/SAIFI cluster size" ).getOrCreate ()
23
24
24
- #connect to the database
25
- #it's more efficient to do the bulk of filter in the database, especially in the time dimensions
26
- query = "(SELECT core_id, time, is_powered, product_id,millis, last_unplug_millis, last_plug_millis FROM powerwatch WHERE time > '2018-07-01' AND time < '2018-12-01' AND (product_id = 7008 OR product_id = 7009)) alias"
27
-
28
- pw_df = spark .read .jdbc ("jdbc:postgresql://timescale.ghana.powerwatch.io/powerwatch" , query ,
29
- properties = {"user" : args .user , "password" : args .password , "driver" :"org.postgresql.Driver" })
30
-
31
- #caching this in memory up front makes this faster if it all fits
32
- pw_df .cache ();
33
-
25
+ ### It's really important that you partition on this data load!!! otherwise your executors will timeout and the whole thing will fail
26
+ start_time = '2018-07-01'
27
+ end_time = '2019-05-15'
28
+
29
+ #Roughly one partition per week of data is pretty fast and doesn't take too much chuffling
30
+ num_partitions = 30
31
+
32
+ # This builds a list of predicates to query the data in parrallel. Makes everything much faster
33
+ start_time_timestamp = calendar .timegm (datetime .strptime (start_time , "%Y-%m-%d" ).timetuple ())
34
+ end_time_timestamp = calendar .timegm (datetime .strptime (end_time , "%Y-%m-%d" ).timetuple ())
35
+ stride = (end_time_timestamp - start_time_timestamp )/ num_partitions
36
+ predicates = []
37
+ for i in range (0 ,num_partitions ):
38
+ begin_timestamp = start_time_timestamp + i * stride
39
+ end_timestamp = start_time_timestamp + (i + 1 )* stride
40
+ pred_string = "time >= '" + datetime .utcfromtimestamp (int (begin_timestamp )).strftime ("%Y-%m-%d %H:%M:%S" )
41
+ pred_string += "' AND "
42
+ pred_string += "time < '" + datetime .utcfromtimestamp (int (end_timestamp )).strftime ("%Y-%m-%d %H:%M:%S" ) + "'"
43
+ predicates .append (pred_string )
44
+
45
+ #This query should only get data from deployed devices in the deployment table
46
+ query = "(SELECT core_id, time, is_powered, product_id,millis, last_unplug_millis, last_plug_millis FROM powerwatch WHERE time >= '" + start_time + "' AND time < '" + end_time + "' AND (product_id = 7008 OR product_id = 7009 or product_id = 7010 or product_id = 7011 or product_id = 8462)) alias"
47
+
48
+ pw_df = spark .read .jdbc (
49
+ url = "jdbc:postgresql://timescale.ghana.powerwatch.io/powerwatch" ,
50
+ table = query ,
51
+ predicates = predicates ,
52
+ properties = {"user" : args .user , "password" : args .password , "driver" :"org.postgresql.Driver" })
53
+
54
+ #if you have multiple saves below this prevents reloading the data every time
55
+ pw_df .cache ()
34
56
35
57
#now we need to created a window function that looks at the leading lagging edge of is powered and detects transitions
36
58
#then we can filter out all data that is not a transition
37
- def detectTransition (value1 , value2 ):
38
- if (value1 == value2 ):
39
- return 0
40
- else :
41
- return 1
42
- udfDetectTransition = udf (detectTransition , IntegerType ())
43
59
w = Window .partitionBy ("core_id" ).orderBy (asc ("time" ))
44
- is_powered_lag = lag ("is_powered" ,1 ).over (w )
45
- pw_df = pw_df .withColumn ("transition" , udfDetectTransition ("is_powered" ,is_powered_lag ))
60
+ pw_df = pw_df .withColumn ("previous_power_state" , lag ("is_powered" ).over (w ))
46
61
47
- #filter out all transitions
48
- pw_df = pw_df .filter ("transition != 0" )
62
+ #filter out every time that the state does not change
63
+ pw_df = pw_df .filter (col ( "previous_power_state" ) != col ( "is_powered" ) )
49
64
50
- #now count each outage (really restoration)
51
- def countOutage (value1 , value2 , value3 ):
52
- if (value1 == False and value2 == True and value3 == True ):
53
- return 1
54
- else :
55
- return 0
56
- udfCountTransition = udf (countOutage , IntegerType ())
65
+ #now we should only count this if it is an outage (on, off, on)
57
66
is_powered_lead = lead ("is_powered" ,1 ).over (w )
58
67
is_powered_lag = lag ("is_powered" ,1 ).over (w )
59
- pw_df = pw_df .withColumn ("outage" , udfCountTransition ("is_powered" , is_powered_lead , is_powered_lag ))
68
+ pw_df = pw_df .withColumn ("lagging_power" ,is_powered_lag )
69
+ pw_df = pw_df .withColumn ("leading_power" ,is_powered_lead )
70
+ pw_df = pw_df .withColumn ("outage" , when ((col ("is_powered" ) == 0 ) & (col ("lagging_power" ) == 1 ) & (col ("leading_power" ) == 1 ), 1 ).otherwise (0 ))
60
71
72
+ #now need the most accurate outage time possible for outage event
61
73
#now find all the exact outage and restore times using millis
62
74
def timeCorrect (time , millis , unplugMillis ):
63
75
if (unplugMillis == 0 or millis == None or unplugMillis == None or isnan (millis ) or isnan (unplugMillis )):
64
76
return time
65
77
elif unplugMillis > millis :
66
78
return time
67
79
else :
68
- return time - datetime . timedelta (microseconds = (int (millis )- int (unplugMillis ))* 1000 )
80
+ return time - timedelta (microseconds = (int (millis )- int (unplugMillis ))* 1000 )
69
81
udftimeCorrect = udf (timeCorrect , TimestampType ())
70
82
pw_df = pw_df .withColumn ("outage_time" , udftimeCorrect ("time" ,"millis" ,"last_unplug_millis" ))
71
83
pw_df = pw_df .withColumn ("r_time" , udftimeCorrect ("time" ,"millis" ,"last_plug_millis" ))
@@ -77,7 +89,6 @@ def timeCorrect(time, millis, unplugMillis):
77
89
#now filter out everything that is not an outage. We should have a time and end_time for every outage
78
90
pw_df = pw_df .filter ("outage != 0" )
79
91
80
-
81
92
#record the duration of the outage
82
93
def calculateDuration (startTime , endTime ):
83
94
delta = endTime - startTime
@@ -88,7 +99,7 @@ def calculateDuration(startTime, endTime):
88
99
pw_df = pw_df .withColumn ("outage_duration" , udfcalculateDuration ("outage_time" ,"restore_time" ))
89
100
90
101
window_size = 150
91
- w = Window .orderBy (asc ("outage_time" )).rowsBetween (- 1 * window_size ,window_size )
102
+ w = Window .partitionBy ( dayofyear ( "outage_time" )). orderBy (asc ("outage_time" )).rowsBetween (- 1 * window_size ,window_size )
92
103
pw_df = pw_df .withColumn ("outage_window_list" ,collect_list (F .struct ("outage_time" ,"core_id" )).over (w ))
93
104
94
105
def filterOutage (time , core_id , timeList ):
@@ -111,7 +122,7 @@ def filterOutage(time, core_id, timeList):
111
122
pw_df = pw_df .withColumn ("outage_number" ,lit (1 ))
112
123
113
124
#okay now we have a list of all outages where at least one other device also had an outage within a time window
114
-
125
+ #pw_df.cache()
115
126
116
127
### SAIFI ###
117
128
#note that this the raw number of sensors that go out rather than a single metric per "outage"
@@ -134,7 +145,7 @@ def filterOutage(time, core_id, timeList):
134
145
outages_by_hour = outages_by_hour .groupBy ("outage_date_hour" ).sum ()
135
146
outages_by_hour = outages_by_hour .withColumn ("outage_hour" , hour ("outage_date_hour" ))
136
147
outages_by_hour = outages_by_hour .groupBy ("outage_hour" ).avg ().orderBy ("outage_hour" )
137
- outages_by_hour .show ()
148
+ outages_by_hour .show (30 )
138
149
139
150
140
151
#pw_df = pw_df.select("time","core_id","outage_duration","outage_number")
0 commit comments