-
Notifications
You must be signed in to change notification settings - Fork 2
/
time_hist2.py
258 lines (208 loc) · 8.11 KB
/
time_hist2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
# coding: utf-8
# In[1]:
import numpy as np
import pandas as pd
from datetime import timedelta, datetime
# In[2]:
class t_hist():
'''
Class which takes solar wind data and creates some time history
for some specific time.
Input:
Data ---------- A DataFrame of solar wind data at 5 minute
cadence and datetime index.
Historic_time - The number of minutes into the past you
would like the hisotry for. (E.g. for 1hr ago
you would input 60 minutes).
window_mins --- If averaging for the time history, then this
input specifies the window length, in minutes,
centred on the historic_time specified.
'''
def __init__(self,data,historic_time,window_mins):
self.data = data
self.time = historic_time
self.window = window_mins
def avg_hist(self):
'''
Function which returns a historic_time value, averaged over
the window_mins.
Output:
- A dataframe of values of the time history.
'''
# Check that indices are datetime
self.is_datetime()
if self.time % 60:
raise ValueError('Please choose a historic time value '+
'which correspond to an integer '+
'number of hours!')
window_s = timedelta(minutes = self.time + self.window/2.0)
# '+5' ensures that the window is closed on the right
window_e = timedelta(minutes = self.time + 5 -
self.window/2.0)
indices = self.data.index
hist = [self.data[i-window_s : i-window_e].mean().values
for i in indices]
col_label = '_'+str(self.time/60.0)[0]+'hr'
columns = [i+col_label for i in self.data.columns]
th_df = pd.DataFrame(hist, index=indices,columns=columns)
return th_df[th_df.index[0]+window_s:]
def instant_hist(self):
'''
Function which returns an instantaneous historic_time value.
Output:
- A dataframe of instantaneous values corresponding to
historic_time minutes in the past.
'''
# Check that indices are datetime
self.is_datetime()
if self.time % 5:
raise ValueError('Please choose a historic time value '+
'which correspond to a multiple of 5 '+
'minutes!')
t_offset = timedelta(minutes=self.time)
indices = self.data.index
hist = [self.data.loc[i-t_offset].values
for i in self.data.index
if i >= indices[0]+t_offset]
if self.time < 60:
if self.time >= 10:
col_label = '_'+str(self.time)[0:2]+'min'
else:
col_label = '_'+str(self.time)[0]+'min'
else:
col_label = '_'+str(self.time/60.0)[0]+'hr_I'
columns = [i+col_label for i in self.data.columns]
return pd.DataFrame(hist, index=indices[int(self.time/5):],
columns=columns)
def is_datetime(self):
dt_type = pd.core.indexes.datetimes.DatetimeIndex
if type(self.data.index) != dt_type:
raise ValueError('Dataframe index is not '+
'in the correct datetime '+
'format')
else:
pass
# In[3]:
def cleaning_data(data,safe_cols=[],sigma_val=4):
'''
Function which removes data which is 'sigma_val' stdevs from
the mean.
Note: 4 sigma encompasses ~99.994% of the data.
~1 real piece of 5 min data is removed for
every 55 days of such data (assuming Gaussian).
Inputs:
sigma_val - (float) number of standard deviations from the
mean to consider as the limit of 'good' data.
safe_cols - Columns in the data which one might like to
keep without any changes (i.e., if there are
no null values in the initial dataset etc.).
Returns:
- Cleaned solar wind data dataframe.
- Dataframe of 'bad' solar wind data.
'''
# Initialising data and empty lists
sw_df = data
cleaned_cols = []
trash_data = []
# Looping through dataframe columns and removing 'bad' values
for i in sw_df.columns:
if i not in safe_cols:
std = sw_df[i].std()
mean = sw_df[i].mean()
cleaned = sw_df[i][sw_df[i]<mean+std*sigma_val]
trash = sw_df[i][sw_df[i]>=mean+std*sigma_val]
cleaned_cols.append(cleaned)
trash_data.append(trash)
else:
cleaned_cols.append(sw_df[i])
trash_data.append([np.nan])
# Initialising empty dataframes and appending data
sw_c_df = pd.DataFrame()
trash_df = pd.DataFrame()
for i in range(len(sw_df.columns)):
# sw_c_df[sw_df.columns[i]] = cleaned_cols[i]
# sw_c_df = sw_c_df
sw_c_df_temp = pd.DataFrame(cleaned_cols[i],
columns=[sw_df.columns[i]])
sw_c_df = pd.concat([sw_c_df,sw_c_df_temp], axis=1)
trash_df_temp = pd.DataFrame(trash_data[i],
columns=[sw_df.columns[i]])
trash_df = pd.concat([trash_df,trash_df_temp], axis=1)
# Checking if the trash data contains non-'bad' data.
check_trash(trash_df)
return (sw_c_df, trash_df)
#################################
def sw_interp(data,method='linear'):
'''
Function which interpolates NaN values in the cleaned
data dataframe.
See Pandas documentation for other methods.
Input:
method - method of interpolation.
Return:
- Cleaned, interpolated data.
'''
return data.interpolate(method=method)
#################################
def check_trash(trash_data):
'''
Function which checks to see if all the removed data
is the 'bad' data fill value.
Returns:
- String detailing which parameters have had real
removed from them.
'''
for i in trash_data.columns:
if (trash_data[i].mean() <
trash_data[i].max()):
print('Some real data has been removed from: ',i)
else:
pass
# In[4]:
def time_history(data,auto=True):
'''
Function which calculates time history information
given an input dataframe.
Averages are centred on the respective time-history
specified.
Input:
data - a Pandas DataFrame containing 5 minute cadence
data.
MUST HAVE DATETIME INDEX.
auto - Whether or not to automatically clean the data.
If not True, then the cleaning_data() and
sw_interp() functions must be called individually
and the results of sw_interp() should be the data
fed to this function, time_history(). Set to false
to retrieve the non-interpolated data and the trash
data.
Output:
A concatenated DataFrame containing
- the original data
- t-6hrs (1hr avg)
- t-5hrs (1hr avg)
- t-3hrs (30min avg)
- t-1hrs (30min avg)
- t-45min (instant)
- t-30min (instant)
- t-15min (instant)
- t-10min (instant)
- t-5min (instant)
'''
if auto is True:
c_data,t_data = cleaning_data(data,sigma_val=4,
safe_cols=[None])
c_i_data = sw_interp(c_data,method='linear')
data = c_i_data
else:
pass
return pd.concat((data,
t_hist(data,360,60).avg_hist(),
t_hist(data,300,60).avg_hist(),
t_hist(data,180,30).avg_hist(),
t_hist(data,60,30).avg_hist(),
t_hist(data,45,0).instant_hist(),
t_hist(data,30,0).instant_hist(),
t_hist(data,15,0).instant_hist(),
t_hist(data,10,0).instant_hist(),
t_hist(data,5,0).instant_hist()),axis=1)