-
Notifications
You must be signed in to change notification settings - Fork 21
/
generate_dataset.py
313 lines (252 loc) · 14.9 KB
/
generate_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
import os
import argparse
import itertools
import numpy as np
import pandas as pd
from functools import reduce
from utils.argument_parser import str2bool, limited_number_type
parser = argparse.ArgumentParser(description='Generate synthetic dataset.')
parser.add_argument('--num', required=True, type=int, help='number of instances to generate')
parser.add_argument('--dims', type=int, nargs='+', default=[10, 12, 10, 8, 5], help='dimension sizes')
parser.add_argument('--seed', type=int, default=123, help='initial random seed')
parser.add_argument('--data-root', type=str, default='./data', help='output directory')
parser.add_argument('--dataset-name', type=str, help='name of the new dataset')
parser.add_argument('--weibull-alpha', type=limited_number_type(float, minimum=0), nargs=2, default=[0.5, 1.0],
metavar=('min', 'max'),
help='range of Weibull alpha to control the real value distribution of normal leaf elements')
parser.add_argument('--zero-rate', type=limited_number_type(float, 0, 1), nargs=2, default=[0.0, 0.25],
metavar=('min', 'max'), help='range of percentage of leaf elements to set to zero')
parser.add_argument('--noise-level', type=limited_number_type(float, 0, 1), nargs=2, default=[0.0, 0.25],
metavar=('min', 'max'), help='range of relative forecasting error of the normal leaf elements')
parser.add_argument('--anomaly-severity', type=limited_number_type(float, 0, 1), nargs=2, default=[0.2, 1.0],
metavar=('min', 'max'), help='range of severity of the anomalies')
parser.add_argument('--anomaly-deviation', type=limited_number_type(float, 0, 1), nargs=2, default=[0.0, 0.1],
metavar=('min', 'max'), help='range of deviation of the anomalies')
parser.add_argument('--num-anomaly', type=limited_number_type(int, minimum=1), nargs=2, default=[1, 3],
metavar=('min', 'max'), help='range of number of anomalies')
parser.add_argument('--num-anomaly-elements', type=limited_number_type(int, minimum=1), nargs=2, default=[1, 3],
metavar=('min', 'max'), help='range of number of elements within each anomaly')
parser.add_argument('--only-last-layer', type=str2bool, nargs='?', const=True, default=False,
help='place all anomalies in the highest layer')
args = parser.parse_args()
# S: 121, L: 122, H: 123
rng = np.random.default_rng(args.seed)
# Basic settings
# S: dimensions = {'a': 10, 'b': 12, 'c': 10, 'd': 8, 'e': 5}
# L: dimensions = {'a': 10, 'b': 24, 'c': 10, 'd': 15}
# 2: dimensions = {'a': 10, 'b': 5, 'c': 250, 'd': 20, 'e': 8, 'f': 12}
dimensions = dict(zip('abcdefghijklmnopqrstuvwxyz', args.dims))
# S and L uses 1000, H dataset uses 100
number_of_files = args.num
# The normal ('real') value distribution
weibull_alpha = [min(args.weibull_alpha), max(args.weibull_alpha)]
# Noise parameters
zero_rate = [min(args.zero_rate), max(args.zero_rate)]
noise_level = [min(args.noise_level), max(args.noise_level)] # S,H: [0.0,0.25], L: [0.0,0.1]
# Anomaly parameters
anomaly_severity = [min(args.anomaly_severity), max(args.anomaly_severity)] # S,H: [0.2,1.0], L: [0.5,1.0]
anomaly_deviation = [min(args.anomaly_deviation), max(args.anomaly_deviation)] # S,H: [0.0,0.1], L: [0.0,0.0]
num_anomaly = [min(args.num_anomaly), max(args.num_anomaly)] # S,H: [1,3], L: [1,5]
num_anomaly_elements = [min(args.num_anomaly_elements), max(args.num_anomaly_elements)] # S,H: [1,3], L: [1,1]
only_last_layer = args.only_last_layer # S,H: False, L: True
# Set the output path
data_root = args.data_root
if args.dataset_name is None:
folder = '-'.join(sorted([k + str(v) for k, v in dimensions.items()]))
else:
folder = args.dataset_name
save_path = os.path.join(data_root, folder)
os.makedirs(save_path, exist_ok=True)
def get_dataset_properties(dimensions):
sel_zero_rate = rng.uniform(zero_rate[0], zero_rate[1])
sel_noise_level = rng.uniform(noise_level[0], noise_level[1])
print('zero_rate', sel_zero_rate)
print('noise_level', sel_noise_level)
sel_num_anomalies = rng.integers(num_anomaly[0], num_anomaly[1], endpoint=True)
print('num_anomalies', sel_num_anomalies)
anomaly_properties = dict()
for i in range(sel_num_anomalies):
if not only_last_layer:
anomaly_level = rng.integers(1, len(dimensions), endpoint=True)
else:
anomaly_level = len(dimensions)
elements = rng.integers(num_anomaly_elements[0], num_anomaly_elements[1], endpoint=True)
# Add noise_level to the severity to avoid anomalies too alike to normal data.
severity = rng.uniform(anomaly_severity[0], anomaly_severity[1]) + sel_noise_level
deviation = rng.uniform(anomaly_deviation[0], anomaly_deviation[1])
anomaly_properties[i] = {'level': anomaly_level, 'elements': elements,
'severity': severity, 'deviation': deviation}
print('anomaly_properties', anomaly_properties)
return sel_zero_rate, sel_noise_level, anomaly_properties
def get_anomaly_locations(df, dimensions, anomaly_properties):
def _get_anomaly_locations(anomaly_level, anomaly_elements, current_anomalies, depth=0):
if len(dimensions) < anomaly_level:
raise ValueError("anaomly_level should be less or equal to number of dimensions.")
if np.prod(sorted(list(dimensions.values()), reverse=True)[:anomaly_level]) < anomaly_elements:
raise ValueError("Impossible to get that many anomaly elements with specified input.")
anomaly_dims = sorted(rng.choice(list(dimensions.keys()), size=anomaly_level, replace=False))
lowest_layer = len(anomaly_dims) == len(dimensions)
# Do not reuse the same anomaly dimension again
used_dims = [ca['dimensions'] for ca in current_anomalies]
if anomaly_dims in used_dims and not lowest_layer:
print('Anomaly dimension used, recursive call.')
return _get_anomaly_locations(anomaly_level, anomaly_elements, current_anomalies)
all_selected_dim_elements = []
for ad in anomaly_dims:
dim_elements = list(range(1, dimensions[ad] + 1))
# We do not want any overlap on current anomalies
# For example, if one anomaly is c=c5 then we do not want an overlapping one at e.g. c=c5&b=b3.
for ca in current_anomalies:
if ad in ca['dimensions']:
# Overlap
idx = ca['dimensions'].index(ad)
used_elements = set([int(a[idx][len(ad):]) for a in ca['cuboids']])
dim_elements = list(set(dim_elements) - used_elements)
if len(dim_elements) == 0:
print('All elements in dimension used, recursive call.')
return _get_anomaly_locations(anomaly_level, anomaly_elements, current_anomalies, depth=depth+1)
selected_dim_elements = rng.choice(dim_elements, size=anomaly_elements, replace=True)
selected_dim_elements = [ad + str(e) for e in selected_dim_elements]
all_selected_dim_elements.append(selected_dim_elements)
anomaly_cuboids = list(zip(*all_selected_dim_elements))
# When in the lowest layer, we do not want any anomalies with real == 0 and predict == 0,
# since we can't predict these (i.e., they would automatically be a false negative when evaluating).
if lowest_layer:
for cuboid in anomaly_cuboids:
element = list(zip(anomaly_dims, cuboid))
v = df.loc[reduce(lambda c1, c2: c1 & c2, [df[k] == v for k, v in element]), 'real'].iloc[0]
if v == 0:
print('Tried anomaly value is 0, recursive call.')
return _get_anomaly_locations(anomaly_level, anomaly_elements, current_anomalies)
# Make sure the anomalies are unique
if len(np.unique(anomaly_cuboids, axis=0)) < anomaly_elements:
print('Non-unique elements found, recursive call.')
return _get_anomaly_locations(anomaly_level, anomaly_elements, current_anomalies)
return anomaly_dims, anomaly_cuboids
anomalies = []
for properties in anomaly_properties.values():
level = properties['level']
elements = properties['elements']
anomaly_dims, anomaly_cuboids = _get_anomaly_locations(level, elements, anomalies)
anomalies.append({'dimensions': anomaly_dims, 'cuboids': anomaly_cuboids})
return anomalies
def get_anomaly_masks(df, anomalies):
anomaly_masks = []
for anomaly in anomalies:
dims = anomaly['dimensions']
cuboid_mask = []
for cuboid in anomaly['cuboids'] :
tup = zip(dims, cuboid) if len(dims) > 1 else [(dims[0], cuboid[0])]
masks = [df[d] == c for d, c in tup]
mask = np.logical_and.reduce(masks)
cuboid_mask.append(mask)
cuboid_mask = np.logical_or.reduce(cuboid_mask)
anomaly_masks.append(cuboid_mask)
return anomaly_masks
def scale_anomaly(row, properties):
severity = properties['severity']
deviation = properties['deviation']
r = rng.normal(loc=severity, scale=deviation)
v = max(row * (1 - r), 0.0)
return v
def generate_labels(anomalies):
labels = []
for anomaly in anomalies:
dims = anomaly['dimensions']
for element in anomaly['cuboids']:
ts = zip(dims, element) if len(dims) > 1 else [(dims[0], element[0])]
label = "&".join(sorted(["=".join(t) for t in ts]))
labels.append(label)
labels = ';'.join(labels)
return labels
def create_metadata(df, anomaly_masks, anomaly_properties, zero_rate, noise_level, direction):
mask = np.logical_or.reduce(anomaly_masks)
normal_predict_amount = df.loc[~mask, 'predict'].abs().sum()
normal_predict_error = (df.loc[~mask, 'real'] - df.loc[~mask, 'predict']).abs().sum()
abnormal_predict_amount = df.loc[mask, 'predict'].abs().sum()
abnormal_predict_error = (df.loc[mask, 'real'] - df.loc[mask, 'predict']).abs().sum()
anomaly_significance = abnormal_predict_error / df['predict'].sum()
anomaly_severities = ';'.join([str(round(anomaly_properties[i]['severity'], 2)) for i in range(len(anomaly_masks))])
anomaly_deviations = ';'.join([str(round(anomaly_properties[i]['deviation'], 2)) for i in range(len(anomaly_masks))])
anomaly_elements = ';'.join([str(round(anomaly_properties[i]['elements'], 2)) for i in range(len(anomaly_masks))])
metadata = {'total_real_amount': df['real'].sum().round(2),
'total_predict_amount': df['predict'].sum().round(2),
'normal_predict_amount': round(normal_predict_amount, 2),
'normal_predict_error': round(normal_predict_error, 2),
'abnormal_predict_amount': round(abnormal_predict_amount, 2),
'abnormal_predict_error': round(abnormal_predict_error, 2),
'anomaly_significance': round(anomaly_significance, 2),
'zero_rate': round(zero_rate, 2),
'noise_level': round(noise_level, 2),
'elements_per_anomaly': anomaly_elements,
'anomaly_severity': anomaly_severities,
'anomaly_deviation': anomaly_deviations,
'anomaly_direction': direction
}
return metadata
def generate_dataset(dimensions):
sel_zero_rate, sel_noise_level, anomaly_properties = get_dataset_properties(dimensions)
dimension_values = [[dimension + str(i) for i in range(1, num + 1)] for dimension, num in dimensions.items()]
# Add all combinations
df = pd.DataFrame(list(itertools.product(*dimension_values)), columns=dimensions.keys())
# Add real values
alpha = rng.uniform(weibull_alpha[0], weibull_alpha[1])
df['real'] = rng.weibull(alpha, len(df)) * 100
print('Added real')
# Add zero rows
df['real'] = df['real'] * (rng.uniform(size=len(df)) > sel_zero_rate)
print('Added zero rows')
# Apply noise to the predictions
df['predict'] = df['real'] + df['real'] * rng.normal(loc=0, size=len(df), scale=sel_noise_level)
print('Added noise')
# Swap for equal distribution on both sides
p = df['predict'].copy()
r = rng.integers(0, 1, size=len(df), endpoint=True)
df['predict'] = np.where(r == 1, df['real'], df['predict'])
df['real'] = np.where(r == 0, df['real'], p)
df.loc[df['predict'] < 0, 'predict'] = 0.0
del p
del r
print('Swap predict/real')
# Create and add anomalies
anomalies = get_anomaly_locations(df, dimensions, anomaly_properties)
print('anomalies', anomalies)
anomaly_masks = get_anomaly_masks(df, anomalies)
# We set the direction following the prediction error direction in the normal data.
# This is to make sure that the error direction in the normal data does not overweigh
# the error introduced by the anomalies.
direction = 1 if df['real'].sum() > df['predict'].sum() else 0
for i, anomaly_mask in enumerate(anomaly_masks):
properties = anomaly_properties[i]
if direction == 0:
df.loc[anomaly_mask, 'real'] = df.loc[anomaly_mask, 'predict'] # reset the noise
df.loc[anomaly_mask, 'real'] = df.loc[anomaly_mask, 'real'].apply(scale_anomaly, properties=properties)
else:
df.loc[anomaly_mask, 'predict'] = df.loc[anomaly_mask, 'real'] # reset the noise
df.loc[anomaly_mask, 'predict'] = df.loc[anomaly_mask, 'predict'].apply(scale_anomaly, properties=properties)
labels = generate_labels(anomalies)
metadata = create_metadata(df, anomaly_masks, anomaly_properties, sel_zero_rate, sel_noise_level, direction)
return df, labels, metadata
def generate_filename(n, used_names):
filename = 0
while filename == 0 or filename in used_names:
range_start = 10**(n-1)
range_end = (10**n)-1
filename = str(rng.integers(range_start, range_end))
return filename
if __name__ == "__main__":
used_names = set()
dataset_info = dict()
for i in range(number_of_files):
filename = generate_filename(6, used_names)
used_names.add(filename)
print('creating file', filename)
df, labels, metadata = generate_dataset(dimensions)
df.to_csv(os.path.join(save_path, filename + '.csv'), index=False)
dataset_info[filename] = {'set': labels, **metadata}
del df
print('-----------------------')
# Save inside the loop for intermediate results
df_info = pd.DataFrame.from_dict(dataset_info, orient='index')
df_info = df_info.rename_axis('timestamp').reset_index()
df_info.to_csv(os.path.join(save_path, 'injection_info.csv'), index=False)