-
Notifications
You must be signed in to change notification settings - Fork 2
/
meta_features.py
352 lines (285 loc) · 11.2 KB
/
meta_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
# @Author: Joey Teng
# @Email: joey.teng.dev@gmail.com
# @Filename: meta_features.py
# @Last modified by: Joey Teng
# @Last modified time: 27-Mar-2018
"""Define and calculate meta-features using given clusters.
See function meta-features()
"""
import collections
import itertools
import math
import numpy
INFINITESIMAL = 1e-323
def size_versus_number_of_clusters(clusters):
"""Calculate the number of clusters respect to each size.
Args:
clusters (list): list of clusters
Returns:
dict:
float: average
float: standard deviation
int: range
dict: stats
{size (int): quantity (int), ...}
"""
stats = collections.defaultdict(int) # default = 0
sizes = [cluster['size'] for cluster in clusters]
for cluster in clusters:
# initial quantity is 0
stats[cluster['size']] += 1
average = numpy.average(sizes)
standard_deviation = numpy.std(sizes)
range_ = max(sizes) - min(sizes)
return {
'average': average,
'standard deviation': standard_deviation,
'range': range_,
'stats': stats}
def volume_versus_size(clusters):
"""Calculate volume of clusters respect to its size.
Args:
clusters (list): list of clusters
Returns:
dict: {size (int): volume (list of floats)}
"""
stats = collections.defaultdict(list)
for cluster in clusters:
# initial container is empty
stats[cluster['size']].append(cluster['volume'])
return stats
def log_volume_versus_size(clusters):
"""Calculate log-volume of clusters respect to its size.
Args:
clusters (list): list of clusters
Returns:
dict: {size (int): log-volume (list of floats)}
"""
stats = collections.defaultdict(list)
for cluster in clusters:
# initial container is empty
stats[cluster['size']].append(cluster['log-volume'])
return stats
def calculate_inverse_density(cluster):
"""Calculate the inverse of Density of a cluster.
inverse of density = volume / size
Args:
clusters (list): list of clusters
Returns:
float: inverse of density
"""
inverse_density = cluster['volume'] / cluster['size']
return inverse_density
def inverse_density_distribution(clusters, slots):
"""Calculate number of clusters in each inverse of density interval.
[lb - 1 * interval, ... (slots - 1) * interval - hb]
lb = lower bound
hb = higher bound
interval = range / slots = (hb - lb) / slots
Args:
clusters (list): list of clusters
slots (int): number of intervals
Returns:
dict:
float: interval
range / slots
float: average
numpy.average
float: standard deviation
numpy.std
float: range
higherbound - lowerbound
dict: stats
from lower bound to higher
{inf: int, n-th slot: int, ...}
[lb - 1 * interval, ... (slots - 1) * interval - hb]
"""
inverse_densities = list(map(calculate_inverse_density, clusters))
stats = collections.defaultdict(int)
interval = None
lowerbound = INFINITESIMAL
higherbound = INFINITESIMAL
if inverse_densities:
lowerbound = min(inverse_densities)
higherbound = max(inverse_densities)
_range = higherbound - lowerbound
interval = _range / slots
if math.isclose(interval, 0):
interval = max(lowerbound, float(1)) # prevent ZeroDivisionError
for inverse_density in inverse_densities:
try:
stats[int((inverse_density - lowerbound) / interval)] += 1
except ZeroDivisionError:
print("Densities: {}".format(inverse_densities))
print("Volumes: {}".format(
list(map(lambda x: x['volume'], clusters))))
print("Size: {}".format(
list(map(lambda x: x['size'], clusters))))
raise ZeroDivisionError(
"({} - {}) / {}".format(
inverse_density, lowerbound, interval))
except ValueError as message:
print("Densities: {}".format(inverse_densities))
print("Volumes: {}".format(
list(map(lambda x: x['volume'], clusters))))
print("Size: {}".format(
list(map(lambda x: x['size'], clusters))))
raise ValueError(
"({} - {}) / {}\n{}".format(
inverse_density, lowerbound, interval, message))
average = numpy.average(inverse_densities)
standard_deviation = numpy.std(inverse_densities)
range_ = higherbound - lowerbound
return {'interval': interval,
'min': lowerbound,
'average': average,
'standard deviation': standard_deviation,
'range': range_,
'stats': stats}
def calculate_inverse_log_density(cluster):
"""Calculate the log of inverse of Density of a cluster.
inverse of density-log = log-volume - ln(size)
Args:
cluster ():
Returns:
float: inverse of density-log
-inf if log-volume = -inf
"""
inverse_log_density = cluster['log-volume'] - math.log(cluster['size'])
return inverse_log_density
def inverse_log_density_distribution(clusters, slots):
"""Calculate number of clusters in each inverse of density interval.
inverse_log_density = log-volume - ln(size)
[lb - 1 * interval, ... (slots - 1) * interval - hb]
lb = lower bound
hb = higher bound
interval = range / slots = (hb - lb) / slots
Args:
clusters (list): list of clusters
slots (int): number of intervals
Returns:
dict:
float: interval
range / slots
float: average
numpy.average
float: standard deviation
numpy.std
float: range
higherbound - lowerbound
dict: stats
from lower bound to higher
{inf: int, n-th slot: int, ...}
[lb - 1 * interval, ... (slots - 1) * interval - hb]
"""
raw_inverse_log_densities = list(
map(calculate_inverse_log_density, clusters))
inverse_log_densities = [
inverse_log_density
for inverse_log_density in raw_inverse_log_densities
if math.isfinite(inverse_log_density)]
stats = collections.defaultdict(int)
interval = None
lowerbound = INFINITESIMAL
higherbound = INFINITESIMAL
if inverse_log_densities:
lowerbound = min(inverse_log_densities)
higherbound = max(inverse_log_densities)
_range = higherbound - lowerbound
interval = _range / slots
if math.isclose(interval, 0):
interval = max(lowerbound, float(1)) # prevent ZeroDivisionError
for inverse_log_density in inverse_log_densities:
try:
stats[int((inverse_log_density - lowerbound) / interval)] += 1
except ZeroDivisionError:
print("Densities: {}".format(inverse_log_densities))
print("Volumes: {}".format(
list(map(lambda x: x['volume'], clusters))))
print("Size: {}".format(
list(map(lambda x: x['size'], clusters))))
raise ZeroDivisionError(
"({} - {}) / {}".format(
inverse_log_density, lowerbound, interval))
except ValueError as message:
print("Densities: {}".format(inverse_log_densities))
print("Volumes: {}".format(
list(map(lambda x: x['volume'], clusters))))
print("Size: {}".format(
list(map(lambda x: x['size'], clusters))))
raise ValueError(
"({} - {}) / {}\n{}".format(
inverse_log_density, lowerbound, interval, message))
# All spheres with -inf volume
stats[-1] = len(raw_inverse_log_densities) - len(inverse_log_densities)
average = numpy.average(inverse_log_densities)
standard_deviation = numpy.std(inverse_log_densities)
range_ = higherbound - lowerbound
return {'interval': interval,
'min': lowerbound,
'average': average,
'standard deviation': standard_deviation,
'range': range_,
'stats': stats}
def label_versus_meta_features(clusters, func, *args, **kwargs):
"""Calculate meta-features for clusters with each label.
Separate clusters based on label and call the funcitons
Include a '_population' label which indicate the meta-feature over
the population regardless of the label
Args:
clusters (dict): list of clusters with ['label']
func (function):
the function that used to calculate the meta-feature required
Returns:
dict: stats
{label (label): corresponding meta-feature, ...}
"""
_clusters = collections.defaultdict(list)
_clusters['_population'] = list(itertools.chain(*clusters.values()))
_clusters.update(clusters.items())
stats = {}
for label in _clusters:
stats[label] = func(_clusters[label], *args, **kwargs)
return stats
def meta_features(clusters): # TODO
"""Calculate all the meta-features defined using clusters calculated.
Args:
clusters (list): list of clusters
[{
'vertices' (list): vertices
all the vertices on/defined the hull
'points' (list): vertices
all the instances that are in the hull
(same label as homogeniety is maintained)
'size' (int): the number of instances belong to this hull
len(vertices) + len(points)
'volume' (float):
the volume in the Euclidean n-dimensional space obtained
by the hull
'label' (int):
the category that the hull belongs to
}, ...]
Returns:
meta-features (dict):
{
'Number of Clusters' (int)
'Size versus Number of Clusters' ():
'Volume versus Size' ():
'Inverse Density distribution over 10 intervals' ():
}
"""
return {'Number of Clusters':
label_versus_meta_features(clusters, len),
'Size versus Number of Clusters':
label_versus_meta_features(
clusters, size_versus_number_of_clusters),
# 'Volume versus Size':
# label_versus_meta_features(clusters, volume_versus_size),
'log-Volume versus Size':
label_versus_meta_features(clusters, log_volume_versus_size),
# 'Inverse Density distribution over 10 intervals':
# label_versus_meta_features(
# clusters, inverse_density_distribution, 10)
'Inverse Log Density distribution over 10 intervals':
label_versus_meta_features(
clusters, inverse_log_density_distribution, 10)}