forked from VincentGranville/Main
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfeatureClustering.py
130 lines (113 loc) · 3.95 KB
/
featureClustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
correlMatrix = [
[1.0000,0.1983,0.2134,0.0932,0.0790,-0.0253,0.0076,0.6796,0.2566],
[0.1983,1.0000,0.2100,0.1989,0.5812,0.2095,0.1402,0.3436,0.5157],
[0.2134,0.2100,1.0000,0.2326,0.0985,0.3044,-0.0160,0.3000,0.1927],
[0.0932,0.1989,0.2326,1.0000,0.1822,0.6644,0.1605,0.1678,0.2559],
[0.0790,0.5812,0.0985,0.1822,1.0000,0.2264,0.1359,0.2171,0.3014],
[-0.0253,0.2095,0.3044,0.6644,0.2264,1.0000,0.1588,0.0698,0.2701],
[0.0076,0.1402,-0.0160,0.1605,0.1359,0.1588,1.0000,0.0850,0.2093],
[0.6796,0.3436,0.3000,0.1678,0.2171,0.0698,0.0850,1.0000,0.3508],
[0.2566,0.5157,0.1927,0.2559,0.3014,0.2701,0.2093,0.3508,1.0000]]
dim = len(correlMatrix)
threshold = 0.4 # two features with |correl|>threshold are connected
pairs = {}
for i in range(dim):
for j in range(i+1,dim):
dist = abs(correlMatrix[i][j])
if dist > threshold:
pairs[(i,j)] = abs(correlMatrix[i][j])
pairs[(j,i)] = abs(correlMatrix[i][j])
# connected components algo to detect feature clusters on feature pairs
#---
# PART 1: Initialization.
point=[]
NNIdx={}
idxHash={}
n=0
for key in pairs:
idx = key[0]
idx2 = key[1]
if idx in idxHash:
idxHash[idx]=idxHash[idx]+1
else:
idxHash[idx]=1
point.append(idx)
NNIdx[idx]=idx2
n=n+1
hash={}
for i in range(n):
idx=point[i]
if idx in NNIdx:
substring="~"+str(NNIdx[idx])
string=""
if idx in hash:
string=str(hash[idx])
if substring not in string:
if idx in hash:
hash[idx]=hash[idx]+substring
else:
hash[idx]=substring
substring="~"+str(idx)
if NNIdx[idx] in hash:
string=hash[NNIdx[idx]]
if substring not in string:
if NNIdx[idx] in hash:
hash[NNIdx[idx]]=hash[NNIdx[idx]]+substring
else:
hash[NNIdx[idx]]=substring
#---
# PART 2: Find the connected components
i=0;
status={}
stack={}
onStack={}
cliqueHash={}
while i<n:
while (i<n and point[i] in status and status[point[i]]==-1):
# point[i] already assigned to a clique, move to next point
i=i+1
nstack=1
if i<n:
idx=point[i]
stack[0]=idx; # initialize the point stack, by adding $idx
onStack[idx]=1;
size=1 # size of the stack at any given time
while nstack>0:
idx=stack[nstack-1]
if (idx not in status) or status[idx] != -1:
status[idx]=-1 # idx considered processed
if i<n:
if point[i] in cliqueHash:
cliqueHash[point[i]]=cliqueHash[point[i]]+"~"+str(idx)
else:
cliqueHash[point[i]]="~"+str(idx)
nstack=nstack-1
aux=hash[idx].split("~")
aux.pop(0) # remove first (empty) element of aux
for idx2 in aux:
# loop over all points that have point idx as nearest neighbor
idx2=int(idx2)
if idx2 not in status or status[idx2] != -1:
# add point idx2 on the stack if it is not there yet
if idx2 not in onStack:
stack[nstack]=idx2
nstack=nstack+1
onStack[idx2]=1
#---
# PART 3: Save results.
clusterID = 1
for clique in cliqueHash:
cluster = cliqueHash[clique]
cluster = cluster.replace('~', ' ')
print("Feature Cluster number %2d: features %s" %(clusterID, cluster))
clusterID += 1
clusteredFeature = {}
for feature in range(dim):
for clique in cliqueHash:
if str(feature) in cliqueHash[clique]:
clusteredFeature[feature] = True
for feature in range(dim):
if feature not in clusteredFeature:
cluster = " "+str(feature)
print("Feature Cluster number %2d: features %s" %(clusterID, cluster))
clusterID += 1