-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathLabelGenerate.py
198 lines (156 loc) · 6.92 KB
/
LabelGenerate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
import random
import time
import utils as ut
import numpy as np
from tqdm import tqdm
from TreeStructure import Node, BinaryTree, ItemsTree
def ClusterTree(file_name, k=10, cluster_size=6):
print(file_name)
save_name = ""
dir_list = file_name.split("/")
for i in range(len(dir_list)-1):
save_name += dir_list[i] + "/"
# 登录数据集
dataMatrix, trainDataset, trainKnn = ut.LoadDataset(file_name)
data_num = dataMatrix.shape[0]
# 构建聚类树基本信息
pointsIndex = [i for i in range(data_num)]
tree = BinaryTree()
pointNum, cpoint = ut.CentralPointOfCluster(pointsIndex, dataMatrix)
# 构造树的根节点
tree.root = Node(cpoint)
tree.root.pointnum = pointNum
tree.root.Id = 0
# 进行聚类
start = time.perf_counter()
tree.TDcluster(k, tree.root, pointsIndex, dataMatrix, cluster_size=cluster_size)
end = time.perf_counter()
timeUsable = end - start
# 将构建的初始树的信息写入文件
deep = tree.TreeInformation(tree.root)
tree.postOrder(tree.root)
with open(save_name + "树的相关信息.txt", "w") as file:
file.write("{}数据集构建树完成,用时{}\n".format(file_name, timeUsable))
file.write("\n" + "*" * 5 + "树的基本信息" + "*" * 5 + "\n")
file.write("树的深度为:{}\n".format(deep))
file.write("树的节点总数:{}\n".format(len(tree.nodes)))
file.write("树的叶子节点数:{}\n".format(len(tree.leafNodes)))
file.write("*" * 25 + "\n")
print("聚类树完成,用时{}".format(timeUsable))
# 保存树
tree.SaveTree(save_name + "Tree.txt")
print("树以保存至{}".format(save_name + "Tree.txt"))
def OptimizeIRepartionTree(file_name, k=10, cluster_size=6, portion_size=400):
save_name = ""
dir_list = file_name.split("/")
for i in range(len(dir_list) - 1):
save_name += dir_list[i] + "/"
# 登录数据集
dataMatrix, trainDataset, trainKnn = ut.LoadDataset(file_name)
# 登录聚类树
tree = ItemsTree()
start = time.perf_counter()
tree.LoadTree(save_name + "Tree.txt")
end = time.perf_counter()
with open(save_name + "树的相关信息.txt", "a") as f:
f.write("\n针对Tree结构进行递增的重分配\n")
f.write("登录Tree树结构用时:{}\n".format(end - start))
print("\n针对Tree结构进行递增的重分配")
print("登录Tree树结构用时:{}".format(end - start))
start = time.perf_counter()
tree.IncrementalRepartion(dataMatrix, file_name, k=k, cluster_size=cluster_size, portion_size=portion_size)
end = time.perf_counter()
with open(save_name + "树的相关信息.txt", "a") as f:
f.write("优化时间为:{}\n".format(end - start))
print("优化时间为:{}\n".format(end - start))
tree.SaveTree(save_name + "IRepartionTree")
def LabelGeneration(file_name, c=1):
save_name = ""
dir_list = file_name.split("/")
for i in range(len(dir_list) - 1):
save_name += dir_list[i] + "/"
# 登录聚类树
tree = ItemsTree()
start = time.perf_counter()
tree.LoadTree(save_name + "IRepartionTree")
end = time.perf_counter()
with open(save_name + "树的相关信息.txt", "a") as f:
f.write("\n生成神经网络的训练集\n")
f.write("登录IRepartionTree结构用时:{}\n".format(end - start))
print("\n生成神经网络的训练集")
print("登录IRepartionTree结构用时:{}".format(end - start))
# 这里需要清空是害怕 postOrder 曾被多次调用过
tree.nodes = []
tree.postOrder(tree.root)
simple = Node([])
nodeList = [simple for i in range(tree.recursion + 1)]
for node in tree.nodes:
if not nodeList[node.Id].cpoint:
nodeList[node.Id] = node
else:
print("出现重复节点,代码有误")
dataMatrix, trainDataset, trainKnn = ut.LoadDataset(file_name)
label_dict = {}
ClusterList = []
flag = 0
print(len(nodeList))
for node in nodeList:
if node.pointIndex is not None:
ClusterList.append(node.pointIndex)
label_dict[str(node.Id)] = flag
flag += 1
print(len(label_dict.keys()))
print(len(ClusterList))
np.save(save_name + "ClusterList", ClusterList)
train_with_label = []
for trainPoint in tqdm(trainDataset):
sample = []
candid, leafNodes = tree.search_C(trainPoint, c)
index = label_dict[str(leafNodes[0].Id)]
sample.append(trainPoint)
sample.append(index)
train_with_label.append(sample)
np.save(save_name + "train_with_label", train_with_label)
def TestRepartionTree(file_name, c=1, k=10):
save_name = ""
dir_list = file_name.split("/")
for i in range(len(dir_list) - 1) :
save_name += dir_list[i] + "/"
# 登录数据集
dataMatrix, testDataset, testKnn = ut.LoadDataset(file_name)
# 登录聚类树
tree = ItemsTree()
start = time.perf_counter()
tree.LoadTree(save_name + "IRepartionTree")
end = time.perf_counter()
with open(save_name + "树的相关信息.txt", "a") as f:
f.write("\n\n" + "#" * 5 + "测试IRepartionTree树的相关信息" + "#" * 5 + "\n")
f.write("登录树结构用时:{}\n\n".format(end - start))
print("\n" + "#" * 5 + "测试IRepartionTree树的相关信息" + "#" * 5)
print("登录树结构用时:{}".format(end - start))
start = time.perf_counter()
for index in range(testDataset.shape[0]):
testPoint = testDataset[index]
candid, leafNodes = tree.search_C(testPoint, c)
if len(candid) < k:
predict, predictDistance = ut.Knn(len(candid), testPoint, candid, dataMatrix)
else:
predict, predictDistance = ut.Knn(k, testPoint, candid, dataMatrix)
end = time.perf_counter()
f.write("不添加重复元素的平均时间:{}\n".format((end - start) / testDataset.shape[0]))
print("不添加重复元素的平均时间:{}".format((end - start) / testDataset.shape[0]))
recall, ratio = ut.ShowResult(tree, dataMatrix, testDataset, testKnn,
file_name.split('.')[0] + "Repartionc1nohave.txt", hava_element=False)
f.write("不添加重复元素的召回率是:{}\n".format(recall))
f.write("不添加重复元素的Ratio是:{}\n".format(ratio))
if __name__ == "__main__":
# , "Zipf/sun/datasetKnn.hdf5",
# "Zipf/enron/datasetKnn.hdf5", "Zipf/nuswide/datasetKnn.hdf5",
# "Zipf/notre/datasetKnn.hd f5", "Zipf/sift/datasetKnn.hdf5"
FileName = ["Zipf/audio/datasetKnn.hdf5"]
cluster_size = 1
for file_name in FileName:
ClusterTree(file_name, k=10, cluster_size=cluster_size)
OptimizeIRepartionTree(file_name, k=10, cluster_size=cluster_size, portion_size=400)
# LabelGeneration(file_name, c=1)
TestRepartionTree(file_name, c=1, k=10)