Skip to content

Commit 4f0e5f9

Browse files
author
东凡
committed
repeat the iterated clustering twice for better performance
1 parent 27fa33b commit 4f0e5f9

File tree

1 file changed

+56
-18
lines changed

1 file changed

+56
-18
lines changed

elec_cut.py

Lines changed: 56 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import os
1+
22
import pandas as pd
33
import numpy as np
44
import matplotlib.pyplot as plt
@@ -110,8 +110,7 @@ def get_slice_feature(data):
110110
p_10 = np.percentile(data['Ia'][data['cut_points'] == index], 20)
111111
iqr.append(p_90 - p_10)
112112

113-
slice_feature = pd.DataFrame({'indexs': indexs,
114-
'length': length,
113+
slice_feature = pd.DataFrame({'length': length,
115114
'mean_ia': mean_ia,
116115
'peak_count': peak_count,
117116
'accumulate_step': accumulate_step,
@@ -123,7 +122,17 @@ def get_slice_feature(data):
123122
return slice_feature
124123

125124

126-
def labeling_slice_feature(slice_feature, features):
125+
def labeling_slice_feature(slice_feature, features, times, vote):
126+
'''
127+
Label the slice_feature data.frame with outcomes of KMeans clustering.
128+
129+
Note:'times' and 'vote' paras are closely related. (1, 1)|(2, 0.5)
130+
:param slice_feature: input data.
131+
:param features: the features used for KMeans clustering.
132+
:param times: this function will be used twice.
133+
:param vote: vote the outcomes of using feature for KMeans.
134+
:return: labeled_slice_feature
135+
'''
127136
labeled_slice_feature = slice_feature.copy()
128137
for feature in features:
129138
label_name = '%s_label' % feature
@@ -141,29 +150,60 @@ def labeling_slice_feature(slice_feature, features):
141150

142151
label_names = ['%s_label' % x for x in features]
143152
slice_sum_label = np.sum(labeled_slice_feature[label_names], axis=1)
144-
slice_sum_label[slice_sum_label > 0] = 1
153+
if times == 1:
154+
slice_sum_label[slice_sum_label > 0] = 1
155+
elif times == 2:
156+
slice_sum_label[slice_sum_label < vote * len(features)] = 0
157+
slice_sum_label[slice_sum_label >= vote * len(features)] = 1
158+
else:
159+
print('error')
145160
labeled_slice_feature['slice_sum_label'] = slice_sum_label
146161

147162
return labeled_slice_feature
148163

149164

150-
def cluster_work_others(data, labeled_slice_feature):
165+
def post_cluster(data, labeled_slice_feature):
166+
labels = np.unique(labeled_slice_feature['slice_sum_label'])
151167
the_labels = np.zeros(data.shape[0])
152168

153-
label_x = labeled_slice_feature.index[labeled_slice_feature['slice_sum_label'] == 1]
154-
for i in label_x:
155-
the_labels[data['cut_points'] == i] = 1
169+
for label in labels:
170+
label_x = labeled_slice_feature.index[labeled_slice_feature['slice_sum_label'] == label]
171+
for i in label_x:
172+
the_labels[data['cut_points'] == i] = label
156173
data['sum_label'] = the_labels
157174

158175
return data
159176

160177

161178
def plot_cluster(data):
162-
labels = np.unique(data['sum_label'])
163-
for label in labels:
179+
labels = np.sort(np.unique(data['sum_label']))
180+
if len(labels) == 2:
181+
colors = ['grey', 'blue']
182+
elif len(labels) == 3:
183+
colors = ['grey', 'red', 'blue']
184+
else:
185+
print('error')
186+
for i in range(len(labels)):
187+
label = labels[i]
164188
plt.scatter(data.index[data['sum_label'] == label].values,
165-
data.loc[data['sum_label'] == label, 'Ia'].values, s=2)
189+
data.loc[data['sum_label'] == label, 'Ia'].values,
190+
s=2, color=colors[i])
166191
plt.vlines(cut_points, 0, np.max(data['Ia']), linewidth=0.5, color='g')
192+
plt.show()
193+
194+
195+
def post_cluster_2(data, labeled_slice_feature, times, vote):
196+
rest_slice_feature = labeled_slice_feature.loc[labeled_slice_feature['slice_sum_label'] == 0, features]
197+
rest_labeled_slice_feature = labeling_slice_feature(rest_slice_feature, features, times, vote)
198+
199+
label_1 = labeled_slice_feature[['slice_sum_label']]
200+
label_2 = rest_labeled_slice_feature[['slice_sum_label']]
201+
label_12 = pd.merge(label_1, label_2, how='outer', left_index=True, right_index=True)
202+
label_12 = label_12.fillna(1)
203+
slice_sum_label = np.sum(label_12, axis=1)
204+
labeled_slice_feature['slice_sum_label'] = slice_sum_label
205+
206+
return post_cluster(data, labeled_slice_feature)
167207

168208

169209
if __name__ == '__main__':
@@ -182,14 +222,12 @@ def plot_cluster(data):
182222
features = ['var', 'peak_count_ave', 'accumulate_step_ave', 'iqr']
183223

184224
slice_feature = get_slice_feature(data)
185-
labeled_slice_feature = labeling_slice_feature(slice_feature, features)
186-
data = cluster_work_others(data, labeled_slice_feature)
225+
labeled_slice_feature = labeling_slice_feature(slice_feature, features, 1, 1)
226+
data = post_cluster(data, labeled_slice_feature)
187227
plot_cluster(data)
188228

189-
190-
191-
192-
229+
data_2 = post_cluster_2(data, labeled_slice_feature, 2, 0.5)
230+
plot_cluster(data_2)
193231

194232

195233

0 commit comments

Comments
 (0)