1
- import os
1
+
2
2
import pandas as pd
3
3
import numpy as np
4
4
import matplotlib .pyplot as plt
@@ -110,8 +110,7 @@ def get_slice_feature(data):
110
110
p_10 = np .percentile (data ['Ia' ][data ['cut_points' ] == index ], 20 )
111
111
iqr .append (p_90 - p_10 )
112
112
113
- slice_feature = pd .DataFrame ({'indexs' : indexs ,
114
- 'length' : length ,
113
+ slice_feature = pd .DataFrame ({'length' : length ,
115
114
'mean_ia' : mean_ia ,
116
115
'peak_count' : peak_count ,
117
116
'accumulate_step' : accumulate_step ,
@@ -123,7 +122,17 @@ def get_slice_feature(data):
123
122
return slice_feature
124
123
125
124
126
- def labeling_slice_feature (slice_feature , features ):
125
+ def labeling_slice_feature (slice_feature , features , times , vote ):
126
+ '''
127
+ Label the slice_feature data.frame with outcomes of KMeans clustering.
128
+
129
+ Note:'times' and 'vote' paras are closely related. (1, 1)|(2, 0.5)
130
+ :param slice_feature: input data.
131
+ :param features: the features used for KMeans clustering.
132
+ :param times: this function will be used twice.
133
+ :param vote: vote the outcomes of using feature for KMeans.
134
+ :return: labeled_slice_feature
135
+ '''
127
136
labeled_slice_feature = slice_feature .copy ()
128
137
for feature in features :
129
138
label_name = '%s_label' % feature
@@ -141,29 +150,60 @@ def labeling_slice_feature(slice_feature, features):
141
150
142
151
label_names = ['%s_label' % x for x in features ]
143
152
slice_sum_label = np .sum (labeled_slice_feature [label_names ], axis = 1 )
144
- slice_sum_label [slice_sum_label > 0 ] = 1
153
+ if times == 1 :
154
+ slice_sum_label [slice_sum_label > 0 ] = 1
155
+ elif times == 2 :
156
+ slice_sum_label [slice_sum_label < vote * len (features )] = 0
157
+ slice_sum_label [slice_sum_label >= vote * len (features )] = 1
158
+ else :
159
+ print ('error' )
145
160
labeled_slice_feature ['slice_sum_label' ] = slice_sum_label
146
161
147
162
return labeled_slice_feature
148
163
149
164
150
- def cluster_work_others (data , labeled_slice_feature ):
165
+ def post_cluster (data , labeled_slice_feature ):
166
+ labels = np .unique (labeled_slice_feature ['slice_sum_label' ])
151
167
the_labels = np .zeros (data .shape [0 ])
152
168
153
- label_x = labeled_slice_feature .index [labeled_slice_feature ['slice_sum_label' ] == 1 ]
154
- for i in label_x :
155
- the_labels [data ['cut_points' ] == i ] = 1
169
+ for label in labels :
170
+ label_x = labeled_slice_feature .index [labeled_slice_feature ['slice_sum_label' ] == label ]
171
+ for i in label_x :
172
+ the_labels [data ['cut_points' ] == i ] = label
156
173
data ['sum_label' ] = the_labels
157
174
158
175
return data
159
176
160
177
161
178
def plot_cluster (data ):
162
- labels = np .unique (data ['sum_label' ])
163
- for label in labels :
179
+ labels = np .sort (np .unique (data ['sum_label' ]))
180
+ if len (labels ) == 2 :
181
+ colors = ['grey' , 'blue' ]
182
+ elif len (labels ) == 3 :
183
+ colors = ['grey' , 'red' , 'blue' ]
184
+ else :
185
+ print ('error' )
186
+ for i in range (len (labels )):
187
+ label = labels [i ]
164
188
plt .scatter (data .index [data ['sum_label' ] == label ].values ,
165
- data .loc [data ['sum_label' ] == label , 'Ia' ].values , s = 2 )
189
+ data .loc [data ['sum_label' ] == label , 'Ia' ].values ,
190
+ s = 2 , color = colors [i ])
166
191
plt .vlines (cut_points , 0 , np .max (data ['Ia' ]), linewidth = 0.5 , color = 'g' )
192
+ plt .show ()
193
+
194
+
195
+ def post_cluster_2 (data , labeled_slice_feature , times , vote ):
196
+ rest_slice_feature = labeled_slice_feature .loc [labeled_slice_feature ['slice_sum_label' ] == 0 , features ]
197
+ rest_labeled_slice_feature = labeling_slice_feature (rest_slice_feature , features , times , vote )
198
+
199
+ label_1 = labeled_slice_feature [['slice_sum_label' ]]
200
+ label_2 = rest_labeled_slice_feature [['slice_sum_label' ]]
201
+ label_12 = pd .merge (label_1 , label_2 , how = 'outer' , left_index = True , right_index = True )
202
+ label_12 = label_12 .fillna (1 )
203
+ slice_sum_label = np .sum (label_12 , axis = 1 )
204
+ labeled_slice_feature ['slice_sum_label' ] = slice_sum_label
205
+
206
+ return post_cluster (data , labeled_slice_feature )
167
207
168
208
169
209
if __name__ == '__main__' :
@@ -182,14 +222,12 @@ def plot_cluster(data):
182
222
features = ['var' , 'peak_count_ave' , 'accumulate_step_ave' , 'iqr' ]
183
223
184
224
slice_feature = get_slice_feature (data )
185
- labeled_slice_feature = labeling_slice_feature (slice_feature , features )
186
- data = cluster_work_others (data , labeled_slice_feature )
225
+ labeled_slice_feature = labeling_slice_feature (slice_feature , features , 1 , 1 )
226
+ data = post_cluster (data , labeled_slice_feature )
187
227
plot_cluster (data )
188
228
189
-
190
-
191
-
192
-
229
+ data_2 = post_cluster_2 (data , labeled_slice_feature , 2 , 0.5 )
230
+ plot_cluster (data_2 )
193
231
194
232
195
233
0 commit comments