1
-
1
+ import numpy as np
2
+ import pandas as pd
3
+ import time
4
+ import csv
5
+ import math
2
6
class FrechetDistance (object ):
3
7
def __init__ (self ) -> None :
4
8
super ().__init__ ()
5
9
6
- def get_frechet_distance (self , repo_name ):
7
- return 0
10
+
11
+ def get_frechet_distance (self , directory_name ):
12
+ start = time .time () # 计算程序运行时间
13
+ topnum = 50
14
+ directpath = directory_name
15
+ filename = directpath + "commitday.csv"
16
+ '''按照天为单位'''
17
+ tmpfilename = directpath + "sum_commitday.csv"
18
+ topfilename = directpath + "frechet_topday.csv"
19
+ self .SumCSV (filename , tmpfilename , topfilename , 50 )
20
+ filename = tmpfilename
21
+ outname = directpath
22
+ self .getDistanceToAll (filename ,outname + 'OvR_Normal_Divide_day.csv' ,True ,True )
23
+ self .getDistanceToAll (filename ,outname + 'OvR_Divide_day.csv' ,False ,True ) #不Normal是为了得到绝对距离(归一后相似度高 && 归一前值也高)
24
+ #self.getDistanceMatrix(filename,outname+'RvR_Normal_Divide.csv',True,True)
25
+ #getDistanceMatrix(filename,outname+'RvR_Divide.csv',False,True) #不Normal是为了得到绝对距离(归一后相似度高 && 归一前值也高)
26
+
27
+ '''按照周为单位'''
28
+ filename = directpath + "commitweek.csv"
29
+ tmpfilename = directpath + "sum_commitweek.csv"
30
+ topfilename = directpath + "frechet_topweek.csv"
31
+ self .SumCSV (filename , tmpfilename , topfilename , 50 )
32
+ filename = tmpfilename
33
+ outname = directpath
34
+ self .getDistanceToAll (filename ,outname + 'OvR_Normal_Divide_week.csv' ,True ,True )
35
+ self .getDistanceToAll (filename ,outname + 'OvR_Divide_week.csv' ,False ,True ) #不Normal是为了得到绝对距离(归一后相似度高 && 归一前值也高)
36
+ #self.getDistanceMatrix(filename,outname+'RvR_Normal_Divide.csv',True,True)
37
+ #self.getDistanceMatrix(filename,outname+'RvR_Divide.csv',False,True) #不Normal是为了得到绝对距离(归一后相似度高 && 归一前值也高)
38
+
39
+ end = time .time ()
40
+ print ("运行时间:" + str (end - start ) + "s" )
41
+
42
+
43
+ #以下部分全部搬运自原来的python文件
44
+ # Euclidean distance.
45
+ def euc_dist (self ,pt1 ,pt2 ):
46
+ return math .sqrt ((pt2 [0 ]- pt1 [0 ])* (pt2 [0 ]- pt1 [0 ])+ (pt2 [1 ]- pt1 [1 ])* (pt2 [1 ]- pt1 [1 ]))
47
+
48
+ def _c (self ,ca ,i ,j ,P ,Q ):
49
+ if ca [i ,j ] > - 1 :
50
+ return ca [i ,j ]
51
+ elif i == 0 and j == 0 :
52
+ ca [i ,j ] = euc_dist (P [0 ],Q [0 ])
53
+ elif i > 0 and j == 0 :
54
+ ca [i ,j ] = max (_c (ca ,i - 1 ,0 ,P ,Q ),euc_dist (P [i ],Q [0 ]))
55
+ elif i == 0 and j > 0 :
56
+ ca [i ,j ] = max (_c (ca ,0 ,j - 1 ,P ,Q ),euc_dist (P [0 ],Q [j ]))
57
+ elif i > 0 and j > 0 :
58
+ ca [i ,j ] = max (min (_c (ca ,i - 1 ,j ,P ,Q ),_c (ca ,i - 1 ,j - 1 ,P ,Q ),_c (ca ,i ,j - 1 ,P ,Q )),euc_dist (P [i ],Q [j ]))
59
+ else :
60
+ ca [i ,j ] = float ("inf" )
61
+ return ca [i ,j ]
62
+
63
+ def dis (self ,ca ,x ,y ,P ,Q ):
64
+ for i in range (x ):
65
+ for j in range (y ):
66
+ if ca [i , j ] > - 1 :
67
+ return ca [i , j ]
68
+ elif i == 0 and j == 0 :
69
+ ca [i , j ] = self .euc_dist (P [0 ], Q [0 ])
70
+ elif i > 0 and j == 0 :
71
+ ca [i , j ] = max (ca [i - 1 ,0 ], self .euc_dist (P [i ], Q [0 ]))
72
+ elif i == 0 and j > 0 :
73
+ ca [i , j ] = max (ca [0 ,j - 1 ], self .euc_dist (P [0 ], Q [j ]))
74
+ elif i > 0 and j > 0 :
75
+ A = ca [i - 1 ,j ];
76
+ B = ca [i - 1 ,j - 1 ];
77
+ C = ca [i ,j - 1 ];
78
+ ca [i , j ] = max (min (A , B , C ),
79
+ self .euc_dist (P [i ], Q [j ]))
80
+ else :
81
+ ca [i , j ] = float ("inf" )
82
+ return ca [i ,j ]
83
+
84
+ """
85
+ Computes the discrete frechet distance between two polygonal lines
86
+ Algorithm: http://www.kr.tuwien.ac.at/staff/eiter/et-archive/cdtr9464.pdf
87
+ P and Q are arrays of 2-element arrays (points)
88
+ """
89
+ def frechetDist (self ,P ,Q ):
90
+ ca = np .ones ((len (P ),len (Q )))
91
+ ca = np .multiply (ca ,- 1 )
92
+ # print(ca)
93
+ # return _c(ca,len(P)-1,len(Q)-1,P,Q)
94
+ return self .dis (ca ,len (P )- 1 ,len (Q )- 1 ,P ,Q )
95
+
96
+
97
+ def loadCSV (self ,filename ,isNormalize = True ):
98
+ df = pd .read_csv (filename ) # 这个会直接默认读取到这个Excel的第一个表单
99
+ data = np .array (df .loc [:, :]) # 主要数据,包含统计值
100
+ # ---数据清洗,先归一化
101
+ data = data [:,1 :data .shape [1 ]]
102
+
103
+ if (isNormalize ):
104
+ for i in range (data .shape [1 ]):
105
+ sum = data [data .shape [0 ] - 1 , i ]
106
+ print ("sum=" + str (sum ))
107
+ for j in range (data .shape [0 ]- 1 ):
108
+ data [j ,i ] = data [j ,i ]/ sum
109
+ if (isNormalize ):
110
+ data [j ,i ] = data [j ,i ] * 1000
111
+ data = data [0 :data .shape [0 ]- 1 ,:]
112
+ column_headers = list (df .columns .values ) #标签头,用于索引
113
+ column_headers = column_headers [1 :column_headers .__len__ ()]
114
+ # print(column_headers)
115
+ return data ,column_headers
116
+
117
+
118
+ def getDistanceToAll (self ,filename ,savename ,isNormalize = True ,isDivide = True ):
119
+ print ("计算各个人和总体之间的FreChet距离" )
120
+ if (isNormalize ):
121
+ print ("--------------有归一化:" )
122
+ else :
123
+ print ("--------------没有有归一化:" )
124
+ dataSet , dataHeader = self .loadCSV (filename ,isNormalize )
125
+ author_cnts = dataHeader .__len__ () - 1
126
+ disMatrix = np .zeros ([1 , author_cnts ])
127
+ standard = dataSet [:, 0 ]
128
+ PQ_List = []
129
+ for i in range (dataHeader .__len__ ()):
130
+ P = []
131
+ for j in range (dataSet .shape [0 ]):
132
+ P .append ((j , dataSet [j , i ]))
133
+ PQ_List .append (P )
134
+ print ("一共有" + str (author_cnts ) + "个Commit次数超过100的开发者" )
135
+ for i in range (author_cnts ):
136
+ print ("处理第" + str (i ) + "个author:" + dataHeader [i + 1 ])
137
+ disMatrix [0 , i ] = self .frechetDist (PQ_List [0 ], PQ_List [i + 1 ])
138
+ if isDivide :
139
+ if disMatrix [0 , i ] != 0 :
140
+ disMatrix [0 , i ] = 1 / disMatrix [0 , i ]
141
+ else :
142
+ disMatrix [0 , i ] = 1
143
+ print (dataHeader [0 ] + " Vs " + dataHeader [i + 1 ] + ",相似距离(值越小,越相似)为:" + str (disMatrix [0 , i ]))
144
+
145
+ # 将结果写入CSV
146
+ out = open (savename , 'w' , newline = '' )
147
+ # 设定写入模式
148
+ csv_write = csv .writer (out , dialect = 'excel' )
149
+ # 写入具体内容
150
+ header = dataHeader .copy ()
151
+ # header.insert(0," ")
152
+ header [0 ] = " "
153
+ csv_write .writerow (header )
154
+ output = disMatrix [0 ].tolist ()
155
+ output .insert (0 , "All" )
156
+ csv_write .writerow (output )
157
+
158
+ out .close ()
159
+
160
+ def getDistanceMatrix (self ,filename ,savename ,isNormalize = True ,isDivide = True ):
161
+
162
+ print ("计算" + filename + "的FreChet距离" )
163
+ if (isNormalize ):
164
+ print ("--------------有归一化:" )
165
+ else :
166
+ print ("--------------没有有归一化:" )
167
+
168
+ # 读取数据
169
+ dataSet , dataHeader = loadCSV (filename ,isNormalize )
170
+ # 开发者数目(排除All)
171
+ author_cnts = dataHeader .__len__ () - 1
172
+ print ("一共有" + str (author_cnts ) + "个Commit次数超过100的开发者" )
173
+
174
+ # 将结果写入CSV
175
+ out = open (savename , 'w' , newline = '' )
176
+ # 设定写入模式
177
+ csv_write = csv .writer (out , dialect = 'excel' )
178
+ # 写入具体内容
179
+ header = dataHeader .copy ()
180
+ header [0 ] = " "
181
+ csv_write .writerow (header )
182
+
183
+ #输出的距离矩阵
184
+ output = np .zeros ((dataHeader .__len__ (), dataHeader .__len__ ()))
185
+ for i in range (dataHeader .__len__ ()):
186
+ P = [] # 参照物
187
+ for j in range (dataSet .shape [0 ]):
188
+ P .append ((j , dataSet [j , i ]))
189
+ for j in range (i + 1 , dataHeader .__len__ ()):
190
+ Q = []
191
+ for k in range (dataSet .shape [0 ]):
192
+ Q .append ((k , dataSet [k , j ]))
193
+ res = frechetDist (P , Q )
194
+ '''这里做了倒数处理'''
195
+ if isDivide :
196
+ if (res == 0 ):
197
+ res = 0
198
+ else :
199
+ res = 1 / res
200
+ print (dataHeader [i ] + " Vs " + dataHeader [j ] + ",相似距离(值越小,越相似)为:" + str (res ))
201
+ output [i , j ] = res
202
+ output [j , i ] = res
203
+ output = output .tolist ()
204
+ for i in range (1 , dataHeader .__len__ ()):
205
+ output [i ][0 ] = dataHeader [i ]
206
+ csv_write .writerow (output [i ])
207
+ out .close ()
208
+
209
+
210
+ def SumCSV (self ,filename ,outputfile ,topfile ,topnum ):
211
+ array = []
212
+ head = []
213
+ with open (filename , 'r' ) as f :
214
+ with open (outputfile , 'w' ) as out :
215
+ line = f .readline ()
216
+ out .write (line )
217
+ head = line .strip ()
218
+ head = head .split (',' )
219
+ length = len (head )
220
+ array = [0 for i in range (1 ,length )]#array比head少一列
221
+ for line in f .readlines ():
222
+ out .write (line )
223
+ line = line .strip ()
224
+ line = line .split (',' )
225
+ for i in range (1 , len (line )):
226
+ array [i - 1 ] += int (line [i ])
227
+ out .write ("Summary" )
228
+ for i in range (0 , len (array )):
229
+ out .write ("," + str (array [i ]))
230
+ index = np .argsort (array [1 :])
231
+ with open (topfile , 'w' ) as top :
232
+ indexlen = len (index )
233
+ if (topnum >= indexlen ):
234
+ topnum = indexlen
235
+ for i in range (topnum ):
236
+ name = head [index [indexlen - 1 - i ] + 2 ]
237
+ commitnum = array [index [indexlen - 1 - i ] + 1 ]
238
+ top .write (str (index [indexlen - 1 - i ]) + ',' + str (name ) + "," + str (commitnum ) + "\n " )
239
+ def GenerateFunction (self ):
240
+ start = time .time () # 计算程序运行时间
241
+
242
+ '''按照周为单位'''
243
+ #filename = 'files/alluxio.csv'
244
+ filename = 'files/commitday.csv'
245
+ tmpfilename = 'files/sum_commitday.csv'
246
+ SumCSV (filename , tmpfilename )
247
+ filename = tmpfilename
248
+ outname = 'outcomes/alluxio/alluxio'
249
+ getDistanceToAll (filename ,outname + 'OvR_Normal_Divide.csv' ,True ,True )
250
+ getDistanceToAll (filename ,outname + 'OvR_Divide.csv' ,False ,True ) #不Normal是为了得到绝对距离(归一后相似度高 && 归一前值也高)
251
+ getDistanceMatrix (filename ,outname + 'RvR_Normal_Divide.csv' ,True ,True )
252
+ getDistanceMatrix (filename ,outname + 'RvR_Divide.csv' ,False ,True ) #不Normal是为了得到绝对距离(归一后相似度高 && 归一前值也高)
253
+
254
+ '''按照天为单位'''
255
+ filename = 'files/alluxio_original.csv'
256
+ filename = 'files/commitweek.csv'
257
+ tmpfilename = 'files/sum_commitweek.csv'
258
+ SumCSV (filename , tmpfilename )
259
+ filename = tmpfilename
260
+ outname = 'outcomes/alluxio_original/alluxio_original'
261
+ getDistanceToAll (filename ,outname + 'OvR_Normal_Divide.csv' ,True ,True )
262
+ getDistanceToAll (filename ,outname + 'OvR_Divide.csv' ,False ,True ) #不Normal是为了得到绝对距离(归一后相似度高 && 归一前值也高)
263
+ getDistanceMatrix (filename ,outname + 'RvR_Normal_Divide.csv' ,True ,True )
264
+ getDistanceMatrix (filename ,outname + 'RvR_Divide.csv' ,False ,True ) #不Normal是为了得到绝对距离(归一后相似度高 && 归一前值也高)
265
+
266
+ end = time .time ()
267
+ print ("运行时间:" + str (end - start ) + "s" )
0 commit comments