Skip to content

Commit 3f70db8

Browse files
committed
frechet-distance vision is finish
1 parent 363213d commit 3f70db8

File tree

4 files changed

+322
-13
lines changed

4 files changed

+322
-13
lines changed

.vscode/settings.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"python.pythonPath": "/usr/bin/python3"
3+
}

FrechetDistance.py

Lines changed: 263 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,267 @@
1-
1+
import numpy as np
2+
import pandas as pd
3+
import time
4+
import csv
5+
import math
26
class FrechetDistance(object):
37
def __init__(self) -> None:
48
super().__init__()
59

6-
def get_frechet_distance(self, repo_name):
7-
return 0
10+
11+
def get_frechet_distance(self, directory_name):
12+
start = time.time() # 计算程序运行时间
13+
topnum = 50
14+
directpath = directory_name
15+
filename = directpath + "commitday.csv"
16+
'''按照天为单位'''
17+
tmpfilename = directpath + "sum_commitday.csv"
18+
topfilename = directpath + "frechet_topday.csv"
19+
self.SumCSV(filename, tmpfilename, topfilename, 50)
20+
filename = tmpfilename
21+
outname = directpath
22+
self.getDistanceToAll(filename,outname+'OvR_Normal_Divide_day.csv',True,True)
23+
self.getDistanceToAll(filename,outname+'OvR_Divide_day.csv',False,True) #不Normal是为了得到绝对距离(归一后相似度高 && 归一前值也高)
24+
#self.getDistanceMatrix(filename,outname+'RvR_Normal_Divide.csv',True,True)
25+
#getDistanceMatrix(filename,outname+'RvR_Divide.csv',False,True) #不Normal是为了得到绝对距离(归一后相似度高 && 归一前值也高)
26+
27+
'''按照周为单位'''
28+
filename = directpath + "commitweek.csv"
29+
tmpfilename = directpath + "sum_commitweek.csv"
30+
topfilename = directpath + "frechet_topweek.csv"
31+
self.SumCSV(filename, tmpfilename, topfilename, 50)
32+
filename = tmpfilename
33+
outname = directpath
34+
self.getDistanceToAll(filename,outname+'OvR_Normal_Divide_week.csv',True,True)
35+
self.getDistanceToAll(filename,outname+'OvR_Divide_week.csv',False,True) #不Normal是为了得到绝对距离(归一后相似度高 && 归一前值也高)
36+
#self.getDistanceMatrix(filename,outname+'RvR_Normal_Divide.csv',True,True)
37+
#self.getDistanceMatrix(filename,outname+'RvR_Divide.csv',False,True) #不Normal是为了得到绝对距离(归一后相似度高 && 归一前值也高)
38+
39+
end = time.time()
40+
print("运行时间:" + str(end - start) + "s")
41+
42+
43+
#以下部分全部搬运自原来的python文件
44+
# Euclidean distance.
45+
def euc_dist(self,pt1,pt2):
46+
return math.sqrt((pt2[0]-pt1[0])*(pt2[0]-pt1[0])+(pt2[1]-pt1[1])*(pt2[1]-pt1[1]))
47+
48+
def _c(self,ca,i,j,P,Q):
49+
if ca[i,j] > -1:
50+
return ca[i,j]
51+
elif i == 0 and j == 0:
52+
ca[i,j] = euc_dist(P[0],Q[0])
53+
elif i > 0 and j == 0:
54+
ca[i,j] = max(_c(ca,i-1,0,P,Q),euc_dist(P[i],Q[0]))
55+
elif i == 0 and j > 0:
56+
ca[i,j] = max(_c(ca,0,j-1,P,Q),euc_dist(P[0],Q[j]))
57+
elif i > 0 and j > 0:
58+
ca[i,j] = max(min(_c(ca,i-1,j,P,Q),_c(ca,i-1,j-1,P,Q),_c(ca,i,j-1,P,Q)),euc_dist(P[i],Q[j]))
59+
else:
60+
ca[i,j] = float("inf")
61+
return ca[i,j]
62+
63+
def dis(self,ca,x,y,P,Q):
64+
for i in range(x):
65+
for j in range(y):
66+
if ca[i, j] > -1:
67+
return ca[i, j]
68+
elif i == 0 and j == 0:
69+
ca[i, j] = self.euc_dist(P[0], Q[0])
70+
elif i > 0 and j == 0:
71+
ca[i, j] = max(ca[i-1,0], self.euc_dist(P[i], Q[0]))
72+
elif i == 0 and j > 0:
73+
ca[i, j] = max(ca[0,j-1], self.euc_dist(P[0], Q[j]))
74+
elif i > 0 and j > 0:
75+
A = ca[i-1,j];
76+
B = ca[i-1,j-1];
77+
C = ca[i,j-1];
78+
ca[i, j] = max(min(A, B, C),
79+
self.euc_dist(P[i], Q[j]))
80+
else:
81+
ca[i, j] = float("inf")
82+
return ca[i,j]
83+
84+
"""
85+
Computes the discrete frechet distance between two polygonal lines
86+
Algorithm: http://www.kr.tuwien.ac.at/staff/eiter/et-archive/cdtr9464.pdf
87+
P and Q are arrays of 2-element arrays (points)
88+
"""
89+
def frechetDist(self,P,Q):
90+
ca = np.ones((len(P),len(Q)))
91+
ca = np.multiply(ca,-1)
92+
# print(ca)
93+
# return _c(ca,len(P)-1,len(Q)-1,P,Q)
94+
return self.dis(ca,len(P)-1,len(Q)-1,P,Q)
95+
96+
97+
def loadCSV(self,filename,isNormalize=True):
98+
df = pd.read_csv(filename) # 这个会直接默认读取到这个Excel的第一个表单
99+
data = np.array(df.loc[:, :]) # 主要数据,包含统计值
100+
# ---数据清洗,先归一化
101+
data = data[:,1:data.shape[1]]
102+
103+
if(isNormalize):
104+
for i in range(data.shape[1]):
105+
sum = data[data.shape[0] - 1, i]
106+
print("sum="+str(sum))
107+
for j in range(data.shape[0]-1):
108+
data[j,i] = data[j,i]/sum
109+
if(isNormalize):
110+
data[j,i] = data[j,i] * 1000
111+
data = data[0:data.shape[0]-1,:]
112+
column_headers = list(df.columns.values) #标签头,用于索引
113+
column_headers = column_headers[1:column_headers.__len__()]
114+
# print(column_headers)
115+
return data,column_headers
116+
117+
118+
def getDistanceToAll(self,filename,savename,isNormalize=True,isDivide=True):
119+
print("计算各个人和总体之间的FreChet距离")
120+
if(isNormalize):
121+
print("--------------有归一化:")
122+
else:
123+
print("--------------没有有归一化:")
124+
dataSet, dataHeader = self.loadCSV(filename,isNormalize)
125+
author_cnts = dataHeader.__len__() - 1
126+
disMatrix = np.zeros([1, author_cnts])
127+
standard = dataSet[:, 0]
128+
PQ_List = []
129+
for i in range(dataHeader.__len__()):
130+
P = []
131+
for j in range(dataSet.shape[0]):
132+
P.append((j, dataSet[j, i]))
133+
PQ_List.append(P)
134+
print("一共有" + str(author_cnts) + "个Commit次数超过100的开发者")
135+
for i in range(author_cnts):
136+
print("处理第" + str(i) + "个author:" + dataHeader[i + 1])
137+
disMatrix[0, i] = self.frechetDist(PQ_List[0], PQ_List[i + 1])
138+
if isDivide:
139+
if disMatrix[0, i] != 0:
140+
disMatrix[0, i] = 1/disMatrix[0, i]
141+
else:
142+
disMatrix[0, i] = 1
143+
print(dataHeader[0] + " Vs " + dataHeader[i + 1] + ",相似距离(值越小,越相似)为:" + str(disMatrix[0, i]))
144+
145+
# 将结果写入CSV
146+
out = open(savename, 'w', newline='')
147+
# 设定写入模式
148+
csv_write = csv.writer(out, dialect='excel')
149+
# 写入具体内容
150+
header = dataHeader.copy()
151+
# header.insert(0," ")
152+
header[0] = " "
153+
csv_write.writerow(header)
154+
output = disMatrix[0].tolist()
155+
output.insert(0, "All")
156+
csv_write.writerow(output)
157+
158+
out.close()
159+
160+
def getDistanceMatrix(self,filename,savename,isNormalize=True,isDivide=True):
161+
162+
print("计算"+filename+"的FreChet距离")
163+
if(isNormalize):
164+
print("--------------有归一化:")
165+
else:
166+
print("--------------没有有归一化:")
167+
168+
# 读取数据
169+
dataSet, dataHeader = loadCSV(filename,isNormalize)
170+
# 开发者数目(排除All)
171+
author_cnts = dataHeader.__len__() - 1
172+
print("一共有" + str(author_cnts) + "个Commit次数超过100的开发者")
173+
174+
# 将结果写入CSV
175+
out = open(savename, 'w', newline='')
176+
# 设定写入模式
177+
csv_write = csv.writer(out, dialect='excel')
178+
# 写入具体内容
179+
header = dataHeader.copy()
180+
header[0] = " "
181+
csv_write.writerow(header)
182+
183+
#输出的距离矩阵
184+
output = np.zeros((dataHeader.__len__(), dataHeader.__len__()))
185+
for i in range(dataHeader.__len__()):
186+
P = [] # 参照物
187+
for j in range(dataSet.shape[0]):
188+
P.append((j, dataSet[j, i]))
189+
for j in range(i + 1, dataHeader.__len__()):
190+
Q = []
191+
for k in range(dataSet.shape[0]):
192+
Q.append((k, dataSet[k, j]))
193+
res = frechetDist(P, Q)
194+
'''这里做了倒数处理'''
195+
if isDivide:
196+
if (res == 0):
197+
res = 0
198+
else:
199+
res = 1 / res
200+
print(dataHeader[i] + " Vs " + dataHeader[j] + ",相似距离(值越小,越相似)为:" + str(res))
201+
output[i, j] = res
202+
output[j, i] = res
203+
output = output.tolist()
204+
for i in range(1, dataHeader.__len__()):
205+
output[i][0] = dataHeader[i]
206+
csv_write.writerow(output[i])
207+
out.close()
208+
209+
210+
def SumCSV(self,filename,outputfile,topfile,topnum):
211+
array = []
212+
head = []
213+
with open(filename, 'r') as f:
214+
with open(outputfile, 'w') as out:
215+
line = f.readline()
216+
out.write(line)
217+
head = line.strip()
218+
head = head.split(',')
219+
length = len(head)
220+
array = [0 for i in range(1,length)]#array比head少一列
221+
for line in f.readlines():
222+
out.write(line)
223+
line = line.strip()
224+
line = line.split(',')
225+
for i in range(1, len(line)):
226+
array[i - 1] += int(line[i])
227+
out.write("Summary")
228+
for i in range(0, len(array)):
229+
out.write("," + str(array[i]))
230+
index = np.argsort(array[1:])
231+
with open(topfile, 'w') as top:
232+
indexlen = len(index)
233+
if(topnum >= indexlen):
234+
topnum = indexlen
235+
for i in range(topnum):
236+
name = head[index[indexlen - 1 - i] + 2]
237+
commitnum = array[index[indexlen - 1 - i] + 1]
238+
top.write(str(index[indexlen - 1 - i]) + ',' + str(name) + "," + str(commitnum) + "\n")
239+
def GenerateFunction(self):
240+
start = time.time() # 计算程序运行时间
241+
242+
'''按照周为单位'''
243+
#filename = 'files/alluxio.csv'
244+
filename = 'files/commitday.csv'
245+
tmpfilename = 'files/sum_commitday.csv'
246+
SumCSV(filename, tmpfilename)
247+
filename = tmpfilename
248+
outname = 'outcomes/alluxio/alluxio'
249+
getDistanceToAll(filename,outname+'OvR_Normal_Divide.csv',True,True)
250+
getDistanceToAll(filename,outname+'OvR_Divide.csv',False,True) #不Normal是为了得到绝对距离(归一后相似度高 && 归一前值也高)
251+
getDistanceMatrix(filename,outname+'RvR_Normal_Divide.csv',True,True)
252+
getDistanceMatrix(filename,outname+'RvR_Divide.csv',False,True) #不Normal是为了得到绝对距离(归一后相似度高 && 归一前值也高)
253+
254+
'''按照天为单位'''
255+
filename = 'files/alluxio_original.csv'
256+
filename = 'files/commitweek.csv'
257+
tmpfilename = 'files/sum_commitweek.csv'
258+
SumCSV(filename, tmpfilename)
259+
filename = tmpfilename
260+
outname = 'outcomes/alluxio_original/alluxio_original'
261+
getDistanceToAll(filename,outname+'OvR_Normal_Divide.csv',True,True)
262+
getDistanceToAll(filename,outname+'OvR_Divide.csv',False,True) #不Normal是为了得到绝对距离(归一后相似度高 && 归一前值也高)
263+
getDistanceMatrix(filename,outname+'RvR_Normal_Divide.csv',True,True)
264+
getDistanceMatrix(filename,outname+'RvR_Divide.csv',False,True) #不Normal是为了得到绝对距离(归一后相似度高 && 归一前值也高)
265+
266+
end = time.time()
267+
print("运行时间:" + str(end - start) + "s")

JavaGitMiner.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
import os
22
import jpype
33
from FrechetDistance import FrechetDistance
4+
import repoDB_Options
5+
import getpass
6+
47

58
class GitMiner():
69
def __init__(self, jar_path = 'GitMiner-1.0-SNAPSHOT.jar') -> None:
@@ -11,9 +14,16 @@ def __init__(self, jar_path = 'GitMiner-1.0-SNAPSHOT.jar') -> None:
1114
jpype.startJVM(jpype.getDefaultJVMPath(), "-ea", "-Djava.class.path=%s" % self.jar_path)
1215
self.frechet = FrechetDistance()
1316

14-
def get_repo_name_from_url(self,url):
17+
def get_path_prefix_from_url(self,url):
1518
repo_name = ""
16-
return repo_name
19+
stringlist = url.split("/")
20+
tmp_name = stringlist[len(stringlist) - 1]
21+
tmp_name = tmp_name.split('.')
22+
repo_name = tmp_name[0]
23+
print("this repo_name is" + repo_name)
24+
db = repoDB_Options.repoDB_Options()
25+
prefix = str(db.get_repo_path_prefix(repo_name))
26+
return prefix
1727

1828
def git_clone(self, url):
1929
# 引入java程序中的类.路径应该是项目中的package包路径.类名
@@ -23,14 +33,15 @@ def git_clone(self, url):
2333
# 执行类中的函数了
2434
res = mdg.generateNew(url)
2535
print(res)
26-
repo_name = self.get_repo_name_from_url(url)
27-
self.frechet.get_frechet_distance(repo_name)
36+
path_prefix = self.get_path_prefix_from_url(url)
37+
self.frechet.get_frechet_distance(path_prefix)
2838
return res
2939

3040

3141
if __name__ == '__main__':
3242
git_miner = GitMiner()
33-
res = git_miner.git_clone(url="git@github.com:OpenSrcRepoDataMining/alluxio.git")
43+
#res = git_miner.git_clone(url="git@github.com:OpenSrcRepoDataMining/alluxio.git")
44+
res = git_miner.git_clone(url="git@github.com:njubigdata04/InvertedIndexWithHbase.git")
3445
print(res)
3546

3647

repoDB_Options.py

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,38 @@ def get_repo_base_information(self, repo_name):
6161
]
6262
res['top_ten_frechet'] = source
6363

64-
65-
64+
source = []
65+
header = ['frechet', 'commits', 'contributor']
66+
source.append(header)
67+
#TODO:获得前缀路径
68+
dirpath = self.get_repo_path_prefix(repo_name)
69+
filename = dirpath + "OvR_Normal_Divide_day.csv"
70+
topfilename = dirpath + "frechet_topday.csv"
71+
allindex = []
72+
allcommit = []
73+
allname = []
74+
with open(topfilename,'r')as top:
75+
for line in top.readlines():
76+
line = line.strip()
77+
line = line.split(',')
78+
allindex.append(int(line[0]))
79+
allcommit.append(int(line[2]))
80+
allname.append(line[1])
81+
with open(filename, 'r')as f:
82+
line = f.readline()
83+
line = f.readline()
84+
line = line.strip()
85+
line = line.split(',')
86+
for i in range(len(allindex)):
87+
insertrow = []
88+
index = allindex[i]
89+
frechetnum = line[index + 1]
90+
insertrow.append(frechetnum)
91+
insertrow.append(allcommit[i])
92+
insertrow.append(allname[i])
93+
source.append(insertrow)
94+
print(source)
95+
res['top_ten_frechet'] = source
6696
return res
6797

6898
# 获得带有列名的数据库信息
@@ -248,11 +278,16 @@ def print_format_datas(self, datas):
248278
for item in data:
249279
print(item, '\t', end="")
250280
print()
251-
281+
#得到前缀,包括序号的,比如/home/username/.gitminer/0/
282+
def get_repo_path_prefix(self, repo_name):
283+
prefix = '/home/' + getpass.getuser() + '/.gitminer/'
284+
prefix = prefix + str(self.get_repo_index(repo_name)) + '/csv/'
285+
return prefix
252286

253287
if __name__ == '__main__':
254288
repoDB = repoDB_Options()
255289
# datas = repoDB.get_FileContributorMatrix("FileContributorMatrix5")
256290
# datas = repoDB.get_repo_base_information("a")
257-
datas = repoDB.get_repo_status()
258-
print(datas)
291+
#datas = repoDB.get_repo_status()
292+
#print(datas)
293+
repoDB.get_repo_base_information('InvertedIndexWithHbase')

0 commit comments

Comments
 (0)