Skip to content

Commit

Permalink
final
Browse files Browse the repository at this point in the history
  • Loading branch information
LeonVitanos committed Mar 15, 2021
1 parent 71f42ff commit 093b64d
Show file tree
Hide file tree
Showing 3 changed files with 184 additions and 95 deletions.
4 changes: 3 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@ FROM python:3.6-slim

RUN apt-get update && apt-get install -y curl

RUN curl -sL https://deb.nodesource.com/setup_13.x | bash - && apt-get install -y git nodejs cloc
RUN curl -sL https://deb.nodesource.com/setup_13.x | bash - && apt-get install -y git nodejs cloc python3-pip

RUN pip3 install seaborn matplotlib

WORKDIR /usr/jquery-data

Expand Down
137 changes: 90 additions & 47 deletions script.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
import os
import sys
import itertools
import subprocess
import json
import pathlib
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import LinearSegmentedColormap

def merge_ranges(ranges):
ranges = iter(sorted(ranges))
Expand All @@ -19,22 +23,23 @@ def merge_ranges(ranges):
yield current_start, current_stop

if __name__ == "__main__":

#Get the pairs of directories of the current path
rootdir = os.path.dirname(os.path.realpath(__file__))
paths = os.listdir(rootdir)
paths = [x for x in paths if os.path.isdir(os.path.join(rootdir, x))]
paths.sort(key=lambda s: list(map(int, s.split('.'))))
#paths = paths[0:4]
pairs = list(itertools.combinations(paths, 2))

n_code = []
for path in paths:
#Run cloc, with .json as output, and JavaScript as the selected language
result = subprocess.run(['cloc', '--not-match-f=intro.js|outro.js|classes.js|event.js', '-json', '-include-lang=JavaScript', os.path.join(rootdir, path, "src")], stdout=subprocess.PIPE)
lines_of_code = json.loads(result.stdout)
lines_of_code = lines_of_code["JavaScript"]["code"] + lines_of_code["JavaScript"]["comment"] + lines_of_code["JavaScript"]["blank"]
result = subprocess.run(['cloc', '--fullpath',
'--not-match-f=\"src/intro.js|src/outro.js\"',
'-json', '-include-lang=JavaScript', os.path.join(rootdir, path, "src")], stdout=subprocess.PIPE)
lines_of_code = json.loads(result.stdout)["JavaScript"]["code"]
n_code.append(lines_of_code)
#print(path, "&", lines_of_code, "\\\\")
print(path, "&", lines_of_code, "\\\\")

#Initialize matrix with 1 in diagonal and all other to 0
data=[]
Expand All @@ -47,46 +52,76 @@ def merge_ranges(ranges):
data[-1].extend(zeros)
i+=1

if os.path.exists("temp.js"):
os.remove("temp.js")
for pair in pairs:
#Run jsinspect with specific threshold. Request output to be at json form. Ignore files which cannot be read by jsinspect
result = subprocess.run(["jsinspect", "-reporter", "json", "-t", "30", "--ignore",
'src/intro.js|src/outro.js|src/attributes/classes.js|src/event.js',
os.path.join(pair[0], "src"), os.path.join(pair[1], "src")], stdout=subprocess.PIPE, cwd=rootdir)
try:
#Run jsinspect with specific threshold. Request output to be at json form. Ignore files which cannot be read by jsinspect
result = subprocess.run(["jsinspect", "-reporter", "json", "-t", "10", "--ignore",
'src/intro.js|src/outro.js',
os.path.join(pair[0], "src"), os.path.join(pair[1], "src")], stdout=subprocess.PIPE, cwd=rootdir)
out = json.loads(result.stdout.decode('utf-8'))
sim_n_code = 0

files =[]
files_code_lines=[]

#For every match, add the files and lines of these files to two lists
for match in out:
flag=False
for i, instance in enumerate(match["instances"]):
if i+1==len(match["instances"]):
break
if not pathlib.Path(match["instances"][i]["path"]).parts[0] == pathlib.Path(match["instances"][i+1]["path"]).parts[0]:
#Versions are different
flag=True
break
if flag:
for i, instance in enumerate(match["instances"]):
p = match["instances"][i]
if p["path"] not in files:
files.append(p["path"])
files_code_lines.append([])
files_code_lines[files.index(p["path"])].append(p["lines"])

#From these two lists, take then number of lines while removing overlapping ranges
sim_n_code = 0
for lines in files_code_lines:
merged_ranges = merge_ranges(lines)
for lines in merged_ranges:
sim_n_code += lines[1]-lines[0]+1
except:
sim_n_code=0
print(result.stdout.decode('utf-8')[0:1000])
print("An exception occurred")
print("An exception occurred with jsinspect", result.stdout.decode('utf-8')[0:200])

sim_n_code = 0

files =[]
files_code_lines=[]

#For every match, add the files and lines of these files to two lists
for match in out:
flag=False
for i, instance in enumerate(match["instances"]):
if i+1==len(match["instances"]):
break
if not pathlib.Path(match["instances"][i]["path"]).parts[0] == pathlib.Path(match["instances"][i+1]["path"]).parts[0]:
#Versions are different
flag=True
break
if flag:
for i, instance in enumerate(match["instances"]):
p = match["instances"][i]
if p["path"] not in files:
files.append(p["path"])
files_code_lines.append([])
files_code_lines[files.index(p["path"])].append(p["lines"])

#From these two lists, take then number of lines while removing overlapping ranges
sim_n_code = 0
n_splits = 0
for matches in files_code_lines:
n_splits+=len(matches)

for i, lines in enumerate(files_code_lines):
#Find the merged ranges
merged_ranges = merge_ranges(lines)
m = []
for r in merged_ranges:
m.append(r)

#Open file, and save to a temporary file the lines that are in the match range
f = open(os.path.join(rootdir,files[i][2:]), "r")
lines = f.readlines()
temp = open("temp.js", "a")
for j, line in enumerate(lines):
for r in m:
if r[0]-1 <= j <= r[1]-1:
temp.write("%s" % line)
break
temp.close()

if files:
#Now we run cloc on this file to count only the code lines
try:
result = subprocess.run(['cloc', '-json', '-include-lang=JavaScript', "temp.js"], stdout=subprocess.PIPE)
lines_of_code = json.loads(result.stdout)
sim_n_code = lines_of_code["JavaScript"]["code"]-len(files)
except:
sim_n_code = 0
print("An exception occurred with cloc", result.stdout.decode('utf-8')[0:200])
open('temp.js', 'w').close() #clean file

#Having the #lines_of_code for both versions, as well as the #similar_lines_of_code we can calculate the coverage
index1 = paths.index(pair[0])
Expand All @@ -95,10 +130,18 @@ def merge_ranges(ranges):
data[index2][index1] = coverage
print(pair[0], "#code:", n_code[index1], pair[1], "#code:", n_code[index2], "Pair #sim:", sim_n_code,
"Coverage:", coverage)

fig = px.imshow(data, x=paths, y=paths,
color_continuous_scale=["white", "cyan", "lightgreen", "yellow", "red"])

fig.show()


#Dont calculate upper half and diagonal
mask = np.zeros_like(data)
mask[np.triu_indices_from(mask)] = True

#Color pallete similar to paper
cmap = LinearSegmentedColormap.from_list(
name='test',
colors=["white", "cyan", "lightgreen", "yellow", "red"]
)

fig, ax = plt.subplots(figsize=(20, 20))
ax = sns.heatmap(data, cmap=cmap, square=True, vmin=0, vmax=1,
linewidths=0.01, linecolor='grey', xticklabels=paths, yticklabels=paths, mask=mask, cbar_kws={"orientation": "horizontal"})
fig.savefig(os.path.join(rootdir, "out", "heatmap.png"), dpi=400)
138 changes: 91 additions & 47 deletions test/script.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
import os
import sys
import itertools
import subprocess
import json
import pathlib
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import LinearSegmentedColormap

def merge_ranges(ranges):
ranges = iter(sorted(ranges))
Expand All @@ -19,22 +23,24 @@ def merge_ranges(ranges):
yield current_start, current_stop

if __name__ == "__main__":

#Get the pairs of directories of the current path
rootdir = os.path.dirname(os.path.realpath(__file__))
paths = os.listdir(rootdir)
paths = [x for x in paths if os.path.isdir(os.path.join(rootdir, x))]
paths.remove("out")
paths.sort(key=lambda s: list(map(int, s.split('.'))))
#paths = paths[0:4]
pairs = list(itertools.combinations(paths, 2))

n_code = []
for path in paths:
#Run cloc, with .json as output, and JavaScript as the selected language
result = subprocess.run(['cloc', '--not-match-f=intro.js|outro.js|classes.js|event.js', '-json', '-include-lang=JavaScript', os.path.join(rootdir, path, "src")], stdout=subprocess.PIPE)
lines_of_code = json.loads(result.stdout)
lines_of_code = lines_of_code["JavaScript"]["code"] + lines_of_code["JavaScript"]["comment"] + lines_of_code["JavaScript"]["blank"]
result = subprocess.run(['cloc', '--fullpath',
'--not-match-f=\"src/intro.js|src/outro.js|src/attributes/classes.js|src/event.js|src/manipulation.js\"',
'-json', '-include-lang=JavaScript', os.path.join(rootdir, path, "src")], stdout=subprocess.PIPE)
lines_of_code = json.loads(result.stdout)["JavaScript"]["code"]
n_code.append(lines_of_code)
#print(path, "&", lines_of_code, "\\\\")
print(path, "&", lines_of_code, "\\\\")

#Initialize matrix with 1 in diagonal and all other to 0
data=[]
Expand All @@ -47,46 +53,76 @@ def merge_ranges(ranges):
data[-1].extend(zeros)
i+=1

if os.path.exists("temp.js"):
os.remove("temp.js")
for pair in pairs:
#Run jsinspect with specific threshold. Request output to be at json form. Ignore files which cannot be read by jsinspect
result = subprocess.run(["jsinspect", "-reporter", "json", "-t", "30", "--ignore",
'src/intro.js|src/outro.js|src/attributes/classes.js|src/event.js',
os.path.join(pair[0], "src"), os.path.join(pair[1], "src")], stdout=subprocess.PIPE, cwd=rootdir)
try:
#Run jsinspect with specific threshold. Request output to be at json form. Ignore files which cannot be read by jsinspect
result = subprocess.run(["jsinspect", "-reporter", "json", "-t", "10", "--ignore",
'src/intro.js|src/outro.js|src/attributes/classes.js|src/event.js|src/manipulation.js',
os.path.join(pair[0], "src"), os.path.join(pair[1], "src")], stdout=subprocess.PIPE, cwd=rootdir)
out = json.loads(result.stdout.decode('utf-8'))
sim_n_code = 0

files =[]
files_code_lines=[]

#For every match, add the files and lines of these files to two lists
for match in out:
flag=False
for i, instance in enumerate(match["instances"]):
if i+1==len(match["instances"]):
break
if not pathlib.Path(match["instances"][i]["path"]).parts[0] == pathlib.Path(match["instances"][i+1]["path"]).parts[0]:
#Versions are different
flag=True
break
if flag:
for i, instance in enumerate(match["instances"]):
p = match["instances"][i]
if p["path"] not in files:
files.append(p["path"])
files_code_lines.append([])
files_code_lines[files.index(p["path"])].append(p["lines"])

#From these two lists, take then number of lines while removing overlapping ranges
sim_n_code = 0
for lines in files_code_lines:
merged_ranges = merge_ranges(lines)
for lines in merged_ranges:
sim_n_code += lines[1]-lines[0]+1
except:
sim_n_code=0
print(result.stdout.decode('utf-8')[0:1000])
print("An exception occurred")
print("An exception occurred with jsinspect", result.stdout.decode('utf-8')[0:200])

sim_n_code = 0

files =[]
files_code_lines=[]

#For every match, add the files and lines of these files to two lists
for match in out:
flag=False
for i, instance in enumerate(match["instances"]):
if i+1==len(match["instances"]):
break
if not pathlib.Path(match["instances"][i]["path"]).parts[0] == pathlib.Path(match["instances"][i+1]["path"]).parts[0]:
#Versions are different
flag=True
break
if flag:
for i, instance in enumerate(match["instances"]):
p = match["instances"][i]
if p["path"] not in files:
files.append(p["path"])
files_code_lines.append([])
files_code_lines[files.index(p["path"])].append(p["lines"])

#From these two lists, take then number of lines while removing overlapping ranges
sim_n_code = 0
n_splits = 0
for matches in files_code_lines:
n_splits+=len(matches)

for i, lines in enumerate(files_code_lines):
#Find the merged ranges
merged_ranges = merge_ranges(lines)
m = []
for r in merged_ranges:
m.append(r)

#Open file, and save to a temporary file the lines that are in the match range
f = open(os.path.join(rootdir,files[i][2:]), "r")
lines = f.readlines()
temp = open("temp.js", "a")
for j, line in enumerate(lines):
for r in m:
if r[0]-1 <= j <= r[1]-1:
temp.write("%s" % line)
break
temp.close()

if files:
#Now we run cloc on this file to count only the code lines
try:
result = subprocess.run(['cloc', '-json', '-include-lang=JavaScript', "temp.js"], stdout=subprocess.PIPE)
lines_of_code = json.loads(result.stdout)
sim_n_code = lines_of_code["JavaScript"]["code"]-len(files)
except:
sim_n_code = 0
print("An exception occurred with cloc", result.stdout.decode('utf-8')[0:200])
open('temp.js', 'w').close() #clean file

#Having the #lines_of_code for both versions, as well as the #similar_lines_of_code we can calculate the coverage
index1 = paths.index(pair[0])
Expand All @@ -95,10 +131,18 @@ def merge_ranges(ranges):
data[index2][index1] = coverage
print(pair[0], "#code:", n_code[index1], pair[1], "#code:", n_code[index2], "Pair #sim:", sim_n_code,
"Coverage:", coverage)

fig = px.imshow(data, x=paths, y=paths,
color_continuous_scale=["white", "cyan", "lightgreen", "yellow", "red"])

fig.show()


#Dont calculate upper half and diagonal
mask = np.zeros_like(data)
mask[np.triu_indices_from(mask)] = True

#Color pallete similar to paper
cmap = LinearSegmentedColormap.from_list(
name='test',
colors=["white", "cyan", "lightgreen", "yellow", "red"]
)

fig, ax = plt.subplots(figsize=(20, 20))
ax = sns.heatmap(data, cmap=cmap, square=True, vmin=0, vmax=1,
linewidths=0.01, linecolor='grey', xticklabels=paths, yticklabels=paths, mask=mask, cbar_kws={"orientation": "horizontal"})
fig.savefig(os.path.join(rootdir, "out", "heatmap.png"), dpi=400)

0 comments on commit 093b64d

Please sign in to comment.