final

LeonVitanos · Mar 15, 2021 · 093b64d · 093b64d
1 parent 71f42ff
commit 093b64d
Show file tree

Hide file tree

Showing 3 changed files with 184 additions and 95 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -2,7 +2,9 @@ FROM python:3.6-slim
 
 RUN apt-get update && apt-get install -y curl
 
-RUN curl -sL https://deb.nodesource.com/setup_13.x | bash - && apt-get install -y git nodejs cloc
+RUN curl -sL https://deb.nodesource.com/setup_13.x | bash - && apt-get install -y git nodejs cloc python3-pip
+
+RUN pip3 install seaborn matplotlib
 
 WORKDIR /usr/jquery-data
 

diff --git a/script.py b/script.py
@@ -1,9 +1,13 @@
 import os
+import sys
 import itertools
 import subprocess
 import json
 import pathlib
-import plotly.express as px
+import seaborn as sns
+import matplotlib.pyplot as plt
+import numpy as np
+from matplotlib.colors import LinearSegmentedColormap
 
 def merge_ranges(ranges):
     ranges = iter(sorted(ranges))
@@ -19,22 +23,23 @@ def merge_ranges(ranges):
     yield current_start, current_stop
 
 if __name__ == "__main__":
-
     #Get the pairs of directories of the current path
     rootdir = os.path.dirname(os.path.realpath(__file__))
     paths = os.listdir(rootdir)
     paths = [x for x in paths if os.path.isdir(os.path.join(rootdir, x))]
     paths.sort(key=lambda s: list(map(int, s.split('.'))))
+    #paths = paths[0:4]
     pairs = list(itertools.combinations(paths, 2))
 
     n_code = []    
     for path in paths:
         #Run cloc, with .json as output, and JavaScript as the selected language
-        result = subprocess.run(['cloc', '--not-match-f=intro.js|outro.js|classes.js|event.js', '-json', '-include-lang=JavaScript', os.path.join(rootdir, path, "src")], stdout=subprocess.PIPE)
-        lines_of_code = json.loads(result.stdout)
-        lines_of_code = lines_of_code["JavaScript"]["code"] + lines_of_code["JavaScript"]["comment"] + lines_of_code["JavaScript"]["blank"]
+        result = subprocess.run(['cloc', '--fullpath', 
+        '--not-match-f=\"src/intro.js|src/outro.js\"',
+        '-json', '-include-lang=JavaScript', os.path.join(rootdir, path, "src")], stdout=subprocess.PIPE)
+        lines_of_code = json.loads(result.stdout)["JavaScript"]["code"]
         n_code.append(lines_of_code)
-        #print(path, "&", lines_of_code, "\\\\")
+        print(path, "&", lines_of_code, "\\\\")
 
     #Initialize matrix with 1 in diagonal and all other to 0
     data=[]
@@ -47,46 +52,76 @@ def merge_ranges(ranges):
         data[-1].extend(zeros)
         i+=1
 
+    if os.path.exists("temp.js"):
+        os.remove("temp.js")
     for pair in pairs:
-        #Run jsinspect with specific threshold. Request output to be at json form. Ignore files which cannot be read by jsinspect
-        result = subprocess.run(["jsinspect", "-reporter", "json", "-t", "30", "--ignore", 
-        'src/intro.js|src/outro.js|src/attributes/classes.js|src/event.js', 
-        os.path.join(pair[0], "src"), os.path.join(pair[1], "src")], stdout=subprocess.PIPE, cwd=rootdir)
         try:
+            #Run jsinspect with specific threshold. Request output to be at json form. Ignore files which cannot be read by jsinspect
+            result = subprocess.run(["jsinspect", "-reporter", "json", "-t", "10", "--ignore", 
+            'src/intro.js|src/outro.js',
+            os.path.join(pair[0], "src"), os.path.join(pair[1], "src")], stdout=subprocess.PIPE, cwd=rootdir)
             out = json.loads(result.stdout.decode('utf-8'))
-            sim_n_code = 0
-
-            files =[]
-            files_code_lines=[]
-
-            #For every match, add the files and lines of these files to two lists
-            for match in out: 
-                flag=False
-                for i, instance in enumerate(match["instances"]):
-                    if i+1==len(match["instances"]):
-                        break
-                    if not pathlib.Path(match["instances"][i]["path"]).parts[0] == pathlib.Path(match["instances"][i+1]["path"]).parts[0]:
-                        #Versions are different
-                        flag=True
-                        break
-                if flag:
-                    for i, instance in enumerate(match["instances"]):
-                        p = match["instances"][i]
-                        if p["path"] not in files:
-                            files.append(p["path"])
-                            files_code_lines.append([])
-                        files_code_lines[files.index(p["path"])].append(p["lines"])
-
-            #From these two lists, take then number of lines while removing overlapping ranges
-            sim_n_code = 0
-            for lines in files_code_lines:
-                merged_ranges = merge_ranges(lines)
-                for lines in merged_ranges:
-                    sim_n_code += lines[1]-lines[0]+1
         except:
             sim_n_code=0
-            print(result.stdout.decode('utf-8')[0:1000])
-            print("An exception occurred") 
+            print("An exception occurred with jsinspect", result.stdout.decode('utf-8')[0:200])
+
+        sim_n_code = 0
+
+        files =[]
+        files_code_lines=[]
+
+        #For every match, add the files and lines of these files to two lists
+        for match in out: 
+            flag=False
+            for i, instance in enumerate(match["instances"]):
+                if i+1==len(match["instances"]):
+                    break
+                if not pathlib.Path(match["instances"][i]["path"]).parts[0] == pathlib.Path(match["instances"][i+1]["path"]).parts[0]:
+                    #Versions are different
+                    flag=True
+                    break
+            if flag:
+                for i, instance in enumerate(match["instances"]):
+                    p = match["instances"][i]
+                    if p["path"] not in files:
+                        files.append(p["path"])
+                        files_code_lines.append([])
+                    files_code_lines[files.index(p["path"])].append(p["lines"])
+
+        #From these two lists, take then number of lines while removing overlapping ranges
+        sim_n_code = 0
+        n_splits = 0
+        for matches in files_code_lines:
+            n_splits+=len(matches)
+
+        for i, lines in enumerate(files_code_lines):
+            #Find the merged ranges
+            merged_ranges = merge_ranges(lines)
+            m = []
+            for r in merged_ranges:
+                m.append(r)
+
+            #Open file, and save to a temporary file the lines that are in the match range
+            f = open(os.path.join(rootdir,files[i][2:]), "r")
+            lines = f.readlines()
+            temp = open("temp.js", "a")
+            for j, line in enumerate(lines):
+                for r in m:
+                    if r[0]-1 <= j <= r[1]-1:
+                        temp.write("%s" % line)
+                        break
+            temp.close()
+
+        if files:
+            #Now we run cloc on this file to count only the code lines
+            try:
+                result = subprocess.run(['cloc', '-json', '-include-lang=JavaScript', "temp.js"], stdout=subprocess.PIPE)
+                lines_of_code = json.loads(result.stdout)
+                sim_n_code = lines_of_code["JavaScript"]["code"]-len(files)
+            except:
+                sim_n_code = 0
+                print("An exception occurred with cloc", result.stdout.decode('utf-8')[0:200])
+            open('temp.js', 'w').close() #clean file
 
         #Having the #lines_of_code for both versions, as well as the #similar_lines_of_code we can calculate the coverage
         index1 = paths.index(pair[0])
@@ -95,10 +130,18 @@ def merge_ranges(ranges):
         data[index2][index1] = coverage
         print(pair[0], "#code:", n_code[index1], pair[1], "#code:", n_code[index2], "Pair #sim:", sim_n_code, 
         "Coverage:", coverage)
-
-    fig = px.imshow(data, x=paths, y=paths,
-        color_continuous_scale=["white", "cyan", "lightgreen", "yellow", "red"])
 
-    fig.show()
-
-
+    #Dont calculate upper half and diagonal
+    mask = np.zeros_like(data)
+    mask[np.triu_indices_from(mask)] = True
+
+    #Color pallete similar to paper
+    cmap = LinearSegmentedColormap.from_list(
+        name='test', 
+        colors=["white", "cyan", "lightgreen", "yellow", "red"]
+    )
+
+    fig, ax = plt.subplots(figsize=(20, 20))
+    ax = sns.heatmap(data, cmap=cmap, square=True, vmin=0, vmax=1,
+    linewidths=0.01, linecolor='grey', xticklabels=paths, yticklabels=paths, mask=mask, cbar_kws={"orientation": "horizontal"})
+    fig.savefig(os.path.join(rootdir, "out", "heatmap.png"), dpi=400)
diff --git a/test/script.py b/test/script.py
@@ -1,9 +1,13 @@
 import os
+import sys
 import itertools
 import subprocess
 import json
 import pathlib
-import plotly.express as px
+import seaborn as sns
+import matplotlib.pyplot as plt
+import numpy as np
+from matplotlib.colors import LinearSegmentedColormap
 
 def merge_ranges(ranges):
     ranges = iter(sorted(ranges))
@@ -19,22 +23,24 @@ def merge_ranges(ranges):
     yield current_start, current_stop
 
 if __name__ == "__main__":
-
     #Get the pairs of directories of the current path
     rootdir = os.path.dirname(os.path.realpath(__file__))
     paths = os.listdir(rootdir)
     paths = [x for x in paths if os.path.isdir(os.path.join(rootdir, x))]
+    paths.remove("out")
     paths.sort(key=lambda s: list(map(int, s.split('.'))))
+    #paths = paths[0:4]
     pairs = list(itertools.combinations(paths, 2))
 
     n_code = []    
     for path in paths:
         #Run cloc, with .json as output, and JavaScript as the selected language
-        result = subprocess.run(['cloc', '--not-match-f=intro.js|outro.js|classes.js|event.js', '-json', '-include-lang=JavaScript', os.path.join(rootdir, path, "src")], stdout=subprocess.PIPE)
-        lines_of_code = json.loads(result.stdout)
-        lines_of_code = lines_of_code["JavaScript"]["code"] + lines_of_code["JavaScript"]["comment"] + lines_of_code["JavaScript"]["blank"]
+        result = subprocess.run(['cloc', '--fullpath', 
+        '--not-match-f=\"src/intro.js|src/outro.js|src/attributes/classes.js|src/event.js|src/manipulation.js\"',
+        '-json', '-include-lang=JavaScript', os.path.join(rootdir, path, "src")], stdout=subprocess.PIPE)
+        lines_of_code = json.loads(result.stdout)["JavaScript"]["code"]
         n_code.append(lines_of_code)
-        #print(path, "&", lines_of_code, "\\\\")
+        print(path, "&", lines_of_code, "\\\\")
 
     #Initialize matrix with 1 in diagonal and all other to 0
     data=[]
@@ -47,46 +53,76 @@ def merge_ranges(ranges):
         data[-1].extend(zeros)
         i+=1
 
+    if os.path.exists("temp.js"):
+        os.remove("temp.js")
     for pair in pairs:
-        #Run jsinspect with specific threshold. Request output to be at json form. Ignore files which cannot be read by jsinspect
-        result = subprocess.run(["jsinspect", "-reporter", "json", "-t", "30", "--ignore", 
-        'src/intro.js|src/outro.js|src/attributes/classes.js|src/event.js', 
-        os.path.join(pair[0], "src"), os.path.join(pair[1], "src")], stdout=subprocess.PIPE, cwd=rootdir)
         try:
+            #Run jsinspect with specific threshold. Request output to be at json form. Ignore files which cannot be read by jsinspect
+            result = subprocess.run(["jsinspect", "-reporter", "json", "-t", "10", "--ignore", 
+            'src/intro.js|src/outro.js|src/attributes/classes.js|src/event.js|src/manipulation.js',
+            os.path.join(pair[0], "src"), os.path.join(pair[1], "src")], stdout=subprocess.PIPE, cwd=rootdir)
             out = json.loads(result.stdout.decode('utf-8'))
-            sim_n_code = 0
-
-            files =[]
-            files_code_lines=[]
-
-            #For every match, add the files and lines of these files to two lists
-            for match in out: 
-                flag=False
-                for i, instance in enumerate(match["instances"]):
-                    if i+1==len(match["instances"]):
-                        break
-                    if not pathlib.Path(match["instances"][i]["path"]).parts[0] == pathlib.Path(match["instances"][i+1]["path"]).parts[0]:
-                        #Versions are different
-                        flag=True
-                        break
-                if flag:
-                    for i, instance in enumerate(match["instances"]):
-                        p = match["instances"][i]
-                        if p["path"] not in files:
-                            files.append(p["path"])
-                            files_code_lines.append([])
-                        files_code_lines[files.index(p["path"])].append(p["lines"])
-
-            #From these two lists, take then number of lines while removing overlapping ranges
-            sim_n_code = 0
-            for lines in files_code_lines:
-                merged_ranges = merge_ranges(lines)
-                for lines in merged_ranges:
-                    sim_n_code += lines[1]-lines[0]+1
         except:
             sim_n_code=0
-            print(result.stdout.decode('utf-8')[0:1000])
-            print("An exception occurred") 
+            print("An exception occurred with jsinspect", result.stdout.decode('utf-8')[0:200])
+
+        sim_n_code = 0
+
+        files =[]
+        files_code_lines=[]
+
+        #For every match, add the files and lines of these files to two lists
+        for match in out: 
+            flag=False
+            for i, instance in enumerate(match["instances"]):
+                if i+1==len(match["instances"]):
+                    break
+                if not pathlib.Path(match["instances"][i]["path"]).parts[0] == pathlib.Path(match["instances"][i+1]["path"]).parts[0]:
+                    #Versions are different
+                    flag=True
+                    break
+            if flag:
+                for i, instance in enumerate(match["instances"]):
+                    p = match["instances"][i]
+                    if p["path"] not in files:
+                        files.append(p["path"])
+                        files_code_lines.append([])
+                    files_code_lines[files.index(p["path"])].append(p["lines"])
+
+        #From these two lists, take then number of lines while removing overlapping ranges
+        sim_n_code = 0
+        n_splits = 0
+        for matches in files_code_lines:
+            n_splits+=len(matches)
+
+        for i, lines in enumerate(files_code_lines):
+            #Find the merged ranges
+            merged_ranges = merge_ranges(lines)
+            m = []
+            for r in merged_ranges:
+                m.append(r)
+
+            #Open file, and save to a temporary file the lines that are in the match range
+            f = open(os.path.join(rootdir,files[i][2:]), "r")
+            lines = f.readlines()
+            temp = open("temp.js", "a")
+            for j, line in enumerate(lines):
+                for r in m:
+                    if r[0]-1 <= j <= r[1]-1:
+                        temp.write("%s" % line)
+                        break
+            temp.close()
+
+        if files:
+            #Now we run cloc on this file to count only the code lines
+            try:
+                result = subprocess.run(['cloc', '-json', '-include-lang=JavaScript', "temp.js"], stdout=subprocess.PIPE)
+                lines_of_code = json.loads(result.stdout)
+                sim_n_code = lines_of_code["JavaScript"]["code"]-len(files)
+            except:
+                sim_n_code = 0
+                print("An exception occurred with cloc", result.stdout.decode('utf-8')[0:200])
+            open('temp.js', 'w').close() #clean file
 
         #Having the #lines_of_code for both versions, as well as the #similar_lines_of_code we can calculate the coverage
         index1 = paths.index(pair[0])
@@ -95,10 +131,18 @@ def merge_ranges(ranges):
         data[index2][index1] = coverage
         print(pair[0], "#code:", n_code[index1], pair[1], "#code:", n_code[index2], "Pair #sim:", sim_n_code, 
         "Coverage:", coverage)
-
-    fig = px.imshow(data, x=paths, y=paths,
-        color_continuous_scale=["white", "cyan", "lightgreen", "yellow", "red"])
 
-    fig.show()
-
-
+    #Dont calculate upper half and diagonal
+    mask = np.zeros_like(data)
+    mask[np.triu_indices_from(mask)] = True
+
+    #Color pallete similar to paper
+    cmap = LinearSegmentedColormap.from_list(
+        name='test', 
+        colors=["white", "cyan", "lightgreen", "yellow", "red"]
+    )
+
+    fig, ax = plt.subplots(figsize=(20, 20))
+    ax = sns.heatmap(data, cmap=cmap, square=True, vmin=0, vmax=1,
+    linewidths=0.01, linecolor='grey', xticklabels=paths, yticklabels=paths, mask=mask, cbar_kws={"orientation": "horizontal"})
+    fig.savefig(os.path.join(rootdir, "out", "heatmap.png"), dpi=400)