Change data extraction to also have node features

emalgorithm · emalgorithm · commit 72f7c488a349 · 2019-03-06T13:59:35.000Z
diff --git a/data_extraction.py b/data_extraction.py
@@ -4,10 +4,102 @@
 from graph_pb2 import Graph
 import networkx as nx
 import sys
+import numpy as np
 
 sys.setrecursionlimit(10000)
 
 
+def get_dataset_from_dir(dir="../corpus/r252-corpus-features/"):
+    methods_source = []
+    methods_names = []
+    methods_graphs = []
+
+    proto_files = list(Path(dir).rglob("*.proto"))
+    print("A total of {} files have been found".format(len(proto_files)))
+
+    # proto_files = [Path("../features-javac-master/Test.java.proto")]
+
+    for i, file in enumerate(proto_files[:100]):
+        # nx_graph = get_nx_graph(file)
+        # if i % 100 == 0:
+        print("Extracting data from file {}".format(i+1))
+        file_methods_source, file_methods_names, file_methods_graph = get_file_methods_data(
+            file)
+        methods_source += file_methods_source
+        methods_names += file_methods_names
+        methods_graphs += file_methods_graph
+
+    return methods_source, methods_names, methods_graphs
+
+
+def get_file_methods_data(file):
+    """
+    Extract the source code tokens, identifier names and graph for methods in a source file
+    represented by a graph. Identifier tokens are split into subtokens. Constructors are not
+    included in the methods.
+    :param file: file
+    :return: (methods_source, methods_names, methods_graph) where methods_source[i] is a list of the tokens for
+    the source of ith method in the file, methods_names[i] is a list of tokens for name of the
+    ith method in the file, and methods_graph[i] is the subtree of the file parse tree starting
+    from the method node.
+    """
+    adj_list, nodes, edges = get_file_graph(file)
+
+    with file.open('rb') as f:
+        class_name = file.name.split('.')
+
+        g = Graph()
+        g.ParseFromString(f.read())
+        methods_source = []
+        methods_names = []
+        methods_graph = []
+        # class_name_node = get_class_name_node(g)
+
+        for node in g.node:
+            if node.contents == "METHOD":
+                method_name_node = get_method_name_node(g, node)
+
+                # If method name is the same as class name, then method name is constructor,
+                # so discard it
+                if method_name_node.contents == class_name:
+                    continue
+
+                method_edges, method_nodes, non_tokens_nodes_features = get_method_edges(node.id, adj_list, nodes)
+                methods_graph.append((method_edges, non_tokens_nodes_features))
+                methods_names.append(split_identifier_into_parts(method_name_node.contents))
+
+                # start_line_number = node.startLineNumber
+                # end_line_number = node.endLineNumber
+                # method_source = []
+                # for other_node in g.node:
+                #     if other_node.startLineNumber >= start_line_number and other_node.endLineNumber \
+                #             <= end_line_number:
+                #         # if other_node.type == FeatureNode.TOKEN:
+                #         #     method_source.append(other_node.contents)
+                #         # elif other_node.type == FeatureNode.IDENTIFIER_TOKEN:
+                #         #     sub_identifiers = split_identifier_into_parts(other_node.contents)
+                #         #     method_source += sub_identifiers
+                #         if other_node.id == method_name_node.id:
+                #             method_source.append('_')
+                #         elif other_node.type == FeatureNode.TOKEN or other_node.type == \
+                #                 FeatureNode.IDENTIFIER_TOKEN:
+                #             method_source.append(other_node.contents)
+
+                method_source = []
+
+                for other_node in method_nodes.values():
+                    if other_node.id == method_name_node.id:
+                        # Replace method name with '_' in method source code
+                        method_source.append('_')
+                    elif other_node.type == FeatureNode.TOKEN or other_node.type == \
+                            FeatureNode.IDENTIFIER_TOKEN:
+                        method_source.append(other_node.contents)
+
+                methods_source.append(method_source)
+
+        return methods_source, methods_names, methods_graph
+
+
 def get_file_graph(file):
     with file.open('rb') as f:
         g = Graph()
@@ -38,9 +130,17 @@ def get_method_edges(method_node_id, file_adj_list, file_nodes):
     method_nodes = {node_id: node for node_id, node in file_nodes.items() if node_id in
                     method_nodes_ids}
 
-    methods_edges = remap_edges(methods_edges, method_nodes)
+    methods_edges, non_tokens_nodes_features = remap_edges(methods_edges, method_nodes)
+
+    return methods_edges, method_nodes, non_tokens_nodes_features
+
+
+def get_method_nodes_rec(node_id, method_nodes_ids, file_adj_list):
+    method_nodes_ids.append(node_id)
 
-    return methods_edges, method_nodes
+    for edge in file_adj_list[node_id]:
+        if edge['edge_type'] != FeatureEdge.NEXT_TOKEN and edge['destination'] not in method_nodes_ids:
+            get_method_nodes_rec(edge['destination'], method_nodes_ids, file_adj_list)
 
 
 def remap_edges(edges, nodes):
@@ -55,30 +155,25 @@ def remap_edges(edges, nodes):
             old_id_to_new_id[node_value.id] = i
             i += 1
 
+    non_tokens_nodes_features = np.zeros((len(nodes_values) - len(old_id_to_new_id), 11))
+    j = i
     # Set new ids for other nodes
     for node_value in nodes_values:
         if not is_token(node_value):
             old_id_to_new_id[node_value.id] = i
+            non_tokens_nodes_features[i - j][node_value.type - 1] = 1
             i += 1
 
     for edge in edges:
         new_edges.append((old_id_to_new_id[edge[0]], old_id_to_new_id[edge[1]]))
 
-    return new_edges
+    return new_edges, non_tokens_nodes_features
 
 
 def is_token(node_value):
     return node_value.type == FeatureNode.TOKEN or node_value.type == FeatureNode.IDENTIFIER_TOKEN
 
 
-def get_method_nodes_rec(node_id, method_nodes_ids, file_adj_list):
-    method_nodes_ids.append(node_id)
-
-    for edge in file_adj_list[node_id]:
-        if edge['edge_type'] != FeatureEdge.NEXT_TOKEN and edge['destination'] not in method_nodes_ids:
-            get_method_nodes_rec(edge['destination'], method_nodes_ids, file_adj_list)
-
-
 def get_tokens(g):
     """
     Get the tokens for a file. Identifiers are split in subtokens.
@@ -98,72 +193,6 @@ def get_tokens(g):
     return tokens
 
 
-def get_methods_source_and_name(file, nx_graph):
-    """
-    Extract the source code token and identifier names for methods in a source file represented
-    by a graph. Identifier tokens are split into subtokens. Constructors are not included in the
-    methods.
-    :param file: file
-    :return: (methods_source, methods_names) where methods_source[i] is a list of the tokens for
-    the source of ith method in the file, and methods_names[i] is a list of tokens for name of the
-    ith
-    """
-    adj_list, nodes, edges = get_file_graph(file)
-
-    with file.open('rb') as f:
-        class_name = file.name.split('.')
-
-        g = Graph()
-        g.ParseFromString(f.read())
-        methods_source = []
-        methods_names = []
-        methods_graph = []
-        # class_name_node = get_class_name_node(g)
-
-        for node in g.node:
-            if node.contents == "METHOD":
-                method_name_node = get_method_name_node(g, node)
-
-                # If method name is the same as class name, then method name is constructor,
-                # so discard it
-                if method_name_node.contents == class_name:
-                    continue
-
-                method_edges, method_nodes = get_method_edges(node.id, adj_list, nodes)
-                methods_graph.append(method_edges)
-                methods_names.append(split_identifier_into_parts(method_name_node.contents))
-
-                # start_line_number = node.startLineNumber
-                # end_line_number = node.endLineNumber
-                # method_source = []
-                # for other_node in g.node:
-                #     if other_node.startLineNumber >= start_line_number and other_node.endLineNumber \
-                #             <= end_line_number:
-                #         # if other_node.type == FeatureNode.TOKEN:
-                #         #     method_source.append(other_node.contents)
-                #         # elif other_node.type == FeatureNode.IDENTIFIER_TOKEN:
-                #         #     sub_identifiers = split_identifier_into_parts(other_node.contents)
-                #         #     method_source += sub_identifiers
-                #         if other_node.id == method_name_node.id:
-                #             method_source.append('_')
-                #         elif other_node.type == FeatureNode.TOKEN or other_node.type == \
-                #                 FeatureNode.IDENTIFIER_TOKEN:
-                #             method_source.append(other_node.contents)
-
-                method_source = []
-
-                for other_node in method_nodes.values():
-                    if other_node.id == method_name_node.id:
-                        method_source.append('_')
-                    elif other_node.type == FeatureNode.TOKEN or other_node.type == \
-                            FeatureNode.IDENTIFIER_TOKEN:
-                        method_source.append(other_node.contents)
-
-                methods_source.append(method_source)
-
-        return methods_source, methods_names, methods_graph
-
-
 def get_method_name_node(g, method_node):
     method_id = method_node.id
     method_name_node_id = 0
@@ -191,27 +220,6 @@ def get_class_name_node(g):
     return class_associated_nodes[1]
 
 
-def get_dataset_from_dir(dir="../corpus/r252-corpus-features/"):
-    methods_source = []
-    methods_names = []
-    methods_graphs = []
-
-    proto_files = list(Path(dir).rglob("*.proto"))
-    print("A total of {} files have been found".format(len(proto_files)))
-
-    for i, file in enumerate(proto_files):
-        nx_graph = get_nx_graph(file)
-        # if i % 100 == 0:
-        print("Extracting data from file {}".format(i+1))
-        file_methods_source, file_methods_names, file_methods_graph = get_methods_source_and_name(
-            file, nx_graph)
-        methods_source += file_methods_source
-        methods_names += file_methods_names
-        methods_graphs += file_methods_graph
-
-    return methods_source, methods_names, methods_graphs
-
-
 def get_nx_graph(file):
     nx_graph = nx.DiGraph()
     with file.open('rb') as f:
@@ -235,12 +243,12 @@ def get_tokens_dataset_from_dir(dir="../corpus/r252-corpus-features/"):
 
     # proto_files = [Path("../features-javac-master/Test.java.proto")]
 
-    for i, file in enumerate(proto_files[:100]):
-        nx_graph = get_nx_graph(file)
+    for i, file in enumerate(proto_files):
+        # nx_graph = get_nx_graph(file)
         if i % 10 == 0:
             print("Extracting data from file {}".format(i+1))
         file_methods_source, file_methods_names, file_methods_graph = \
-            get_methods_source_and_name(file, nx_graph)
+            get_file_methods_data(file)
         methods_source += file_methods_source
         methods_names += file_methods_names
         methods_graphs += file_methods_graph
@@ -255,13 +263,13 @@ def get_method_nodes(method_node, file_graph):
     return method_nodes
 
 
-def get_method_nodes_rec(node, file_graph, method_nodes):
-    print(len(method_nodes))
-    for e in file_graph.edge:
-        neighbour = e.destinationId
-        if neighbour not in method_nodes:
-            method_nodes.append(neighbour)
-            get_method_nodes(neighbour, nx_graph, method_nodes)
+# def get_method_nodes_rec(node, file_graph, method_nodes):
+#     print(len(method_nodes))
+#     for e in file_graph.edge:
+#         neighbour = e.destinationId
+#         if neighbour not in method_nodes:
+#             method_nodes.append(neighbour)
+#             get_method_nodes(neighbour, nx_graph, method_nodes)
 
 
 def get_augmented_graph(file):
diff --git a/data_generation.py b/data_generation.py
@@ -1,8 +1,8 @@
 from data_extraction import get_dataset_from_dir, get_tokens_dataset_from_dir
 import pickle
 
-methods_source, methods_names, methods_graphs = get_tokens_dataset_from_dir(
+methods_source, methods_names, methods_graphs = get_dataset_from_dir(
     "../corpus/r252-corpus-features/")
 
 pickle.dump({'methods_source': methods_source, 'methods_names': methods_names, 'methods_graphs':
-    methods_graphs}, open('data/methods_tokens_graphs.pkl', 'wb'))
+    methods_graphs}, open('data/methods_tokens_graphs2.pkl', 'wb'))