Add documentation in data_processing files

emalgorithm · emalgorithm · commit 95349c51909b · 2019-03-15T00:03:35.000Z
diff --git a/README.md b/README.md
@@ -1,3 +1,4 @@
+## Running the Code
 In order to extract the features from the corpus proto files, run:
 python data_generation.py
 
@@ -7,8 +8,30 @@ python train.py --model_name="lstm_gcn_to_lstm_attention" --device=cuda:0 --prin
 All the possible options when running a model can be seen by running:
 python train.py --help
 
+## Pretrained Models
 A pretrained version of the best performing model (as a state dictionary) can be downloaded at 
 https://drive.google.com/file/d/1fm7hGzr-tziNhUMh8duc8s4j5gWW3uKm/view?usp=sharing
 
-
+## High-Level Code Structure
+- data_processing/: contains the code for extracting, storing, analysing and processing data
+    - data_analysis.ipynb: notebook containing analysis of the extracted data
+    - data_extraction.py: contains the logic to extract the features data from the proto files of 
+    the corpus
+    - data_generation.py: file to be called to generate the features data  
+    - data_util.py: contains utilities to work with data
+    - text_util.py: contains utilities to work with text
+- models/: contains all the code for the different models
+    - full_model.py: class of the complete methodNaming model
+    - gat_encoder.py: class for the Graph Attention Network encoder
+    - gcn_encoder.py: class for the Graph Convolutional Network encoder
+    - graph_attention_layer.py: class for the Graph Attention Layer used by the Graph Attention 
+    Network 
+    - graph_convolutional_layer.py: class for the Graph Convolutional Layer used by the Graph 
+    Convolutional Network 
+    - lstm_decoder.py: class for the LSTM sequence decoder
+    - lstm_encoder.py: class for the LSTM sequence encoder
+- training.py: contains code to train and evaluate the models
+    - evaluation_util.py: contains utilities to compute evaluation metrics
+    - train.py: entry-point for training the models
+    - train_model.py: contains logic to train the models
 
diff --git a/data_processing/data_extraction.py b/data_processing/data_extraction.py
@@ -10,19 +10,19 @@
 
 
 def get_dataset_from_dir(dir="../corpus/r252-corpus-features/"):
+    """
+    Extract methods source code, names and graphs structure.
+    :param dir: directory where to look for proto files
+    :return: (methods_source, methods_names, methods_graphs)
+    """
     methods_source = []
     methods_names = []
     methods_graphs = []
 
     proto_files = list(Path(dir).rglob("*.proto"))
     print("A total of {} files have been found".format(len(proto_files)))
 
-    # proto_files = [Path("../features-javac-master/Test.java.proto")]
-
     for i, file in enumerate(proto_files):
-        # nx_graph = get_nx_graph(file)
-        # if i % 100 == 0:
-        print("Extracting data from file {}".format(i+1))
         file_methods_source, file_methods_names, file_methods_graph = get_file_methods_data(
             file)
         methods_source += file_methods_source
@@ -34,9 +34,8 @@ def get_dataset_from_dir(dir="../corpus/r252-corpus-features/"):
 
 def get_file_methods_data(file):
     """
-    Extract the source code tokens, identifier names and graph for methods in a source file
-    represented by a graph. Identifier tokens are split into subtokens. Constructors are not
-    included in the methods.
+    Extract the source code tokens, identifier names and graph for methods in a source file.
+    Identifier tokens are split into subtokens. Constructors are not included in the methods.
     :param file: file
     :return: (methods_source, methods_names, methods_graph) where methods_source[i] is a list of the tokens for
     the source of ith method in the file, methods_names[i] is a list of tokens for name of the
@@ -68,23 +67,6 @@ def get_file_methods_data(file):
                 methods_graph.append((method_edges, non_tokens_nodes_features))
                 methods_names.append(split_identifier_into_parts(method_name_node.contents))
 
-                # start_line_number = node.startLineNumber
-                # end_line_number = node.endLineNumber
-                # method_source = []
-                # for other_node in g.node:
-                #     if other_node.startLineNumber >= start_line_number and other_node.endLineNumber \
-                #             <= end_line_number:
-                #         # if other_node.type == FeatureNode.TOKEN:
-                #         #     method_source.append(other_node.contents)
-                #         # elif other_node.type == FeatureNode.IDENTIFIER_TOKEN:
-                #         #     sub_identifiers = split_identifier_into_parts(other_node.contents)
-                #         #     method_source += sub_identifiers
-                #         if other_node.id == method_name_node.id:
-                #             method_source.append('_')
-                #         elif other_node.type == FeatureNode.TOKEN or other_node.type == \
-                #                 FeatureNode.IDENTIFIER_TOKEN:
-                #             method_source.append(other_node.contents)
-
                 method_source = []
 
                 for other_node in method_nodes.values():
@@ -101,6 +83,9 @@ def get_file_methods_data(file):
 
 
 def get_file_graph(file):
+    """
+    Compute graph for the given file.
+    """
     with file.open('rb') as f:
         g = Graph()
         g.ParseFromString(f.read())
@@ -117,6 +102,9 @@ def get_file_graph(file):
 
 
 def get_method_edges(method_node_id, file_adj_list, file_nodes):
+    """
+    Compute edges of a method graph for a method starting at the node 'method_node_id'.
+    """
     method_nodes_ids = []
 
     get_method_nodes_rec(method_node_id, method_nodes_ids, file_adj_list)
@@ -136,6 +124,9 @@ def get_method_edges(method_node_id, file_adj_list, file_nodes):
 
 
 def get_method_nodes_rec(node_id, method_nodes_ids, file_adj_list):
+    """
+    Utilities to recursively retrieve all edges of a method graph.
+    """
     method_nodes_ids.append(node_id)
 
     for edge in file_adj_list[node_id]:
@@ -144,6 +135,9 @@ def get_method_nodes_rec(node_id, method_nodes_ids, file_adj_list):
 
 
 def remap_edges(edges, nodes):
+    """
+    Remap edges so that ids start from 0 and are consecutive.
+    """
     old_id_to_new_id = {}
     i = 0
     nodes_values = sorted(nodes.values(), key=lambda node: node.id)
@@ -174,26 +168,10 @@ def is_token(node_value):
     return node_value.type == FeatureNode.TOKEN or node_value.type == FeatureNode.IDENTIFIER_TOKEN
 
 
-def get_tokens(g):
+def get_method_name_node(g, method_node):
     """
-    Get the tokens for a file. Identifiers are split in subtokens.
-    :param g: graph representing the file
-    :return: list of tokens
+    Return the node corresponding to the name of a method.
     """
-    token_nodes = list(filter(lambda n: n.type in (FeatureNode.TOKEN, FeatureNode.IDENTIFIER_TOKEN),
-                         g.node))
-    tokens = []
-    for token_node in token_nodes:
-        if token_node.type == FeatureNode.IDENTIFIER_TOKEN:
-            sub_identifiers = split_identifier_into_parts(token_node.contents)
-            tokens += sub_identifiers
-        else:
-            tokens.append(token_node.contents)
-
-    return tokens
-
-
-def get_method_name_node(g, method_node):
     method_id = method_node.id
     method_name_node_id = 0
 
@@ -221,6 +199,9 @@ def get_class_name_node(g):
 
 
 def get_nx_graph(file):
+    """
+    Get networkx graph corresponding to a file.
+    """
     nx_graph = nx.DiGraph()
     with file.open('rb') as f:
         g = Graph()
@@ -231,116 +212,3 @@ def get_nx_graph(file):
                          edge.type][0]
             nx_graph.add_edge(edge.sourceId, edge.destinationId, edge_type=edge_type)
     return nx_graph
-
-
-def get_tokens_dataset_from_dir(dir="../corpus/r252-corpus-features/"):
-    methods_source = []
-    methods_names = []
-    methods_graphs = []
-
-    proto_files = list(Path(dir).rglob("*.proto"))
-    print("A total of {} files have been found".format(len(proto_files)))
-
-    # proto_files = [Path("../features-javac-master/Test.java.proto")]
-
-    for i, file in enumerate(proto_files):
-        # nx_graph = get_nx_graph(file)
-        if i % 10 == 0:
-            print("Extracting data from file {}".format(i+1))
-        file_methods_source, file_methods_names, file_methods_graph = \
-            get_file_methods_data(file)
-        methods_source += file_methods_source
-        methods_names += file_methods_names
-        methods_graphs += file_methods_graph
-
-    return methods_source, methods_names, methods_graphs
-
-
-def get_method_nodes(method_node, file_graph):
-    method_nodes = [method_node]
-    get_method_nodes_rec(method_node, file_graph, method_nodes)
-
-    return method_nodes
-
-
-# def get_method_nodes_rec(node, file_graph, method_nodes):
-#     print(len(method_nodes))
-#     for e in file_graph.edge:
-#         neighbour = e.destinationId
-#         if neighbour not in method_nodes:
-#             method_nodes.append(neighbour)
-#             get_method_nodes(neighbour, nx_graph, method_nodes)
-
-
-def get_augmented_graph(file):
-    # TODO: Does each method in a file have a different graph?
-    with file.open('rb') as f:
-        g = Graph()
-        g.ParseFromString(f.read())
-
-        augmented_graph = nx.Graph()
-        new_node_id = max([node.id for node in g.node]) + 1
-
-        split_identifiers_node = [node for node in g.node if node.type == FeatureNode.IDENTIFIER_TOKEN
-                                  and len(split_identifier_into_parts(node.contents)) > 1]
-
-        # Add all edges
-        for edge in g.edge:
-            edge_type = [name for name, value in list(vars(FeatureEdge).items())[8:] if value ==
-                         edge.type][0]
-            augmented_graph.add_edge(edge.sourceId, edge.destinationId, edge_type=edge_type)
-
-        # Add new edges for split identifiers and sub identifiers
-        for node in split_identifiers_node:
-            sub_identifiers = split_identifier_into_parts(node.contents)
-            sub_identifiers_ids = list(range(new_node_id, new_node_id + len(sub_identifiers)))
-            new_node_id += len(sub_identifiers)
-
-            # ADD NEXT_TOKEN edge from node before identifier to first sub-identifier
-            previous_token_node_id = find_previous_token_node_id(node, g)
-            augmented_graph.add_edge(previous_token_node_id, sub_identifiers_ids[0],
-                                     edge_type="NEXT_TOKEN")
-
-            # ADD NEXT_TOKEN edge from last sub-identifier to node after identifier
-            next_token_node_id = find_next_token_node_id(node, g)
-            augmented_graph.add_edge(sub_identifiers_ids[-1], next_token_node_id,
-                                     edge_type="NEXT_TOKEN")
-
-            # ADD AST_CHILD edge from ast parent of node to first sub-identifier
-            # ast_parent_node_id = find_ast_parent_node_id(node, g)
-            # augmented_graph.add_edge(ast_parent_node_id, sub_identifiers_ids[0],
-            #                          edge_type="ASSOCIATED_TOKEN")
-
-            for i, sub_identifier_id in enumerate(sub_identifiers_ids):
-                # Add IN_TOKEN edges from sub-identifiers to identifier
-                augmented_graph.add_edge(sub_identifier_id, node.id, edge_type="IN_TOKEN")
-
-                # ADD NEXT_TOKEN edges from sub-identifier to next sub-identifier
-                if i < len(sub_identifiers_ids) - 1:
-                    augmented_graph.add_edge(sub_identifiers_ids[i], sub_identifiers_ids[i + 1],
-                                             edge_type="NEXT_TOKEN")
-    return augmented_graph
-
-
-def find_previous_token_node_id(node, g):
-    for edge in g.edge:
-        if edge.destinationId == node.id and edge.type == FeatureEdge.NEXT_TOKEN:
-            return edge.sourceId
-
-    return None
-
-
-def find_next_token_node_id(node, g):
-    for edge in g.edge:
-        if edge.sourceId == node.id and edge.type == FeatureEdge.NEXT_TOKEN:
-            return edge.destinationId
-
-    return None
-
-
-def find_ast_parent_node_id(node, g):
-    for edge in g.edge:
-        if edge.destinationId == node.id and edge.type == FeatureEdge.ASSOCIATED_TOKEN:
-            return edge.sourceId
-
-    return None
diff --git a/data_processing/data_generation.py b/data_processing/data_generation.py
@@ -1,8 +1,10 @@
 from data_processing.data_extraction import get_dataset_from_dir
 import pickle
 
+# Generate data
 methods_source, methods_names, methods_graphs = get_dataset_from_dir(
     "../corpus/r252-corpus-features/")
 
+# Store data
 pickle.dump({'methods_source': methods_source, 'methods_names': methods_names, 'methods_graphs':
     methods_graphs}, open('data/methods_tokens_graphs.pkl', 'wb'))
diff --git a/data_processing/data_util.py b/data_processing/data_util.py