10
10
11
11
12
12
def get_dataset_from_dir (dir = "../corpus/r252-corpus-features/" ):
13
+ """
14
+ Extract methods source code, names and graphs structure.
15
+ :param dir: directory where to look for proto files
16
+ :return: (methods_source, methods_names, methods_graphs)
17
+ """
13
18
methods_source = []
14
19
methods_names = []
15
20
methods_graphs = []
16
21
17
22
proto_files = list (Path (dir ).rglob ("*.proto" ))
18
23
print ("A total of {} files have been found" .format (len (proto_files )))
19
24
20
- # proto_files = [Path("../features-javac-master/Test.java.proto")]
21
-
22
25
for i , file in enumerate (proto_files ):
23
- # nx_graph = get_nx_graph(file)
24
- # if i % 100 == 0:
25
- print ("Extracting data from file {}" .format (i + 1 ))
26
26
file_methods_source , file_methods_names , file_methods_graph = get_file_methods_data (
27
27
file )
28
28
methods_source += file_methods_source
@@ -34,9 +34,8 @@ def get_dataset_from_dir(dir="../corpus/r252-corpus-features/"):
34
34
35
35
def get_file_methods_data (file ):
36
36
"""
37
- Extract the source code tokens, identifier names and graph for methods in a source file
38
- represented by a graph. Identifier tokens are split into subtokens. Constructors are not
39
- included in the methods.
37
+ Extract the source code tokens, identifier names and graph for methods in a source file.
38
+ Identifier tokens are split into subtokens. Constructors are not included in the methods.
40
39
:param file: file
41
40
:return: (methods_source, methods_names, methods_graph) where methods_source[i] is a list of the tokens for
42
41
the source of ith method in the file, methods_names[i] is a list of tokens for name of the
@@ -68,23 +67,6 @@ def get_file_methods_data(file):
68
67
methods_graph .append ((method_edges , non_tokens_nodes_features ))
69
68
methods_names .append (split_identifier_into_parts (method_name_node .contents ))
70
69
71
- # start_line_number = node.startLineNumber
72
- # end_line_number = node.endLineNumber
73
- # method_source = []
74
- # for other_node in g.node:
75
- # if other_node.startLineNumber >= start_line_number and other_node.endLineNumber \
76
- # <= end_line_number:
77
- # # if other_node.type == FeatureNode.TOKEN:
78
- # # method_source.append(other_node.contents)
79
- # # elif other_node.type == FeatureNode.IDENTIFIER_TOKEN:
80
- # # sub_identifiers = split_identifier_into_parts(other_node.contents)
81
- # # method_source += sub_identifiers
82
- # if other_node.id == method_name_node.id:
83
- # method_source.append('_')
84
- # elif other_node.type == FeatureNode.TOKEN or other_node.type == \
85
- # FeatureNode.IDENTIFIER_TOKEN:
86
- # method_source.append(other_node.contents)
87
-
88
70
method_source = []
89
71
90
72
for other_node in method_nodes .values ():
@@ -101,6 +83,9 @@ def get_file_methods_data(file):
101
83
102
84
103
85
def get_file_graph (file ):
86
+ """
87
+ Compute graph for the given file.
88
+ """
104
89
with file .open ('rb' ) as f :
105
90
g = Graph ()
106
91
g .ParseFromString (f .read ())
@@ -117,6 +102,9 @@ def get_file_graph(file):
117
102
118
103
119
104
def get_method_edges (method_node_id , file_adj_list , file_nodes ):
105
+ """
106
+ Compute edges of a method graph for a method starting at the node 'method_node_id'.
107
+ """
120
108
method_nodes_ids = []
121
109
122
110
get_method_nodes_rec (method_node_id , method_nodes_ids , file_adj_list )
@@ -136,6 +124,9 @@ def get_method_edges(method_node_id, file_adj_list, file_nodes):
136
124
137
125
138
126
def get_method_nodes_rec (node_id , method_nodes_ids , file_adj_list ):
127
+ """
128
+ Utilities to recursively retrieve all edges of a method graph.
129
+ """
139
130
method_nodes_ids .append (node_id )
140
131
141
132
for edge in file_adj_list [node_id ]:
@@ -144,6 +135,9 @@ def get_method_nodes_rec(node_id, method_nodes_ids, file_adj_list):
144
135
145
136
146
137
def remap_edges (edges , nodes ):
138
+ """
139
+ Remap edges so that ids start from 0 and are consecutive.
140
+ """
147
141
old_id_to_new_id = {}
148
142
i = 0
149
143
nodes_values = sorted (nodes .values (), key = lambda node : node .id )
@@ -174,26 +168,10 @@ def is_token(node_value):
174
168
return node_value .type == FeatureNode .TOKEN or node_value .type == FeatureNode .IDENTIFIER_TOKEN
175
169
176
170
177
- def get_tokens ( g ):
171
+ def get_method_name_node ( g , method_node ):
178
172
"""
179
- Get the tokens for a file. Identifiers are split in subtokens.
180
- :param g: graph representing the file
181
- :return: list of tokens
173
+ Return the node corresponding to the name of a method.
182
174
"""
183
- token_nodes = list (filter (lambda n : n .type in (FeatureNode .TOKEN , FeatureNode .IDENTIFIER_TOKEN ),
184
- g .node ))
185
- tokens = []
186
- for token_node in token_nodes :
187
- if token_node .type == FeatureNode .IDENTIFIER_TOKEN :
188
- sub_identifiers = split_identifier_into_parts (token_node .contents )
189
- tokens += sub_identifiers
190
- else :
191
- tokens .append (token_node .contents )
192
-
193
- return tokens
194
-
195
-
196
- def get_method_name_node (g , method_node ):
197
175
method_id = method_node .id
198
176
method_name_node_id = 0
199
177
@@ -221,6 +199,9 @@ def get_class_name_node(g):
221
199
222
200
223
201
def get_nx_graph (file ):
202
+ """
203
+ Get networkx graph corresponding to a file.
204
+ """
224
205
nx_graph = nx .DiGraph ()
225
206
with file .open ('rb' ) as f :
226
207
g = Graph ()
@@ -231,116 +212,3 @@ def get_nx_graph(file):
231
212
edge .type ][0 ]
232
213
nx_graph .add_edge (edge .sourceId , edge .destinationId , edge_type = edge_type )
233
214
return nx_graph
234
-
235
-
236
- def get_tokens_dataset_from_dir (dir = "../corpus/r252-corpus-features/" ):
237
- methods_source = []
238
- methods_names = []
239
- methods_graphs = []
240
-
241
- proto_files = list (Path (dir ).rglob ("*.proto" ))
242
- print ("A total of {} files have been found" .format (len (proto_files )))
243
-
244
- # proto_files = [Path("../features-javac-master/Test.java.proto")]
245
-
246
- for i , file in enumerate (proto_files ):
247
- # nx_graph = get_nx_graph(file)
248
- if i % 10 == 0 :
249
- print ("Extracting data from file {}" .format (i + 1 ))
250
- file_methods_source , file_methods_names , file_methods_graph = \
251
- get_file_methods_data (file )
252
- methods_source += file_methods_source
253
- methods_names += file_methods_names
254
- methods_graphs += file_methods_graph
255
-
256
- return methods_source , methods_names , methods_graphs
257
-
258
-
259
- def get_method_nodes (method_node , file_graph ):
260
- method_nodes = [method_node ]
261
- get_method_nodes_rec (method_node , file_graph , method_nodes )
262
-
263
- return method_nodes
264
-
265
-
266
- # def get_method_nodes_rec(node, file_graph, method_nodes):
267
- # print(len(method_nodes))
268
- # for e in file_graph.edge:
269
- # neighbour = e.destinationId
270
- # if neighbour not in method_nodes:
271
- # method_nodes.append(neighbour)
272
- # get_method_nodes(neighbour, nx_graph, method_nodes)
273
-
274
-
275
- def get_augmented_graph (file ):
276
- # TODO: Does each method in a file have a different graph?
277
- with file .open ('rb' ) as f :
278
- g = Graph ()
279
- g .ParseFromString (f .read ())
280
-
281
- augmented_graph = nx .Graph ()
282
- new_node_id = max ([node .id for node in g .node ]) + 1
283
-
284
- split_identifiers_node = [node for node in g .node if node .type == FeatureNode .IDENTIFIER_TOKEN
285
- and len (split_identifier_into_parts (node .contents )) > 1 ]
286
-
287
- # Add all edges
288
- for edge in g .edge :
289
- edge_type = [name for name , value in list (vars (FeatureEdge ).items ())[8 :] if value ==
290
- edge .type ][0 ]
291
- augmented_graph .add_edge (edge .sourceId , edge .destinationId , edge_type = edge_type )
292
-
293
- # Add new edges for split identifiers and sub identifiers
294
- for node in split_identifiers_node :
295
- sub_identifiers = split_identifier_into_parts (node .contents )
296
- sub_identifiers_ids = list (range (new_node_id , new_node_id + len (sub_identifiers )))
297
- new_node_id += len (sub_identifiers )
298
-
299
- # ADD NEXT_TOKEN edge from node before identifier to first sub-identifier
300
- previous_token_node_id = find_previous_token_node_id (node , g )
301
- augmented_graph .add_edge (previous_token_node_id , sub_identifiers_ids [0 ],
302
- edge_type = "NEXT_TOKEN" )
303
-
304
- # ADD NEXT_TOKEN edge from last sub-identifier to node after identifier
305
- next_token_node_id = find_next_token_node_id (node , g )
306
- augmented_graph .add_edge (sub_identifiers_ids [- 1 ], next_token_node_id ,
307
- edge_type = "NEXT_TOKEN" )
308
-
309
- # ADD AST_CHILD edge from ast parent of node to first sub-identifier
310
- # ast_parent_node_id = find_ast_parent_node_id(node, g)
311
- # augmented_graph.add_edge(ast_parent_node_id, sub_identifiers_ids[0],
312
- # edge_type="ASSOCIATED_TOKEN")
313
-
314
- for i , sub_identifier_id in enumerate (sub_identifiers_ids ):
315
- # Add IN_TOKEN edges from sub-identifiers to identifier
316
- augmented_graph .add_edge (sub_identifier_id , node .id , edge_type = "IN_TOKEN" )
317
-
318
- # ADD NEXT_TOKEN edges from sub-identifier to next sub-identifier
319
- if i < len (sub_identifiers_ids ) - 1 :
320
- augmented_graph .add_edge (sub_identifiers_ids [i ], sub_identifiers_ids [i + 1 ],
321
- edge_type = "NEXT_TOKEN" )
322
- return augmented_graph
323
-
324
-
325
- def find_previous_token_node_id (node , g ):
326
- for edge in g .edge :
327
- if edge .destinationId == node .id and edge .type == FeatureEdge .NEXT_TOKEN :
328
- return edge .sourceId
329
-
330
- return None
331
-
332
-
333
- def find_next_token_node_id (node , g ):
334
- for edge in g .edge :
335
- if edge .sourceId == node .id and edge .type == FeatureEdge .NEXT_TOKEN :
336
- return edge .destinationId
337
-
338
- return None
339
-
340
-
341
- def find_ast_parent_node_id (node , g ):
342
- for edge in g .edge :
343
- if edge .destinationId == node .id and edge .type == FeatureEdge .ASSOCIATED_TOKEN :
344
- return edge .sourceId
345
-
346
- return None
0 commit comments