4
4
from graph_pb2 import Graph
5
5
import networkx as nx
6
6
import sys
7
+ import numpy as np
7
8
8
9
sys .setrecursionlimit (10000 )
9
10
10
11
12
+ def get_dataset_from_dir (dir = "../corpus/r252-corpus-features/" ):
13
+ methods_source = []
14
+ methods_names = []
15
+ methods_graphs = []
16
+
17
+ proto_files = list (Path (dir ).rglob ("*.proto" ))
18
+ print ("A total of {} files have been found" .format (len (proto_files )))
19
+
20
+ # proto_files = [Path("../features-javac-master/Test.java.proto")]
21
+
22
+ for i , file in enumerate (proto_files [:100 ]):
23
+ # nx_graph = get_nx_graph(file)
24
+ # if i % 100 == 0:
25
+ print ("Extracting data from file {}" .format (i + 1 ))
26
+ file_methods_source , file_methods_names , file_methods_graph = get_file_methods_data (
27
+ file )
28
+ methods_source += file_methods_source
29
+ methods_names += file_methods_names
30
+ methods_graphs += file_methods_graph
31
+
32
+ return methods_source , methods_names , methods_graphs
33
+
34
+
35
+ def get_file_methods_data (file ):
36
+ """
37
+ Extract the source code tokens, identifier names and graph for methods in a source file
38
+ represented by a graph. Identifier tokens are split into subtokens. Constructors are not
39
+ included in the methods.
40
+ :param file: file
41
+ :return: (methods_source, methods_names, methods_graph) where methods_source[i] is a list of the tokens for
42
+ the source of ith method in the file, methods_names[i] is a list of tokens for name of the
43
+ ith method in the file, and methods_graph[i] is the subtree of the file parse tree starting
44
+ from the method node.
45
+ """
46
+ adj_list , nodes , edges = get_file_graph (file )
47
+
48
+ with file .open ('rb' ) as f :
49
+ class_name = file .name .split ('.' )
50
+
51
+ g = Graph ()
52
+ g .ParseFromString (f .read ())
53
+ methods_source = []
54
+ methods_names = []
55
+ methods_graph = []
56
+ # class_name_node = get_class_name_node(g)
57
+
58
+ for node in g .node :
59
+ if node .contents == "METHOD" :
60
+ method_name_node = get_method_name_node (g , node )
61
+
62
+ # If method name is the same as class name, then method name is constructor,
63
+ # so discard it
64
+ if method_name_node .contents == class_name :
65
+ continue
66
+
67
+ method_edges , method_nodes , non_tokens_nodes_features = get_method_edges (node .id , adj_list , nodes )
68
+ methods_graph .append ((method_edges , non_tokens_nodes_features ))
69
+ methods_names .append (split_identifier_into_parts (method_name_node .contents ))
70
+
71
+ # start_line_number = node.startLineNumber
72
+ # end_line_number = node.endLineNumber
73
+ # method_source = []
74
+ # for other_node in g.node:
75
+ # if other_node.startLineNumber >= start_line_number and other_node.endLineNumber \
76
+ # <= end_line_number:
77
+ # # if other_node.type == FeatureNode.TOKEN:
78
+ # # method_source.append(other_node.contents)
79
+ # # elif other_node.type == FeatureNode.IDENTIFIER_TOKEN:
80
+ # # sub_identifiers = split_identifier_into_parts(other_node.contents)
81
+ # # method_source += sub_identifiers
82
+ # if other_node.id == method_name_node.id:
83
+ # method_source.append('_')
84
+ # elif other_node.type == FeatureNode.TOKEN or other_node.type == \
85
+ # FeatureNode.IDENTIFIER_TOKEN:
86
+ # method_source.append(other_node.contents)
87
+
88
+ method_source = []
89
+
90
+ for other_node in method_nodes .values ():
91
+ if other_node .id == method_name_node .id :
92
+ # Replace method name with '_' in method source code
93
+ method_source .append ('_' )
94
+ elif other_node .type == FeatureNode .TOKEN or other_node .type == \
95
+ FeatureNode .IDENTIFIER_TOKEN :
96
+ method_source .append (other_node .contents )
97
+
98
+ methods_source .append (method_source )
99
+
100
+ return methods_source , methods_names , methods_graph
101
+
102
+
11
103
def get_file_graph (file ):
12
104
with file .open ('rb' ) as f :
13
105
g = Graph ()
@@ -38,9 +130,17 @@ def get_method_edges(method_node_id, file_adj_list, file_nodes):
38
130
method_nodes = {node_id : node for node_id , node in file_nodes .items () if node_id in
39
131
method_nodes_ids }
40
132
41
- methods_edges = remap_edges (methods_edges , method_nodes )
133
+ methods_edges , non_tokens_nodes_features = remap_edges (methods_edges , method_nodes )
134
+
135
+ return methods_edges , method_nodes , non_tokens_nodes_features
136
+
137
+
138
+ def get_method_nodes_rec (node_id , method_nodes_ids , file_adj_list ):
139
+ method_nodes_ids .append (node_id )
42
140
43
- return methods_edges , method_nodes
141
+ for edge in file_adj_list [node_id ]:
142
+ if edge ['edge_type' ] != FeatureEdge .NEXT_TOKEN and edge ['destination' ] not in method_nodes_ids :
143
+ get_method_nodes_rec (edge ['destination' ], method_nodes_ids , file_adj_list )
44
144
45
145
46
146
def remap_edges (edges , nodes ):
@@ -55,30 +155,25 @@ def remap_edges(edges, nodes):
55
155
old_id_to_new_id [node_value .id ] = i
56
156
i += 1
57
157
158
+ non_tokens_nodes_features = np .zeros ((len (nodes_values ) - len (old_id_to_new_id ), 11 ))
159
+ j = i
58
160
# Set new ids for other nodes
59
161
for node_value in nodes_values :
60
162
if not is_token (node_value ):
61
163
old_id_to_new_id [node_value .id ] = i
164
+ non_tokens_nodes_features [i - j ][node_value .type - 1 ] = 1
62
165
i += 1
63
166
64
167
for edge in edges :
65
168
new_edges .append ((old_id_to_new_id [edge [0 ]], old_id_to_new_id [edge [1 ]]))
66
169
67
- return new_edges
170
+ return new_edges , non_tokens_nodes_features
68
171
69
172
70
173
def is_token (node_value ):
71
174
return node_value .type == FeatureNode .TOKEN or node_value .type == FeatureNode .IDENTIFIER_TOKEN
72
175
73
176
74
- def get_method_nodes_rec (node_id , method_nodes_ids , file_adj_list ):
75
- method_nodes_ids .append (node_id )
76
-
77
- for edge in file_adj_list [node_id ]:
78
- if edge ['edge_type' ] != FeatureEdge .NEXT_TOKEN and edge ['destination' ] not in method_nodes_ids :
79
- get_method_nodes_rec (edge ['destination' ], method_nodes_ids , file_adj_list )
80
-
81
-
82
177
def get_tokens (g ):
83
178
"""
84
179
Get the tokens for a file. Identifiers are split in subtokens.
@@ -98,72 +193,6 @@ def get_tokens(g):
98
193
return tokens
99
194
100
195
101
- def get_methods_source_and_name (file , nx_graph ):
102
- """
103
- Extract the source code token and identifier names for methods in a source file represented
104
- by a graph. Identifier tokens are split into subtokens. Constructors are not included in the
105
- methods.
106
- :param file: file
107
- :return: (methods_source, methods_names) where methods_source[i] is a list of the tokens for
108
- the source of ith method in the file, and methods_names[i] is a list of tokens for name of the
109
- ith
110
- """
111
- adj_list , nodes , edges = get_file_graph (file )
112
-
113
- with file .open ('rb' ) as f :
114
- class_name = file .name .split ('.' )
115
-
116
- g = Graph ()
117
- g .ParseFromString (f .read ())
118
- methods_source = []
119
- methods_names = []
120
- methods_graph = []
121
- # class_name_node = get_class_name_node(g)
122
-
123
- for node in g .node :
124
- if node .contents == "METHOD" :
125
- method_name_node = get_method_name_node (g , node )
126
-
127
- # If method name is the same as class name, then method name is constructor,
128
- # so discard it
129
- if method_name_node .contents == class_name :
130
- continue
131
-
132
- method_edges , method_nodes = get_method_edges (node .id , adj_list , nodes )
133
- methods_graph .append (method_edges )
134
- methods_names .append (split_identifier_into_parts (method_name_node .contents ))
135
-
136
- # start_line_number = node.startLineNumber
137
- # end_line_number = node.endLineNumber
138
- # method_source = []
139
- # for other_node in g.node:
140
- # if other_node.startLineNumber >= start_line_number and other_node.endLineNumber \
141
- # <= end_line_number:
142
- # # if other_node.type == FeatureNode.TOKEN:
143
- # # method_source.append(other_node.contents)
144
- # # elif other_node.type == FeatureNode.IDENTIFIER_TOKEN:
145
- # # sub_identifiers = split_identifier_into_parts(other_node.contents)
146
- # # method_source += sub_identifiers
147
- # if other_node.id == method_name_node.id:
148
- # method_source.append('_')
149
- # elif other_node.type == FeatureNode.TOKEN or other_node.type == \
150
- # FeatureNode.IDENTIFIER_TOKEN:
151
- # method_source.append(other_node.contents)
152
-
153
- method_source = []
154
-
155
- for other_node in method_nodes .values ():
156
- if other_node .id == method_name_node .id :
157
- method_source .append ('_' )
158
- elif other_node .type == FeatureNode .TOKEN or other_node .type == \
159
- FeatureNode .IDENTIFIER_TOKEN :
160
- method_source .append (other_node .contents )
161
-
162
- methods_source .append (method_source )
163
-
164
- return methods_source , methods_names , methods_graph
165
-
166
-
167
196
def get_method_name_node (g , method_node ):
168
197
method_id = method_node .id
169
198
method_name_node_id = 0
@@ -191,27 +220,6 @@ def get_class_name_node(g):
191
220
return class_associated_nodes [1 ]
192
221
193
222
194
- def get_dataset_from_dir (dir = "../corpus/r252-corpus-features/" ):
195
- methods_source = []
196
- methods_names = []
197
- methods_graphs = []
198
-
199
- proto_files = list (Path (dir ).rglob ("*.proto" ))
200
- print ("A total of {} files have been found" .format (len (proto_files )))
201
-
202
- for i , file in enumerate (proto_files ):
203
- nx_graph = get_nx_graph (file )
204
- # if i % 100 == 0:
205
- print ("Extracting data from file {}" .format (i + 1 ))
206
- file_methods_source , file_methods_names , file_methods_graph = get_methods_source_and_name (
207
- file , nx_graph )
208
- methods_source += file_methods_source
209
- methods_names += file_methods_names
210
- methods_graphs += file_methods_graph
211
-
212
- return methods_source , methods_names , methods_graphs
213
-
214
-
215
223
def get_nx_graph (file ):
216
224
nx_graph = nx .DiGraph ()
217
225
with file .open ('rb' ) as f :
@@ -235,12 +243,12 @@ def get_tokens_dataset_from_dir(dir="../corpus/r252-corpus-features/"):
235
243
236
244
# proto_files = [Path("../features-javac-master/Test.java.proto")]
237
245
238
- for i , file in enumerate (proto_files [: 100 ] ):
239
- nx_graph = get_nx_graph (file )
246
+ for i , file in enumerate (proto_files ):
247
+ # nx_graph = get_nx_graph(file)
240
248
if i % 10 == 0 :
241
249
print ("Extracting data from file {}" .format (i + 1 ))
242
250
file_methods_source , file_methods_names , file_methods_graph = \
243
- get_methods_source_and_name (file , nx_graph )
251
+ get_file_methods_data (file )
244
252
methods_source += file_methods_source
245
253
methods_names += file_methods_names
246
254
methods_graphs += file_methods_graph
@@ -255,13 +263,13 @@ def get_method_nodes(method_node, file_graph):
255
263
return method_nodes
256
264
257
265
258
- def get_method_nodes_rec (node , file_graph , method_nodes ):
259
- print (len (method_nodes ))
260
- for e in file_graph .edge :
261
- neighbour = e .destinationId
262
- if neighbour not in method_nodes :
263
- method_nodes .append (neighbour )
264
- get_method_nodes (neighbour , nx_graph , method_nodes )
266
+ # def get_method_nodes_rec(node, file_graph, method_nodes):
267
+ # print(len(method_nodes))
268
+ # for e in file_graph.edge:
269
+ # neighbour = e.destinationId
270
+ # if neighbour not in method_nodes:
271
+ # method_nodes.append(neighbour)
272
+ # get_method_nodes(neighbour, nx_graph, method_nodes)
265
273
266
274
267
275
def get_augmented_graph (file ):
0 commit comments