Skip to content

Commit 72f7c48

Browse files
committed
Change data extraction to also have node features
1 parent 371d0eb commit 72f7c48

File tree

2 files changed

+118
-110
lines changed

2 files changed

+118
-110
lines changed

data_extraction.py

Lines changed: 116 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,102 @@
44
from graph_pb2 import Graph
55
import networkx as nx
66
import sys
7+
import numpy as np
78

89
sys.setrecursionlimit(10000)
910

1011

12+
def get_dataset_from_dir(dir="../corpus/r252-corpus-features/"):
13+
methods_source = []
14+
methods_names = []
15+
methods_graphs = []
16+
17+
proto_files = list(Path(dir).rglob("*.proto"))
18+
print("A total of {} files have been found".format(len(proto_files)))
19+
20+
# proto_files = [Path("../features-javac-master/Test.java.proto")]
21+
22+
for i, file in enumerate(proto_files[:100]):
23+
# nx_graph = get_nx_graph(file)
24+
# if i % 100 == 0:
25+
print("Extracting data from file {}".format(i+1))
26+
file_methods_source, file_methods_names, file_methods_graph = get_file_methods_data(
27+
file)
28+
methods_source += file_methods_source
29+
methods_names += file_methods_names
30+
methods_graphs += file_methods_graph
31+
32+
return methods_source, methods_names, methods_graphs
33+
34+
35+
def get_file_methods_data(file):
36+
"""
37+
Extract the source code tokens, identifier names and graph for methods in a source file
38+
represented by a graph. Identifier tokens are split into subtokens. Constructors are not
39+
included in the methods.
40+
:param file: file
41+
:return: (methods_source, methods_names, methods_graph) where methods_source[i] is a list of the tokens for
42+
the source of ith method in the file, methods_names[i] is a list of tokens for name of the
43+
ith method in the file, and methods_graph[i] is the subtree of the file parse tree starting
44+
from the method node.
45+
"""
46+
adj_list, nodes, edges = get_file_graph(file)
47+
48+
with file.open('rb') as f:
49+
class_name = file.name.split('.')
50+
51+
g = Graph()
52+
g.ParseFromString(f.read())
53+
methods_source = []
54+
methods_names = []
55+
methods_graph = []
56+
# class_name_node = get_class_name_node(g)
57+
58+
for node in g.node:
59+
if node.contents == "METHOD":
60+
method_name_node = get_method_name_node(g, node)
61+
62+
# If method name is the same as class name, then method name is constructor,
63+
# so discard it
64+
if method_name_node.contents == class_name:
65+
continue
66+
67+
method_edges, method_nodes, non_tokens_nodes_features = get_method_edges(node.id, adj_list, nodes)
68+
methods_graph.append((method_edges, non_tokens_nodes_features))
69+
methods_names.append(split_identifier_into_parts(method_name_node.contents))
70+
71+
# start_line_number = node.startLineNumber
72+
# end_line_number = node.endLineNumber
73+
# method_source = []
74+
# for other_node in g.node:
75+
# if other_node.startLineNumber >= start_line_number and other_node.endLineNumber \
76+
# <= end_line_number:
77+
# # if other_node.type == FeatureNode.TOKEN:
78+
# # method_source.append(other_node.contents)
79+
# # elif other_node.type == FeatureNode.IDENTIFIER_TOKEN:
80+
# # sub_identifiers = split_identifier_into_parts(other_node.contents)
81+
# # method_source += sub_identifiers
82+
# if other_node.id == method_name_node.id:
83+
# method_source.append('_')
84+
# elif other_node.type == FeatureNode.TOKEN or other_node.type == \
85+
# FeatureNode.IDENTIFIER_TOKEN:
86+
# method_source.append(other_node.contents)
87+
88+
method_source = []
89+
90+
for other_node in method_nodes.values():
91+
if other_node.id == method_name_node.id:
92+
# Replace method name with '_' in method source code
93+
method_source.append('_')
94+
elif other_node.type == FeatureNode.TOKEN or other_node.type == \
95+
FeatureNode.IDENTIFIER_TOKEN:
96+
method_source.append(other_node.contents)
97+
98+
methods_source.append(method_source)
99+
100+
return methods_source, methods_names, methods_graph
101+
102+
11103
def get_file_graph(file):
12104
with file.open('rb') as f:
13105
g = Graph()
@@ -38,9 +130,17 @@ def get_method_edges(method_node_id, file_adj_list, file_nodes):
38130
method_nodes = {node_id: node for node_id, node in file_nodes.items() if node_id in
39131
method_nodes_ids}
40132

41-
methods_edges = remap_edges(methods_edges, method_nodes)
133+
methods_edges, non_tokens_nodes_features = remap_edges(methods_edges, method_nodes)
134+
135+
return methods_edges, method_nodes, non_tokens_nodes_features
136+
137+
138+
def get_method_nodes_rec(node_id, method_nodes_ids, file_adj_list):
139+
method_nodes_ids.append(node_id)
42140

43-
return methods_edges, method_nodes
141+
for edge in file_adj_list[node_id]:
142+
if edge['edge_type'] != FeatureEdge.NEXT_TOKEN and edge['destination'] not in method_nodes_ids:
143+
get_method_nodes_rec(edge['destination'], method_nodes_ids, file_adj_list)
44144

45145

46146
def remap_edges(edges, nodes):
@@ -55,30 +155,25 @@ def remap_edges(edges, nodes):
55155
old_id_to_new_id[node_value.id] = i
56156
i += 1
57157

158+
non_tokens_nodes_features = np.zeros((len(nodes_values) - len(old_id_to_new_id), 11))
159+
j = i
58160
# Set new ids for other nodes
59161
for node_value in nodes_values:
60162
if not is_token(node_value):
61163
old_id_to_new_id[node_value.id] = i
164+
non_tokens_nodes_features[i - j][node_value.type - 1] = 1
62165
i += 1
63166

64167
for edge in edges:
65168
new_edges.append((old_id_to_new_id[edge[0]], old_id_to_new_id[edge[1]]))
66169

67-
return new_edges
170+
return new_edges, non_tokens_nodes_features
68171

69172

70173
def is_token(node_value):
71174
return node_value.type == FeatureNode.TOKEN or node_value.type == FeatureNode.IDENTIFIER_TOKEN
72175

73176

74-
def get_method_nodes_rec(node_id, method_nodes_ids, file_adj_list):
75-
method_nodes_ids.append(node_id)
76-
77-
for edge in file_adj_list[node_id]:
78-
if edge['edge_type'] != FeatureEdge.NEXT_TOKEN and edge['destination'] not in method_nodes_ids:
79-
get_method_nodes_rec(edge['destination'], method_nodes_ids, file_adj_list)
80-
81-
82177
def get_tokens(g):
83178
"""
84179
Get the tokens for a file. Identifiers are split in subtokens.
@@ -98,72 +193,6 @@ def get_tokens(g):
98193
return tokens
99194

100195

101-
def get_methods_source_and_name(file, nx_graph):
102-
"""
103-
Extract the source code token and identifier names for methods in a source file represented
104-
by a graph. Identifier tokens are split into subtokens. Constructors are not included in the
105-
methods.
106-
:param file: file
107-
:return: (methods_source, methods_names) where methods_source[i] is a list of the tokens for
108-
the source of ith method in the file, and methods_names[i] is a list of tokens for name of the
109-
ith
110-
"""
111-
adj_list, nodes, edges = get_file_graph(file)
112-
113-
with file.open('rb') as f:
114-
class_name = file.name.split('.')
115-
116-
g = Graph()
117-
g.ParseFromString(f.read())
118-
methods_source = []
119-
methods_names = []
120-
methods_graph = []
121-
# class_name_node = get_class_name_node(g)
122-
123-
for node in g.node:
124-
if node.contents == "METHOD":
125-
method_name_node = get_method_name_node(g, node)
126-
127-
# If method name is the same as class name, then method name is constructor,
128-
# so discard it
129-
if method_name_node.contents == class_name:
130-
continue
131-
132-
method_edges, method_nodes = get_method_edges(node.id, adj_list, nodes)
133-
methods_graph.append(method_edges)
134-
methods_names.append(split_identifier_into_parts(method_name_node.contents))
135-
136-
# start_line_number = node.startLineNumber
137-
# end_line_number = node.endLineNumber
138-
# method_source = []
139-
# for other_node in g.node:
140-
# if other_node.startLineNumber >= start_line_number and other_node.endLineNumber \
141-
# <= end_line_number:
142-
# # if other_node.type == FeatureNode.TOKEN:
143-
# # method_source.append(other_node.contents)
144-
# # elif other_node.type == FeatureNode.IDENTIFIER_TOKEN:
145-
# # sub_identifiers = split_identifier_into_parts(other_node.contents)
146-
# # method_source += sub_identifiers
147-
# if other_node.id == method_name_node.id:
148-
# method_source.append('_')
149-
# elif other_node.type == FeatureNode.TOKEN or other_node.type == \
150-
# FeatureNode.IDENTIFIER_TOKEN:
151-
# method_source.append(other_node.contents)
152-
153-
method_source = []
154-
155-
for other_node in method_nodes.values():
156-
if other_node.id == method_name_node.id:
157-
method_source.append('_')
158-
elif other_node.type == FeatureNode.TOKEN or other_node.type == \
159-
FeatureNode.IDENTIFIER_TOKEN:
160-
method_source.append(other_node.contents)
161-
162-
methods_source.append(method_source)
163-
164-
return methods_source, methods_names, methods_graph
165-
166-
167196
def get_method_name_node(g, method_node):
168197
method_id = method_node.id
169198
method_name_node_id = 0
@@ -191,27 +220,6 @@ def get_class_name_node(g):
191220
return class_associated_nodes[1]
192221

193222

194-
def get_dataset_from_dir(dir="../corpus/r252-corpus-features/"):
195-
methods_source = []
196-
methods_names = []
197-
methods_graphs = []
198-
199-
proto_files = list(Path(dir).rglob("*.proto"))
200-
print("A total of {} files have been found".format(len(proto_files)))
201-
202-
for i, file in enumerate(proto_files):
203-
nx_graph = get_nx_graph(file)
204-
# if i % 100 == 0:
205-
print("Extracting data from file {}".format(i+1))
206-
file_methods_source, file_methods_names, file_methods_graph = get_methods_source_and_name(
207-
file, nx_graph)
208-
methods_source += file_methods_source
209-
methods_names += file_methods_names
210-
methods_graphs += file_methods_graph
211-
212-
return methods_source, methods_names, methods_graphs
213-
214-
215223
def get_nx_graph(file):
216224
nx_graph = nx.DiGraph()
217225
with file.open('rb') as f:
@@ -235,12 +243,12 @@ def get_tokens_dataset_from_dir(dir="../corpus/r252-corpus-features/"):
235243

236244
# proto_files = [Path("../features-javac-master/Test.java.proto")]
237245

238-
for i, file in enumerate(proto_files[:100]):
239-
nx_graph = get_nx_graph(file)
246+
for i, file in enumerate(proto_files):
247+
# nx_graph = get_nx_graph(file)
240248
if i % 10 == 0:
241249
print("Extracting data from file {}".format(i+1))
242250
file_methods_source, file_methods_names, file_methods_graph = \
243-
get_methods_source_and_name(file, nx_graph)
251+
get_file_methods_data(file)
244252
methods_source += file_methods_source
245253
methods_names += file_methods_names
246254
methods_graphs += file_methods_graph
@@ -255,13 +263,13 @@ def get_method_nodes(method_node, file_graph):
255263
return method_nodes
256264

257265

258-
def get_method_nodes_rec(node, file_graph, method_nodes):
259-
print(len(method_nodes))
260-
for e in file_graph.edge:
261-
neighbour = e.destinationId
262-
if neighbour not in method_nodes:
263-
method_nodes.append(neighbour)
264-
get_method_nodes(neighbour, nx_graph, method_nodes)
266+
# def get_method_nodes_rec(node, file_graph, method_nodes):
267+
# print(len(method_nodes))
268+
# for e in file_graph.edge:
269+
# neighbour = e.destinationId
270+
# if neighbour not in method_nodes:
271+
# method_nodes.append(neighbour)
272+
# get_method_nodes(neighbour, nx_graph, method_nodes)
265273

266274

267275
def get_augmented_graph(file):

data_generation.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
from data_extraction import get_dataset_from_dir, get_tokens_dataset_from_dir
22
import pickle
33

4-
methods_source, methods_names, methods_graphs = get_tokens_dataset_from_dir(
4+
methods_source, methods_names, methods_graphs = get_dataset_from_dir(
55
"../corpus/r252-corpus-features/")
66

77
pickle.dump({'methods_source': methods_source, 'methods_names': methods_names, 'methods_graphs':
8-
methods_graphs}, open('data/methods_tokens_graphs.pkl', 'wb'))
8+
methods_graphs}, open('data/methods_tokens_graphs2.pkl', 'wb'))

0 commit comments

Comments
 (0)