-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGraphml_Filter.py
244 lines (180 loc) · 9.64 KB
/
Graphml_Filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
import argparse
import gzip
import logging
import os
import xml.etree.ElementTree as ET
from ast import literal_eval
from xml.etree.ElementTree import ElementTree
def getGraphmlFiles(dirPath):
"""Retrieve all the graphml files from the directory and parse them as XML trees
Parameters
----------
dirPath : str
The path where the graphml files are
Returns
-------
dict
a dict with the name of the file as keys and the XML trees as values
None
"""
# Dict of the data from the graphml files
treeDataDict = {}
# Loop trough all the files in the directory
for file in os.listdir(dirPath):
# If the file ends with .graphml.gz that means it if a compressed graphml file,
# we need to decompress it and then read it
if file.endswith(".graphml.gz"):
# Get the data of the compressed file
fileData = gzip.open(dirPath + file, 'rb').read()
# Append the data to the dict by its file name
treeDataDict[file.replace('.graphml.gz', '')] = ET.fromstring(fileData)
# Else if it a "classic" graphml file we simply read it
elif file.endswith(".graphml"):
# Add the data to the dict
treeDataDict[file.replace('.graphml', '')] = ET.parse(dirPath + file).getroot()
# Then we return this dict
return treeDataDict
def filterTreeDataDictFromKeys(treeDataDict, resultdirectory, nodeKeys, edgesKeys):
"""Filter a tree from nodes/edges keys and save them back to a result directory
Parameters
----------
treeDataDict : dict
dict containing the XML trees that need to be filtered
resultdirectory : str
path to results directory where the filtered trees will be saved
nodeKeys : str
dict containing the keys and the values that a node must have (can be empty)
edgesKeys : str
dict containing the keys and the values that a edge must have (can be empty)
"""
# The namespace must be given...
namespace = {"": "http://graphml.graphdrawing.org/xmlns"}
# Used to save the file without the namespace
ET.register_namespace('', 'http://graphml.graphdrawing.org/xmlns')
# The keys mapped with the values used to fulfil conditions (to keep the nodes/edges that we want)
nodesKeysToWatch = {}
edgesKeysToWatch = {}
keysAlreadyFound = False
# We want to loop through every trees
for keyTreeName in treeDataDict:
# If this is not the first time we are looking for the keys we can continue
# because the keys used are the same for all the trees of the function
if not keysAlreadyFound:
# We need to get the keys and know which key is mapped with the field we want to keep
for key in treeDataDict[keyTreeName].findall('key', namespace):
# We check which key is needed for the nodes and for the edges
if key.attrib['for'] == "node" and key.attrib['attr.name'] in nodeKeys.keys():
# We want to map the key id to the values of the attribute name of the nodeKeys
nodesKeysToWatch[key.attrib['id']] = nodeKeys[key.attrib['attr.name']]
if key.attrib['for'] == "edge" and key.attrib['attr.name'] in edgesKeys.keys():
# We want to map the key id to the values of the attribute name of the edgeKeys
edgesKeysToWatch[key.attrib['id']] = edgesKeys[key.attrib['attr.name']]
# Tell that the keys have been found
keysAlreadyFound = True
# Get the graph
graph = treeDataDict[keyTreeName].find("graph", namespace)
# The nodes we want to keep
nodesToKeep = set()
# In the first place we want to gather all the nodes that fulfil given conditions
for node in graph.findall("node", namespace):
# We need to loop in every keys the node must have (if no condition is set we accept everything)
if len(nodesKeysToWatch) > 0:
# We loop through the keys needed
for key in nodesKeysToWatch:
# If the node has a data key equal to what we are looking for
if node.find(".//data[@key='%s']" % key, namespace).text in nodesKeysToWatch[key]:
# Add the node to the ones we want to keep
nodesToKeep.add(node.attrib["id"])
# No condition = accept every nodes
else:
# Add the node to the ones we want to keep
nodesToKeep.add(node.attrib["id"])
# Edges ids
edgesToKeep = set()
# set of "foreign" nodes that will be gathered in the loop below (the ones connected to the needed ones)
newNodesToKeep = set()
# Then we should have a set of nodes ids and we want to get the edges connected to those nodes
for edge in graph.findall("edge", namespace):
# If the source or the target is in the ids set we keep this edge
if edge.attrib["source"] in nodesToKeep or edge.attrib["target"] in nodesToKeep:
# We need to loop in every keys the node must have (if no condition is set we accept everything)
if len(edgesKeysToWatch) > 0:
# We loop through the keys needed
for key in edgesKeysToWatch:
# If the edge has a data key equal to what we are looking for
if edge.find(".//data[@key='%s']" % key, namespace).text in edgesKeysToWatch[key]:
# Add the edge to the ones we want to keep
edgesToKeep.add(edge.attrib["id"])
# And add both the source and the target in the new nodes ids set as maybe one of them is
# not part of the initial nodesToKeep set and we must add them (to have the foreign nodes
# connected to the ones we want)
newNodesToKeep.add(edge.attrib["source"])
newNodesToKeep.add(edge.attrib["target"])
# No condition = accept every edges
else:
# Add the node to the ones we want to keep
edgesToKeep.add(edge.attrib["id"])
# And add both the source and the target in the new nodes ids set as maybe one of them is
# not part of the initial nodesToKeep set and we must add them (to have the foreign nodes
# connected to the ones we want)
newNodesToKeep.add(edge.attrib["source"])
newNodesToKeep.add(edge.attrib["target"])
# We merge the two nodes sets
nodesToKeep = nodesToKeep.union(newNodesToKeep)
# Now that we have our nodes and edges we will loop the document and remove the ones we dont want
for node in graph.findall('node', namespace):
if not (node.attrib['id'] in nodesToKeep):
graph.remove(node)
for edge in graph.findall('edge', namespace):
if not (edge.attrib['id'] in edgesToKeep):
graph.remove(edge)
# Finally save the new graph in the result directory (with the same name)
ElementTree(treeDataDict[keyTreeName]).write(resultdirectory + keyTreeName + ".graphml")
if __name__ == '__main__':
"""
Main function
"""
# Create the parser
parser = argparse.ArgumentParser(
description='The arguments needed are the source path, the destination path and the nodes/edges keys to filter on')
# Add the arguments
parser.add_argument('sourcesPath', metavar='SourcesPath', nargs="?",
help='The sources directory path (must exists)', default="./sources/")
parser.add_argument('resultsPath', metavar='ResultsPath', nargs="?", help='The results directory path',
default="./results/")
parser.add_argument('nodesKeys', metavar='NodesKeys', nargs="?",
help='The keys with the values the nodes must follow', default='{"Country": ["RU"]}')
parser.add_argument('edgesKeys', metavar='EdgesKeys', nargs="?",
help='The keys with the values the edges must follow', default="{}")
# Parse the arguments
args = parser.parse_args()
# Get the path where to find the sources
sourcesPath = vars(args)["sourcesPath"]
# Check if the destination file exists (else raise an Exception)
if not os.path.exists(sourcesPath):
raise FileNotFoundError("%s path does not exist" % sourcesPath)
# Get the path where to save the filtered files
resultsPath = vars(args)["resultsPath"]
# We try to parse the keys as a dict, if it fails we abort
try:
nodeKeys = literal_eval(vars(args)["nodesKeys"])
except ValueError:
raise
try:
edgesKeys = literal_eval(vars(args)["edgesKeys"])
except ValueError:
raise
# Create the results directory if it does not exist yet
if not os.path.exists(resultsPath):
# Create the directory
os.makedirs(resultsPath)
logging.info("Results directory has been created")
else:
logging.info("Results directory already exists")
# Gather the data from the graphml files in the source directory
treeDataDict = getGraphmlFiles(sourcesPath)
# Filter the trees and save them back into the results directory (filtered from given keys)
# For example if we only want to keep the nodes from Russia (RU) and keep the nodes connected to them
# The nodeKeys will be {"Country": ["RU"]}
filterTreeDataDictFromKeys(treeDataDict, resultsPath, nodeKeys, edgesKeys)
print("The files have been filtered")