-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathextractcERGM2subgraphs.py
executable file
·319 lines (262 loc) · 10.7 KB
/
extractcERGM2subgraphs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
#!/usr/bin/env python2
##############################################################################
#
# extractcERGM2subgraphs.py - get subgraphs for cERGM goodness-of-fit
#
# File: extractcERGM2subgraphs.py
# Author: Alex Stivala
# Created: March 2022
#
##############################################################################
"""Extract subgraphs for cERGM goodness-of-fit
Input file is a Pajek format network file with either 1 edge per
line or 1 node per line format (see
https://snap.stanford.edu/snappy/doc/reference/LoadPajek.html)
For each input graphs, writes new file with
the induced subgraph induced by the union
of the nodes in the last term (time period), and all nodes in
earlier term that receive arcs from them.
Output files are in cwd, with _cergm2_subgraph appended before suffix (.net.gz)
in Pajek network format (compressed with gzip).
Also outputs files with term for each node similarly as _cergm2_subgraph_term.txt.gz
(also compressed)
WARNING: the output files are overwritten if they exist.
This is a reimplementation of the subgraph extract code from
plotEstimNetDirectedcERGMSimFit2.R (R/igraph version) using Python and SNAP
instead as R is too slow and not scalable:
the original R/igraph code takes several hours per network on the patent
data (approx 100 000 nodes per time period) for example, making it
unusuable since large numbers of networks are needed (and the cluster has
a 48 hour time limit; even embarrassing parallelism is not practical either
since in R even 'multicore' means the entire memory is copied for each
core), while this takes approx 1 minute, and also uses less memory.
For SNAP see
http://snap.stanford.edu/snappy/index.html
Used version 4.1.0.
NB using Python 2.7 as could net get SNAP to install in Python 3.
Usage:
python2 extractcERGM2subgraphs.py netfilename termfilename simNetFilePrefix
netfilename is the Pajek format observed graph (the input arclistFile
for EstimNetDirected)
termfilename is the time periods (terms) of the nodes in the same
format as any node attribute for EstimNetDirected. I.e. the first
line is the attribute name (must be 'term' for this one) and each
subsequent line is the value for a node, in order of the node
numbering used in the arc list file (netfilename) i.e. first line
after header has term (time period) for node 0, the next line for node
1, and so on. The terms themsevles are numbered from 0 for the first
term (time period). This is the same format used for terms in cERGM
estimation in EstimNetDirected (and simulation in SimulateERGM); see
add_Cergm_terms_to_digraph() in src/graph.c
simNetFilePreifx is the prefix of the simulated network filenames
this files have _x.net appended by EstimNetDirected or SimulateERGM,
where is task or iteration number.
"""
import sys,os,time
import glob,re
import getopt
import gzip
import tempfile
import snap
from inducedSubgraphcERGM2 import cERGM2_subgraph,write_graph_file_compressed,write_term_file_compressed
#-----------------------------------------------------------------------------
#
# Functions
#
#-----------------------------------------------------------------------------
def glob_re(pattern, strings):
"""
Regular expression glob
https://stackoverflow.com/questions/13031989/regular-expression-usage-in-glob-glob
"""
return filter(re.compile(pattern).match, strings)
def convert_estimnetdirected_to_pajek_format(infile, outf):
"""
Convert EstimNEtDirected Pajek network file format to the more verbose
Pajek file format where each node is specified explicitly.
EstimNetDirected can use a format like this:
*Vertices 9032
*Arcs
641 1
739 1
...
where the nodes are (implicitly) numbered 1..N (9032 in the example above).
This usually works with most software that uses Pajek format.
However not unfortunately with snap.LoadPajek() which requires the
nodes all to be named, eg:
*Vertices 9032
1
2
...
9032
*Arcs
641 1
739 1
...
This function does that conversion. Note that EstimNeDirected writes
Pajek .net files in this format, so not nedded for those, but
many programs write in the simpler format (without all the vertices named)
and EstimNetDirected can read it, but snaplLoadPajek() cannot.
Parameters:
infile - filename of EstimNetDirected Pajek format file to read
outf - open file object of verbose Pajek format to write
Return value:
0 if ok else nonzero on error
"""
dat = open(infile).readlines()
if dat[0].split(' ')[0].lower() != "*vertices":
sys.stderr.write("Bad input file " + infile + " (no *vertices)\n")
return -1
vertcount = int(dat[0].split(' ')[1])
if dat[1].rstrip().lower() != "*arcs":
sys.stderr.write("Bad input file " + infile + " (no *arcs)\n")
return -1
nodenums = list(set([int(x) for x in flatten([y.rstrip().split(' ') for y in dat[2:]])]))
if min(nodenums) < 1 or max(nodenums) > vertcount:
sys.stderr.write("Bad input file " + infile + "(bad node number)\n")
outf.write("*Vertices " + str(vertcount)+ "\n")
outf.write("\n".join([str(x) for x in range(1, vertcount+1)]))
outf.write("\n")
outf.write("".join(dat[1:]))
return 0
def flatten(t):
""" flatten list of lists to a single list
https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-a-list-of-lists
"""
return [item for sublist in t for item in sublist]
#-----------------------------------------------------------------------------
#
# main
#
#-----------------------------------------------------------------------------
def usage(progname):
"""
print usage msg and exit
"""
sys.stderr.write("usage: " + progname + " netfilename termfilename simNEtFilePrefix\n")
sys.exit(1)
def main():
"""
See usage message in module header block
"""
directed = False
try:
opts,args = getopt.getopt(sys.argv[1:], '')
except:
usage(sys.argv[0])
for opt,arg in opts:
usage(sys.argv[0])
if len(args) != 3:
usage(sys.argv[0])
netfilename = args[0]
termfilename = args[1]
simnetfileprefix = args[2]
##
## Load observed graph
##
print netfilename
tmpfd, tmpfile = tempfile.mkstemp()
tmpf = os.fdopen(tmpfd, "w")
if convert_estimnetdirected_to_pajek_format(netfilename, tmpf) != 0:
sys.stderr.write('ERROR: bad data in ' + netfilename + '\n')
sys.exit(1)
tmpf.close()
# Need to use PNEANet not PNGraph to allow attributes with AddIntAtrN etc.
G = snap.LoadPajek(snap.PNEANet, tmpfile)
os.remove(tmpfile)
snap.PrintInfo(G)
##
## Load term (time period) data
##
termdat = open(termfilename).readlines()
if (termdat[0].rstrip() != "term"):
sys.stderr.write("ERROR: expecting 'term' as first line of " + termfilename + "\n")
sys.exit(1)
terms = [int(x) for x in termdat[1:]]
print 'maxterm = ', max(terms)
##
## Annotate observed graph with term data
##
assert(len(terms) == G.GetNodes())
for (i, node) in enumerate(G.Nodes()):
G.AddIntAttrDatN(node, terms[i], "term")
##
## Get subgraph for observed graph
##
start = time.time()
Gsubgraph = cERGM2_subgraph(G)
print 'Getting subgraph took ', time.time() - start, 's'
snap.PrintInfo(Gsubgraph)
##
## Write subgraph and terms for its nodes (compressed)
##
# build a dictionary mapping nodeid:term
# so that can be written to term file in correct order.
# Note that then the index in nodelist of a nodeid can be used
# as sequential node number of each node.
nodelist = list()
termdict = dict() # map nodeid : term
for node in Gsubgraph.Nodes():
nodelist.append(node.GetId())
termdict[node.GetId()] = Gsubgraph.GetIntAttrDatN(node.GetId(), "term")
outfilename = os.path.splitext(netfilename)[0] + '_cergm2_subgraph' + os.path.extsep + "net.gz"
termoutfilename = os.path.splitext(netfilename)[0] + '_cergm2_subraph_term' + os.path.extsep + "txt.gz"
print 'Writing observed subgraph to ', outfilename
write_graph_file_compressed(outfilename, Gsubgraph, nodelist)
print 'writing observed subgraph terms to ', termoutfilename
write_term_file_compressed(termoutfilename, Gsubgraph, nodelist, termdict)
##
## Load simulated graphs, annotate with terms, and get subgraphs for each
## and write them to compressed (gzip) files
##
graph_glob = simnetfileprefix + "_[0-9]*[.]net$"
sim_files = glob_re(graph_glob, os.listdir("."))
gz = False
if len(sim_files) == 0:
print 'No .net files with prefix ', simnetfileprefix, ' found, trying .net.gz instead'
graph_glob = simnetfileprefix + "_[0-9]*[.]net.gz$"
sim_files = glob_re(graph_glob, os.listdir("."))
gz = True
for simfile in sim_files:
if gz:
print simfile
tmpfd, tmpfile = tempfile.mkstemp()
tmpf = os.fdopen(tmpfd, "w")
try:
tmpf.write(gzip.open(simfile).read())
tmpf.close()
G = snap.LoadPajek(snap.PNEANet, tmpfile)
finally:
os.remove(tmpfile)
else:
print simfile
G = snap.LoadPajek(snap.PNEANet, simfile)
assert(len(terms) == G.GetNodes())
for (i, node) in enumerate(G.Nodes()):
G.AddIntAttrDatN(node, terms[i], "term")
snap.PrintInfo(G)
start = time.time()
Gsubgraph = cERGM2_subgraph(G)
print 'Getting subgraph took ', time.time() - start, 's'
snap.PrintInfo(Gsubgraph)
# build a dictionary mapping nodeid:term
# so that can be written to term file in correct order.
# Note that then the index in nodelist of a nodeid can be used
# as sequential node number of each node.
nodelist = list()
termdict = dict() # map nodeid : term
for node in Gsubgraph.Nodes():
nodelist.append(node.GetId())
termdict[node.GetId()] = Gsubgraph.GetIntAttrDatN(node.GetId(), "term")
if gz:
regxp = re.compile("_([0-9]*)[.]net.gz$")
else:
regxp = re.compile("_([0-9]*)[.]net$")
outfilename = regxp.sub("_cergm2_subgraph_\\1.net.gz", simfile)
termoutfilename = regxp.sub("_cergm2_subgraph_term_\\1.txt.gz", simfile)
print 'Writing simulated subgraph to ', outfilename
write_graph_file_compressed(outfilename, Gsubgraph, nodelist)
print 'writing simulated subgraph terms to ', termoutfilename
write_term_file_compressed(termoutfilename, Gsubgraph, nodelist, termdict)
if __name__ == "__main__":
main()