-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathload_pokec_data.py
158 lines (128 loc) · 5.37 KB
/
load_pokec_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
##############################################################################
#
# load_pokec_data.py - load Pokec social network data in SNAP
#
#
# File: load_pokec_data.py
# Author: Alex Stivala
# Created: November 2018
#
##############################################################################
"""Function to load the Pokec social network data from zip file and
convert to SNAP format.
The Pokec social network data is from SNAP
https://snap.stanford.edu/data/soc-Pokec.html
See documentation in
https://snap.stanford.edu/data/soc-pokec-readme.txt and source
citation:
L. Takac, M. Zabovsky. Data Analysis in Public Social Networks,
International Scientific Conference & International Workshop
Present Day Trends of Innovations, May 2012 Lomza, Poland.
https://snap.stanford.edu/data/soc-pokec.pdf
Reference for SNAP collection of data sets:
@misc{snapnets,
author = {Jure Leskovec and Andrej Krevl},
title = {{SNAP Datasets}: {Stanford} Large Network Dataset Collection},
howpublished = {\url{http://snap.stanford.edu/data}},
month = jun,
year = 2014
}
Input files (in specified directory):
soc-pokec-profiles.txt.gz
soc-pokec-relationships.txt.gz
For SNAP see
http://snap.stanford.edu/snappy/index.html
Used version 4.1.0.
E.g.
G = load_pokec_data('/home/stivala/SNAPestimations/pokec/')
NB this uses at least 0.5 GB memory and tmp directory space
"""
import os,sys
import glob
import tempfile
import gzip
import csv
import snap
#-----------------------------------------------------------------------------
#
# Functions
#
#-----------------------------------------------------------------------------
def cleanup_tmpdir(tmpdir):
"""
Remove a temporary directory and its contents
Parameters:
tmpdir - temporary directory to remove
Return value: None
"""
try:
for filename in glob.glob(os.path.join(tmpdir, "*")):
os.remove(filename)
os.rmdir(tmpdir)
except OSError, inst:
sys.stderr.write('WARNING: could not remove temp files'
' in ' + tmpdir + '\n' + str(inst) + '\n')
def load_pokec_data(indirname):
"""Load the pokec data from specified directory
Parameters:
indirname - path name of directory to load from
Return value:
tuple(G, profile) where
G - SNAP TNGraph object built from the data
profile - dictionary mapping node ID (int) to list
of attributes (all strings)
profile_colnames - dict mapping attribute name to
index of the profile list so e.g. we can look
up AGE of userid 123 with
profile[123][profile_colnames['AGE']]
Note that in SNAP, node IDs are unique integers and do not have to
be 0..N-1. However EstimNetDirected requires the node ids in the
Pajek files for its input are numbered 1..N. Fortunately the SNAP
Pokec network data has nodes numbered 1..N already so we can just
directly use those as the node ids and not have to do any
renumbering etc.
"""
infilename = "soc-pokec-relationships.txt.gz"
tmpdir = tempfile.mkdtemp()
try:
fin = gzip.open(os.path.join(indirname, infilename), 'rb')
filename = os.path.join(tmpdir, "soc-pokec-relationships.txt")
fout = open(filename, 'w')
fout.write(fin.read())
fout.close()
G = snap.LoadEdgeList(snap.PNGraph, filename, 0, 1, '\t')
finally:
cleanup_tmpdir(tmpdir)
# https://snap.stanford.edu/data/soc-pokec-readme.txt
# but 'user_id' column 0 used as dict key so not included here
colnames = [ 'public', 'completion_percentage',
'gender', 'region', 'last_login', 'registration',
'AGE', 'body', 'I_am_working_in_field',
'spoken_languages', 'hobbies',
'I_most_enjoy_good_food', 'pets', 'body_type',
'my_eyesight', 'eye_color', 'hair_color',
'hair_type', 'completed_level_of_education',
'favourite_color', 'relation_to_smoking',
'relation_to_alcohol', 'sign_in_zodiac',
'on_pokec_i_am_looking_for', 'love_is_for_me',
'relation_to_casual_sex', 'my_partner_should_be',
'marital_status', 'children',
'relation_to_children', 'I_like_movies',
'I_like_watching_movie', 'I_like_music',
'I_mostly_like_listening_to_music',
'the_idea_of_good_evening',
'I_like_specialties_from_kitchen', 'fun',
'I_am_going_to_concerts', 'my_active_sports',
'my_passive_sports', 'profession', 'I_like_books',
'life_style', 'music', 'cars', 'politics',
'relationships', 'art_culture',
'hobbies_interests', 'science_technologies',
'computers_internet', 'education', 'sport',
'movies', 'travelling', 'health',
'companies_brands', 'more']
profile_colnames = dict([(name, col) for (col, name) in enumerate(colnames)])
profilepath = os.path.join(indirname, "soc-pokec-profiles.txt.gz")
profiledata = [ (x[0], x[1:]) for x in csv.reader(gzip.open(profilepath, 'rb'), delimiter='\t') ]
profiledict = dict([(int(x[0]), x[1]) for x in profiledata])
assert(G.GetNodes() == len(profiledict))
return (G, profiledict, profile_colnames)