Skip to content

Commit

Permalink
update python README
Browse files Browse the repository at this point in the history
  • Loading branch information
muhanzhang committed Sep 14, 2018
1 parent 058196c commit 46da136
Show file tree
Hide file tree
Showing 10 changed files with 52 additions and 27 deletions.
20 changes: 10 additions & 10 deletions MATLAB/Main.m
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,21 @@
setting = 1;
switch setting
case 1 % traditional link prediction benchmarks
numOfExperiment = 1;
%workers = 5; % number of workers running parallelly
workers = 0; % change workers to 0 to disable parallel loop of multiple exps
ratioTrain = 0.9; % train split ratio
numOfExperiment = 10;
workers = 2; % number of workers running parallelly
%workers = 0; % change workers to 0 to disable parallel loop of multiple exps
ratioTrain = 0.5; % train split ratio
connected = false; % whether to sample test links while ensuring the remaining net is connected
%dataname = strvcat('USAir','NS','PB','Yeast','Celegans','Power','Router','Ecoli');
%dataname = strvcat('USAir','NS','Yeast','Celegans','Power','Router'); %
dataname = strvcat('USAir','NS','Yeast','Celegans','Power','Router'); %
%dataname = strvcat('PB', 'Ecoli'); % set workers 5, h=1 for SEAL due to memory issues
%dataname = strvcat('PB', 'Ecoli'); % set workers 2, h=1 for WL alone due to memory issues
dataname = strvcat('USAir');
dataname = strvcat('PB', 'Ecoli'); % set workers 2, h=1 for WL alone due to memory issues
%dataname = strvcat('USAir');
%method = [1, 2, 3, 4, 5, 6, 7, 8, 9]; % 1: SEAL, 2: Heuristic methods 3: Traditional latent feature methods, 4: WLNM 5: WL graph kernel, 6: Embedding methods
%method =[1, 2, 3, 4, 5, 6];
method =[1];
method =[1, 2, 3, 4, 5, 6];
%method =[1];
h = 'auto'; % the maximum hop to extract enclosing subgraphs, h = 'auto' means to automatically select h from 1, 2
%h = 1;
h = 1;
include_embedding = 0; % whether to include node embeddings in node information matrix of SEAL, needs node2vec software
include_attribute = 0;
portion = 1; % portion of observed links selected as training data
Expand Down
2 changes: 1 addition & 1 deletion MATLAB/ensemble_heuristics.m
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
% -all_sims: similarity scores of all heuristics
% --Output--
% -auc: the AUC score on testing links

%
% *author: Muhan Zhang, Washington University in St. Louis
%%

Expand Down
2 changes: 2 additions & 0 deletions MATLAB/generate_embeddings.m
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
% --Output--
% -node_embeddings: a matrix, ith row contains the ith node's embeddings
%
% *author: Muhan Zhang, Washington University in St. Louis
%%

if nargin < 3
emd_method = 'node2vec'
Expand Down
6 changes: 4 additions & 2 deletions MATLAB/sample_neg.m
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,17 @@
% -test: half test positive adjacency matrix
% -k: how many times of negative links (w.r.t. pos links) to
% sample
% -portion: if specified, only a portion of the sampled train
% and test links be returned
% -portion: if specified to (e.g.) 0.5, only 50% of the sampled
% train and test links will be returned
% -evaluate_on_all_unseen: if true, will not randomly sample
% negative testing links, but regard
% all links unseen during training as
% neg testing links; train negative links
% are sampled in the original way
% --Output--
% column indices for four datasets
%
% *author: Muhan Zhang, Washington University in St. Louis
%%

if nargin < 3
Expand Down
1 change: 1 addition & 0 deletions Python/.gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
*.pyc
backup/
*results.txt
test_scores.txt
Binary file removed Python/.util_functions.py.swp
Binary file not shown.
2 changes: 1 addition & 1 deletion Python/Main.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@
args.train_dir = os.path.join(args.file_dir, 'data/{}'.format(args.train_name))
args.test_dir = os.path.join(args.file_dir, 'data/{}'.format(args.test_name))
train_idx = np.loadtxt(args.train_dir, dtype=int)
test_idx = np.loadtxt(args.test_dir, dtype=int)[:50]
test_idx = np.loadtxt(args.test_dir, dtype=int)
max_idx = max(np.max(train_idx), np.max(test_idx))
net = ssp.csc_matrix((np.ones(len(train_idx)), (train_idx[:, 0], train_idx[:, 1])), shape=(max_idx+1, max_idx+1))
net[train_idx[:, 1], train_idx[:, 0]] = 1 # add symmetric edges
Expand Down
24 changes: 16 additions & 8 deletions Python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,17 @@ About

Python version of SEAL (learning from Subgraphs, Embeddings, and Attributes for Link prediction).

Usages
------
Installation
------------

Please download our [\[pytorch_DGCNN software\]](https://github.com/muhanzhang/pytorch_DGCNN) to the same level as the root SEAL folder (not this Python folder). DGCNN is the default graph neural network in SEAL.

Install pytorch_DGCNN according to its instruction.


Usages
------

Type "python Main.py" to have a try of SEAL on the USAir network.

Type:
Expand All @@ -21,25 +25,29 @@ Type:

to run SEAL on the NS network with 50% observed links randomly removed as testing links, hop number set automatically from {1, 2}, and node2vec embeddings included.

Type:

python Main.py --data-name PPI_subgraph --test-ratio 0.5 --use-embedding --use-attribute

to run SEAL on PPI_subgraph with node attributes included. The node attributes are assumed to be saved in the _group_ of the _.mat_ file.

Type:

python Main.py --train-name PB_train.txt --test-name PB_test.txt --hop 1

to run SEAL on a custom splitting of train and test links, where each row of "PB_train.txt" is an observed training link, each row of "PB_test.txt" is an unobserved testing link. Note that links in "PB_train.txt" will be used to construct the observed network, yet it is not necessary to train SEAL on all links in "PB_train.txt" especially when the number of observed links is huge. To set a maximum number of links to train on, append "--max-train-num 10000" for example.

Sometimes even extracting 1-hop enclosing subgraphs for some links leads to unaffordable number of nodes in the enclosing subgraphs, especially in Twitter-type networks where a hub node can have millions of followers. To deal with this case, append "--max-nodes-per-hop 100" for example to restrict the number of nodes in each hop to be less than 100 using random sampling. SEAL will still have excellent performance.
Sometimes even extracting 1-hop enclosing subgraphs for some links leads to unaffordable number of nodes in the enclosing subgraphs, especially in Twitter-type networks where a hub node can have millions of followers. To deal with this case, append "--max-nodes-per-hop 100" for example to restrict the number of nodes in each hop to be less than 100 using random sampling. SEAL still shows excellent performance.


Requirements
------------

Tested with Python 2.7, Pytorch 4.0.

All python libraries required by pytorch_DGCNN such as networkx, tqdm, sklearn etc. are required.

Python libraries gensim and scipy are required.
Required python libraries: gensim and scipy; all python libraries required by pytorch_DGCNN such as networkx, tqdm, sklearn etc.

A network embedding software node2vec has been included in "software/". If it does not work, you may need to reinstall it from source.
If you want to enable embeddings for link prediction, please install the network embedding software 'node2vec' in "software/" (if the included one does not work).


Reference
Expand All @@ -56,4 +64,4 @@ If you find the code useful, please cite our paper:

Muhan Zhang, Washington University in St. Louis
muhan@wustl.edu
2/10/2018
9/5/2018
11 changes: 11 additions & 0 deletions Python/install.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash

cd ../../
git clone https://github.com/muhanzhang/pytorch_DGCNN
cd pytorch_DGCNN-master
unzip pytorch_structure2vec-master.zip
cd pytorch_structure2vec-master/s2vlib/
make -j4
cd SEAL/Python


11 changes: 6 additions & 5 deletions Python/util_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,14 @@
import scipy.sparse as ssp
from sklearn import metrics
from gensim.models import Word2Vec
import warnings
warnings.simplefilter('ignore', ssp.SparseEfficiencyWarning)
cur_dir = os.path.dirname(os.path.realpath(__file__))
sys.path.append('%s/../../pytorch_DGCNN' % cur_dir)
sys.path.append('%s/software/node2vec/src' % cur_dir)
from util import S2VGraph
import node2vec


def sample_neg(net, test_ratio=0.1, train_pos=None, test_pos=None, max_train_num=None):
# get upper triangular matrix
net_triu = ssp.triu(net, k=1)
Expand Down Expand Up @@ -71,19 +72,19 @@ def links2subgraphs(A, train_pos, train_neg, test_pos, test_neg, h=1, max_nodes_
print('\033[91mChoose h=1\033[0m')

# extract enclosing subgraphs
max_n_label = {'val': 0}
max_n_label = {'value': 0}
def helper(A, links, g_label):
g_list = []
for i, j in tqdm(zip(links[0], links[1])):
g, n_labels, n_features = subgraph_extraction_labeling((i, j), A, h, max_nodes_per_hop, node_information)
max_n_label['val'] = max(max(n_labels), max_n_label['val'])
max_n_label['value'] = max(max(n_labels), max_n_label['value'])
g_list.append(S2VGraph(g, g_label, n_labels, n_features))
return g_list
print('Enclosing subgraph extraction begins...')
train_graphs = helper(A, train_pos, 1) + helper(A, train_neg, 0)
test_graphs = helper(A, test_pos, 1) + helper(A, test_neg, 0)
print(max_n_label)
return train_graphs, test_graphs, max_n_label['val']
return train_graphs, test_graphs, max_n_label['value']


def subgraph_extraction_labeling(ind, A, h=1, max_nodes_per_hop=None, node_information=None):
Expand Down Expand Up @@ -120,7 +121,7 @@ def subgraph_extraction_labeling(ind, A, h=1, max_nodes_per_hop=None, node_infor
if node_information is not None:
features = node_information[nodes]
# construct nx graph
g = nx.from_numpy_matrix(subgraph.toarray())
g = nx.from_scipy_sparse_matrix(subgraph)
return g, labels.tolist(), features


Expand Down

0 comments on commit 46da136

Please sign in to comment.