update python README

LingLing-85 · Sep 14, 2018 · 46da136 · 46da136
1 parent 058196c
commit 46da136
Show file tree

Hide file tree

Showing 10 changed files with 52 additions and 27 deletions.
diff --git a/MATLAB/Main.m b/MATLAB/Main.m
@@ -10,21 +10,21 @@
 setting = 1;
 switch setting
 case 1  % traditional link prediction benchmarks
-    numOfExperiment = 1;        
-    %workers = 5;  % number of workers running parallelly
-    workers = 0;  % change workers to 0 to disable parallel loop of multiple exps
-    ratioTrain = 0.9; % train split ratio
+    numOfExperiment = 10;        
+    workers = 2;  % number of workers running parallelly
+    %workers = 0;  % change workers to 0 to disable parallel loop of multiple exps
+    ratioTrain = 0.5; % train split ratio
     connected = false; % whether to sample test links while ensuring the remaining net is connected
     %dataname = strvcat('USAir','NS','PB','Yeast','Celegans','Power','Router','Ecoli');
-    %dataname = strvcat('USAir','NS','Yeast','Celegans','Power','Router'); % 
+    dataname = strvcat('USAir','NS','Yeast','Celegans','Power','Router'); % 
     %dataname = strvcat('PB', 'Ecoli');  % set workers 5,  h=1 for SEAL due to memory issues
-    %dataname = strvcat('PB', 'Ecoli'); % set workers 2, h=1 for WL alone due to memory issues
-    dataname = strvcat('USAir');
+    dataname = strvcat('PB', 'Ecoli'); % set workers 2, h=1 for WL alone due to memory issues
+    %dataname = strvcat('USAir');
     %method = [1, 2, 3, 4, 5, 6, 7, 8, 9];  % 1: SEAL,  2: Heuristic methods 3: Traditional latent feature methods,  4: WLNM 5: WL graph kernel, 6: Embedding methods
-    %method =[1, 2, 3, 4, 5, 6];
-    method =[1];
+    method =[1, 2, 3, 4, 5, 6];
+    %method =[1];
     h = 'auto';  % the maximum hop to extract enclosing subgraphs, h = 'auto' means to automatically select h from 1, 2
-    %h = 1;
+    h = 1;
     include_embedding = 0;  % whether to include node embeddings in node information matrix of SEAL, needs node2vec software
     include_attribute = 0;
     portion = 1;  % portion of observed links selected as training data

diff --git a/MATLAB/ensemble_heuristics.m b/MATLAB/ensemble_heuristics.m
@@ -10,7 +10,7 @@
 %  -all_sims: similarity scores of all heuristics
 %  --Output--
 %  -auc: the AUC score on testing links
-
+%
 %  *author: Muhan Zhang, Washington University in St. Louis
 %%
 

diff --git a/MATLAB/generate_embeddings.m b/MATLAB/generate_embeddings.m
@@ -8,6 +8,8 @@
 %  --Output--
 %  -node_embeddings: a matrix, ith row contains the ith node's embeddings
 %
+%  *author: Muhan Zhang, Washington University in St. Louis
+%%
 
 if nargin < 3
     emd_method = 'node2vec'

diff --git a/MATLAB/sample_neg.m b/MATLAB/sample_neg.m
@@ -10,15 +10,17 @@
 %  -test: half test positive adjacency matrix
 %  -k: how many times of negative links (w.r.t. pos links) to 
 %      sample
-%  -portion: if specified, only a portion of the sampled train
-%            and test links be returned
+%  -portion: if specified to (e.g.) 0.5, only 50%  of the sampled
+%            train and test links will be returned
 %  -evaluate_on_all_unseen: if true, will not randomly sample
 %                          negative testing links, but regard
 %                          all links unseen during training as 
 %                          neg testing links; train negative links 
 %                          are sampled in the original way
 %  --Output--
 %  column indices for four datasets
+%
+%  *author: Muhan Zhang, Washington University in St. Louis
 %%
 
 if nargin < 3

diff --git a/Python/.gitignore b/Python/.gitignore
@@ -1,3 +1,4 @@
 *.pyc
 backup/
 *results.txt
+test_scores.txt
diff --git a/Python/.util_functions.py.swp b/Python/.util_functions.py.swp
diff --git a/Python/Main.py b/Python/Main.py
@@ -74,7 +74,7 @@
     args.train_dir = os.path.join(args.file_dir, 'data/{}'.format(args.train_name))
     args.test_dir = os.path.join(args.file_dir, 'data/{}'.format(args.test_name))
     train_idx = np.loadtxt(args.train_dir, dtype=int)
-    test_idx = np.loadtxt(args.test_dir, dtype=int)[:50]
+    test_idx = np.loadtxt(args.test_dir, dtype=int)
     max_idx = max(np.max(train_idx), np.max(test_idx))
     net = ssp.csc_matrix((np.ones(len(train_idx)), (train_idx[:, 0], train_idx[:, 1])), shape=(max_idx+1, max_idx+1))
     net[train_idx[:, 1], train_idx[:, 0]] = 1  # add symmetric edges

diff --git a/Python/README.md b/Python/README.md
@@ -6,13 +6,17 @@ About
 
 Python version of SEAL (learning from Subgraphs, Embeddings, and Attributes for Link prediction).
 
-Usages
-------
+Installation
+------------
 
 Please download our [\[pytorch_DGCNN software\]](https://github.com/muhanzhang/pytorch_DGCNN) to the same level as the root SEAL folder (not this Python folder). DGCNN is the default graph neural network in SEAL.
 
 Install pytorch_DGCNN according to its instruction.
 
+
+Usages
+------
+
 Type "python Main.py" to have a try of SEAL on the USAir network.
 
 Type:
@@ -21,25 +25,29 @@ Type:
 
 to run SEAL on the NS network with 50% observed links randomly removed as testing links, hop number set automatically from {1, 2}, and node2vec embeddings included.
 
+Type:
+
+    python Main.py --data-name PPI_subgraph --test-ratio 0.5 --use-embedding --use-attribute
+
+to run SEAL on PPI_subgraph with node attributes included. The node attributes are assumed to be saved in the  _group_ of the _.mat_ file.
+
 Type:
 
     python Main.py --train-name PB_train.txt --test-name PB_test.txt --hop 1
 
 to run SEAL on a custom splitting of train and test links, where each row of "PB_train.txt" is an observed training link, each row of "PB_test.txt" is an unobserved testing link. Note that links in "PB_train.txt" will be used to construct the observed network, yet it is not necessary to train SEAL on all links in "PB_train.txt" especially when the number of observed links is huge. To set a maximum number of links to train on, append "--max-train-num 10000" for example.
 
-Sometimes even extracting 1-hop enclosing subgraphs for some links leads to unaffordable number of nodes in the enclosing subgraphs, especially in Twitter-type networks where a hub node can have millions of followers. To deal with this case, append "--max-nodes-per-hop 100" for example to restrict the number of nodes in each hop to be less than 100 using random sampling. SEAL will still have excellent performance.
+Sometimes even extracting 1-hop enclosing subgraphs for some links leads to unaffordable number of nodes in the enclosing subgraphs, especially in Twitter-type networks where a hub node can have millions of followers. To deal with this case, append "--max-nodes-per-hop 100" for example to restrict the number of nodes in each hop to be less than 100 using random sampling. SEAL still shows excellent performance.
 
 
 Requirements
 ------------
 
 Tested with Python 2.7, Pytorch 4.0.
 
-All python libraries required by pytorch_DGCNN such as networkx, tqdm, sklearn etc. are required.
-
-Python libraries gensim and scipy are required.
+Required python libraries: gensim and scipy; all python libraries required by pytorch_DGCNN such as networkx, tqdm, sklearn etc.
 
-A network embedding software node2vec has been included in "software/". If it does not work, you may need to reinstall it from source.
+If you want to enable embeddings for link prediction, please install the network embedding software 'node2vec' in "software/" (if the included one does not work).
 
 
 Reference
@@ -56,4 +64,4 @@ If you find the code useful, please cite our paper:
 
 Muhan Zhang, Washington University in St. Louis
 muhan@wustl.edu
-2/10/2018
+9/5/2018
diff --git a/Python/install.sh b/Python/install.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+cd ../../
+git clone https://github.com/muhanzhang/pytorch_DGCNN
+cd pytorch_DGCNN-master
+unzip pytorch_structure2vec-master.zip
+cd pytorch_structure2vec-master/s2vlib/
+make -j4
+cd SEAL/Python
+
+
diff --git a/Python/util_functions.py b/Python/util_functions.py
@@ -11,13 +11,14 @@
 import scipy.sparse as ssp
 from sklearn import metrics
 from gensim.models import Word2Vec
+import warnings
+warnings.simplefilter('ignore', ssp.SparseEfficiencyWarning)
 cur_dir = os.path.dirname(os.path.realpath(__file__))
 sys.path.append('%s/../../pytorch_DGCNN' % cur_dir)
 sys.path.append('%s/software/node2vec/src' % cur_dir)
 from util import S2VGraph
 import node2vec
 
-
 def sample_neg(net, test_ratio=0.1, train_pos=None, test_pos=None, max_train_num=None):
     # get upper triangular matrix
     net_triu = ssp.triu(net, k=1)
@@ -71,19 +72,19 @@ def links2subgraphs(A, train_pos, train_neg, test_pos, test_neg, h=1, max_nodes_
             print('\033[91mChoose h=1\033[0m')
 
     # extract enclosing subgraphs
-    max_n_label = {'val': 0}
+    max_n_label = {'value': 0}
     def helper(A, links, g_label):
         g_list = []
         for i, j in tqdm(zip(links[0], links[1])):
             g, n_labels, n_features = subgraph_extraction_labeling((i, j), A, h, max_nodes_per_hop, node_information)
-            max_n_label['val'] = max(max(n_labels), max_n_label['val'])
+            max_n_label['value'] = max(max(n_labels), max_n_label['value'])
             g_list.append(S2VGraph(g, g_label, n_labels, n_features))
         return g_list
     print('Enclosing subgraph extraction begins...')
     train_graphs = helper(A, train_pos, 1) + helper(A, train_neg, 0)
     test_graphs = helper(A, test_pos, 1) + helper(A, test_neg, 0)
     print(max_n_label)
-    return train_graphs, test_graphs, max_n_label['val']
+    return train_graphs, test_graphs, max_n_label['value']
 
 
 def subgraph_extraction_labeling(ind, A, h=1, max_nodes_per_hop=None, node_information=None):
@@ -120,7 +121,7 @@ def subgraph_extraction_labeling(ind, A, h=1, max_nodes_per_hop=None, node_infor
     if node_information is not None:
         features = node_information[nodes]
     # construct nx graph
-    g = nx.from_numpy_matrix(subgraph.toarray())
+    g = nx.from_scipy_sparse_matrix(subgraph)
     return g, labels.tolist(), features