Skip to content

Commit

Permalink
Repo update w/ ZINC-full dataset (graphdeeplearning#46)
Browse files Browse the repository at this point in the history
* ZINC full data and scripts

* README updates

* README updates
  • Loading branch information
vijaydwivedi75 authored Oct 7, 2020
1 parent 527e16b commit 7446184
Show file tree
Hide file tree
Showing 10 changed files with 738 additions and 21 deletions.
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@

## Updates

**Oct 7, 2020**
* Repo updated to DGL 0.5.2 and PyTorch 1.6.0. Please update your environment using yml files ([CPU](./environment_cpu.yml), [GPU](./environment_gpu.yml)).
* Added [ZINC-full](./data/script_download_molecules.sh) dataset (249K molecular graphs) with [scripts](./scripts/ZINC-full/).


**Jun 11, 2020**
* Second release of the project. Major updates :
+ Added experimental pipeline for Weisfeiler-Lehman-GNNs operating on dense rank-2 tensors.
Expand Down
2 changes: 1 addition & 1 deletion data/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def LoadData(DATASET_NAME):
return SuperPixDataset(DATASET_NAME)

# handling for (ZINC) molecule dataset
if DATASET_NAME == 'ZINC':
if DATASET_NAME == 'ZINC' or DATASET_NAME == 'ZINC-full':
return MoleculeDataset(DATASET_NAME)

# handling for the TU Datasets
Expand Down
27 changes: 17 additions & 10 deletions data/molecules.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,20 +20,21 @@


class MoleculeDGL(torch.utils.data.Dataset):
def __init__(self, data_dir, split, num_graphs):
def __init__(self, data_dir, split, num_graphs=None):
self.data_dir = data_dir
self.split = split
self.num_graphs = num_graphs

with open(data_dir + "/%s.pickle" % self.split,"rb") as f:
self.data = pickle.load(f)

# loading the sampled indices from file ./zinc_molecules/<split>.index
with open(data_dir + "/%s.index" % self.split,"r") as f:
data_idx = [list(map(int, idx)) for idx in csv.reader(f)]
self.data = [ self.data[i] for i in data_idx[0] ]

assert len(self.data)==num_graphs, "Sample num_graphs again; available idx: train/val/test => 10k/1k/1k"
if self.num_graphs in [10000, 1000]:
# loading the sampled indices from file ./zinc_molecules/<split>.index
with open(data_dir + "/%s.index" % self.split,"r") as f:
data_idx = [list(map(int, idx)) for idx in csv.reader(f)]
self.data = [ self.data[i] for i in data_idx[0] ]

assert len(self.data)==num_graphs, "Sample num_graphs again; available idx: train/val/test => 10k/1k/1k"

"""
data is a list of Molecule dict objects with following attributes
Expand Down Expand Up @@ -104,9 +105,15 @@ def __init__(self, name='Zinc'):

data_dir='./data/molecules'

self.train = MoleculeDGL(data_dir, 'train', num_graphs=10000)
self.val = MoleculeDGL(data_dir, 'val', num_graphs=1000)
self.test = MoleculeDGL(data_dir, 'test', num_graphs=1000)
if self.name == 'ZINC-full':
data_dir='./data/molecules/zinc_full'
self.train = MoleculeDGL(data_dir, 'train', num_graphs=220011)
self.val = MoleculeDGL(data_dir, 'val', num_graphs=24445)
self.test = MoleculeDGL(data_dir, 'test', num_graphs=5000)
else:
self.train = MoleculeDGL(data_dir, 'train', num_graphs=10000)
self.val = MoleculeDGL(data_dir, 'val', num_graphs=1000)
self.test = MoleculeDGL(data_dir, 'test', num_graphs=1000)
print("Time taken: {:.4f}s".format(time.time()-t0))


Expand Down
393 changes: 393 additions & 0 deletions data/molecules/prepare_molecules_ZINC_full.ipynb

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions data/script_download_molecules.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,4 @@ else
curl https://www.dropbox.com/s/gpzovwqqsudarvq/ZINC-full.pkl?dl=1 -o ZINC-full.pkl -J -L -k
fi


8 changes: 4 additions & 4 deletions environment_cpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@ dependencies:
- python=3.7.4
- python-dateutil=2.8.0
- pip=19.2.3
- pytorch=1.3
- torchvision==0.4.2
- pytorch=1.6.0
- torchvision==0.7.0
- pillow==6.1
- dgl=0.4.2
- dgl=0.5.2
- numpy=1.16.4
- matplotlib=3.1.0
- tensorboard=1.14.0
Expand Down Expand Up @@ -41,4 +41,4 @@ dependencies:
- tensorflow==2.1.0
- tensorflow-estimator==2.1.0
- tensorboard==2.1.1
- ogb==1.1.1
- ogb==1.2.2
11 changes: 5 additions & 6 deletions environment_gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,15 @@ channels:
- anaconda
- defaults
dependencies:
- cuda10.0
- cudatoolkit=10.0
- cudatoolkit=10.2
- cudnn=7.6.5
- python=3.7.4
- python-dateutil=2.8.0
- pip=19.2.3
- pytorch=1.3
- torchvision==0.4.2
- pytorch=1.6.0
- torchvision==0.7.0
- pillow==6.1
- dgl-cuda10.0=0.4.2
- dgl-cuda10.2=0.5.2
- numpy=1.16.4
- matplotlib=3.1.0
- tensorboard=1.14.0
Expand Down Expand Up @@ -45,4 +44,4 @@ dependencies:
- tensorflow-gpu==2.1.0
- tensorflow-estimator==2.1.0
- tensorboard==2.1.1
- ogb==1.1.1
- ogb==1.2.2
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
#!/bin/bash


############
# Usage
############

# bash script_main_molecules_graph_regression_ZINC-full_100k.sh



############
# GNNs
############

#MLP
#GCN
#GraphSage
#GatedGCN
#GAT
#MoNet
#GIN
#3WLGNN
#RingGNN



############
# ZINC-full - 4 RUNS
############

seed0=41
seed1=95
seed2=12
seed3=35
code=main_molecules_graph_regression.py
dataset=ZINC-full
out_dir=out/molecules_graph_regression/ZINC_full/
tmux new -s benchmark -d
tmux send-keys "source activate benchmark_gnn" C-m
tmux send-keys "
python $code --dataset $dataset --out_dir $out_dir --gpu_id 0 --seed $seed0 --config 'configs/molecules_graph_regression_MLP_ZINC_100k.json' &
python $code --dataset $dataset --out_dir $out_dir --gpu_id 1 --seed $seed1 --config 'configs/molecules_graph_regression_MLP_ZINC_100k.json' &
python $code --dataset $dataset --out_dir $out_dir --gpu_id 2 --seed $seed2 --config 'configs/molecules_graph_regression_MLP_ZINC_100k.json' &
python $code --dataset $dataset --out_dir $out_dir --gpu_id 3 --seed $seed3 --config 'configs/molecules_graph_regression_MLP_ZINC_100k.json' &
wait" C-m
tmux send-keys "
python $code --dataset $dataset --out_dir $out_dir --gpu_id 0 --seed $seed0 --config 'configs/molecules_graph_regression_GCN_ZINC_100k.json' &
python $code --dataset $dataset --out_dir $out_dir --gpu_id 1 --seed $seed1 --config 'configs/molecules_graph_regression_GCN_ZINC_100k.json' &
python $code --dataset $dataset --out_dir $out_dir --gpu_id 2 --seed $seed2 --config 'configs/molecules_graph_regression_GCN_ZINC_100k.json' &
python $code --dataset $dataset --out_dir $out_dir --gpu_id 3 --seed $seed3 --config 'configs/molecules_graph_regression_GCN_ZINC_100k.json' &
wait" C-m
tmux send-keys "
python $code --dataset $dataset --out_dir $out_dir --gpu_id 0 --seed $seed0 --config 'configs/molecules_graph_regression_GraphSage_ZINC_100k.json' &
python $code --dataset $dataset --out_dir $out_dir --gpu_id 1 --seed $seed1 --config 'configs/molecules_graph_regression_GraphSage_ZINC_100k.json' &
python $code --dataset $dataset --out_dir $out_dir --gpu_id 2 --seed $seed2 --config 'configs/molecules_graph_regression_GraphSage_ZINC_100k.json' &
python $code --dataset $dataset --out_dir $out_dir --gpu_id 3 --seed $seed3 --config 'configs/molecules_graph_regression_GraphSage_ZINC_100k.json' &
wait" C-m
tmux send-keys "
python $code --dataset $dataset --out_dir $out_dir --gpu_id 0 --seed $seed0 --config 'configs/molecules_graph_regression_GatedGCN_ZINC_100k.json' &
python $code --dataset $dataset --out_dir $out_dir --gpu_id 1 --seed $seed1 --config 'configs/molecules_graph_regression_GatedGCN_ZINC_100k.json' &
python $code --dataset $dataset --out_dir $out_dir --gpu_id 2 --seed $seed2 --config 'configs/molecules_graph_regression_GatedGCN_ZINC_100k.json' &
python $code --dataset $dataset --out_dir $out_dir --gpu_id 3 --seed $seed3 --config 'configs/molecules_graph_regression_GatedGCN_ZINC_100k.json' &
wait" C-m
tmux send-keys "
python $code --dataset $dataset --out_dir $out_dir --gpu_id 0 --seed $seed0 --config 'configs/molecules_graph_regression_GAT_ZINC_100k.json' &
python $code --dataset $dataset --out_dir $out_dir --gpu_id 1 --seed $seed1 --config 'configs/molecules_graph_regression_GAT_ZINC_100k.json' &
python $code --dataset $dataset --out_dir $out_dir --gpu_id 2 --seed $seed2 --config 'configs/molecules_graph_regression_GAT_ZINC_100k.json' &
python $code --dataset $dataset --out_dir $out_dir --gpu_id 3 --seed $seed3 --config 'configs/molecules_graph_regression_GAT_ZINC_100k.json' &
wait" C-m
tmux send-keys "
python $code --dataset $dataset --out_dir $out_dir --gpu_id 0 --seed $seed0 --config 'configs/molecules_graph_regression_MoNet_ZINC_100k.json' &
python $code --dataset $dataset --out_dir $out_dir --gpu_id 1 --seed $seed1 --config 'configs/molecules_graph_regression_MoNet_ZINC_100k.json' &
python $code --dataset $dataset --out_dir $out_dir --gpu_id 2 --seed $seed2 --config 'configs/molecules_graph_regression_MoNet_ZINC_100k.json' &
python $code --dataset $dataset --out_dir $out_dir --gpu_id 3 --seed $seed3 --config 'configs/molecules_graph_regression_MoNet_ZINC_100k.json' &
wait" C-m
tmux send-keys "
python $code --dataset $dataset --out_dir $out_dir --gpu_id 0 --seed $seed0 --config 'configs/molecules_graph_regression_GIN_ZINC_100k.json' &
python $code --dataset $dataset --out_dir $out_dir --gpu_id 1 --seed $seed1 --config 'configs/molecules_graph_regression_GIN_ZINC_100k.json' &
python $code --dataset $dataset --out_dir $out_dir --gpu_id 2 --seed $seed2 --config 'configs/molecules_graph_regression_GIN_ZINC_100k.json' &
python $code --dataset $dataset --out_dir $out_dir --gpu_id 3 --seed $seed3 --config 'configs/molecules_graph_regression_GIN_ZINC_100k.json' &
wait" C-m
tmux send-keys "
python $code --dataset $dataset --out_dir $out_dir --gpu_id 0 --seed $seed0 --config 'configs/molecules_graph_regression_3WLGNN_ZINC_100k.json' &
python $code --dataset $dataset --out_dir $out_dir --gpu_id 1 --seed $seed1 --config 'configs/molecules_graph_regression_3WLGNN_ZINC_100k.json' &
python $code --dataset $dataset --out_dir $out_dir --gpu_id 2 --seed $seed2 --config 'configs/molecules_graph_regression_3WLGNN_ZINC_100k.json' &
python $code --dataset $dataset --out_dir $out_dir --gpu_id 3 --seed $seed3 --config 'configs/molecules_graph_regression_3WLGNN_ZINC_100k.json' &
wait" C-m
tmux send-keys "
python $code --dataset $dataset --out_dir $out_dir --gpu_id 0 --seed $seed0 --config 'configs/molecules_graph_regression_RingGNN_ZINC_100k.json' &
python $code --dataset $dataset --out_dir $out_dir --gpu_id 1 --seed $seed1 --config 'configs/molecules_graph_regression_RingGNN_ZINC_100k.json' &
python $code --dataset $dataset --out_dir $out_dir --gpu_id 2 --seed $seed2 --config 'configs/molecules_graph_regression_RingGNN_ZINC_100k.json' &
python $code --dataset $dataset --out_dir $out_dir --gpu_id 3 --seed $seed3 --config 'configs/molecules_graph_regression_RingGNN_ZINC_100k.json' &
wait" C-m
tmux send-keys "
python $code --dataset $dataset --out_dir $out_dir --gpu_id 0 --seed $seed0 --edge_feat True --config 'configs/molecules_graph_regression_GatedGCN_ZINC_100k.json' &
python $code --dataset $dataset --out_dir $out_dir --gpu_id 1 --seed $seed1 --edge_feat True --config 'configs/molecules_graph_regression_GatedGCN_ZINC_100k.json' &
python $code --dataset $dataset --out_dir $out_dir --gpu_id 2 --seed $seed2 --edge_feat True --config 'configs/molecules_graph_regression_GatedGCN_ZINC_100k.json' &
python $code --dataset $dataset --out_dir $out_dir --gpu_id 3 --seed $seed3 --edge_feat True --config 'configs/molecules_graph_regression_GatedGCN_ZINC_100k.json' &
wait" C-m
tmux send-keys "
python $code --dataset $dataset --out_dir $out_dir --gpu_id 0 --seed $seed0 --edge_feat True --config 'configs/molecules_graph_regression_3WLGNN_ZINC_100k.json' &
python $code --dataset $dataset --out_dir $out_dir --gpu_id 1 --seed $seed1 --edge_feat True --config 'configs/molecules_graph_regression_3WLGNN_ZINC_100k.json' &
python $code --dataset $dataset --out_dir $out_dir --gpu_id 2 --seed $seed2 --edge_feat True --config 'configs/molecules_graph_regression_3WLGNN_ZINC_100k.json' &
python $code --dataset $dataset --out_dir $out_dir --gpu_id 3 --seed $seed3 --edge_feat True --config 'configs/molecules_graph_regression_3WLGNN_ZINC_100k.json' &
wait" C-m
tmux send-keys "
python $code --dataset $dataset --out_dir $out_dir --gpu_id 0 --seed $seed0 --edge_feat True --config 'configs/molecules_graph_regression_RingGNN_ZINC_100k.json' &
python $code --dataset $dataset --out_dir $out_dir --gpu_id 1 --seed $seed1 --edge_feat True --config 'configs/molecules_graph_regression_RingGNN_ZINC_100k.json' &
python $code --dataset $dataset --out_dir $out_dir --gpu_id 2 --seed $seed2 --edge_feat True --config 'configs/molecules_graph_regression_RingGNN_ZINC_100k.json' &
python $code --dataset $dataset --out_dir $out_dir --gpu_id 3 --seed $seed3 --edge_feat True --config 'configs/molecules_graph_regression_RingGNN_ZINC_100k.json' &
wait" C-m
tmux send-keys "tmux kill-session -t benchmark" C-m











Loading

0 comments on commit 7446184

Please sign in to comment.