diff --git a/expts/configs/config_gpspp_10M_pcqm4m.yaml b/expts/configs/config_gpspp_10M_pcqm4m.yaml
deleted file mode 100644
index c6862cb06..000000000
--- a/expts/configs/config_gpspp_10M_pcqm4m.yaml
+++ /dev/null
@@ -1,227 +0,0 @@
-# GPS++ model with the PCQMv2 dataset on IPU.
-constants:
-  name: &name pcqm4mv2_gpspp_4layer
-  seed: &seed 42
-  raise_train_error: true   # Whether the code should raise an error if it crashes during training
-  accelerator:
-    type: ipu  # cpu or ipu or gpu
-
-datamodule:
-  module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
-  args: # Matches that in the test_multitask_datamodule.py case.
-    task_specific_args:   # To be replaced by a new class "DatasetParams"
-      homolumo:
-        df: null
-        task_level: "graph"
-        df_path: graphium/data/PCQM4Mv2/pcqm4mv2.csv #graphium/data/PCQM4Mv2/pcqm4mv2.csv
-        # wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv
-        # or set path as https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv directly
-        smiles_col: "cxsmiles"
-        label_cols: ["homo_lumo_gap"]
-        # sample_size: 80000 # use sample_size for test
-        splits_path: graphium/data/PCQM4Mv2/split_dict_v2.pt  # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt`
-        # graphium/data/PCQM4Mv2/split_dict.pt
-        # graphium/data/PCQM4Mv2/pcqm4m_split.csv
-        split_names: ["train", "valid", "test-dev"]
-        label_normalization:
-          method: "normal"
-          min_clipping: 0
-          max_clipping: 50
-
-    # Featurization
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 20
-    featurization_progress: True
-    featurization_backend: "loky"
-    processed_graph_data_path: "/tmp/graphium_data/PCQM4Mv2/"
-    featurization:
-    # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
-    # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
-    # 'num_chiral_centers (not included yet)']
-      atom_property_list_onehot: [atomic-number, group, period, total-valence]
-      atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring]
-      conformer_property_list: [positions_3d] # 3D_bias
-      # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring']
-      edge_property_list: [bond-type-onehot, stereo, in-ring]
-      add_self_loop: False
-      explicit_H: False # if H is included
-      use_bonds_weights: False
-      pos_encoding_as_features: # encoder dropout 0.18
-        pos_types:
-          la_pos: &pos_enc
-            pos_type: laplacian_eigvec_eigval #laplacian_eigvec
-            num_pos: 8
-            normalization: "none" # nomrlization already applied on the eigen vectors
-            disconnected_comp: True # if eigen values/vector for disconnected graph are included
-          rw_pos: # use same name as pe_encoder
-            pos_type: rwse
-            ksteps: 16
-
-
-    # Data handling-related
-    batch_size_training: 16
-    batch_size_inference: 16
-    # cache_data_path: .
-    num_workers: 20 # -1 to use all
-    persistent_workers: False # if use persistent worker at the start of each epoch.
-    # Using persistent_workers false might make the start of each epoch very long.
-    featurization_backend: "loky"
-
-    ipu_dataloader_training_opts:
-      mode: async
-      max_num_nodes_per_graph: 20 # train max nodes: 20, max_edges: 54
-      max_num_edges_per_graph: 60
-
-    ipu_dataloader_inference_opts:
-      mode: async
-      max_num_nodes_per_graph: 16 # valid max nodes: 51, max_edges: 118
-      max_num_edges_per_graph: 120
-      # test-dev max nodes: 50, max_edges: 116
-      # test-challenge max nodes: 51, max_edges: 106
-
-architecture:
-  model_type: FullGraphMultiTaskNetwork
-  mup_base_path: null
-  pre_nn:   # Set as null to avoid a pre-nn network
-    out_dim: 256
-    hidden_dims: 1024
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: &dropout 0.18
-    normalization: &normalization layer_norm
-    last_normalization: *normalization
-    residual_type: none
-
-  pre_nn_edges:   # Set as null to avoid a pre-nn network
-    out_dim: 128
-    hidden_dims: 512
-    depth: 2
-    activation: relu
-    last_activation: none
-    dropout: *dropout
-    normalization: *normalization
-    last_normalization: *normalization
-    residual_type: none
-
-  pe_encoders:
-    out_dim: 32
-    pool: "sum" #"mean" "max"
-    last_norm: None #"batch_norm", "layer_norm"
-    encoders: #la_pos |  rw_pos
-      la_pos:  # Set as null to avoid a pre-nn network
-        encoder_type: "laplacian_pe"
-        input_keys: ["eigvecs", "eigvals"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        model_type: 'DeepSet' #'Transformer' or 'DeepSet'
-        num_layers: 2
-        num_layers_post: 1 # Num. layers to apply after pooling
-        dropout: 0.18
-        first_normalization: "none" #"batch_norm" or "layer_norm"
-      rw_pos:
-        encoder_type: "mlp"
-        input_keys: ["rwse"]
-        output_keys: ["feat"]
-        hidden_dim: 64
-        out_dim: 32
-        num_layers: 2
-        dropout: 0.18
-        normalization: "layer_norm" #"batch_norm" or "layer_norm"
-        first_normalization: "layer_norm" #"batch_norm" or "layer_norm"
-      gaussian_pos: # 3D_bias
-        encoder_type: "gaussian_kernel"
-        input_keys: ["positions_3d"]
-        output_keys: ["feat", "graph_gaussian_bias_3d"]
-        num_heads: 32
-        num_layers: 1 #2
-        embed_dim: 32
-        out_dim: 32 # need num of gaussian kernels 128
-        # but currently it checks pe_out_dim == pe_out_dim in encoder_manager.py, line 128
-        use_input_keys_prefix: False
-
-
-  gnn:  # Set as null to avoid a post-nn network
-    out_dim: 256
-    hidden_dims: 256
-    depth: 4
-    activation: gelu
-    last_activation: none
-    dropout: 0.1
-    normalization: "layer_norm"
-    last_normalization: *normalization
-    residual_type: simple
-    pooling: [sum]
-    virtual_node: 'none'
-    layer_type: 'pyg:gps' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps
-    layer_kwargs:  # Parameters for the model itself. You could define dropout_attn: 0.1
-      node_residual: false
-      mpnn_type: 'pyg:mpnnplus'
-      mpnn_kwargs:
-        in_dim: 256
-        out_dim: 256
-        in_dim_edges: 128
-        out_dim_edges: 128
-      attn_type: "full-attention" # "full-attention", "none"
-      precision: &precision 16
-      biased_attention_key: "graph_gaussian_bias_3d" # 3D_bias
-      attn_kwargs:
-        num_heads: 32
-      droppath_rate_attn: 0.0
-      droppath_rate_ffn: 0.0
-
-
-  post_nn: null
-
-  task_heads:
-    homolumo:
-      out_dim: 1
-      hidden_dims: 256
-      depth: 2                          # Not needed if we have hidden_dims
-      activation: relu
-      last_activation: none
-      dropout: *dropout
-      normalization: *normalization
-      last_normalization: "none"
-      residual_type: none
-
-#Task-specific
-predictor:
-  metrics_on_progress_bar:
-    homolumo: ["mae", "pearsonr"]
-  loss_fun:
-    homolumo: mae_ipu
-  random_seed: *seed
-  optim_kwargs:
-    lr: 4.e-4 # warmup can be scheduled using torch_scheduler_kwargs
-    # weight_decay: 1.e-7
-    # loss_scaling: 1024
-  torch_scheduler_kwargs:
-    module_type: WarmUpLinearLR
-    max_num_epochs: &max_epochs 100
-    warmup_epochs: 10
-    verbose: False
-  scheduler_kwargs:
-  #  monitor: &monitor homolumo/mae/train
-  #  mode: min
-  #  frequency: 1
-  target_nan_mask: null # null: no mask, 0: 0 mask, ignore: ignore nan values from loss
-  flag_kwargs:
-    n_steps: 0 # 1
-    alpha: 0.0 # 0.01
-
-# Task-specific
-metrics:
-  homolumo:
-    - name: mae
-      metric: mae_ipu
-      target_nan_mask: null
-      multitask_handling: flatten
-      threshold_kwargs: null
-    - name: pearsonr
-      metric: pearsonr_ipu
-      threshold_kwargs: null
-      target_nan_mask: null
-      multitask_handling: mean-per-label
diff --git a/expts/hydra-configs/accelerator/ipu.yaml b/expts/hydra-configs/accelerator/ipu.yaml
index 6e7fc8e06..43e4455ef 100644
--- a/expts/hydra-configs/accelerator/ipu.yaml
+++ b/expts/hydra-configs/accelerator/ipu.yaml
@@ -1,9 +1,16 @@
 type: ipu
 ipu_config:
-    - deviceIterations(5) # IPU would require large batches to be ready for the model.
+    - deviceIterations(30) # IPU would require large batches to be ready for the model.
     - replicationFactor(16)
     # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
     # - enableExecutableCaching("pop_compiler_cache")
     - TensorLocations.numIOTiles(128)
-    - _Popart.set("defaultBufferingDepth", 128)
-    - Precision.enableStochasticRounding(True)
\ No newline at end of file
+    - _Popart.set("defaultBufferingDepth", 96)
+    - Precision.enableStochasticRounding(True)
+
+ipu_inference_config:
+    # set device iteration and replication factor to 1 during inference
+    # gradient accumulation was set to 1 in the code
+    - deviceIterations(1)
+    - replicationFactor(1)
+    - Precision.enableStochasticRounding(False)
diff --git a/expts/hydra-configs/dataset/accelerator/pcqm4m_ipu.yaml b/expts/hydra-configs/dataset/accelerator/pcqm4m_ipu.yaml
new file mode 100644
index 000000000..6502f9414
--- /dev/null
+++ b/expts/hydra-configs/dataset/accelerator/pcqm4m_ipu.yaml
@@ -0,0 +1,25 @@
+# @package _global_
+
+datamodule:
+  args:
+    ipu_dataloader_training_opts:
+      mode: async
+      max_num_nodes_per_graph: 16 # train max nodes: 20, max_edges: 54
+      max_num_edges_per_graph: 60
+    ipu_dataloader_inference_opts:
+      mode: async
+      max_num_nodes_per_graph: 30 # valid max nodes: 51, max_edges: 118
+      max_num_edges_per_graph: 120
+    # Data handling-related
+    batch_size_training: 32
+    batch_size_inference: 16
+
+predictor:
+  metrics_every_n_train_steps: 1000
+  optim_kwargs:
+    loss_scaling: 1024
+
+trainer:
+  trainer:
+    precision: 16-true
+    accumulate_grad_batches: 2
diff --git a/expts/configs/config_mpnn_10M_pcqm4m.yaml b/expts/hydra-configs/dataset/pcqm4m.yaml
similarity index 70%
rename from expts/configs/config_mpnn_10M_pcqm4m.yaml
rename to expts/hydra-configs/dataset/pcqm4m.yaml
index 63fab1970..391bb21de 100644
--- a/expts/configs/config_mpnn_10M_pcqm4m.yaml
+++ b/expts/hydra-configs/dataset/pcqm4m.yaml
@@ -1,52 +1,4 @@
-# Testing the mpnn only model with the PCQMv2 dataset on IPU.
-constants:
-  name: &name pcqm4mv2_mpnn_4layer
-  seed: &seed 42
-  raise_train_error: true   # Whether the code should raise an error if it crashes during training
-
-accelerator:
-  type: ipu  # cpu or ipu or gpu
-  config_override:
-    datamodule:
-      args:
-        ipu_dataloader_training_opts:
-          mode: async
-          max_num_nodes_per_graph: 20 # train max nodes: 20, max_edges: 54
-          max_num_edges_per_graph: 60
-        ipu_dataloader_inference_opts:
-          mode: async
-          max_num_nodes_per_graph: 16 # valid max nodes: 51, max_edges: 118
-          max_num_edges_per_graph: 120
-        # Data handling-related
-        batch_size_training: 64
-        batch_size_inference: 16
-    predictor:
-      optim_kwargs:
-        loss_scaling: 1024
-    trainer:
-      trainer:
-        precision: 16-true
-        accumulate_grad_batches: 4
-
-  ipu_config:
-    - deviceIterations(20) # IPU would require large batches to be ready for the model.
-    - replicationFactor(16)
-    # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
-    # - enableExecutableCaching("pop_compiler_cache")
-    - TensorLocations.numIOTiles(128)
-    - _Popart.set("defaultBufferingDepth", 128)
-    - Precision.enableStochasticRounding(True)
-
-# accelerator:
-#   type: cpu  # cpu or ipu or gpu
-#   config_override:
-#     datamodule:
-#       batch_size_training: 256
-#       batch_size_inference: 64
-#     trainer:
-#       trainer:
-#         precision: 32
-#         accumulate_grad_batches: 1
+# @package _global_
 
 datamodule:
   module_type: "MultitaskFromSmilesDataModule"
@@ -56,27 +8,25 @@ datamodule:
       homolumo:
         df: null
         task_level: "graph"
-        df_path: graphium/data/PCQM4M/pcqm4mv2-20k.csv
+        df_path: graphium/data/PCQM4M/pcqm4mv2.csv
         # wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv
         # or set path as https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv directly
         smiles_col: "cxsmiles"
         label_cols: ["homo_lumo_gap"]
-        sample_size: 8000 # use sample_size for test
-        # splits_path: graphium/data/PCQM4M/split_dict_v2.pt  # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt`
-        # split_names: ["train", "valid", "test-dev"]
+        # sample_size: 8000 # use sample_size for test
+        splits_path: graphium/data/PCQM4M/split_dict_v2.pt  # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt`
+        split_names: ["train", "valid", "test-dev"]
         # graphium/data/PCQM4Mv2/split_dict.pt
         # graphium/data/PCQM4Mv2/pcqm4m_split.csv
-        split_val: 0.1
-        split_test: 0.1
-        seed: *seed
+        # split_val: 0.1
+        # split_test: 0.1
+        seed: ${constants.seed}
         label_normalization:
           method: "normal"
-          min_clipping: 0
-          max_clipping: 50
 
     # Featurization
     prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 20
+    featurization_n_jobs: 30
     featurization_progress: True
     featurization_backend: "loky"
     processed_graph_data_path: "../datacache/PCQM4Mv2/"
@@ -114,8 +64,6 @@ datamodule:
     num_workers: 30 # -1 to use all
     persistent_workers: False # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
-    featurization_backend: "loky"
-
 
 architecture:
   model_type: FullGraphMultiTaskNetwork
@@ -182,19 +130,6 @@ architecture:
     last_normalization: *normalization
     residual_type: simple
     virtual_node: 'none'
-    layer_type: 'pyg:gps' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps
-    layer_kwargs:  # Parameters for the model itself. You could define dropout_attn: 0.1
-      node_residual: false
-      mpnn_type: 'pyg:mpnnplus'
-      mpnn_kwargs:
-        in_dim: 256
-        out_dim: 256
-        in_dim_edges: 128
-        out_dim_edges: 128
-      attn_type: "none" # "full-attention", "none"
-      # biased_attention: false
-      attn_kwargs: null
-
 
   graph_output_nn:
     graph:
@@ -225,10 +160,12 @@ architecture:
 #Task-specific
 predictor:
   metrics_on_progress_bar:
-    homolumo: ["mae", "pearsonr"]
+    homolumo: []
+  metrics_on_training_set:
+    homolumo: ["pearsonr"]  
   loss_fun:
     homolumo: mae_ipu
-  random_seed: *seed
+  random_seed: ${constants.seed}
   optim_kwargs:
     lr: 4.e-4 # warmup can be scheduled using torch_scheduler_kwargs
     # weight_decay: 1.e-7
@@ -252,7 +189,7 @@ metrics:
     - name: mae
       metric: mae_ipu
       target_nan_mask: null
-      multitask_handling: flatten
+      multitask_handling: mean-per-label
       threshold_kwargs: null
     - name: pearsonr
       metric: pearsonr_ipu
@@ -261,9 +198,10 @@ metrics:
       multitask_handling: mean-per-label
 
 trainer:
+  seed: ${constants.seed}
   logger:
     save_dir: logs/PCQMv2
-    name: *name
+    name: ${constants.name}
     project: PCQMv2_mpnn
   #early_stopping:
   #  monitor: *monitor
@@ -272,7 +210,7 @@ trainer:
   #  mode: &mode min
   model_checkpoint:
     dirpath: models_checkpoints/PCMQ4Mv2/
-    filename: *name
+    filename: ${constants.name}
     #monitor: *monitor
     #mode: *mode
     save_top_k: 1
diff --git a/expts/hydra-configs/experiment/pcqm4m_gpspp.yaml b/expts/hydra-configs/experiment/pcqm4m_gpspp.yaml
new file mode 100644
index 000000000..a321e835d
--- /dev/null
+++ b/expts/hydra-configs/experiment/pcqm4m_gpspp.yaml
@@ -0,0 +1,13 @@
+# @package _global_
+
+# GPS++ model with the PCQMv2 dataset.
+constants:
+  name: pcqm4mv2_gpspp_4layer
+  entity: "multitask-gnn"
+  seed: 42
+  max_epochs: 100
+  raise_train_error: true   # Whether the code should raise an error if it crashes during training
+
+trainer:
+  model_checkpoint:
+    dirpath: models_checkpoints/PCMQ4Mv2/gpspp/
diff --git a/expts/hydra-configs/experiment/pcqm4m_mpnn.yaml b/expts/hydra-configs/experiment/pcqm4m_mpnn.yaml
new file mode 100644
index 000000000..08b1b1f3c
--- /dev/null
+++ b/expts/hydra-configs/experiment/pcqm4m_mpnn.yaml
@@ -0,0 +1,13 @@
+# @package _global_
+
+# MPNN model with the PCQMv2 dataset.
+constants:
+  name: pcqm4mv2_mpnn_4layer
+  entity: "multitask-gnn"
+  seed: 42
+  max_epochs: 100
+  raise_train_error: true   # Whether the code should raise an error if it crashes during training
+
+trainer:
+  model_checkpoint:
+    dirpath: models_checkpoints/PCMQ4Mv2/mpnn/
diff --git a/expts/hydra-configs/model/gpspp.yaml b/expts/hydra-configs/model/gpspp.yaml
new file mode 100644
index 000000000..0b231fcf1
--- /dev/null
+++ b/expts/hydra-configs/model/gpspp.yaml
@@ -0,0 +1,38 @@
+# @package _global_
+
+architecture:
+  pe_encoders:
+    encoders:
+      gaussian_pos:
+        encoder_type: "gaussian_kernel"
+        input_keys: ["positions_3d"]
+        output_keys: ["feat", "nodepair_gaussian_bias_3d"]
+        num_heads: 32
+        num_layers: 1 #2
+        embed_dim: 32
+        out_dim: 32 # need num of gaussian kernels 128
+        # but currently it checks pe_out_dim == pe_out_dim in encoder_manager.py, line 128
+        use_input_keys_prefix: False
+
+  gnn:
+    layer_type: 'pyg:gps'
+    layer_kwargs:  # Parameters for the model itself. You could define dropout_attn: 0.1
+      node_residual: false
+      mpnn_type: 'pyg:mpnnplus'
+      mpnn_kwargs:
+        in_dim: 256
+        out_dim: 256
+        in_dim_edges: 128
+        out_dim_edges: 128
+      attn_type: "full-attention" # "full-attention", "none"
+      precision: &precision 16-true
+      biased_attention_key: "nodepair_gaussian_bias_3d" # 3D_bias
+      attn_kwargs:
+        num_heads: 32
+      droppath_rate_attn: 0.0
+      droppath_rate_ffn: 0.0
+
+datamodule:
+  args: # Matches that in the test_multitask_datamodule.py case.
+    featurization:
+      conformer_property_list: [positions_3d]
diff --git a/expts/hydra-configs/model/mpnn.yaml b/expts/hydra-configs/model/mpnn.yaml
new file mode 100644
index 000000000..4a8a428e8
--- /dev/null
+++ b/expts/hydra-configs/model/mpnn.yaml
@@ -0,0 +1,16 @@
+# @package _global_
+
+architecture:
+  gnn:
+    layer_type: 'pyg:gps'
+    layer_kwargs:  # Parameters for the model itself. You could define dropout_attn: 0.1
+      node_residual: false
+      mpnn_type: 'pyg:mpnnplus'
+      mpnn_kwargs:
+        in_dim: 256
+        out_dim: 256
+        in_dim_edges: 128
+        out_dim_edges: 128
+      attn_type: "none" # "full-attention", "none"
+      # biased_attention: false
+      attn_kwargs: null
diff --git a/expts/main_run_multitask.py b/expts/main_run_multitask.py
index 64e1c185b..c14670377 100644
--- a/expts/main_run_multitask.py
+++ b/expts/main_run_multitask.py
@@ -73,7 +73,6 @@ def main(cfg: DictConfig) -> None:
     save_params_to_wandb(trainer.logger, cfg, predictor, datamodule)
 
     # Determine the max num nodes and edges in training and validation
-    logger.info("About to set the max nodes etc.")
     predictor.set_max_nodes_edges_per_graph(datamodule, stages=["train", "val"])
 
     # Run the model training
diff --git a/expts/neurips2023_configs/config_large_gcn.yaml b/expts/neurips2023_configs/config_large_gcn.yaml
index 7745693b3..033b8a5f5 100644
--- a/expts/neurips2023_configs/config_large_gcn.yaml
+++ b/expts/neurips2023_configs/config_large_gcn.yaml
@@ -3,6 +3,7 @@ constants:
   name: &name neurips2023_large_data_gcn
   seed: &seed 42
   raise_train_error: true   # Whether the code should raise an error if it crashes during training
+  entity: multitask-gnn
 
 accelerator:
   type: ipu  # cpu or ipu or gpu
@@ -170,7 +171,6 @@ datamodule:
     num_workers: 32 # -1 to use all
     persistent_workers: True # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
-    featurization_backend: "loky"
 
 
 architecture:
diff --git a/expts/neurips2023_configs/debug/config_large_gcn_debug.yaml b/expts/neurips2023_configs/debug/config_large_gcn_debug.yaml
index db18ee10c..d6e4cb724 100644
--- a/expts/neurips2023_configs/debug/config_large_gcn_debug.yaml
+++ b/expts/neurips2023_configs/debug/config_large_gcn_debug.yaml
@@ -3,6 +3,7 @@ constants:
   name: &name neurips2023_large_data_gcn_debug
   seed: &seed 100
   raise_train_error: true   # Whether the code should raise an error if it crashes during training
+  entity: multitask-gnn
 
 accelerator:
   type: ipu  # cpu or ipu or gpu
@@ -169,7 +170,6 @@ datamodule:
     num_workers: 30 # -1 to use all
     persistent_workers: False # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
-    featurization_backend: "loky"
 
 
 architecture:
diff --git a/expts/run_validation_test.py b/expts/run_validation_test.py
index cf6248d24..06804301c 100644
--- a/expts/run_validation_test.py
+++ b/expts/run_validation_test.py
@@ -4,7 +4,7 @@
 from os.path import dirname, abspath
 import yaml
 from copy import deepcopy
-from omegaconf import DictConfig
+from omegaconf import DictConfig, OmegaConf
 import timeit
 from loguru import logger
 from datetime import datetime
@@ -20,41 +20,33 @@
     load_trainer,
     save_params_to_wandb,
     load_accelerator,
-    load_yaml_config,
 )
 from graphium.utils.safe_run import SafeRun
 
+import hydra
 
 # WandB
 import wandb
 
 # Set up the working directory
 MAIN_DIR = dirname(dirname(abspath(graphium.__file__)))
-
-# CONFIG_FILE = "expts/configs/config_mpnn_10M_b3lyp.yaml"
-# CONFIG_FILE = "expts/configs/config_mpnn_10M_pcqm4m.yaml"
-# CONFIG_FILE = "expts/neurips2023_configs/config_debug.yaml"
-# CONFIG_FILE = "expts/neurips2023_configs/config_large_mpnn.yaml"
-# CONFIG_FILE = "expts/neurips2023_configs/config_large_gcn.yaml"
-CONFIG_FILE = "expts/neurips2023_configs/debug/config_large_gcn_debug.yaml"
-# CONFIG_FILE = "expts/neurips2023_configs/config_large_gin.yaml"
-# CONFIG_FILE = "expts/neurips2023_configs/config_large_gine.yaml"
-# CONFIG_FILE = "expts/neurips2023_configs/config_small_gcn.yaml"
-# CONFIG_FILE = "expts/neurips2023_configs/config_large_gcn.yaml"
-# CONFIG_FILE = "exptas/neurips2023_configs/config_small_gin.yaml"
-# CONFIG_FILE = "expts/neurips2023_configs/config_small_gine.yaml"
 os.chdir(MAIN_DIR)
 
 
-def main(cfg: DictConfig, run_name: str = "main", add_date_time: bool = True) -> None:
+@hydra.main(version_base=None, config_path="hydra-configs", config_name="main")
+def main(cfg: DictConfig) -> None:
+    cfg = OmegaConf.to_container(cfg, resolve=True)
+
+    run_name: str = "main"
+    add_date_time: bool = True
+
     st = timeit.default_timer()
 
     date_time_suffix = ""
     if add_date_time:
         date_time_suffix = datetime.now().strftime("%d.%m.%Y_%H.%M.%S")
 
-    cfg = deepcopy(cfg)
-    wandb.init(project=cfg["constants"]["name"], config=cfg)
+    wandb.init(entity=cfg["constants"]["entity"], project=cfg["constants"]["name"], config=cfg)
 
     # Initialize the accelerator
     cfg, accelerator_type = load_accelerator(cfg)
@@ -110,12 +102,4 @@ def main(cfg: DictConfig, run_name: str = "main", add_date_time: bool = True) ->
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--config", help="Path to the config file", default=None)
-
-    args, unknown_args = parser.parse_known_args()
-    if args.config is not None:
-        CONFIG_FILE = args.config
-    cfg = load_yaml_config(CONFIG_FILE, MAIN_DIR, unknown_args)
-
-    main(cfg)
+    main()