Merge pull request #419 from datamol-io/pcqm4m_changes

pcqm4m changes with mpnn and gps++ configs, fix run_validation_test.py
datamol-io · Aug 1, 2023 · 66c7adc · 66c7adc
2 parents b0d4fd5 + f60b968
commit 66c7adc
Show file tree

Hide file tree

Showing 12 changed files with 145 additions and 339 deletions.
diff --git a/expts/configs/config_gpspp_10M_pcqm4m.yaml b/expts/configs/config_gpspp_10M_pcqm4m.yaml
diff --git a/expts/hydra-configs/accelerator/ipu.yaml b/expts/hydra-configs/accelerator/ipu.yaml
@@ -1,9 +1,16 @@
 type: ipu
 ipu_config:
-    - deviceIterations(5) # IPU would require large batches to be ready for the model.
+    - deviceIterations(30) # IPU would require large batches to be ready for the model.
     - replicationFactor(16)
     # - enableProfiling("graph_analyser")       # The folder where the profile will be stored
     # - enableExecutableCaching("pop_compiler_cache")
     - TensorLocations.numIOTiles(128)
-    - _Popart.set("defaultBufferingDepth", 128)
-    - Precision.enableStochasticRounding(True)
+    - _Popart.set("defaultBufferingDepth", 96)
+    - Precision.enableStochasticRounding(True)
+
+ipu_inference_config:
+    # set device iteration and replication factor to 1 during inference
+    # gradient accumulation was set to 1 in the code
+    - deviceIterations(1)
+    - replicationFactor(1)
+    - Precision.enableStochasticRounding(False)
diff --git a/expts/hydra-configs/dataset/accelerator/pcqm4m_ipu.yaml b/expts/hydra-configs/dataset/accelerator/pcqm4m_ipu.yaml
@@ -0,0 +1,25 @@
+# @package _global_
+
+datamodule:
+  args:
+    ipu_dataloader_training_opts:
+      mode: async
+      max_num_nodes_per_graph: 16 # train max nodes: 20, max_edges: 54
+      max_num_edges_per_graph: 60
+    ipu_dataloader_inference_opts:
+      mode: async
+      max_num_nodes_per_graph: 30 # valid max nodes: 51, max_edges: 118
+      max_num_edges_per_graph: 120
+    # Data handling-related
+    batch_size_training: 32
+    batch_size_inference: 16
+
+predictor:
+  metrics_every_n_train_steps: 1000
+  optim_kwargs:
+    loss_scaling: 1024
+
+trainer:
+  trainer:
+    precision: 16-true
+    accumulate_grad_batches: 2