Merge pull request #521 from ndickson-nvidia/atom_order

Reorder atoms in label data (Fixes 502) + documenting C++
datamol-io · Sep 23, 2024 · e887176 · e887176
2 parents 24412f6 + f123565
commit e887176
Show file tree

Hide file tree

Showing 29 changed files with 1,520 additions and 1,210 deletions.
diff --git a/graphium/data/datamodule.py b/graphium/data/datamodule.py
@@ -11,7 +11,6 @@
 --------------------------------------------------------------------------------
 """
 
-
 import tempfile
 from contextlib import redirect_stderr, redirect_stdout
 from typing import Type, List, Dict, Union, Any, Callable, Optional, Tuple, Iterable, Literal
@@ -918,10 +917,14 @@ def encode_feature_options(options, name, encoding_function):
 
         explicit_H = featurization["explicit_H"] if "explicit_H" in featurization else False
         add_self_loop = featurization["add_self_loop"] if "add_self_loop" in featurization else False
+        merge_equivalent_mols = (
+            featurization["merge_equivalent_mols"] if "merge_equivalent_mols" in featurization else True
+        )
 
         # Save these for calling graphium_cpp.prepare_and_save_data later
         self.add_self_loop = add_self_loop
         self.explicit_H = explicit_H
+        self.merge_equivalent_mols = merge_equivalent_mols
 
         self.preprocessing_n_jobs = preprocessing_n_jobs
 
@@ -1163,6 +1166,7 @@ def prepare_data(self):
             self.add_self_loop,
             self.explicit_H,
             self.preprocessing_n_jobs,
+            self.merge_equivalent_mols,
         )
         self._len = self._get_len_from_cached_file()
 
@@ -1722,8 +1726,15 @@ def get_data_hash(self):
         """
         Get a hash specific to a dataset.
         Useful to cache the pre-processed data.
-        """
-        args = {}
+        Don't include options only used at data loading time, such as
+        most featurization options, but include options used during
+        pre-processing, like merge_equivalent_mols.
+        """
+        args = {
+            "add_self_loop": self.add_self_loop,
+            "explicit_H": self.explicit_H,
+            "merge_equivalent_mols": self.merge_equivalent_mols,
+        }
         # pop epoch_sampling_fraction out when creating hash
         # so that the data cache does not need to be regenerated
         # when epoch_sampling_fraction has changed.

diff --git a/graphium/graphium_cpp/commute.cpp b/graphium/graphium_cpp/commute.cpp
@@ -9,6 +9,8 @@
 #include <stdint.h>
 #include <vector>
 
+// Computes the "commute distance" between each pair of nodes, outputting to `matrix`.
+// See the declaration in commute.h for more details.
 template<typename T>
 void compute_commute_distances(
     const uint32_t n,
@@ -35,8 +37,10 @@ void compute_commute_distances(
         full_sum = T(row_starts[n]);
     }
 
+    // Allocate the memory for the output
     matrix.resize(n * n);
 
+    // Compute the distances from the pseudoinverse
     for (size_t row = 0, row_diag_index = 0, i = 0; row < n; ++row, row_diag_index += (n + 1)) {
         for (size_t col = 0, col_diag_index = 0; col < n; ++col, ++i, col_diag_index += (n + 1)) {
             matrix[i] = full_sum * (
@@ -47,17 +51,16 @@ void compute_commute_distances(
     }
 }
 
-template
-void compute_commute_distances<float>(
+// Explicit instantiations of `compute_commute_distances` for `float` and `double`
+template void compute_commute_distances<float>(
     const uint32_t n,
     const uint32_t* row_starts,
     const uint32_t* neighbors,
     LaplacianData<float>& data,
     std::vector<float>& laplacian_pseudoinverse,
     std::vector<float>& matrix,
     const float* weights);
-template
-void compute_commute_distances<double>(
+template void compute_commute_distances<double>(
     const uint32_t n,
     const uint32_t* row_starts,
     const uint32_t* neighbors,

diff --git a/graphium/graphium_cpp/commute.h b/graphium/graphium_cpp/commute.h
@@ -8,6 +8,23 @@
 #include <stdint.h>
 #include <vector>
 
+//! Computes the "commute distance", `2*E*(P_ii + P_jj - 2*P_ij)`, for each node pair `ij`,
+//! where P is the Laplacian pseudoinverse and E is the total number of unique edges.
+//! Template type `T` can be `float` or `double`.  Implementation is in commute.cpp
+//!
+//! @param n Number of nodes
+//! @param row_starts Array of `n+1` indices into `neighbors`, indicating where each node's
+//!                   neighbors start, plus one at the end to indicate the full length of
+//!                   `neighbors`
+//! @param neighbors Concatenated array of all neighbors of all nodes, in order
+//! @param data Cache for the eigendecomposition of the graph Laplacian matrix
+//! @param laplacian_pseudoinverse If empty, this will be filled with the pseudoinverse of the
+//!                                graph Laplacian matrix, else its contents will be assumed to
+//!                                contain the cached pseudoinverse of the graph Laplacian
+//! @param matrix The output commute distances for all `n^2` node pairs
+//! @param weights Optional array of edge weights, in the order corresponding with neighbors.
+//!                If non-null, the distances will be scaled by the sum of all weights, instead
+//!                of `2*E`.
 template<typename T>
 void compute_commute_distances(
     const uint32_t n,
@@ -18,17 +35,17 @@ void compute_commute_distances(
     std::vector<T>& matrix,
     const T* weights = nullptr);
 
-extern template
-void compute_commute_distances<float>(
+// Instantiation declarations of `compute_commute_distances` for `float` and `double`
+// The explicit instantiations are in commute.cpp
+extern template void compute_commute_distances<float>(
     const uint32_t n,
     const uint32_t* row_starts,
     const uint32_t* neighbors,
     LaplacianData<float>& data,
     std::vector<float>& laplacian_pseudoinverse,
     std::vector<float>& matrix,
     const float* weights);
-extern template
-void compute_commute_distances<double>(
+extern template void compute_commute_distances<double>(
     const uint32_t n,
     const uint32_t* row_starts,
     const uint32_t* neighbors,

diff --git a/graphium/graphium_cpp/electrostatic.cpp b/graphium/graphium_cpp/electrostatic.cpp
@@ -9,6 +9,8 @@
 #include <stdint.h>
 #include <vector>
 
+// Computes the pseudoinverse of the graph Laplacian, outputting to `matrix`.
+// See the declaration in electrostatic.h for more details.
 template<typename T>
 void compute_laplacian_pseudoinverse(
     const uint32_t n,
@@ -24,6 +26,8 @@ void compute_laplacian_pseudoinverse(
         compute_laplacian_eigendecomp(n, row_starts, neighbors, Normalization::NONE, data, 1, nullptr, weights);
     }
 
+    // Allocate the space for the output and initialize to zero.
+    // The clear() call is so that resize() initializes all values to zero.
     matrix.clear();
     matrix.resize(size_t(n) * n, T(0));
     const T maxEigenvalue = data.eigenvalues.back();
@@ -49,6 +53,7 @@ void compute_laplacian_pseudoinverse(
     }
 }
 
+// Explicit instantiations of `compute_laplacian_pseudoinverse` for `float` and `double`
 template void compute_laplacian_pseudoinverse<float>(
     const uint32_t n,
     const uint32_t* row_starts,
@@ -64,6 +69,8 @@ template void compute_laplacian_pseudoinverse<double>(
     std::vector<double>& matrix,
     const double* weights);
 
+// Computes the "electrostatic interactions" between each pair of nodes, outputting to `matrix`.
+// See the declaration in electrostatic.h for more details.
 template<typename T>
 void compute_electrostatic_interactions(
     const uint32_t n,
@@ -78,6 +85,7 @@ void compute_electrostatic_interactions(
         compute_laplacian_pseudoinverse(n, row_starts, neighbors, data, laplacian_pseudoinverse, weights);
     }
 
+    // Allocate the memory for the output
     matrix.resize(n * n);
 
     // Subtract the diagonal value from each column
@@ -88,6 +96,7 @@ void compute_electrostatic_interactions(
     }
 }
 
+// Explicit instantiations of `compute_electrostatic_interactions` for `float` and `double`
 template void compute_electrostatic_interactions<float>(
     const uint32_t n,
     const uint32_t* row_starts,

diff --git a/graphium/graphium_cpp/electrostatic.h b/graphium/graphium_cpp/electrostatic.h
@@ -8,6 +8,18 @@
 #include <stdint.h>
 #include <vector>
 
+//! Computes the pseudoinverse of the graph Laplacian matrix.
+//! Template type `T` can be `float` or `double`.  Implementation is in electrostatic.cpp
+//!
+//! @param n Number of nodes
+//! @param row_starts Array of `n+1` indices into `neighbors`, indicating where each node's
+//!                   neighbors start, plus one at the end to indicate the full length of
+//!                   `neighbors`
+//! @param neighbors Concatenated array of all neighbors of all nodes, in order
+//! @param data Cache for the eigendecomposition of the graph Laplacian matrix
+//! @param matrix The output pseudoinverse of the graph Laplacian matrix
+//! @param weights Optional array of edge weights, in the order corresponding with neighbors.
+//!                If null, the edge weights are all 1.
 template<typename T>
 void compute_laplacian_pseudoinverse(
     const uint32_t n,
@@ -17,6 +29,8 @@ void compute_laplacian_pseudoinverse(
     std::vector<T>& matrix,
     const T* weights = nullptr);
 
+// Instantiation declarations of `compute_laplacian_pseudoinverse` for `float` and `double`
+// The explicit instantiations are in electrostatic.cpp
 extern template void compute_laplacian_pseudoinverse<float>(
     const uint32_t n,
     const uint32_t* row_starts,
@@ -32,6 +46,24 @@ extern template void compute_laplacian_pseudoinverse<double>(
     std::vector<double>& matrix,
     const double* weights);
 
+//! Computes the "electrostatic interactions", `P_ij - P_jj`, for each node pair `ij`,
+//! where P is the Laplacian pseudoinverse.
+//! Template type `T` can be `float` or `double`.  Implementation is in electrostatic.cpp
+//!
+//! @param n Number of nodes
+//! @param row_starts Array of `n+1` indices into `neighbors`, indicating where each node's
+//!                   neighbors start, plus one at the end to indicate the full length of
+//!                   `neighbors`
+//! @param neighbors Concatenated array of all neighbors of all nodes, in order
+//! @param data Cache for the eigendecomposition of the graph Laplacian matrix
+//! @param laplacian_pseudoinverse If empty, this will be filled with the pseudoinverse of the
+//!                                graph Laplacian matrix, else its contents will be assumed to
+//!                                contain the cached pseudoinverse of the graph Laplacian
+//! @param matrix The output electrostatic interactions for all `n^2` node pairs, i.e. the
+//!               pseudoinverse of the graph Laplacian matrix, with the diagonal subtracted from
+//!               each column, stored in row-major order.
+//! @param weights Optional array of edge weights, in the order corresponding with neighbors.
+//!                If null, the edge weights are all 1.
 template<typename T>
 void compute_electrostatic_interactions(
     const uint32_t n,
@@ -42,6 +74,8 @@ void compute_electrostatic_interactions(
     std::vector<T>& matrix,
     const T* weights = nullptr);
 
+// Instantiation declarations of `compute_electrostatic_interactions` for `float` and `double`
+// The explicit instantiations are in electrostatic.cpp
 extern template void compute_electrostatic_interactions<float>(
     const uint32_t n,
     const uint32_t* row_starts,