Skip to content

Commit

Permalink
Merge pull request #521 from ndickson-nvidia/atom_order
Browse files Browse the repository at this point in the history
Reorder atoms in label data (Fixes 502) + documenting C++
  • Loading branch information
DomInvivo authored Sep 23, 2024
2 parents 24412f6 + f123565 commit e887176
Show file tree
Hide file tree
Showing 29 changed files with 1,520 additions and 1,210 deletions.
17 changes: 14 additions & 3 deletions graphium/data/datamodule.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
--------------------------------------------------------------------------------
"""


import tempfile
from contextlib import redirect_stderr, redirect_stdout
from typing import Type, List, Dict, Union, Any, Callable, Optional, Tuple, Iterable, Literal
Expand Down Expand Up @@ -918,10 +917,14 @@ def encode_feature_options(options, name, encoding_function):

explicit_H = featurization["explicit_H"] if "explicit_H" in featurization else False
add_self_loop = featurization["add_self_loop"] if "add_self_loop" in featurization else False
merge_equivalent_mols = (
featurization["merge_equivalent_mols"] if "merge_equivalent_mols" in featurization else True
)

# Save these for calling graphium_cpp.prepare_and_save_data later
self.add_self_loop = add_self_loop
self.explicit_H = explicit_H
self.merge_equivalent_mols = merge_equivalent_mols

self.preprocessing_n_jobs = preprocessing_n_jobs

Expand Down Expand Up @@ -1163,6 +1166,7 @@ def prepare_data(self):
self.add_self_loop,
self.explicit_H,
self.preprocessing_n_jobs,
self.merge_equivalent_mols,
)
self._len = self._get_len_from_cached_file()

Expand Down Expand Up @@ -1722,8 +1726,15 @@ def get_data_hash(self):
"""
Get a hash specific to a dataset.
Useful to cache the pre-processed data.
"""
args = {}
Don't include options only used at data loading time, such as
most featurization options, but include options used during
pre-processing, like merge_equivalent_mols.
"""
args = {
"add_self_loop": self.add_self_loop,
"explicit_H": self.explicit_H,
"merge_equivalent_mols": self.merge_equivalent_mols,
}
# pop epoch_sampling_fraction out when creating hash
# so that the data cache does not need to be regenerated
# when epoch_sampling_fraction has changed.
Expand Down
11 changes: 7 additions & 4 deletions graphium/graphium_cpp/commute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
#include <stdint.h>
#include <vector>

// Computes the "commute distance" between each pair of nodes, outputting to `matrix`.
// See the declaration in commute.h for more details.
template<typename T>
void compute_commute_distances(
const uint32_t n,
Expand All @@ -35,8 +37,10 @@ void compute_commute_distances(
full_sum = T(row_starts[n]);
}

// Allocate the memory for the output
matrix.resize(n * n);

// Compute the distances from the pseudoinverse
for (size_t row = 0, row_diag_index = 0, i = 0; row < n; ++row, row_diag_index += (n + 1)) {
for (size_t col = 0, col_diag_index = 0; col < n; ++col, ++i, col_diag_index += (n + 1)) {
matrix[i] = full_sum * (
Expand All @@ -47,17 +51,16 @@ void compute_commute_distances(
}
}

template
void compute_commute_distances<float>(
// Explicit instantiations of `compute_commute_distances` for `float` and `double`
template void compute_commute_distances<float>(
const uint32_t n,
const uint32_t* row_starts,
const uint32_t* neighbors,
LaplacianData<float>& data,
std::vector<float>& laplacian_pseudoinverse,
std::vector<float>& matrix,
const float* weights);
template
void compute_commute_distances<double>(
template void compute_commute_distances<double>(
const uint32_t n,
const uint32_t* row_starts,
const uint32_t* neighbors,
Expand Down
25 changes: 21 additions & 4 deletions graphium/graphium_cpp/commute.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,23 @@
#include <stdint.h>
#include <vector>

//! Computes the "commute distance", `2*E*(P_ii + P_jj - 2*P_ij)`, for each node pair `ij`,
//! where P is the Laplacian pseudoinverse and E is the total number of unique edges.
//! Template type `T` can be `float` or `double`. Implementation is in commute.cpp
//!
//! @param n Number of nodes
//! @param row_starts Array of `n+1` indices into `neighbors`, indicating where each node's
//! neighbors start, plus one at the end to indicate the full length of
//! `neighbors`
//! @param neighbors Concatenated array of all neighbors of all nodes, in order
//! @param data Cache for the eigendecomposition of the graph Laplacian matrix
//! @param laplacian_pseudoinverse If empty, this will be filled with the pseudoinverse of the
//! graph Laplacian matrix, else its contents will be assumed to
//! contain the cached pseudoinverse of the graph Laplacian
//! @param matrix The output commute distances for all `n^2` node pairs
//! @param weights Optional array of edge weights, in the order corresponding with neighbors.
//! If non-null, the distances will be scaled by the sum of all weights, instead
//! of `2*E`.
template<typename T>
void compute_commute_distances(
const uint32_t n,
Expand All @@ -18,17 +35,17 @@ void compute_commute_distances(
std::vector<T>& matrix,
const T* weights = nullptr);

extern template
void compute_commute_distances<float>(
// Instantiation declarations of `compute_commute_distances` for `float` and `double`
// The explicit instantiations are in commute.cpp
extern template void compute_commute_distances<float>(
const uint32_t n,
const uint32_t* row_starts,
const uint32_t* neighbors,
LaplacianData<float>& data,
std::vector<float>& laplacian_pseudoinverse,
std::vector<float>& matrix,
const float* weights);
extern template
void compute_commute_distances<double>(
extern template void compute_commute_distances<double>(
const uint32_t n,
const uint32_t* row_starts,
const uint32_t* neighbors,
Expand Down
9 changes: 9 additions & 0 deletions graphium/graphium_cpp/electrostatic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
#include <stdint.h>
#include <vector>

// Computes the pseudoinverse of the graph Laplacian, outputting to `matrix`.
// See the declaration in electrostatic.h for more details.
template<typename T>
void compute_laplacian_pseudoinverse(
const uint32_t n,
Expand All @@ -24,6 +26,8 @@ void compute_laplacian_pseudoinverse(
compute_laplacian_eigendecomp(n, row_starts, neighbors, Normalization::NONE, data, 1, nullptr, weights);
}

// Allocate the space for the output and initialize to zero.
// The clear() call is so that resize() initializes all values to zero.
matrix.clear();
matrix.resize(size_t(n) * n, T(0));
const T maxEigenvalue = data.eigenvalues.back();
Expand All @@ -49,6 +53,7 @@ void compute_laplacian_pseudoinverse(
}
}

// Explicit instantiations of `compute_laplacian_pseudoinverse` for `float` and `double`
template void compute_laplacian_pseudoinverse<float>(
const uint32_t n,
const uint32_t* row_starts,
Expand All @@ -64,6 +69,8 @@ template void compute_laplacian_pseudoinverse<double>(
std::vector<double>& matrix,
const double* weights);

// Computes the "electrostatic interactions" between each pair of nodes, outputting to `matrix`.
// See the declaration in electrostatic.h for more details.
template<typename T>
void compute_electrostatic_interactions(
const uint32_t n,
Expand All @@ -78,6 +85,7 @@ void compute_electrostatic_interactions(
compute_laplacian_pseudoinverse(n, row_starts, neighbors, data, laplacian_pseudoinverse, weights);
}

// Allocate the memory for the output
matrix.resize(n * n);

// Subtract the diagonal value from each column
Expand All @@ -88,6 +96,7 @@ void compute_electrostatic_interactions(
}
}

// Explicit instantiations of `compute_electrostatic_interactions` for `float` and `double`
template void compute_electrostatic_interactions<float>(
const uint32_t n,
const uint32_t* row_starts,
Expand Down
34 changes: 34 additions & 0 deletions graphium/graphium_cpp/electrostatic.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,18 @@
#include <stdint.h>
#include <vector>

//! Computes the pseudoinverse of the graph Laplacian matrix.
//! Template type `T` can be `float` or `double`. Implementation is in electrostatic.cpp
//!
//! @param n Number of nodes
//! @param row_starts Array of `n+1` indices into `neighbors`, indicating where each node's
//! neighbors start, plus one at the end to indicate the full length of
//! `neighbors`
//! @param neighbors Concatenated array of all neighbors of all nodes, in order
//! @param data Cache for the eigendecomposition of the graph Laplacian matrix
//! @param matrix The output pseudoinverse of the graph Laplacian matrix
//! @param weights Optional array of edge weights, in the order corresponding with neighbors.
//! If null, the edge weights are all 1.
template<typename T>
void compute_laplacian_pseudoinverse(
const uint32_t n,
Expand All @@ -17,6 +29,8 @@ void compute_laplacian_pseudoinverse(
std::vector<T>& matrix,
const T* weights = nullptr);

// Instantiation declarations of `compute_laplacian_pseudoinverse` for `float` and `double`
// The explicit instantiations are in electrostatic.cpp
extern template void compute_laplacian_pseudoinverse<float>(
const uint32_t n,
const uint32_t* row_starts,
Expand All @@ -32,6 +46,24 @@ extern template void compute_laplacian_pseudoinverse<double>(
std::vector<double>& matrix,
const double* weights);

//! Computes the "electrostatic interactions", `P_ij - P_jj`, for each node pair `ij`,
//! where P is the Laplacian pseudoinverse.
//! Template type `T` can be `float` or `double`. Implementation is in electrostatic.cpp
//!
//! @param n Number of nodes
//! @param row_starts Array of `n+1` indices into `neighbors`, indicating where each node's
//! neighbors start, plus one at the end to indicate the full length of
//! `neighbors`
//! @param neighbors Concatenated array of all neighbors of all nodes, in order
//! @param data Cache for the eigendecomposition of the graph Laplacian matrix
//! @param laplacian_pseudoinverse If empty, this will be filled with the pseudoinverse of the
//! graph Laplacian matrix, else its contents will be assumed to
//! contain the cached pseudoinverse of the graph Laplacian
//! @param matrix The output electrostatic interactions for all `n^2` node pairs, i.e. the
//! pseudoinverse of the graph Laplacian matrix, with the diagonal subtracted from
//! each column, stored in row-major order.
//! @param weights Optional array of edge weights, in the order corresponding with neighbors.
//! If null, the edge weights are all 1.
template<typename T>
void compute_electrostatic_interactions(
const uint32_t n,
Expand All @@ -42,6 +74,8 @@ void compute_electrostatic_interactions(
std::vector<T>& matrix,
const T* weights = nullptr);

// Instantiation declarations of `compute_electrostatic_interactions` for `float` and `double`
// The explicit instantiations are in electrostatic.cpp
extern template void compute_electrostatic_interactions<float>(
const uint32_t n,
const uint32_t* row_starts,
Expand Down
Loading

0 comments on commit e887176

Please sign in to comment.