Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reorder atoms in label data (Fixes 502) + documenting C++ #521

Merged
merged 21 commits into from
Sep 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
1d7bfeb
Reorder atoms in node-level and nodepair-level label data, when the s…
ndickson-nvidia Jul 22, 2024
b7d9fe7
Merging of equivalent molecules is now optional, but still defaults t…
ndickson-nvidia Jul 24, 2024
5932fd8
Fixed bug with recent change in smiles_to_brief_data
ndickson-nvidia Jul 25, 2024
020b08a
Fix graphium_cpp.prepare_and_save_data call in test_dataset.py to inc…
ndickson-nvidia Jul 25, 2024
6cd5e26
In MultitaskFromSmilesDataModule.get_data_hash, include options used …
ndickson-nvidia Jul 25, 2024
d1cad44
Linter fixes in python files already modified in this branch
ndickson-nvidia Jul 25, 2024
508abbd
Split prepare_and_save_data into get_task_data, get_indices_and_strin…
ndickson-nvidia Aug 1, 2024
044cd47
Added support for reordering edge label data if there are multiple ta…
ndickson-nvidia Aug 1, 2024
50adb1b
Changed parse_mol in graphium_cpp.cpp to order based only on explicit…
ndickson-nvidia Aug 10, 2024
c870af4
The datasets use 0-based indexing for explicit ordering via atom clas…
ndickson-nvidia Aug 12, 2024
90f2403
Started adding doxygen comments to the C++ code. Also changed comput…
ndickson-nvidia Sep 10, 2024
bafdfe8
Adding unit test for node ordering
WenkelF Sep 5, 2024
7fd40e7
Added doxygen comments for functions and enums related to one-hot fea…
ndickson-nvidia Sep 11, 2024
06b12b2
Added more doxygen comments
ndickson-nvidia Sep 11, 2024
5d798a5
Added and updated more comments
ndickson-nvidia Sep 17, 2024
7261279
Added comments to each function in features.cpp
ndickson-nvidia Sep 23, 2024
b82f582
Investigating failing unit tests
WenkelF Sep 23, 2024
4a152b2
Added more comments to labels.cpp
ndickson-nvidia Sep 23, 2024
618bbb1
Merge branch 'atom_order' of ssh://github.com/ndickson-nvidia/graphiu…
ndickson-nvidia Sep 23, 2024
92ab751
Build fix in features.cpp
ndickson-nvidia Sep 23, 2024
f123565
Skipping test_training.py for now
WenkelF Sep 23, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions graphium/data/datamodule.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
--------------------------------------------------------------------------------
"""


import tempfile
from contextlib import redirect_stderr, redirect_stdout
from typing import Type, List, Dict, Union, Any, Callable, Optional, Tuple, Iterable, Literal
Expand Down Expand Up @@ -918,10 +917,14 @@ def encode_feature_options(options, name, encoding_function):

explicit_H = featurization["explicit_H"] if "explicit_H" in featurization else False
add_self_loop = featurization["add_self_loop"] if "add_self_loop" in featurization else False
merge_equivalent_mols = (
featurization["merge_equivalent_mols"] if "merge_equivalent_mols" in featurization else True
)

# Save these for calling graphium_cpp.prepare_and_save_data later
self.add_self_loop = add_self_loop
self.explicit_H = explicit_H
self.merge_equivalent_mols = merge_equivalent_mols

self.preprocessing_n_jobs = preprocessing_n_jobs

Expand Down Expand Up @@ -1163,6 +1166,7 @@ def prepare_data(self):
self.add_self_loop,
self.explicit_H,
self.preprocessing_n_jobs,
self.merge_equivalent_mols,
)
self._len = self._get_len_from_cached_file()

Expand Down Expand Up @@ -1722,8 +1726,15 @@ def get_data_hash(self):
"""
Get a hash specific to a dataset.
Useful to cache the pre-processed data.
"""
args = {}
Don't include options only used at data loading time, such as
most featurization options, but include options used during
pre-processing, like merge_equivalent_mols.
"""
args = {
"add_self_loop": self.add_self_loop,
"explicit_H": self.explicit_H,
"merge_equivalent_mols": self.merge_equivalent_mols,
}
# pop epoch_sampling_fraction out when creating hash
# so that the data cache does not need to be regenerated
# when epoch_sampling_fraction has changed.
Expand Down
11 changes: 7 additions & 4 deletions graphium/graphium_cpp/commute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
#include <stdint.h>
#include <vector>

// Computes the "commute distance" between each pair of nodes, outputting to `matrix`.
// See the declaration in commute.h for more details.
template<typename T>
void compute_commute_distances(
const uint32_t n,
Expand All @@ -35,8 +37,10 @@ void compute_commute_distances(
full_sum = T(row_starts[n]);
}

// Allocate the memory for the output
matrix.resize(n * n);

// Compute the distances from the pseudoinverse
for (size_t row = 0, row_diag_index = 0, i = 0; row < n; ++row, row_diag_index += (n + 1)) {
for (size_t col = 0, col_diag_index = 0; col < n; ++col, ++i, col_diag_index += (n + 1)) {
matrix[i] = full_sum * (
Expand All @@ -47,17 +51,16 @@ void compute_commute_distances(
}
}

template
void compute_commute_distances<float>(
// Explicit instantiations of `compute_commute_distances` for `float` and `double`
template void compute_commute_distances<float>(
const uint32_t n,
const uint32_t* row_starts,
const uint32_t* neighbors,
LaplacianData<float>& data,
std::vector<float>& laplacian_pseudoinverse,
std::vector<float>& matrix,
const float* weights);
template
void compute_commute_distances<double>(
template void compute_commute_distances<double>(
const uint32_t n,
const uint32_t* row_starts,
const uint32_t* neighbors,
Expand Down
25 changes: 21 additions & 4 deletions graphium/graphium_cpp/commute.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,23 @@
#include <stdint.h>
#include <vector>

//! Computes the "commute distance", `2*E*(P_ii + P_jj - 2*P_ij)`, for each node pair `ij`,
//! where P is the Laplacian pseudoinverse and E is the total number of unique edges.
//! Template type `T` can be `float` or `double`. Implementation is in commute.cpp
//!
//! @param n Number of nodes
//! @param row_starts Array of `n+1` indices into `neighbors`, indicating where each node's
//! neighbors start, plus one at the end to indicate the full length of
//! `neighbors`
//! @param neighbors Concatenated array of all neighbors of all nodes, in order
//! @param data Cache for the eigendecomposition of the graph Laplacian matrix
//! @param laplacian_pseudoinverse If empty, this will be filled with the pseudoinverse of the
//! graph Laplacian matrix, else its contents will be assumed to
//! contain the cached pseudoinverse of the graph Laplacian
//! @param matrix The output commute distances for all `n^2` node pairs
//! @param weights Optional array of edge weights, in the order corresponding with neighbors.
//! If non-null, the distances will be scaled by the sum of all weights, instead
//! of `2*E`.
template<typename T>
void compute_commute_distances(
const uint32_t n,
Expand All @@ -18,17 +35,17 @@ void compute_commute_distances(
std::vector<T>& matrix,
const T* weights = nullptr);

extern template
void compute_commute_distances<float>(
// Instantiation declarations of `compute_commute_distances` for `float` and `double`
// The explicit instantiations are in commute.cpp
extern template void compute_commute_distances<float>(
const uint32_t n,
const uint32_t* row_starts,
const uint32_t* neighbors,
LaplacianData<float>& data,
std::vector<float>& laplacian_pseudoinverse,
std::vector<float>& matrix,
const float* weights);
extern template
void compute_commute_distances<double>(
extern template void compute_commute_distances<double>(
const uint32_t n,
const uint32_t* row_starts,
const uint32_t* neighbors,
Expand Down
9 changes: 9 additions & 0 deletions graphium/graphium_cpp/electrostatic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
#include <stdint.h>
#include <vector>

// Computes the pseudoinverse of the graph Laplacian, outputting to `matrix`.
// See the declaration in electrostatic.h for more details.
template<typename T>
void compute_laplacian_pseudoinverse(
const uint32_t n,
Expand All @@ -24,6 +26,8 @@ void compute_laplacian_pseudoinverse(
compute_laplacian_eigendecomp(n, row_starts, neighbors, Normalization::NONE, data, 1, nullptr, weights);
}

// Allocate the space for the output and initialize to zero.
// The clear() call is so that resize() initializes all values to zero.
matrix.clear();
matrix.resize(size_t(n) * n, T(0));
const T maxEigenvalue = data.eigenvalues.back();
Expand All @@ -49,6 +53,7 @@ void compute_laplacian_pseudoinverse(
}
}

// Explicit instantiations of `compute_laplacian_pseudoinverse` for `float` and `double`
template void compute_laplacian_pseudoinverse<float>(
const uint32_t n,
const uint32_t* row_starts,
Expand All @@ -64,6 +69,8 @@ template void compute_laplacian_pseudoinverse<double>(
std::vector<double>& matrix,
const double* weights);

// Computes the "electrostatic interactions" between each pair of nodes, outputting to `matrix`.
// See the declaration in electrostatic.h for more details.
template<typename T>
void compute_electrostatic_interactions(
const uint32_t n,
Expand All @@ -78,6 +85,7 @@ void compute_electrostatic_interactions(
compute_laplacian_pseudoinverse(n, row_starts, neighbors, data, laplacian_pseudoinverse, weights);
}

// Allocate the memory for the output
matrix.resize(n * n);

// Subtract the diagonal value from each column
Expand All @@ -88,6 +96,7 @@ void compute_electrostatic_interactions(
}
}

// Explicit instantiations of `compute_electrostatic_interactions` for `float` and `double`
template void compute_electrostatic_interactions<float>(
const uint32_t n,
const uint32_t* row_starts,
Expand Down
34 changes: 34 additions & 0 deletions graphium/graphium_cpp/electrostatic.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,18 @@
#include <stdint.h>
#include <vector>

//! Computes the pseudoinverse of the graph Laplacian matrix.
//! Template type `T` can be `float` or `double`. Implementation is in electrostatic.cpp
//!
//! @param n Number of nodes
//! @param row_starts Array of `n+1` indices into `neighbors`, indicating where each node's
//! neighbors start, plus one at the end to indicate the full length of
//! `neighbors`
//! @param neighbors Concatenated array of all neighbors of all nodes, in order
//! @param data Cache for the eigendecomposition of the graph Laplacian matrix
//! @param matrix The output pseudoinverse of the graph Laplacian matrix
//! @param weights Optional array of edge weights, in the order corresponding with neighbors.
//! If null, the edge weights are all 1.
template<typename T>
void compute_laplacian_pseudoinverse(
const uint32_t n,
Expand All @@ -17,6 +29,8 @@ void compute_laplacian_pseudoinverse(
std::vector<T>& matrix,
const T* weights = nullptr);

// Instantiation declarations of `compute_laplacian_pseudoinverse` for `float` and `double`
// The explicit instantiations are in electrostatic.cpp
extern template void compute_laplacian_pseudoinverse<float>(
const uint32_t n,
const uint32_t* row_starts,
Expand All @@ -32,6 +46,24 @@ extern template void compute_laplacian_pseudoinverse<double>(
std::vector<double>& matrix,
const double* weights);

//! Computes the "electrostatic interactions", `P_ij - P_jj`, for each node pair `ij`,
//! where P is the Laplacian pseudoinverse.
//! Template type `T` can be `float` or `double`. Implementation is in electrostatic.cpp
//!
//! @param n Number of nodes
//! @param row_starts Array of `n+1` indices into `neighbors`, indicating where each node's
//! neighbors start, plus one at the end to indicate the full length of
//! `neighbors`
//! @param neighbors Concatenated array of all neighbors of all nodes, in order
//! @param data Cache for the eigendecomposition of the graph Laplacian matrix
//! @param laplacian_pseudoinverse If empty, this will be filled with the pseudoinverse of the
//! graph Laplacian matrix, else its contents will be assumed to
//! contain the cached pseudoinverse of the graph Laplacian
//! @param matrix The output electrostatic interactions for all `n^2` node pairs, i.e. the
//! pseudoinverse of the graph Laplacian matrix, with the diagonal subtracted from
//! each column, stored in row-major order.
//! @param weights Optional array of edge weights, in the order corresponding with neighbors.
//! If null, the edge weights are all 1.
template<typename T>
void compute_electrostatic_interactions(
const uint32_t n,
Expand All @@ -42,6 +74,8 @@ void compute_electrostatic_interactions(
std::vector<T>& matrix,
const T* weights = nullptr);

// Instantiation declarations of `compute_electrostatic_interactions` for `float` and `double`
// The explicit instantiations are in electrostatic.cpp
extern template void compute_electrostatic_interactions<float>(
const uint32_t n,
const uint32_t* row_starts,
Expand Down
Loading
Loading