Added and updated more comments

datamol-io · Sep 23, 2024 · 5d798a5 · 5d798a5
1 parent 06b12b2
commit 5d798a5
Show file tree

Hide file tree

Showing 3 changed files with 140 additions and 5 deletions.
diff --git a/graphium/graphium_cpp/features.h b/graphium/graphium_cpp/features.h
@@ -381,6 +381,8 @@ std::unique_ptr<RDKit::RWMol> parse_mol(
     bool ordered = true);
 
 //! Determines a canonical ordering of the atoms in `mol`
+//!
+//! This is implemented in graphium_cpp.cpp, to keep it near `parse_mol`
 void get_canonical_atom_order(
     const RDKit::ROMol& mol,
     std::vector<unsigned int>& atom_order);
diff --git a/graphium/graphium_cpp/graphium_cpp.cpp b/graphium/graphium_cpp/graphium_cpp.cpp
@@ -33,8 +33,8 @@
 #include <pybind11/stl.h>
 #include <torch/extension.h>
 
-// RDKit::SmilesToMol uses std::string, so until we replace it, lets use std::string here.
-// ("const char*" could avoid an extra allocation, if we do eventually replace use of SmilesToMol.)
+// Creates an RWMol from a SMILES string.
+// See the declaration in features.h for more details.
 std::unique_ptr<RDKit::RWMol> parse_mol(
     const std::string& smiles_string,
     bool explicit_H,
@@ -106,6 +106,8 @@ std::unique_ptr<RDKit::RWMol> parse_mol(
     return mol;
 }
 
+// Determines a canonical ordering of the atoms in `mol`
+// See the declaration in features.h for more details.
 void get_canonical_atom_order(const RDKit::ROMol& mol, std::vector<unsigned int>& atom_order) {
     RDKit::Canon::rankMolAtoms(mol, atom_order);
     assert(atom_order.size() == mol->getNumAtoms());
@@ -125,8 +127,8 @@ PYBIND11_MODULE(graphium_cpp, m) {
     m.def("extract_string", &extract_string, "Extracts a single string from a Tensor of contatenated strings.");
 
     // Functions in features.cpp
-    m.def("atom_float_feature_names_to_tensor", &atom_float_feature_names_to_tensor, "Accepts feature names and returns a tensor representing them as integers");
     m.def("atom_onehot_feature_names_to_tensor", &atom_onehot_feature_names_to_tensor, "Accepts feature names and returns a tensor representing them as integers");
+    m.def("atom_float_feature_names_to_tensor", &atom_float_feature_names_to_tensor, "Accepts feature names and returns a tensor representing them as integers");
     m.def("bond_feature_names_to_tensor", &bond_feature_names_to_tensor, "Accepts feature names and returns a tensor representing them as integers");
     m.def("positional_feature_options_to_tensor", &positional_feature_options_to_tensor, "Accepts feature names, levels, and options, and returns a tensor representing them as integers");
     m.def("featurize_smiles", &featurize_smiles, "Accepts a SMILES string and returns tensors representing the features");

diff --git a/graphium/graphium_cpp/labels.h b/graphium/graphium_cpp/labels.h
@@ -18,27 +18,140 @@
 #include <pybind11/pytypes.h>
 #include <pybind11/stl.h>
 
-// The following functions are in labels.cpp, and declared here so that
-// graphium_cpp.cpp can expose them to Python via pybind.
+//! Reads the number of columns and data type for each task, from the common label metadata
+//! file that was already saved by `prepare_and_save_data`, possibly on a previous run, in the
+//! directory `processed_graph_data_path/data_hash`.  Returns empty lists on failure.
+//!
+//! This is implemented in labels.cpp, and declared here so that graphium_cpp.cpp
+//! can expose it to Python via pybind.
 std::tuple<
     std::vector<int64_t>,
     std::vector<int32_t>
 > load_num_cols_and_dtypes(
     const std::string& processed_graph_data_path,
     const std::string& data_hash);
 
+//! Reads data from the stage-specific label metadata files that were already saved by
+//! `prepare_and_save_data`, possibly on a previous run, in the directory
+//! `processed_graph_data_path/stage_data_hash`.  Returns an empty list on failure.
+//!
+//! On success, the returned tensors are:
+//!     0) All SMILES strings concatenated,
+//!     1) The beginning offsets of each SMILES string in the first tensor, and
+//!        one extra at the end equal to the length of the first tensor
+//!     2) The number of nodes (atoms) in each molecule
+//!     3) The number of edges (bonds) in each molecule
+//!     4) (Optional if only inference) The offset of each molecule's label data within the
+//!        label data files, plus an extra for the end of each file
+//! The first two tensors are used by `extract_string`.  The optional last tensor is
+//! used by `load_labels_from_index`.
+//!
+//! This is implemented in labels.cpp, and declared here so that graphium_cpp.cpp
+//! can expose it to Python via pybind.
 std::vector<at::Tensor> load_metadata_tensors(
     const std::string processed_graph_data_path,
     const std::string stage,
     const std::string data_hash);
 
+//! Reads data from the task-specific stats file that was already saved by
+//! `prepare_and_save_data`, possibly on a previous run, in the directory
+//! `processed_graph_data_path/data_hash`.  Returns an empty list on failure.
+//!
+//! Each tensor's length is the number of columns for this task, and there are 4
+//! tensors total: minimum, maximum, mean, standard deviation.
+//!
+//! This is implemented in labels.cpp, and declared here so that graphium_cpp.cpp
+//! can expose it to Python via pybind.
 std::vector<at::Tensor> load_stats(
     const std::string processed_graph_data_path,
     const std::string data_hash,
     const std::string task_name);
 
+//! Accepts a Numpy array of strings or Python list of strings, and returns a PyTorch tensor
+//! of all of the characters and another tensor containing indices into the other tensor
+//! indicating where each string begins, plus one extra index indicating the end.
+//!
+//! This is implemented in labels.cpp, and declared here so that graphium_cpp.cpp
+//! can expose it to Python via pybind.
 std::pair<at::Tensor, at::Tensor> concatenate_strings(pybind11::handle handle);
 
+//! Merges label data for equivalent molecules from separate datasets,
+//! computes statistics, and caches the label data to files for efficient loading later.
+//!
+//! This is implemented in labels.cpp, and declared here so that graphium_cpp.cpp
+//! can expose it to Python via pybind.
+//!
+//! @param task_names Python list of the names of the datasets to process.  These are used for
+//!                   looking up into the other parameters starting with `task_`, and the
+//!                   beginning of each name must be `graph_`, `node_`, `edge_`, or `nodepair_`
+//!                   to determine the level of the label data.
+//! @param task_dataset_args Python dict mapping task names to Python dicts for each dataset.
+//!                          Each task's dict must contain a mapping from `"smiles"` to a 1D
+//!                          Numpy array of objects, each of which is a Python string with a
+//!                          molecule's SMILES text.  If doing inference, each task's dict must
+//!                          also map from `"labels"` to a 2D Numpy array of float16, float32,
+//!                          or float64 type.  For node, edge, or node-pair level label data,
+//!                          the dict must also map from `"label_offsets"` to a 1D Numpy array
+//!                          of type int64, indicating the row in the `"labels"` array where
+//!                          each molecule's data begins, plus an extra for the end.  If
+//!                          `"label_offsets"` is not present, the `"labels"` array has one row
+//!                          per molecule, and if it is present, the `"labels"` array has one
+//!                          row per atom, bond, or pair of atoms, according to the label level.
+//! @param task_label_normalization Python dict mapping task names to Python dicts for each
+//!                                 dataset's normalization options.  Each task's dict must
+//!                                 contain a mapping from `"method"` to either `"none"`,
+//!                                 `"normal"`, or `"unit"`, and can optionally contain a
+//!                                 mapping from `"min_clipping"` and/or `"max_clipping"` to a
+//!                                 Python float or int to explicitly clip the range.
+//! @param processed_graph_data_path String containing the base directory to create
+//!                                  subdirectories for cached files in.  It can exist already,
+//!                                  or will be created if it does not already exist.
+//! @param data_hash String representing a hash of the label data options.  It will be used in
+//!                  the names of all subdirectories created under `processed_graph_data_path`.
+//! @param task_train_indices Python dict mapping task names to Python lists of ints, indicating
+//!                           indices into `task_dataset_args[task_name]["smiles"]` and other
+//!                           per-molecule-per-task arrays.  Only these molecules will be used
+//!                           for the "train" stage.
+//! @param task_val_indices Python dict mapping task names to Python lists of ints, indicating
+//!                         indices into `task_dataset_args[task_name]["smiles"]` and other
+//!                         per-molecule-per-task arrays.  Only these molecules will be used
+//!                         for the "val" stage.
+//! @param task_test_indices Python dict mapping task names to Python lists of ints, indicating
+//!                          indices into `task_dataset_args[task_name]["smiles"]` and other
+//!                          per-molecule-per-task arrays.  Only these molecules will be used
+//!                          for the "test" stage.
+//! @param add_self_loop If true (default is false), `num_atoms` is added to the number of
+//!                      directed edges (twice the number of bonds).  This is for consistency
+//!                      with `featurize_smiles` later.
+//! @param explicit_H If true (default is false), any implicit hydrogens will be made explicit,
+//!                   possibly increasing the number of atoms.
+//! @param max_threads If greater than zero, at most this many threads will be created for
+//!                    processing in parallel.  If zero (the default), at most one thread per
+//!                    logical CPU core will be created.  If less than zero, the limit is
+//!                    reduced by adding this negative amount to the number of logical CPU
+//!                    cores.
+//! @param merge_equivalent_mols If true (the default), label data for the same molecule in
+//!                              different datasets are collected together, even if the atoms
+//!                              or bonds are in a different order.  Duplicates of the same
+//!                              molecule within a single dataset will be ignored.  This is very
+//!                              slow, and changes the number and order of the molecules, so it
+//!                              can be set to false for inference, where there is no label data
+//!                              or only one dataset.
+//! @return Four objects:
+//!         - A dict mapping the stage names ("train", "val", "test") to a list of five 1D
+//!           PyTorch tensors:
+//!             0) SMILES strings all concatenated, one per unique molecule
+//!             1) Offsets into the previous tensor where the strings begin, one per unique
+//!                molecule, plus one extra for the end
+//!             2) Number of nodes (atoms) in each unique molecule
+//!             3) Number of edges (2*bonds) in each unique molecule
+//!             4) (Only if there is label data) `mol_file_data_offsets` to be passed to calls
+//!                to `load_labels_from_index`
+//!         - A dict mapping task names to a list of four 1D PyTorch tensors for column
+//!           normalization: minimum, maximum, mean, standard deviation
+//!         - A list of the number of columns in each task, in the same order as `task_names`
+//!         - A list of integers representing the Torch data type of each task, in the same
+//!           order as `task_names`
 std::tuple<
     std::unordered_map<std::string, std::vector<at::Tensor>>,
     std::unordered_map<std::string, std::vector<at::Tensor>>,
@@ -58,6 +171,16 @@ std::tuple<
     int max_threads = 0,
     bool merge_equivalent_mols = true);
 
+//! Loads label data associated with the molecule with index `mol_index` from the corresponding
+//! file in the directory `stage_directory`, and adds the data to `labels` dictionary using
+//! the strings from `label_names` to map to tensors.  The label data must be previously saved
+//! by `prepare_and_save_data`.  `mol_file_data_offsets` is used to determine how to find the
+//! data in the file, `label_data_types` is used for the type and size of each float, and
+//! `label_num_cols` is used to determine the layout of each output tensor, especially ones with
+//! multiple rows, such as node-level, edge-level, or node-pair-level label data.
+//!
+//! This is implemented in labels.cpp, and declared here so that graphium_cpp.cpp
+//! can expose it to Python via pybind.
 void load_labels_from_index(
     const std::string stage_directory,
     const int64_t mol_index,
@@ -67,6 +190,14 @@ void load_labels_from_index(
     const pybind11::list& label_data_types,
     pybind11::dict& labels);
 
+//! Extracts a single string from `concat_strings`, a Tensor of contatenated strings,
+//! using offsets at the specified `index` in `string_offsets`.
+//!
+//! The tensors can be returned by `load_metadata_tensors`, `concatenate_strings`, or
+//! `prepare_and_save_data`.
+//!
+//! This is implemented in labels.cpp, and declared here so that graphium_cpp.cpp
+//! can expose it to Python via pybind.
 std::string extract_string(
     const at::Tensor& concat_strings,
     const at::Tensor& string_offsets,