Update distance code & add tests

Roderick Bovee · Roderick Bovee · commit 75ecdb93f41f · 2019-12-31T22:00:50.000-08:00
diff --git a/src/distances.rs b/src/distances.rs
diff --git a/src/lib.rs b/src/lib.rs
@@ -32,13 +32,13 @@ pub enum TaxonomyError {
 }
 
 mod base;
-pub mod distances;
 pub mod edit;
 pub mod formats;
 #[cfg(any(feature = "python", feature = "python_test"))]
 pub mod python;
 mod rank;
 mod taxonomy;
+pub mod weights;
 
 pub use crate::base::GeneralTaxonomy;
 pub use crate::rank::TaxRank;
diff --git a/src/taxonomy.rs b/src/taxonomy.rs
@@ -1,5 +1,5 @@
-//! The Taxonomy trait defines a large suite of methods that can be used
-//! by implementing a limited subset of required methods.
+//! The Taxonomy trait defines a large suite of methods that can be used by
+//! implementing a limited subset of required methods.
 use std::collections::VecDeque;
 use std::fmt::{Debug, Display};
 use std::iter::Sum;
@@ -41,8 +41,9 @@ where
     /// have access to ranks in a standardized fashion.
     fn rank(&self, tax_id: T) -> Result<Option<TaxRank>>;
 
-    /// Returns a [Vec] of taxonomy nodes from the one provided
-    /// back to root.
+    /// Returns a [Vec] of taxonomy nodes from the one provided back to root.
+    /// This method must return the node itself as the first entry in the list
+    /// and the root node as the last entry in the list.
     fn lineage(&'t self, tax_id: T) -> Result<Vec<T>> {
         // make a vec of parents of id1
         let mut parents = Vec::new();
@@ -135,9 +136,8 @@ where
         self.traverse(self.root()).unwrap().count() / 2
     }
 
-    /// Convenience function for determining if there are any nodes at all
-    /// in the taxonomy. This should almost always be implemented for performance
-    /// reasons.
+    /// Determines if there are any nodes at all in the taxonomy. This should
+    /// almost always be implemented for performance reasons.
     fn is_empty(&'t self) -> bool
     where
         Self: Sized,
diff --git a/src/weights.rs b/src/weights.rs
@@ -0,0 +1,226 @@
+//! Code for working with distance relationships in taxonomic trees
+//! and tracking weights assigned to different nodes in the tree.
+use std::collections::{BinaryHeap, HashMap, HashSet};
+use std::fmt::{Debug, Display};
+use std::hash::{BuildHasher, Hash};
+use std::iter::Sum;
+
+use crate::taxonomy::Taxonomy;
+use crate::Result;
+
+/// Calculates the summed weight of every path to root for a set of node
+/// weights given a corresponding taxonomy.
+///
+/// This is like `maximum_weighted_path`, but rather than just returning the
+/// _best_ path, it returns all of the paths.
+pub fn all_weighted_paths<'t, D: 't, T: 't, W: 't, S: BuildHasher>(
+    taxonomy: &'t impl Taxonomy<'t, T, D>,
+    weights: &HashMap<T, W, S>,
+) -> Result<Vec<(T, W)>>
+where
+    D: Debug + PartialOrd + Sum,
+    T: Clone + Debug + Display + Hash + PartialEq + Ord,
+    W: Clone + Debug + Ord + PartialOrd + PartialEq + Sum,
+{
+    // TODO: should only return leaf nodes
+    let mut taxs_by_score: BinaryHeap<(W, T)> = BinaryHeap::new();
+    let mut stem_ids: HashSet<T> = HashSet::new();
+    for tax_id in weights.keys() {
+        if stem_ids.contains(&tax_id) {
+            continue;
+        }
+        let score = taxonomy
+            .lineage((*tax_id).clone())?
+            .iter()
+            .filter_map(|t| {
+                if t != tax_id {
+                    stem_ids.insert((*t).clone());
+                }
+                weights.get(t).cloned()
+            })
+            .sum();
+
+        taxs_by_score.push((score, tax_id.clone()));
+    }
+    // nodes with higher scores should be first so we need to reverse
+    let mut sorted_taxs = taxs_by_score.into_sorted_vec();
+    sorted_taxs.reverse();
+    // switch the weight/tax_id order and remove remaining stems
+    let taxs = sorted_taxs
+        .into_iter()
+        .filter_map(|(weight, tax_id)| {
+            if stem_ids.contains(&tax_id) {
+                None
+            } else {
+                Some((tax_id, weight))
+            }
+        })
+        .collect();
+    Ok(taxs)
+}
+
+/// Find the lineage that has the greatest summed weight from all of the
+/// weights and a corresponding taxonomy.
+///
+/// Note that this implementation doesn't use the "distances" in the taxonomy
+/// itself, but only uses the weights provided. This can greatly speed up the
+/// calculation because it reduces the number of possible paths that need to
+/// be checked.
+pub fn maximum_weighted_path<'t, D: 't, T: 't, W: 't, S: BuildHasher>(
+    taxonomy: &'t impl Taxonomy<'t, T, D>,
+    weights: &HashMap<T, W, S>,
+    take_first_in_tie: bool,
+) -> Result<(Option<T>, W)>
+where
+    D: Debug + PartialOrd + Sum,
+    T: Clone + Debug + Display + Hash + PartialEq + Ord,
+    W: Clone + Debug + Ord + PartialOrd + PartialEq + Sum,
+{
+    let mut max_taxes: Vec<T> = Vec::new();
+    // this is gross, but there's no "Zero" trait we can define to init this
+    let mut max_score: W = Vec::new().into_iter().sum();
+    for tax_id in weights.keys() {
+        let mut tax_node = tax_id.clone();
+        let mut scores = Vec::new();
+        loop {
+            if let Some(score) = weights.get(&tax_node) {
+                scores.push((*score).clone());
+            }
+            match taxonomy.parent(tax_node)? {
+                Some(p) => tax_node = p.0,
+                None => break,
+            }
+        }
+
+        let score: W = scores.into_iter().sum();
+        if score > max_score {
+            max_score = score.clone();
+            max_taxes.clear();
+        }
+
+        if score >= max_score {
+            max_taxes.push(tax_id.clone());
+        }
+    }
+
+    if take_first_in_tie {
+        return Ok((max_taxes.into_iter().next(), max_score));
+    }
+
+    let first_child = max_taxes.pop();
+    let ancestor =
+        max_taxes
+            .into_iter()
+            .try_fold(first_child, |ancestor, child| match ancestor {
+                None => Ok(None),
+                Some(a) => taxonomy.lca(a, child).map(Some),
+            })?;
+    Ok((ancestor, max_score))
+    // is max_score going to be a little low here because we're not counting
+    // all the leaf nodes?
+}
+
+/// Coverts a set of weights into the set of weights including their children
+/// using the corresponding taxonomy.
+///
+/// For example, for a taxonomic classification of a set of sequencing reads
+/// where a classification may be non-specific to a leaf node, this would
+/// turn the raw read counts at each taxonomic node into the set of read
+/// counts including all of the children of that node (which is probably more
+/// useful for most use cases).
+pub fn rollup_weights<'t, D: 't, T: 't, W: 't, S: BuildHasher>(
+    taxonomy: &'t impl Taxonomy<'t, T, D>,
+    weights: &HashMap<T, W, S>,
+) -> Result<Vec<(T, W)>>
+where
+    D: Debug + PartialOrd + Sum,
+    T: Clone + Debug + Display + Hash + PartialEq + Ord,
+    W: Clone + Debug + Ord + PartialOrd + PartialEq + Sum,
+{
+    let mut all_weights: HashMap<T, Vec<W>> = HashMap::new();
+    for (leaf_id, weight) in weights {
+        for tax_id in taxonomy.lineage(leaf_id.clone())? {
+            let tax_weights = all_weights.entry(tax_id).or_insert_with(Vec::new);
+            tax_weights.push(weight.clone());
+        }
+    }
+    Ok(all_weights
+        .into_iter()
+        .map(|(tax_id, all_weights)| (tax_id, all_weights.into_iter().sum()))
+        .collect())
+}
+
+#[test]
+fn test_all_weighted_path() -> Result<()> {
+    use crate::taxonomy::test::MockTax;
+    let tax = MockTax;
+    let mut hits: HashMap<u32, u16> = HashMap::new();
+    hits.insert(765909, 41);
+    hits.insert(1, 25);
+    hits.insert(131567, 233);
+    hits.insert(2, 512);
+    hits.insert(1224, 33);
+    hits.insert(1236, 275);
+    hits.insert(135622, 59);
+    hits.insert(22, 270);
+    hits.insert(62322, 49);
+    hits.insert(56812, 1);
+    let weights = all_weighted_paths(&tax, &hits)?;
+    assert_eq!(weights, vec![(56812, 1457), (765909, 1119)]);
+    Ok(())
+}
+
+#[test]
+fn test_maximum_weighted_path() -> Result<()> {
+    use crate::taxonomy::test::MockTax;
+    let tax = MockTax;
+    let mut hits: HashMap<u32, u16> = HashMap::new();
+    hits.insert(765909, 41);
+    hits.insert(1, 25);
+    hits.insert(131567, 233);
+    hits.insert(2, 512);
+    hits.insert(1224, 33);
+    hits.insert(1236, 275);
+    hits.insert(135622, 59);
+    hits.insert(22, 270);
+    hits.insert(62322, 49);
+    hits.insert(56812, 1);
+    let (node, weight) = maximum_weighted_path(&tax, &hits, false)?;
+    assert_eq!(node, Some(56812));
+    assert_eq!(weight, 25 + 233 + 512 + 33 + 275 + 59 + 270 + 49 + 1);
+    Ok(())
+}
+
+#[test]
+fn test_rollup() -> Result<()> {
+    use crate::taxonomy::test::MockTax;
+    let tax = MockTax;
+    let mut hits: HashMap<u32, u16> = HashMap::new();
+    hits.insert(1, 25);
+    hits.insert(2, 512);
+    hits.insert(1224, 33);
+    hits.insert(56812, 1);
+    hits.insert(765909, 41);
+    let mut rolled_hits: Vec<(u32, u16)> = rollup_weights(&tax, &hits)?.into_iter().collect();
+    rolled_hits.sort();
+    assert_eq!(
+        rolled_hits,
+        vec![
+            (1, 25 + 512 + 33 + 1 + 41),
+            (2, 512 + 33 + 1 + 41),
+            (22, 1),
+            (1046, 41),
+            (1224, 33 + 1 + 41),
+            (1236, 1 + 41),
+            (53452, 41),
+            (56812, 1),
+            (61598, 41),
+            (62322, 1),
+            (131567, 587),
+            (135613, 41),
+            (135622, 1),
+            (765909, 41),
+        ]
+    );
+    Ok(())
+}