|
| 1 | +//! Code for working with distance relationships in taxonomic trees |
| 2 | +//! and tracking weights assigned to different nodes in the tree. |
| 3 | +use std::collections::{BinaryHeap, HashMap, HashSet}; |
| 4 | +use std::fmt::{Debug, Display}; |
| 5 | +use std::hash::{BuildHasher, Hash}; |
| 6 | +use std::iter::Sum; |
| 7 | + |
| 8 | +use crate::taxonomy::Taxonomy; |
| 9 | +use crate::Result; |
| 10 | + |
| 11 | +/// Calculates the summed weight of every path to root for a set of node |
| 12 | +/// weights given a corresponding taxonomy. |
| 13 | +/// |
| 14 | +/// This is like `maximum_weighted_path`, but rather than just returning the |
| 15 | +/// _best_ path, it returns all of the paths. |
| 16 | +pub fn all_weighted_paths<'t, D: 't, T: 't, W: 't, S: BuildHasher>( |
| 17 | + taxonomy: &'t impl Taxonomy<'t, T, D>, |
| 18 | + weights: &HashMap<T, W, S>, |
| 19 | +) -> Result<Vec<(T, W)>> |
| 20 | +where |
| 21 | + D: Debug + PartialOrd + Sum, |
| 22 | + T: Clone + Debug + Display + Hash + PartialEq + Ord, |
| 23 | + W: Clone + Debug + Ord + PartialOrd + PartialEq + Sum, |
| 24 | +{ |
| 25 | + // TODO: should only return leaf nodes |
| 26 | + let mut taxs_by_score: BinaryHeap<(W, T)> = BinaryHeap::new(); |
| 27 | + let mut stem_ids: HashSet<T> = HashSet::new(); |
| 28 | + for tax_id in weights.keys() { |
| 29 | + if stem_ids.contains(&tax_id) { |
| 30 | + continue; |
| 31 | + } |
| 32 | + let score = taxonomy |
| 33 | + .lineage((*tax_id).clone())? |
| 34 | + .iter() |
| 35 | + .filter_map(|t| { |
| 36 | + if t != tax_id { |
| 37 | + stem_ids.insert((*t).clone()); |
| 38 | + } |
| 39 | + weights.get(t).cloned() |
| 40 | + }) |
| 41 | + .sum(); |
| 42 | + |
| 43 | + taxs_by_score.push((score, tax_id.clone())); |
| 44 | + } |
| 45 | + // nodes with higher scores should be first so we need to reverse |
| 46 | + let mut sorted_taxs = taxs_by_score.into_sorted_vec(); |
| 47 | + sorted_taxs.reverse(); |
| 48 | + // switch the weight/tax_id order and remove remaining stems |
| 49 | + let taxs = sorted_taxs |
| 50 | + .into_iter() |
| 51 | + .filter_map(|(weight, tax_id)| { |
| 52 | + if stem_ids.contains(&tax_id) { |
| 53 | + None |
| 54 | + } else { |
| 55 | + Some((tax_id, weight)) |
| 56 | + } |
| 57 | + }) |
| 58 | + .collect(); |
| 59 | + Ok(taxs) |
| 60 | +} |
| 61 | + |
| 62 | +/// Find the lineage that has the greatest summed weight from all of the |
| 63 | +/// weights and a corresponding taxonomy. |
| 64 | +/// |
| 65 | +/// Note that this implementation doesn't use the "distances" in the taxonomy |
| 66 | +/// itself, but only uses the weights provided. This can greatly speed up the |
| 67 | +/// calculation because it reduces the number of possible paths that need to |
| 68 | +/// be checked. |
| 69 | +pub fn maximum_weighted_path<'t, D: 't, T: 't, W: 't, S: BuildHasher>( |
| 70 | + taxonomy: &'t impl Taxonomy<'t, T, D>, |
| 71 | + weights: &HashMap<T, W, S>, |
| 72 | + take_first_in_tie: bool, |
| 73 | +) -> Result<(Option<T>, W)> |
| 74 | +where |
| 75 | + D: Debug + PartialOrd + Sum, |
| 76 | + T: Clone + Debug + Display + Hash + PartialEq + Ord, |
| 77 | + W: Clone + Debug + Ord + PartialOrd + PartialEq + Sum, |
| 78 | +{ |
| 79 | + let mut max_taxes: Vec<T> = Vec::new(); |
| 80 | + // this is gross, but there's no "Zero" trait we can define to init this |
| 81 | + let mut max_score: W = Vec::new().into_iter().sum(); |
| 82 | + for tax_id in weights.keys() { |
| 83 | + let mut tax_node = tax_id.clone(); |
| 84 | + let mut scores = Vec::new(); |
| 85 | + loop { |
| 86 | + if let Some(score) = weights.get(&tax_node) { |
| 87 | + scores.push((*score).clone()); |
| 88 | + } |
| 89 | + match taxonomy.parent(tax_node)? { |
| 90 | + Some(p) => tax_node = p.0, |
| 91 | + None => break, |
| 92 | + } |
| 93 | + } |
| 94 | + |
| 95 | + let score: W = scores.into_iter().sum(); |
| 96 | + if score > max_score { |
| 97 | + max_score = score.clone(); |
| 98 | + max_taxes.clear(); |
| 99 | + } |
| 100 | + |
| 101 | + if score >= max_score { |
| 102 | + max_taxes.push(tax_id.clone()); |
| 103 | + } |
| 104 | + } |
| 105 | + |
| 106 | + if take_first_in_tie { |
| 107 | + return Ok((max_taxes.into_iter().next(), max_score)); |
| 108 | + } |
| 109 | + |
| 110 | + let first_child = max_taxes.pop(); |
| 111 | + let ancestor = |
| 112 | + max_taxes |
| 113 | + .into_iter() |
| 114 | + .try_fold(first_child, |ancestor, child| match ancestor { |
| 115 | + None => Ok(None), |
| 116 | + Some(a) => taxonomy.lca(a, child).map(Some), |
| 117 | + })?; |
| 118 | + Ok((ancestor, max_score)) |
| 119 | + // is max_score going to be a little low here because we're not counting |
| 120 | + // all the leaf nodes? |
| 121 | +} |
| 122 | + |
| 123 | +/// Coverts a set of weights into the set of weights including their children |
| 124 | +/// using the corresponding taxonomy. |
| 125 | +/// |
| 126 | +/// For example, for a taxonomic classification of a set of sequencing reads |
| 127 | +/// where a classification may be non-specific to a leaf node, this would |
| 128 | +/// turn the raw read counts at each taxonomic node into the set of read |
| 129 | +/// counts including all of the children of that node (which is probably more |
| 130 | +/// useful for most use cases). |
| 131 | +pub fn rollup_weights<'t, D: 't, T: 't, W: 't, S: BuildHasher>( |
| 132 | + taxonomy: &'t impl Taxonomy<'t, T, D>, |
| 133 | + weights: &HashMap<T, W, S>, |
| 134 | +) -> Result<Vec<(T, W)>> |
| 135 | +where |
| 136 | + D: Debug + PartialOrd + Sum, |
| 137 | + T: Clone + Debug + Display + Hash + PartialEq + Ord, |
| 138 | + W: Clone + Debug + Ord + PartialOrd + PartialEq + Sum, |
| 139 | +{ |
| 140 | + let mut all_weights: HashMap<T, Vec<W>> = HashMap::new(); |
| 141 | + for (leaf_id, weight) in weights { |
| 142 | + for tax_id in taxonomy.lineage(leaf_id.clone())? { |
| 143 | + let tax_weights = all_weights.entry(tax_id).or_insert_with(Vec::new); |
| 144 | + tax_weights.push(weight.clone()); |
| 145 | + } |
| 146 | + } |
| 147 | + Ok(all_weights |
| 148 | + .into_iter() |
| 149 | + .map(|(tax_id, all_weights)| (tax_id, all_weights.into_iter().sum())) |
| 150 | + .collect()) |
| 151 | +} |
| 152 | + |
| 153 | +#[test] |
| 154 | +fn test_all_weighted_path() -> Result<()> { |
| 155 | + use crate::taxonomy::test::MockTax; |
| 156 | + let tax = MockTax; |
| 157 | + let mut hits: HashMap<u32, u16> = HashMap::new(); |
| 158 | + hits.insert(765909, 41); |
| 159 | + hits.insert(1, 25); |
| 160 | + hits.insert(131567, 233); |
| 161 | + hits.insert(2, 512); |
| 162 | + hits.insert(1224, 33); |
| 163 | + hits.insert(1236, 275); |
| 164 | + hits.insert(135622, 59); |
| 165 | + hits.insert(22, 270); |
| 166 | + hits.insert(62322, 49); |
| 167 | + hits.insert(56812, 1); |
| 168 | + let weights = all_weighted_paths(&tax, &hits)?; |
| 169 | + assert_eq!(weights, vec![(56812, 1457), (765909, 1119)]); |
| 170 | + Ok(()) |
| 171 | +} |
| 172 | + |
| 173 | +#[test] |
| 174 | +fn test_maximum_weighted_path() -> Result<()> { |
| 175 | + use crate::taxonomy::test::MockTax; |
| 176 | + let tax = MockTax; |
| 177 | + let mut hits: HashMap<u32, u16> = HashMap::new(); |
| 178 | + hits.insert(765909, 41); |
| 179 | + hits.insert(1, 25); |
| 180 | + hits.insert(131567, 233); |
| 181 | + hits.insert(2, 512); |
| 182 | + hits.insert(1224, 33); |
| 183 | + hits.insert(1236, 275); |
| 184 | + hits.insert(135622, 59); |
| 185 | + hits.insert(22, 270); |
| 186 | + hits.insert(62322, 49); |
| 187 | + hits.insert(56812, 1); |
| 188 | + let (node, weight) = maximum_weighted_path(&tax, &hits, false)?; |
| 189 | + assert_eq!(node, Some(56812)); |
| 190 | + assert_eq!(weight, 25 + 233 + 512 + 33 + 275 + 59 + 270 + 49 + 1); |
| 191 | + Ok(()) |
| 192 | +} |
| 193 | + |
| 194 | +#[test] |
| 195 | +fn test_rollup() -> Result<()> { |
| 196 | + use crate::taxonomy::test::MockTax; |
| 197 | + let tax = MockTax; |
| 198 | + let mut hits: HashMap<u32, u16> = HashMap::new(); |
| 199 | + hits.insert(1, 25); |
| 200 | + hits.insert(2, 512); |
| 201 | + hits.insert(1224, 33); |
| 202 | + hits.insert(56812, 1); |
| 203 | + hits.insert(765909, 41); |
| 204 | + let mut rolled_hits: Vec<(u32, u16)> = rollup_weights(&tax, &hits)?.into_iter().collect(); |
| 205 | + rolled_hits.sort(); |
| 206 | + assert_eq!( |
| 207 | + rolled_hits, |
| 208 | + vec![ |
| 209 | + (1, 25 + 512 + 33 + 1 + 41), |
| 210 | + (2, 512 + 33 + 1 + 41), |
| 211 | + (22, 1), |
| 212 | + (1046, 41), |
| 213 | + (1224, 33 + 1 + 41), |
| 214 | + (1236, 1 + 41), |
| 215 | + (53452, 41), |
| 216 | + (56812, 1), |
| 217 | + (61598, 41), |
| 218 | + (62322, 1), |
| 219 | + (131567, 587), |
| 220 | + (135613, 41), |
| 221 | + (135622, 1), |
| 222 | + (765909, 41), |
| 223 | + ] |
| 224 | + ); |
| 225 | + Ok(()) |
| 226 | +} |
0 commit comments