Skip to content

Commit 75ecdb9

Browse files
author
Roderick Bovee
committed
Update distance code & add tests
1 parent b237452 commit 75ecdb9

File tree

4 files changed

+234
-131
lines changed

4 files changed

+234
-131
lines changed

src/distances.rs

Lines changed: 0 additions & 123 deletions
This file was deleted.

src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,13 @@ pub enum TaxonomyError {
3232
}
3333

3434
mod base;
35-
pub mod distances;
3635
pub mod edit;
3736
pub mod formats;
3837
#[cfg(any(feature = "python", feature = "python_test"))]
3938
pub mod python;
4039
mod rank;
4140
mod taxonomy;
41+
pub mod weights;
4242

4343
pub use crate::base::GeneralTaxonomy;
4444
pub use crate::rank::TaxRank;

src/taxonomy.rs

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
//! The Taxonomy trait defines a large suite of methods that can be used
2-
//! by implementing a limited subset of required methods.
1+
//! The Taxonomy trait defines a large suite of methods that can be used by
2+
//! implementing a limited subset of required methods.
33
use std::collections::VecDeque;
44
use std::fmt::{Debug, Display};
55
use std::iter::Sum;
@@ -41,8 +41,9 @@ where
4141
/// have access to ranks in a standardized fashion.
4242
fn rank(&self, tax_id: T) -> Result<Option<TaxRank>>;
4343

44-
/// Returns a [Vec] of taxonomy nodes from the one provided
45-
/// back to root.
44+
/// Returns a [Vec] of taxonomy nodes from the one provided back to root.
45+
/// This method must return the node itself as the first entry in the list
46+
/// and the root node as the last entry in the list.
4647
fn lineage(&'t self, tax_id: T) -> Result<Vec<T>> {
4748
// make a vec of parents of id1
4849
let mut parents = Vec::new();
@@ -135,9 +136,8 @@ where
135136
self.traverse(self.root()).unwrap().count() / 2
136137
}
137138

138-
/// Convenience function for determining if there are any nodes at all
139-
/// in the taxonomy. This should almost always be implemented for performance
140-
/// reasons.
139+
/// Determines if there are any nodes at all in the taxonomy. This should
140+
/// almost always be implemented for performance reasons.
141141
fn is_empty(&'t self) -> bool
142142
where
143143
Self: Sized,

src/weights.rs

Lines changed: 226 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,226 @@
1+
//! Code for working with distance relationships in taxonomic trees
2+
//! and tracking weights assigned to different nodes in the tree.
3+
use std::collections::{BinaryHeap, HashMap, HashSet};
4+
use std::fmt::{Debug, Display};
5+
use std::hash::{BuildHasher, Hash};
6+
use std::iter::Sum;
7+
8+
use crate::taxonomy::Taxonomy;
9+
use crate::Result;
10+
11+
/// Calculates the summed weight of every path to root for a set of node
12+
/// weights given a corresponding taxonomy.
13+
///
14+
/// This is like `maximum_weighted_path`, but rather than just returning the
15+
/// _best_ path, it returns all of the paths.
16+
pub fn all_weighted_paths<'t, D: 't, T: 't, W: 't, S: BuildHasher>(
17+
taxonomy: &'t impl Taxonomy<'t, T, D>,
18+
weights: &HashMap<T, W, S>,
19+
) -> Result<Vec<(T, W)>>
20+
where
21+
D: Debug + PartialOrd + Sum,
22+
T: Clone + Debug + Display + Hash + PartialEq + Ord,
23+
W: Clone + Debug + Ord + PartialOrd + PartialEq + Sum,
24+
{
25+
// TODO: should only return leaf nodes
26+
let mut taxs_by_score: BinaryHeap<(W, T)> = BinaryHeap::new();
27+
let mut stem_ids: HashSet<T> = HashSet::new();
28+
for tax_id in weights.keys() {
29+
if stem_ids.contains(&tax_id) {
30+
continue;
31+
}
32+
let score = taxonomy
33+
.lineage((*tax_id).clone())?
34+
.iter()
35+
.filter_map(|t| {
36+
if t != tax_id {
37+
stem_ids.insert((*t).clone());
38+
}
39+
weights.get(t).cloned()
40+
})
41+
.sum();
42+
43+
taxs_by_score.push((score, tax_id.clone()));
44+
}
45+
// nodes with higher scores should be first so we need to reverse
46+
let mut sorted_taxs = taxs_by_score.into_sorted_vec();
47+
sorted_taxs.reverse();
48+
// switch the weight/tax_id order and remove remaining stems
49+
let taxs = sorted_taxs
50+
.into_iter()
51+
.filter_map(|(weight, tax_id)| {
52+
if stem_ids.contains(&tax_id) {
53+
None
54+
} else {
55+
Some((tax_id, weight))
56+
}
57+
})
58+
.collect();
59+
Ok(taxs)
60+
}
61+
62+
/// Find the lineage that has the greatest summed weight from all of the
63+
/// weights and a corresponding taxonomy.
64+
///
65+
/// Note that this implementation doesn't use the "distances" in the taxonomy
66+
/// itself, but only uses the weights provided. This can greatly speed up the
67+
/// calculation because it reduces the number of possible paths that need to
68+
/// be checked.
69+
pub fn maximum_weighted_path<'t, D: 't, T: 't, W: 't, S: BuildHasher>(
70+
taxonomy: &'t impl Taxonomy<'t, T, D>,
71+
weights: &HashMap<T, W, S>,
72+
take_first_in_tie: bool,
73+
) -> Result<(Option<T>, W)>
74+
where
75+
D: Debug + PartialOrd + Sum,
76+
T: Clone + Debug + Display + Hash + PartialEq + Ord,
77+
W: Clone + Debug + Ord + PartialOrd + PartialEq + Sum,
78+
{
79+
let mut max_taxes: Vec<T> = Vec::new();
80+
// this is gross, but there's no "Zero" trait we can define to init this
81+
let mut max_score: W = Vec::new().into_iter().sum();
82+
for tax_id in weights.keys() {
83+
let mut tax_node = tax_id.clone();
84+
let mut scores = Vec::new();
85+
loop {
86+
if let Some(score) = weights.get(&tax_node) {
87+
scores.push((*score).clone());
88+
}
89+
match taxonomy.parent(tax_node)? {
90+
Some(p) => tax_node = p.0,
91+
None => break,
92+
}
93+
}
94+
95+
let score: W = scores.into_iter().sum();
96+
if score > max_score {
97+
max_score = score.clone();
98+
max_taxes.clear();
99+
}
100+
101+
if score >= max_score {
102+
max_taxes.push(tax_id.clone());
103+
}
104+
}
105+
106+
if take_first_in_tie {
107+
return Ok((max_taxes.into_iter().next(), max_score));
108+
}
109+
110+
let first_child = max_taxes.pop();
111+
let ancestor =
112+
max_taxes
113+
.into_iter()
114+
.try_fold(first_child, |ancestor, child| match ancestor {
115+
None => Ok(None),
116+
Some(a) => taxonomy.lca(a, child).map(Some),
117+
})?;
118+
Ok((ancestor, max_score))
119+
// is max_score going to be a little low here because we're not counting
120+
// all the leaf nodes?
121+
}
122+
123+
/// Coverts a set of weights into the set of weights including their children
124+
/// using the corresponding taxonomy.
125+
///
126+
/// For example, for a taxonomic classification of a set of sequencing reads
127+
/// where a classification may be non-specific to a leaf node, this would
128+
/// turn the raw read counts at each taxonomic node into the set of read
129+
/// counts including all of the children of that node (which is probably more
130+
/// useful for most use cases).
131+
pub fn rollup_weights<'t, D: 't, T: 't, W: 't, S: BuildHasher>(
132+
taxonomy: &'t impl Taxonomy<'t, T, D>,
133+
weights: &HashMap<T, W, S>,
134+
) -> Result<Vec<(T, W)>>
135+
where
136+
D: Debug + PartialOrd + Sum,
137+
T: Clone + Debug + Display + Hash + PartialEq + Ord,
138+
W: Clone + Debug + Ord + PartialOrd + PartialEq + Sum,
139+
{
140+
let mut all_weights: HashMap<T, Vec<W>> = HashMap::new();
141+
for (leaf_id, weight) in weights {
142+
for tax_id in taxonomy.lineage(leaf_id.clone())? {
143+
let tax_weights = all_weights.entry(tax_id).or_insert_with(Vec::new);
144+
tax_weights.push(weight.clone());
145+
}
146+
}
147+
Ok(all_weights
148+
.into_iter()
149+
.map(|(tax_id, all_weights)| (tax_id, all_weights.into_iter().sum()))
150+
.collect())
151+
}
152+
153+
#[test]
154+
fn test_all_weighted_path() -> Result<()> {
155+
use crate::taxonomy::test::MockTax;
156+
let tax = MockTax;
157+
let mut hits: HashMap<u32, u16> = HashMap::new();
158+
hits.insert(765909, 41);
159+
hits.insert(1, 25);
160+
hits.insert(131567, 233);
161+
hits.insert(2, 512);
162+
hits.insert(1224, 33);
163+
hits.insert(1236, 275);
164+
hits.insert(135622, 59);
165+
hits.insert(22, 270);
166+
hits.insert(62322, 49);
167+
hits.insert(56812, 1);
168+
let weights = all_weighted_paths(&tax, &hits)?;
169+
assert_eq!(weights, vec![(56812, 1457), (765909, 1119)]);
170+
Ok(())
171+
}
172+
173+
#[test]
174+
fn test_maximum_weighted_path() -> Result<()> {
175+
use crate::taxonomy::test::MockTax;
176+
let tax = MockTax;
177+
let mut hits: HashMap<u32, u16> = HashMap::new();
178+
hits.insert(765909, 41);
179+
hits.insert(1, 25);
180+
hits.insert(131567, 233);
181+
hits.insert(2, 512);
182+
hits.insert(1224, 33);
183+
hits.insert(1236, 275);
184+
hits.insert(135622, 59);
185+
hits.insert(22, 270);
186+
hits.insert(62322, 49);
187+
hits.insert(56812, 1);
188+
let (node, weight) = maximum_weighted_path(&tax, &hits, false)?;
189+
assert_eq!(node, Some(56812));
190+
assert_eq!(weight, 25 + 233 + 512 + 33 + 275 + 59 + 270 + 49 + 1);
191+
Ok(())
192+
}
193+
194+
#[test]
195+
fn test_rollup() -> Result<()> {
196+
use crate::taxonomy::test::MockTax;
197+
let tax = MockTax;
198+
let mut hits: HashMap<u32, u16> = HashMap::new();
199+
hits.insert(1, 25);
200+
hits.insert(2, 512);
201+
hits.insert(1224, 33);
202+
hits.insert(56812, 1);
203+
hits.insert(765909, 41);
204+
let mut rolled_hits: Vec<(u32, u16)> = rollup_weights(&tax, &hits)?.into_iter().collect();
205+
rolled_hits.sort();
206+
assert_eq!(
207+
rolled_hits,
208+
vec![
209+
(1, 25 + 512 + 33 + 1 + 41),
210+
(2, 512 + 33 + 1 + 41),
211+
(22, 1),
212+
(1046, 41),
213+
(1224, 33 + 1 + 41),
214+
(1236, 1 + 41),
215+
(53452, 41),
216+
(56812, 1),
217+
(61598, 41),
218+
(62322, 1),
219+
(131567, 587),
220+
(135613, 41),
221+
(135622, 1),
222+
(765909, 41),
223+
]
224+
);
225+
Ok(())
226+
}

0 commit comments

Comments
 (0)