Skip to content

Commit

Permalink
Fix #13 - pmd-mask now handles unmapped reads positionned beyond the …
Browse files Browse the repository at this point in the history
…chromosome's length
  • Loading branch information
MaelLefeuvre committed Jul 19, 2023
1 parent f7b97c9 commit 7bcd765
Show file tree
Hide file tree
Showing 5 changed files with 39 additions and 15 deletions.
6 changes: 5 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
# version 0.3.0 (2023-03-19à
# version 0.3.1 (2023-07-19)
## Bugfixes

- Fixes (#13) `mask_sequence()` function now correctly handles out of index errors when attempting to retrieve positions that lie beyond the chromosome's length. A `ReferenceOutOfIndexError` is now raised to the `mask_5p()` and `mask_3p()` functions, while `apply_pmd_mask()` now recover from such an error if and only if the read happens to be labeled as 'segment unmapped' (`0x4`).
# version 0.3.0 (2023-03-19)
## Features:
Additional `-M`|`--metrics-file` argument allows to optionally specify an output summary file, where the software will keep a record of which positions were selected as masking thresholds. This file is headed, tab-separated, lexicographically ordered and structured according to four fields:
- `Chr`: Chromosome name (string)
Expand Down
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "pmd-mask"
version = "0.3.0"
version = "0.3.1"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
Expand Down
2 changes: 2 additions & 0 deletions src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,7 @@ pub enum RuntimeError {
#[error("Failed to write masking thresholds within the provided metrics file path. [{0}]")]
WriteMasksMetrics(#[source] std::io::Error),

#[error("Length of the retrieved reference sequence does not match the length of the read")]
ReferenceOutOfIndexError,
}

42 changes: 30 additions & 12 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,37 +12,39 @@ pub use mask::{Masks, MaskEntry, MaskThreshold};

use anyhow::Result;
use rust_htslib::{faidx, bam};
use log::{debug, trace};
use log::{debug, trace, warn};


use rust_htslib::bam::ext::BamRecordExtensions;


#[inline]
fn mask_sequence(range: Range<usize>, reference: &[u8], seq: &mut [u8], quals: &mut [u8], target_nucleotide: u8, positions: &[[usize; 2]]) {
fn mask_sequence(range: Range<usize>, reference: &[u8], seq: &mut [u8], quals: &mut [u8], target_nucleotide: u8, positions: &[[usize; 2]]) -> Result<(), RuntimeError> {
'mask: for [readpos, refpos] in positions.iter().copied().skip(range.start).take_while(|[readpos, _]| *readpos < range.end ) {
if readpos >= seq.len() { break 'mask}
if reference[refpos] == target_nucleotide {
if readpos >= seq.len() { break 'mask }
let reference_nucleotide = reference.get(refpos).ok_or_else(|| RuntimeError::ReferenceOutOfIndexError)?;
if *reference_nucleotide == target_nucleotide {
seq[readpos] = b'N';
quals[readpos] = 0;
}
}
Ok(())
}

#[inline]
fn mask_5p(thresholds: &MaskThreshold, reference: &[u8], seq: &mut [u8], quals: &mut [u8], positions: &[[usize; 2]]) {
fn mask_5p(thresholds: &MaskThreshold, reference: &[u8], seq: &mut [u8], quals: &mut [u8], positions: &[[usize; 2]]) -> Result<(), RuntimeError> {
// Unwrap cause we have previously validated the struct. [Code smell]
let mask_5p_threshold = thresholds.get_threshold(&Orientation::FivePrime).unwrap().inner();
let mask_5p_range = 0..mask_5p_threshold -1;
mask_sequence(mask_5p_range, reference, seq, quals, b'C', positions);
mask_sequence(mask_5p_range, reference, seq, quals, b'C', positions)
}

#[inline]
fn mask_3p(thresholds: &MaskThreshold, reference: &[u8], seq: &mut [u8], quals: &mut [u8], positions: &[[usize; 2]]) {
fn mask_3p(thresholds: &MaskThreshold, reference: &[u8], seq: &mut [u8], quals: &mut [u8], positions: &[[usize; 2]]) -> Result<(), RuntimeError> {
// Unwrap cause we have previously validated the struct. [Code smell]
let mask_3p_threshold = thresholds.get_threshold(&Orientation::ThreePrime).unwrap().inner();
let mask_3p_range = seq.len().saturating_sub(mask_3p_threshold-1)..seq.len();
mask_sequence(mask_3p_range, reference, seq, quals, b'G', positions);
mask_sequence(mask_3p_range, reference, seq, quals, b'G', positions)
}


Expand Down Expand Up @@ -98,10 +100,17 @@ where B: bam::Read,
trace!("Sequence : {}", unsafe { str::from_utf8_unchecked(&sequence) });

// ---- Mask 5p' positions
mask_5p(relevant_thresholds, reference, &mut new_seq, &mut new_quals, &aligned_pos);
if let Err(e) = mask_5p(relevant_thresholds, reference, &mut new_seq, &mut new_quals, &aligned_pos) {
match bam_record.is_unmapped() {
true => warn!("{e}"),
false => return Err(e)
}
};

// ---- Mask 3p' positions
mask_3p(relevant_thresholds, reference, &mut new_seq, &mut new_quals, &aligned_pos);
if let Err(e) = mask_3p(relevant_thresholds, reference, &mut new_seq, &mut new_quals, &aligned_pos) {
if ! bam_record.is_unmapped() { return Err(e) } else { warn!("{e}")}
};

// SAFETY: samtools performs UTF8 sanity checks on the raw sequence. So we're ok.
trace!("Masked : {}", unsafe{ std::str::from_utf8_unchecked(&new_seq) });
Expand Down Expand Up @@ -212,7 +221,7 @@ mod test {
// Mask 5p
let pair_indices = cigar2paired_indices(cigar);
println!("---- 5p masking with threshold set at {threshold_len}");
mask_5p(&threshold, reference, &mut seq, &mut quals, &pair_indices);
mask_5p(&threshold, reference, &mut seq, &mut quals, &pair_indices)?;
print_align!(reference, seq, quals);

// ---- Validate 5p masking.
Expand All @@ -236,7 +245,7 @@ mod test {

// Mask 3p
println!("---- 3p masking with threshold set at {threshold_len}");
mask_3p(&threshold, reference, &mut seq, &mut quals, &pair_indices);
mask_3p(&threshold, reference, &mut seq, &mut quals, &pair_indices)?;
print_align!(reference, seq, quals);

// ---- Validate 3p masking
Expand Down Expand Up @@ -370,6 +379,15 @@ mod test {
}
}
}

#[test]
fn mask_spurious_unmapped_sequence() {
let reference = "N";
let seq = "GGATCACAGGTCTATCACCCTATTAACCACTCACGG";
let quals = "AAFFFJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ";
let cigar = vec![Cigar::Match(seq.len())];
println!("{:?}", mask_and_validate(10, reference, seq, quals, &cigar));
}
}


0 comments on commit 7bcd765

Please sign in to comment.