Skip to content

Commit

Permalink
Merge pull request #14 from MaelLefeuvre/dev
Browse files Browse the repository at this point in the history
Bump to v0.3.1 - various bugfixes
  • Loading branch information
MaelLefeuvre authored Aug 8, 2023
2 parents 66b7e0c + 83cfaee commit 168bbfe
Show file tree
Hide file tree
Showing 12 changed files with 119 additions and 41 deletions.
9 changes: 8 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
# version 0.3.0 (2023-03-19à
# version 0.3.1 (2023-07-19)
## Bugfixes
- Fixes issue #13 : `mask_sequence()` function now correctly handles out of index errors when attempting to retrieve positions that lie beyond the chromosome's length. A `ReferenceOutOfIndexError` is now raised to the `mask_5p()` and `mask_3p()` functions, while `apply_pmd_mask()` now recover from such an error if and only if the read happens to be labeled as 'segment unmapped' (`0x4`).
- Temporary workaround to issue #15 : The current patch directly uses `libc::free` to manually clean up memory between each iteration. A more elegant solution may come in the future, if issue rust-bio/rust-htslib#401 ever resolves..
- Use of [`atty`](https://docs.rs/atty/latest/atty/) ensures users are actually feeding the program with an input using piping, when they did not use `--bam` argument. This ensures a comprehensive error message gets printed, instead of letting the program endlessly hanging.
- Improved error handling: `pmd-mask` now outputs a comprehensive error message when users forget to provide with a valid misincorporation file path.

# version 0.3.0 (2023-03-19)
## Features:
Additional `-M`|`--metrics-file` argument allows to optionally specify an output summary file, where the software will keep a record of which positions were selected as masking thresholds. This file is headed, tab-separated, lexicographically ordered and structured according to four fields:
- `Chr`: Chromosome name (string)
Expand Down
12 changes: 7 additions & 5 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 4 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "pmd-mask"
version = "0.3.0"
version = "0.3.1"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
Expand All @@ -18,7 +18,7 @@ name = "apply_pmd_mask"
harness = false

[dependencies]
rust-htslib = { version = "0.41", features = ["bzip2", "lzma"] }
rust-htslib = { version = "0.44.1", features = ["bzip2", "lzma"]}
clap = { version = "4.1", features = ["derive", "wrap_help", "color"] }
serde = { version = "1.0", features = ["derive"] }
thiserror = "1.0"
Expand All @@ -28,6 +28,8 @@ harness = false
chrono = "0.4"
csv = "1.2"
num_cpus = "1.15"
libc = "0.2"
atty = "0.2.14"
[dev-dependencies]
serde_test = "1.0"
assert_cmd = "2.0.8"
Expand Down
10 changes: 9 additions & 1 deletion src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ pub enum RuntimeError {

#[error(transparent)]
ParseMisincorporation(#[from] crate::mask::MasksError),


#[error("Both the output and input alignment files appears to be the same file! Exiting.")]
InputIsOutput,
Expand All @@ -21,5 +20,14 @@ pub enum RuntimeError {
#[error("Failed to write masking thresholds within the provided metrics file path. [{0}]")]
WriteMasksMetrics(#[source] std::io::Error),

#[error("Length of the retrieved reference sequence does not match the length of the read")]
ReferenceOutOfIndexError,

#[error(
"Neither '--bam', nor the standard input is being sollicited at this point. \
Please provide pmd-mask with an input to work from, either through piping, or with the --bam argument"
)]
// #[error("Error")]
NoStdin,
}

70 changes: 51 additions & 19 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,44 +10,46 @@ use error::RuntimeError;
use genome::Orientation;
pub use mask::{Masks, MaskEntry, MaskThreshold};

use anyhow::Result;
use anyhow::{Result, Context};
use rust_htslib::{faidx, bam};
use log::{debug, trace};
use log::{debug, trace, warn};


use rust_htslib::bam::ext::BamRecordExtensions;


#[inline]
fn mask_sequence(range: Range<usize>, reference: &[u8], seq: &mut [u8], quals: &mut [u8], target_nucleotide: u8, positions: &[[usize; 2]]) {
fn mask_sequence(range: Range<usize>, reference: &[u8], seq: &mut [u8], quals: &mut [u8], target_nucleotide: u8, positions: &[[usize; 2]]) -> Result<(), RuntimeError> {
'mask: for [readpos, refpos] in positions.iter().copied().skip(range.start).take_while(|[readpos, _]| *readpos < range.end ) {
if readpos >= seq.len() { break 'mask}
if reference[refpos] == target_nucleotide {
if readpos >= seq.len() { break 'mask }
let reference_nucleotide = reference.get(refpos).ok_or_else(|| RuntimeError::ReferenceOutOfIndexError)?;
if *reference_nucleotide == target_nucleotide {
seq[readpos] = b'N';
quals[readpos] = 0;
}
}
Ok(())
}

#[inline]
fn mask_5p(thresholds: &MaskThreshold, reference: &[u8], seq: &mut [u8], quals: &mut [u8], positions: &[[usize; 2]]) {
fn mask_5p(thresholds: &MaskThreshold, reference: &[u8], seq: &mut [u8], quals: &mut [u8], positions: &[[usize; 2]]) -> Result<(), RuntimeError> {
// Unwrap cause we have previously validated the struct. [Code smell]
let mask_5p_threshold = thresholds.get_threshold(&Orientation::FivePrime).unwrap().inner();
let mask_5p_range = 0..mask_5p_threshold -1;
mask_sequence(mask_5p_range, reference, seq, quals, b'C', positions);
mask_sequence(mask_5p_range, reference, seq, quals, b'C', positions)
}

#[inline]
fn mask_3p(thresholds: &MaskThreshold, reference: &[u8], seq: &mut [u8], quals: &mut [u8], positions: &[[usize; 2]]) {
fn mask_3p(thresholds: &MaskThreshold, reference: &[u8], seq: &mut [u8], quals: &mut [u8], positions: &[[usize; 2]]) -> Result<(), RuntimeError> {
// Unwrap cause we have previously validated the struct. [Code smell]
let mask_3p_threshold = thresholds.get_threshold(&Orientation::ThreePrime).unwrap().inner();
let mask_3p_range = seq.len().saturating_sub(mask_3p_threshold-1)..seq.len();
mask_sequence(mask_3p_range, reference, seq, quals, b'G', positions);
mask_sequence(mask_3p_range, reference, seq, quals, b'G', positions)
}


#[inline]
pub fn apply_pmd_mask<B>(bam: &mut B, reference: &faidx::Reader, masks: &Masks, writer: &mut bam::Writer) -> Result<(), RuntimeError>
#[inline(never)]
pub fn apply_pmd_mask<B>(bam: &mut B, reference: &faidx::Reader, masks: &Masks, writer: &mut bam::Writer) -> Result<()>
where B: bam::Read,
{
// ---- Get header template
Expand Down Expand Up @@ -77,9 +79,10 @@ where B: bam::Read,
};


// ---- Get the reference's position
let reference = reference.fetch_seq(current_record.chromosome.inner(), bam_record.reference_start() as usize, bam_record.reference_end() as usize -1 ) ?;

// ---- Get the reference's position
// Memory leak here...
let refseq = reference.fetch_seq(current_record.chromosome.inner(), bam_record.reference_start() as usize, bam_record.reference_end() as usize -1 ) ?;

let aligned_pos = bam_record.aligned_pairs().map(|[readpos, refpos]| [readpos as usize, (refpos - bam_record.pos()) as usize]).collect::<Vec<[usize; 2]>>();

trace!("-----------------------");
Expand All @@ -94,14 +97,31 @@ where B: bam::Read,
// @ TODO: use this?: static DECODE_BASE: &[u8] = b"=ACMGRSVTWYHKDBN";
let sequence = bam_record.seq().as_bytes();
// @ SAFETY: Sequence is only parsed for TRACE logging, displaying potentially invalid UTF8-character is the whole point..
trace!("Reference: {}", unsafe { str::from_utf8_unchecked(reference) });
trace!("Reference: {}", unsafe { str::from_utf8_unchecked(refseq) });
trace!("Sequence : {}", unsafe { str::from_utf8_unchecked(&sequence) });

const UNMAPPED_CONTEXT: &str = "(This is merely a warning because this record was already set as UnMapped (0x4)";
let err_msg = |e: &RuntimeError , end: Orientation | {
let position = bam_record.pos();
format!("While attempting to mask the {end} end of record [{current_record} {position}]: {e}")
};
// ---- Mask 5p' positions
mask_5p(relevant_thresholds, reference, &mut new_seq, &mut new_quals, &aligned_pos);
if let Err(e) = mask_5p(relevant_thresholds, refseq, &mut new_seq, &mut new_quals, &aligned_pos) {
let context = err_msg(&e, Orientation::FivePrime);
match bam_record.is_unmapped() {
true => warn!("@ {context} {UNMAPPED_CONTEXT}"),
false => return Err(e).with_context(|| format!("{current_record}"))
}
};

// ---- Mask 3p' positions
mask_3p(relevant_thresholds, reference, &mut new_seq, &mut new_quals, &aligned_pos);
if let Err(e) = mask_3p(relevant_thresholds, refseq, &mut new_seq, &mut new_quals, &aligned_pos) {
let context = err_msg(&e, Orientation::ThreePrime);
match bam_record.is_unmapped() {
true => warn!("{context} {UNMAPPED_CONTEXT}"),
false => return Err(e).context(context)
}
};

// SAFETY: samtools performs UTF8 sanity checks on the raw sequence. So we're ok.
trace!("Masked : {}", unsafe{ std::str::from_utf8_unchecked(&new_seq) });
Expand All @@ -110,6 +130,9 @@ where B: bam::Read,
bam_record.clone_into(&mut out_record);
out_record.set(bam_record.qname(), Some(&bam_record.cigar().take()), &new_seq, &new_quals);
writer.write(&out_record)?;

// Manually remove rust-htslib fetch_seq leak.
unsafe {libc::free(refseq.as_ptr() as *mut std::ffi::c_void)}
}
Ok(())
}
Expand Down Expand Up @@ -212,7 +235,7 @@ mod test {
// Mask 5p
let pair_indices = cigar2paired_indices(cigar);
println!("---- 5p masking with threshold set at {threshold_len}");
mask_5p(&threshold, reference, &mut seq, &mut quals, &pair_indices);
mask_5p(&threshold, reference, &mut seq, &mut quals, &pair_indices)?;
print_align!(reference, seq, quals);

// ---- Validate 5p masking.
Expand All @@ -236,7 +259,7 @@ mod test {

// Mask 3p
println!("---- 3p masking with threshold set at {threshold_len}");
mask_3p(&threshold, reference, &mut seq, &mut quals, &pair_indices);
mask_3p(&threshold, reference, &mut seq, &mut quals, &pair_indices)?;
print_align!(reference, seq, quals);

// ---- Validate 3p masking
Expand Down Expand Up @@ -370,6 +393,15 @@ mod test {
}
}
}

#[test]
fn mask_spurious_unmapped_sequence() {
let reference = "N";
let seq = "GGATCACAGGTCTATCACCCTATTAACCACTCACGG";
let quals = "AAFFFJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ";
let cigar = vec![Cigar::Match(seq.len())];
println!("{:?}", mask_and_validate(10, reference, seq, quals, &cigar));
}
}


9 changes: 7 additions & 2 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,17 +68,22 @@ where F: AsRef<Path>
}


fn run(args: &Cli) -> Result<(), RuntimeError> {
fn run(args: &Cli) -> Result<()> {

// ---- Ensure Input bam and output bam are not the same.
if let Some(ref input_file) = args.bam{
if let Some(ref output_file) = args.output {
if *input_file == *output_file {
return Err(RuntimeError::InputIsOutput)
anyhow::bail!(RuntimeError::InputIsOutput)
}
}
}

// ---- Ensure the stdin is being sollicited if there are no specified input bams.
if atty::is(atty::Stream::Stdin) && args.bam.is_none() {
anyhow::bail!(RuntimeError::NoStdin)
}



// ---- define threadpool if required:
Expand Down
2 changes: 1 addition & 1 deletion src/mask/entry/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ pub mod dummy {
let mut header = Header::new();
let mut chr_record = HeaderRecord::new("SQ".as_bytes());
chr_record.push_tag("SN".as_bytes(), &"chr1".to_string());
chr_record.push_tag("LN".as_bytes(), &249250621);
chr_record.push_tag("LN".as_bytes(), 249250621);
header.push_record(&chr_record);

let mut record = Record::new();
Expand Down
11 changes: 10 additions & 1 deletion src/mask/error.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,18 @@
use thiserror::Error;

use crate::misincorporation::MisincorporationsError;

use super::{entry::MaskEntry, threshold::MaskThresholdError};

#[derive(Debug, Error, PartialEq)]
#[derive(Debug, Error)]
pub enum MasksError {
#[error("Invalid MaskThreshold within MaskEntry {0}. Got {1}")]
ValidateThresholds(MaskEntry, #[source] MaskThresholdError),

#[error("Failed to open misincorporation file [{source}]")]
OpenFile{#[source] source: std::io::Error},

#[error("Failed to obtain Misincorporations from misincorporation file")]
GenerateMisincorporations(#[from] MisincorporationsError)

}
25 changes: 19 additions & 6 deletions src/mask/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ use std::fs::File;
use std::io::Write;
use std::path::Path;


use log::{warn, debug, trace};

pub mod entry;
Expand Down Expand Up @@ -50,14 +51,14 @@ impl TryFrom<&Misincorporations> for Masks {
impl Masks {

pub fn from_path(misincorporations: impl AsRef<Path>, threshold: f32) -> Result<Self, MasksError> {
let file = File::open(&misincorporations).unwrap();
//.map_err(|e| MisincorporationsError::OpenFile(path.as_ref().display().to_string(), e))?;
let file = File::open(&misincorporations)
.map_err(|e| MasksError::OpenFile{source: e})?;
Self::from_reader(file, threshold)
}

pub fn from_reader<R: std::io::Read>(misincorporations: R, threshold: f32) -> Result<Self, MasksError> {

let mut threshold_positions = Misincorporations::from_reader(misincorporations, threshold).unwrap();
let mut threshold_positions = Misincorporations::from_reader(misincorporations, threshold)?;

// ---- Validate misincorporation and issue warnings for any 'abnormal' frequency found.
let mut abnormal_frequencies = threshold_positions
Expand All @@ -76,7 +77,7 @@ impl Masks {
})
);

Ok(Masks::try_from(&threshold_positions).unwrap())
Masks::try_from(&threshold_positions)
}

pub fn get(&self, entry: &MaskEntry) -> Option<&MaskThreshold> {
Expand Down Expand Up @@ -173,13 +174,25 @@ mod test {
#[test]
fn validate() {
let mut masks = Masks::try_from(&dummy_misincorporations()).expect("Invalid Misincorporations");
assert_eq!(masks.validate(), Ok(()));
match masks.validate() {
Ok(()) => {},
_ => panic!("Invalid mask")
};

let bad_entry = MaskEntry{chromosome: ChrName::new("chr1"), strand: Reverse};
let funky_threshold = masks.inner.get_mut(&bad_entry).expect("Entry not found");
*funky_threshold = MaskThreshold{inner: HashMap::new()};

assert_eq!(masks.validate(), Err(MasksError::ValidateThresholds(bad_entry, MaskThresholdError::ValidateThresh{got:0, want:2})))
match masks.validate() {
Ok(()) => panic!("Masks should be invalid at this point."),
Err(e) => match e {
MasksError::ValidateThresholds(entry, treshold_err) => {
assert_eq!(bad_entry, entry);
assert_eq!(treshold_err, MaskThresholdError::ValidateThresh { got: 0, want: 2 })
},
_ => panic!("Invalid error type.")
}
}
}

}
2 changes: 1 addition & 1 deletion src/misincorporation/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use thiserror::Error;
#[derive(Debug, Error)]
pub enum MisincorporationsError {
#[error("@line {0}: Failed to deserialize record in misincorporation file. Got {1}")]
DeserializeRecord(usize, Box<dyn std::error::Error>),
DeserializeRecord(usize, String),

#[error("Failed to open {0}. Got {1}")]
OpenFile(String, #[source] std::io::Error)
Expand Down
2 changes: 1 addition & 1 deletion src/misincorporation/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ impl Misincorporations {
let mut threshold_positions = Vec::with_capacity(32 * 2 * 2);

'nextline: for (line, result) in reader.deserialize::<MisincorporationRecord>().enumerate() {
let record = result.map_err(|e|DeserializeRecord(line, e.into()))?;
let record = result.map_err(|e|DeserializeRecord(line, e.to_string()))?;
// Once we've found a position at which the threshold is met, we
// don't need to parse entries which belong from the same chromosome.
if let Some(chromosome_to_skip) = skip_chromosome {
Expand Down
2 changes: 1 addition & 1 deletion src/misincorporation/record/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use std::{fmt::{self, Display, Formatter}};
use std::fmt::{self, Display, Formatter};
use crate::genome::{ChrName, Orientation, Strand, Position};

use serde::Deserialize;
Expand Down

0 comments on commit 168bbfe

Please sign in to comment.