Skip to content

Commit

Permalink
Log and propagate orientation filter reasoning to output
Browse files Browse the repository at this point in the history
  • Loading branch information
hextraza committed Feb 16, 2024
1 parent bcc65c6 commit 959c8b6
Show file tree
Hide file tree
Showing 10 changed files with 175 additions and 392 deletions.
216 changes: 85 additions & 131 deletions src/align.rs

Large diffs are not rendered by default.

9 changes: 3 additions & 6 deletions src/parse/bam.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@ pub const BAM_FIELDS_TO_REPORT: [&'static str; 37] = [
"MAPQ",
"POS",
"MATE_POS",
//"SEQ",
"SEQ", // This gets filtered out before getting written to disk in the bam processingm it's just used for associating reads to filter info
"SEQ_LEN",
"INSERT_SIZE",
"CIGAR",
//"CIGAR",
"QUALITY_FAILED",
"SECONDARY",
"DUPLICATE",
Expand Down Expand Up @@ -216,10 +216,7 @@ impl UMIReader {
"MAPQ" => record.mapq().to_string(),
"POS" => record.pos().to_string(),
"MATE_POS" => record.mpos().to_string(),
"SEQ" => String::from_utf8(record.seq().as_bytes()).unwrap_or_else(|e| {
eprintln!("Error: {}", e);
String::new()
}),
"SEQ" => seq.to_string(),
"SEQ_LEN" => record.seq_len().to_string(),
"INSERT_SIZE" => record.insert_size().to_string(),
"CIGAR" => record.cigar().to_string(),
Expand Down
86 changes: 73 additions & 13 deletions src/process/bam.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::align::{AlignDebugInfo, AlignFilterConfig, PseudoAligner};
use crate::align::{AlignDebugInfo, AlignFilterConfig, PseudoAligner, FilterReason, AlignmentDirection};
use crate::parse::bam::BAM_FIELDS_TO_REPORT;
use crate::parse::bam::UMIReader;
use crate::reference_library::ReferenceMetadata;
Expand All @@ -8,6 +8,7 @@ use debruijn::dna_string::DnaString;
use flate2::write::GzEncoder;
use flate2::Compression;

use std::collections::HashMap;
use std::sync::mpsc::{self};
use std::sync::{Arc, Mutex};
use std::thread;
Expand Down Expand Up @@ -37,8 +38,9 @@ pub fn process(
output_paths: Vec<String>,
num_cores: usize,
) {
// associated with r2, r1, r2 forward, r1 forward, r2 reverse, r1 reverse, both
let (log_sender, log_receiver) =
mpsc::channel::<((Vec<String>, (i32, Vec<String>, Vec<String>)), usize)>();
mpsc::channel::<((Vec<String>, (i32, Vec<String>, Vec<String>, FilterReason, FilterReason, FilterReason, FilterReason, FilterReason, AlignmentDirection)), usize)>();

let log_thread = thread::spawn(move || {
println!("Spawning logging thread.");
Expand All @@ -64,21 +66,28 @@ pub fn process(
if first_write[index] {
writeln!(
file_handle,
"nimble_features\tnimble_score\t{}\t{}",
"nimble_features\tnimble_score\t{}\t{}\t{}",
bam_data_header("r1"),
bam_data_header("r2")
bam_data_header("r2"),
"r1_filter_forward\tr1_filter_reverse\tr2_filter_forward\tr2_filter_reverse\t",
)
.unwrap();
first_write[index] = false;
}

writeln!(
file_handle,
"{}\t{}\t{}\t{}",
"{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}",
msg.0.join(","),
msg.1 .0,
bam_data_values(&msg.1 .2), // r1
bam_data_values(&msg.1 .1) // r2
bam_data_values(&msg.1 .1), // r2. these are reversed from the order they take in the data structures, hence the weird numbering
&msg.1.4,
&msg.1.6,
&msg.1.3,
&msg.1.5,
//&msg.1.7,
//&msg.1.8,
)
.unwrap();
}
Expand Down Expand Up @@ -193,6 +202,7 @@ fn get_score<'a>(
) -> (
Vec<(Vec<String>, (i32, Vec<String>, Vec<String>))>,
Vec<(Vec<String>, String, f64, usize, String)>,
HashMap<String, (FilterReason, FilterReason, FilterReason, FilterReason, FilterReason, AlignmentDirection)>
) {
let sequences: Box<dyn Iterator<Item = Result<DnaString, Error>> + 'a> = Box::new(
current_umi_group
Expand Down Expand Up @@ -249,11 +259,11 @@ fn align_umi_to_libraries(
reference_metadata: &Vec<ReferenceMetadata>,
align_configs: &Vec<AlignFilterConfig>,
_thread_num: usize,
) -> Vec<Vec<(Vec<String>, (i32, Vec<String>, Vec<String>))>> {
) -> Vec<Vec<(Vec<String>, (i32, Vec<String>, Vec<String>, FilterReason, FilterReason, FilterReason, FilterReason, FilterReason, AlignmentDirection))>> {
let mut results = vec![];

for (i, reference_index) in reference_indices.iter().enumerate() {
let (mut s, _) = get_score(
let (mut s, _, filter_reasons) = get_score(
&umi,
&current_metadata_group,
reference_index,
Expand All @@ -263,11 +273,7 @@ fn align_umi_to_libraries(
&current_metadata_group
.clone()
.into_iter()
.map(|v| match v[3].as_str() {
"true" => true,
"false" => false,
_ => panic!("Could not parse revcomp field \"{}\" as boolean", v[3])
})
.map(|v| parse_str_as_bool(&v[3]))
.collect::<Vec<bool>>(),
);

Expand Down Expand Up @@ -296,6 +302,52 @@ fn align_umi_to_libraries(
}

s.extend(non_matching_reads);

let mut transformed_scores = Vec::new();
for score in s.into_iter() {
let r1_key_part = check_reverse_comp((&DnaString::from_dna_string(&score.1.1[15]), &parse_str_as_bool(&score.1.1[3]))).to_string();
let r2_key_part = check_reverse_comp((&DnaString::from_dna_string(&score.1.2[15]), &parse_str_as_bool(&score.1.2[3]))).to_string();
let filter_result = filter_reasons.get(&(r1_key_part.clone() + &r2_key_part));

let new_score = match filter_result {
Some(v) => {
(
score.0,
(
score.1.0,
score.1.1,
score.1.2,
v.0,
v.1,
v.2,
v.3,
v.4,
v.5,
),
)
},
None => {
(
score.0,
(
score.1.0,
score.1.1,
score.1.2,
FilterReason::None,
FilterReason::None,
FilterReason::None,
FilterReason::None,
FilterReason::None,
AlignmentDirection::None,
),
)
},
};
transformed_scores.push(new_score);
}

let s = transformed_scores;

results.push(s);
}
}
Expand All @@ -311,4 +363,12 @@ fn check_reverse_comp(rec: (&DnaString, &bool)) -> DnaString {
} else {
seq.to_owned()
}
}

fn parse_str_as_bool(v: &str) -> bool {
match v {
"true" => true,
"false" => false,
_ => panic!("Could not parse revcomp field \"{}\" as boolean", v)
}
}
2 changes: 1 addition & 1 deletion src/process/fastq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ pub fn process(
_num_cores: usize,
) {
for (i, index) in reference_indices.into_iter().enumerate() {
let (results, _alignment_metadata) = score(
let (results, _alignment_metadata, _) = score(
get_error_checked_fastq_readers(input_files[0].clone()),
if input_files.len() > 1 { Some(get_error_checked_fastq_readers(input_files[1].clone())) } else { None },
&Vec::new(),
Expand Down
6 changes: 5 additions & 1 deletion src/score.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
use crate::align;
use crate::align::{FilterReason, AlignmentDirection};
use crate::reference_library;
use crate::utils;
use reference_library::ReferenceMetadata;

use debruijn::dna_string::DnaString;
use std::collections::HashMap;
use std::io::Error;

/* Takes a list of sequences and optionally reverse sequences, a reference library index, reference library metadata,
Expand All @@ -26,9 +28,10 @@ pub fn score<'a>(
) -> (
Vec<(Vec<String>, (i32, Vec<String>, Vec<String>))>,
Vec<(Vec<String>, String, f64, usize, String)>,
HashMap<String, (FilterReason, FilterReason, FilterReason, FilterReason, FilterReason, AlignmentDirection)>
) {
// Perform filtered pseudoalignment
let (reference_scores, alignment_metadata) = align::score(
let (reference_scores, alignment_metadata, filter_reasons) = align::score(
sequences,
reverse_sequences,
current_metadata_group,
Expand All @@ -41,5 +44,6 @@ pub fn score<'a>(
(
utils::sort_score_vector(reference_scores),
alignment_metadata,
filter_reasons
)
}
Loading

0 comments on commit 959c8b6

Please sign in to comment.