Skip to content

Commit

Permalink
Input can be a path prefix (#24)
Browse files Browse the repository at this point in the history
  • Loading branch information
nh13 authored Nov 27, 2022
1 parent af2f697 commit ddc5928
Show file tree
Hide file tree
Showing 7 changed files with 606 additions and 45 deletions.
24 changes: 22 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,11 @@ log = "0.4.14"
mimalloc = { version = "0.1.27", default-features = false }
num_cpus = "1.13.0"
parking_lot = "0.11.2"
path-absolutize = "3.0.14"
pooled-writer = "0.2.0"
rayon = "1.5.1"
read-structure = "0.1.0"
regex = "1.7.0"
seq_io = { git = "https://github.com/fulcrumgenomics/seq_io.git", rev = "3d461a3" }
serde = { version = "1.0.130", features = ["derive"] }
strum = { version = "0.24.0", features = ["derive"] }
Expand Down
26 changes: 25 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ This repository is home to the `sgdemux` tool for demultiplexing sequencing data
* [Usage](#usage)
* [Inputs](#inputs)
* [FASTQ Files](#fastq-files)
* [Auto-detecting FASTQS from a Path Prefix](#auto-detecting-fastqs-from-a-path-prefix)
* [Read Structures](#read-structures)
* [Sample Sheet](#sample-sheet)
* [Specifying Demultiplexing Command Line Options](#specifying-demultiplexing-command-line-options)
Expand Down Expand Up @@ -126,9 +127,30 @@ for read in R1 R2 I1 I2; do cat L*/${read}.fastq.gz > ./${read}.fastq.gz; done

FASTQ files _must_ be BGZF compressed.

###### Auto-detecting FASTQS from a Path Prefix

Alternatively, the FASTQS can be auto-detected when a path prefix is given to `--fastqs <dir>/<prefix>`.
The FASTQs must be named `<dir>/<prefix>_L00<lane>_<kind><kind-number>_001.fastq.gz`, where `kind` is
one of R (read/template), I (index/sample barcode), or U (umi/molecular barcode).

The Read Structure must not be given on the the command line or Sample Sheet. Instead, the Read
Structure will be derived file names (kind and kind number), with the full read length used for the given kind.
E.g. if the following FASTQs are present with path prefix `/path/to/prefix`:

```
/path/to/prefix_L001_I1_001.fasztq.gz
/path/to/prefix_L001_R1_001.fasztq.gz
/path/to/prefix_L001_R2_001.fasztq.gz
/path/to/prefix_L001_I2_001.fasztq.gz
```

then the `+B +T +T +B` read structure will be used. Since this tool requires all sample barcode
segments to have a fixed length, the first read in any index/sample-barcode FASTQ will be examined
and its length used as the expected sample barcode length.

##### Read Structures

Read Structures are short strings that describe the origin and/or purpose of bases within sequencing reads. They are made up of a sequence of `<number><operator>` pairs. Four kinds of operators are recognized:
Read Structures are short strings that describe the origin and/or purpose of bases within sequencing reads. They are made up of a sequence of `<number><operator>` pairs (segments). Four kinds of operators are recognized:

1. **T** identifies template reads/bases
2. **B** identifies sample barcode reads/bases
Expand All @@ -146,6 +168,8 @@ One Read Structure must be provided for each input FASTQ file, in the same order
--read-structures +T +T 8B 8B
```

All sample barocde segments must be a fixed length. E.g. `8B+T` is allowed but `10S+B` is not.

##### Sample Sheet

Information about the sample(s) to demultiplex is specified within a Sample Sheet.
Expand Down
213 changes: 205 additions & 8 deletions src/lib/opts.rs
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
#![forbid(unsafe_code)]

use std::{num::NonZeroUsize, path::PathBuf, vec::Vec};
use std::{cmp::Ordering, num::NonZeroUsize, path::PathBuf, str::FromStr, vec::Vec};

use anyhow::{anyhow, Result};
use anyhow::{anyhow, bail, Result};
use clap::Parser;
use env_logger::Env;
use read_structure::{ReadStructure, SegmentType};
use read_structure::{ReadSegment, ReadStructure, SegmentType};

use crate::{
demux::DemuxReadFilterConfig,
matcher::{MatcherKind, UNDETERMINED_NAME},
utils::built_info,
utils::{built_info, infer_fastq_sequence_length},
};

pub static LOGO: &str = "
Expand Down Expand Up @@ -78,19 +78,19 @@ For support please contact: care@singulargenomics.com
#[derive(Parser, Debug, Clone)]
#[clap(name = TOOL_NAME, version = built_info::VERSION.as_str(), about=SHORT_USAGE, long_about=LONG_USAGE, term_width=0)]
pub struct Opts {
/// Path to the input FASTQs.
/// Path to the input FASTQs, or path prefix if not a file.
#[clap(long, short = 'f', display_order = 1, required = true, multiple_values = true)]
pub fastqs: Vec<PathBuf>,

/// Path to the sample metadata.
#[structopt(long, short = 's', display_order = 2)]
pub sample_metadata: PathBuf,

/// Read structures, one per input FASTQ.
#[clap(long, short = 'r', display_order = 3, required = true, multiple_values = true)]
/// Read structures, one per input FASTQ. Do not provide when using a path prefix for FASTQs.
#[clap(long, short = 'r', display_order = 3, multiple_values = true)]
pub read_structures: Vec<ReadStructure>,

/// The directory to write outputs.
/// The directory to write outputs to.
///
/// This tool will overwrite existing files.
#[clap(long, short, display_order = 4)]
Expand Down Expand Up @@ -249,6 +249,74 @@ impl Opts {
}
Ok(output_types_to_write)
}

/// Builds a new copy of this `Opt` where read structures with variable length sample barcodes
/// are replaced with fixed length sample barcodes. The fixed length of each sample barcode
/// is inferred from the FASTQ corresponding to the given read structure by examining the
/// length of the first read, then removing any other fixed length segments, with the remaining
/// length used as the fixed sample barcode length.
///
/// If there is no read structure with a variable length sample barcode, the `Opt` itself is
/// returned.
pub fn with_fixed_sample_barcodes(self) -> Result<Self> {
if self
.read_structures
.iter()
.all(|s| s.sample_barcodes().all(read_structure::ReadSegment::has_length))
{
Ok(self)
} else {
// Go through each read structure and try to infer the length of any variable length
// sample barcode
let mut read_structures: Vec<ReadStructure> = vec![];
for (read_structure, fastq) in self.read_structures.iter().zip(self.fastqs.iter()) {
if read_structure
.iter()
.all(|s| s.kind != SegmentType::SampleBarcode || s.has_length())
{
// No variable length, so just clone the current read structure
read_structures.push(read_structure.clone());
} else {
// Get the read length from the FASTQ, subtract all non variable length
// segments (must be a sample barcode if we've gone this far).
let read_length: usize = infer_fastq_sequence_length(&fastq)?;
let fixed_length: usize =
read_structure.iter().map(|s| s.length().unwrap_or(0)).sum();
match fixed_length.cmp(&read_length) {
Ordering::Greater => bail!(
"Read length ({}) is too short ({}) for the read structure {} in {:?}",
read_length,
fixed_length,
read_structure,
fastq
),
Ordering::Equal => bail!("Variable length sample barcode would be zero (read length: {}, sum of fixed segment lengths: {}) in FASTQ: {:?}",
read_length, fixed_length, fastq
),
Ordering::Less => {
let read_segments: Vec<ReadSegment> = read_structure
.iter()
.map(|s| {
if s.has_length() {
*s
} else {
ReadSegment::from_str(&format!(
"{}{}",
read_length - fixed_length,
s.kind.value()
))
.unwrap()
}
})
.collect();
read_structures.push(ReadStructure::new(read_segments)?);
}
}
}
}
Ok(Opts { read_structures, ..self })
}
}
}

/// Implement defaults that match the CLI options to allow for easier testing.
Expand Down Expand Up @@ -294,3 +362,132 @@ pub fn setup() -> Opts {

Opts::parse()
}

#[cfg(test)]
mod test {
use std::{
fs::File,
io::{BufWriter, Write},
path::PathBuf,
str::FromStr,
};

use gzp::{BgzfSyncWriter, Compression};
use read_structure::ReadStructure;
use tempfile::tempdir;

use super::Opts;

struct FastqDef {
read_structure: ReadStructure,
fastq: PathBuf,
read_length: usize,
expected: ReadStructure,
}

impl FastqDef {
fn write(&self) {
let writer = BufWriter::new(File::create(&self.fastq).unwrap());
let mut gz_writer = BgzfSyncWriter::new(writer, Compression::new(3));
let bases = String::from_utf8(vec![b'A'; self.read_length]).unwrap();
let quals = String::from_utf8(vec![b'I'; self.read_length]).unwrap();
let data = format!("@NAME\n{}\n+\n{}\n", bases, quals);
gz_writer.write_all(data.as_bytes()).unwrap();
gz_writer.flush().unwrap();
}
}

#[test]
fn test_with_fixed_sample_barcodes_ok() {
let dir = tempdir().unwrap();

let fastq_defs = vec![
// no change: no sample barcode, template is variable length
FastqDef {
read_structure: ReadStructure::from_str("+T").unwrap(),
fastq: dir.path().join("1.fastq.gz"),
read_length: 150,
expected: ReadStructure::from_str("+T").unwrap(),
},
// no change: no sample barcode, fixed molecluar barcode, template is variable length
FastqDef {
read_structure: ReadStructure::from_str("24M+T").unwrap(),
fastq: dir.path().join("3.fastq.gz"),
read_length: 150,
expected: ReadStructure::from_str("24M+T").unwrap(),
},
// no change: fixed sample barcode length
FastqDef {
read_structure: ReadStructure::from_str("+B").unwrap(),
fastq: dir.path().join("2.fastq.gz"),
read_length: 20,
expected: ReadStructure::from_str("20B").unwrap(),
},
// updated: one fixed length and one variable length sample barcode
FastqDef {
read_structure: ReadStructure::from_str("8B10S+B").unwrap(),
fastq: dir.path().join("4.fastq.gz"),
read_length: 30,
expected: ReadStructure::from_str("8B10S12B").unwrap(),
},
];

// Create output FASTQ files
for fastq_def in &fastq_defs {
fastq_def.write();
}

// Create the opt, and update it
let opt = Opts {
read_structures: fastq_defs.iter().map(|f| f.read_structure.clone()).collect(),
fastqs: fastq_defs.iter().map(|f| f.fastq.clone()).collect(),
..Opts::default()
}
.with_fixed_sample_barcodes()
.unwrap();

// Check the new read structures
let expected_read_structures: Vec<ReadStructure> =
fastq_defs.iter().map(|f| f.expected.clone()).collect();
assert_eq!(opt.read_structures.len(), expected_read_structures.len());
for (actual, expected) in opt.read_structures.iter().zip(expected_read_structures.iter()) {
assert_eq!(actual.to_string(), expected.to_string());
}
}

#[test]
fn test_with_fixed_sample_barcodes_error() {
let dir = tempdir().unwrap();

let fastq_defs = vec![
// error: read length is short enough that variable length sample barcode is zero
FastqDef {
read_structure: ReadStructure::from_str("8B10S+B").unwrap(),
fastq: dir.path().join("5.fastq.gz"),
read_length: 18,
expected: ReadStructure::from_str("+B").unwrap(), // ignore
},
// error: read length is short enough that leading fixed lengths are too short
FastqDef {
read_structure: ReadStructure::from_str("8B10S+B").unwrap(),
fastq: dir.path().join("5.fastq.gz"),
read_length: 17,
expected: ReadStructure::from_str("+B").unwrap(), // ignore
},
];

// Create output FASTQ files
for fastq_def in &fastq_defs {
fastq_def.write();

// Create the opt, and update it
let result = Opts {
read_structures: vec![fastq_def.read_structure.clone()],
fastqs: vec![fastq_def.fastq.clone()],
..Opts::default()
}
.with_fixed_sample_barcodes();
assert!(result.is_err());
}
}
}
Loading

0 comments on commit ddc5928

Please sign in to comment.