Singular-Genomics · nh13 · Dec 7, 2022 · Nov 26, 2022 · Nov 28, 2022 · Nov 28, 2022
diff --git a/README.md b/README.md
@@ -276,6 +276,7 @@ then the `Sample_Barcode` field for each sample should be composed as follows:
 |--writer-threads          |No |5           | The number of threads to use to write compressed FASTQ data to disk.                                                                                                                                                                                                                                                           |
 |--override-matcher        |No |n/a         | The algorithm to use for matching, either `CachedHammingDistance` or `PreCompute`.  By default if barcodes are 12bp or shorter `PreCompute` is used which pre-computes all possible matches, or if barcodes are longer than 12bp `CachedHammingDistance` is used which calculates matches when needed then caches the results. |
 |--most-unmatched-to-output|No |1000        | Report on the top N most frequently observed unmatched barcode sequences.                                                                                                                                                                                                                                                      |
+|--skip-read-name-check|No |False        |  If this is true, then all the read names across FASTQs will not be enforced to be the same. This may be useful when the read names are known to be the same and performance matters. Regardless, the first read name in each FASTQ will always be checked.sequences.                                                                                                                                                                                                                                                      |
 
 
 #### Performance Considerations

diff --git a/src/lib/demux.rs b/src/lib/demux.rs
@@ -3,6 +3,7 @@
 use std::{borrow::Cow, iter::IntoIterator, vec::Vec};
 
 use anyhow::{ensure, Context, Result};
+use bstr::ByteSlice;
 use itertools::Itertools;
 use log::debug;
 use read_structure::{ReadSegment, ReadStructure, SegmentType};
@@ -187,6 +188,9 @@ pub struct Demultiplexer<'a, M: Matcher> {
     /// If false, unmatched barcodes will not be collected, since there will be no thread to send to if unmatched are not
     /// being collected.
     collect_unmatched: bool,
+    /// If this is true, then the read names across FASTQs will not be enforced to be the same.  This may be useful when
+    /// the read names are known to be the same and performance matters.
+    skip_read_name_check: bool,
 }
 
 impl<'a, M> Demultiplexer<'a, M>
@@ -203,6 +207,7 @@ where
         read_filter_config: &'a DemuxReadFilterConfig,
         matcher: M,
         collect_unmatched: bool,
+        skip_read_name_check: bool,
     ) -> Result<Self> {
         // TODO: use display instead of debug formatting
         ensure!(
@@ -238,6 +243,7 @@ where
             matcher,
             sample_barcode_hop_checker,
             collect_unmatched,
+            skip_read_name_check,
         })
     }
 
@@ -421,6 +427,23 @@ where
             let mut output_reads = vec![];
             let mut qual_counter = BaseQualCounter::new();
 
+            if !self.skip_read_name_check {
+                let first_head = zipped_reads[0].head();
+                let end_index = zipped_reads[0].head().find_byte(b' ').unwrap_or(first_head.len());
+                for read in zipped_reads.iter().dropping(1) {
+                    let cur_head = read.head();
+                    let ok = cur_head.len() == end_index
+                        || (cur_head.len() > end_index && cur_head[end_index] == b' ');
+                    let ok = ok && first_head[0..end_index] == cur_head[0..end_index];
+                    ensure!(
+                        ok,
+                        "Read names did not match: {:?} != {:?}",
+                        String::from_utf8_lossy(first_head),
+                        String::from_utf8_lossy(cur_head)
+                    );
+                }
+            }
+
             // If any filtering was specified, check it here by peeking at the header of the first read in the set of reads.
             match self.apply_filters(&zipped_reads)? {
                 Some(ReadFailedFilterResult::ControlFilter) => {
@@ -692,6 +715,7 @@ mod test {
                             &self.read_filter,
                             matcher,
                             false,
+                            false,
                         )
                         .unwrap(),
                     )
@@ -711,6 +735,7 @@ mod test {
                             &self.read_filter,
                             matcher,
                             false,
+                            false,
                         )
                         .unwrap(),
                     )
@@ -1619,6 +1644,7 @@ mod test {
             &filter_config,
             matcher,
             false,
+            false,
         )
         .unwrap();
         let per_fastq_record_set =
@@ -1655,4 +1681,28 @@ mod test {
             "Sample2 R2 reads should match expected"
         );
     }
+
+    #[rstest]
+    #[rustfmt::skip]
+    fn test_demux_paired_reads_different_read_names(
+        #[values(MatcherKind::CachedHammingDistance, MatcherKind::PreCompute)]
+        matcher: MatcherKind
+    ) {
+        let read_structures = vec![
+            ReadStructure::from_str("8B100T").unwrap(),
+            ReadStructure::from_str("9B100T").unwrap(),
+        ];
+
+        let mut fq1: OwnedRecord = Fq { name: "frag1", bases: &[&SAMPLE_BARCODE_1[0..8], &[b'A'; 100]].concat(), ..Fq::default() }.to_owned_record();
+        let fq2 = Fq { name: "frag2", bases: &[&SAMPLE_BARCODE_1[8..],  &[b'T'; 100]].concat(), ..Fq::default() }.to_owned_record();
+
+        // mess with the FQ1 read name
+        fq1.head = vec![b'f'];
+        fq1.head.append(&mut fq2.head.clone());
+
+        let context = DemuxTestContext::demux_structures(vec![fq1, fq2], read_structures, matcher);
+        let demuxer = context.demux();
+        let result = demuxer.demultiplex(&context.per_fastq_record_set);
+        assert!(result.is_err());
+    }
 }
diff --git a/src/lib/opts.rs b/src/lib/opts.rs
@@ -216,6 +216,12 @@ pub struct Opts {
     /// [default: None]
     #[clap(long, possible_values=MatcherKind::possible_values(), display_order = 31)]
     pub override_matcher: Option<MatcherKind>,
+
+    /// If this is true, then all the read names across FASTQs will not be enforced to be the same.
+    /// This may be useful when the read names are known to be the same and performance matters.
+    /// Regardless, the first read name in each FASTQ will always be checked.
+    #[clap(long, display_order = 31)]
+    pub skip_read_name_check: bool,
 }
 
 impl Opts {
@@ -466,6 +472,7 @@ impl Default for Opts {
             decompression_threads_per_reader: 4,
             override_matcher: None,
             output_dir: PathBuf::default(),
+            skip_read_name_check: false,
         }
     }
 }

diff --git a/src/lib/run.rs b/src/lib/run.rs
@@ -187,6 +187,7 @@ pub fn run(opts: Opts) -> Result<()> {
             &read_filter_config,
             matcher,
             true,
+            opts.skip_read_name_check,
         )?;
         Box::new(demuxer)
     } else {
@@ -204,6 +205,7 @@ pub fn run(opts: Opts) -> Result<()> {
             &read_filter_config,
             matcher,
             true,
+            opts.skip_read_name_check,
         )?;
         Box::new(demuxer)
     };