Support additonal FASTQ sequence designed for UMI file

adamrtalbot · adamrtalbot · commit d3777d798735 · 2024-05-15T16:49:07.000+01:00
Changes:
 - Parse input subworkflow to support 3rd FASTQ in addition to R1 and R2
 - Checks number of FASTQ files matches the number of read structures
diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -27,13 +27,20 @@
                 "pattern": "^\\S+\\.f(ast)?q\\.gz$",
                 "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'"
             },
+            "fastq_3": {
+                "type": "string",
+                "format": "file-path",
+                "exists": true,
+                "pattern": "^\\S+\\.f(ast)?q\\.gz$",
+                "errorMessage": "FastQ file for reads 3 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'"
+            },
             "read_structure": {
                 "type": "string",
                 "pattern": "^.*$",
                 "errorMessage": "Read structure must be provided; For format, see: https://github.com/fulcrumgenomics/fgbio/wiki/Read-Structures",
                 "meta": ["read_structure"]
             }
         },
-        "required": ["sample", "fastq_1", "fastq_2", "read_structure"]
+        "required": ["sample", "fastq_1", "read_structure"]
     }
 }
diff --git a/subworkflows/local/utils_nfcore_fastquorum_pipeline/main.nf b/subworkflows/local/utils_nfcore_fastquorum_pipeline/main.nf
@@ -92,13 +92,18 @@ workflow PIPELINE_INITIALISATION {
     Channel
         .fromSamplesheet("input")
         .map {
-            meta, fastq_1, fastq_2 ->
-                if (!fastq_2) {
-                    return [ meta.id, meta + [ single_end:true ], [ fastq_1 ] ]
-                } else {
+            meta, fastq_1, fastq_2, fastq_3 ->
+                if (fastq_3) {
+                    return [ meta.id, meta + [ single_end:false ], [ fastq_1, fastq_2, fastq_3 ] ]
+                } else if (fastq_2) {
                     return [ meta.id, meta + [ single_end:false ], [ fastq_1, fastq_2 ] ]
+                } else {
+                    return [ meta.id, meta + [ single_end:true ], [ fastq_1 ] ]
                 }
         }
+        .map {
+            validateReadStructure(it)
+        }
         .groupTuple()
         .map {
             validateInputSamplesheet(it)
@@ -109,6 +114,8 @@ workflow PIPELINE_INITIALISATION {
         }
         .set { ch_samplesheet }
 
+    ch_samplesheet.view()
+
     emit:
     samplesheet = ch_samplesheet
     versions    = ch_versions
@@ -163,6 +170,21 @@ def validateInputParameters() {
     genomeExistsError()
 }
 
+def validateReadStructure(input) {
+    def id           = input[0]
+    def meta         = input[1]
+    def fastqs       = input[2]
+
+    def num_fastqs     = fastqs.size()
+    def num_structures = meta.read_structure.tokenize(" ").size()
+
+    if (num_fastqs != num_structures) {
+        error("Please check input samplesheet -> Number of fastq files (${num_fastqs}) does not match the number of read structures (${num_structures}): ${id}, '${meta.read_structure}'")
+    }
+    return [ id, meta, fastqs ]
+}
+
+
 //
 // Validate channels from input samplesheet
 //
@@ -177,6 +199,7 @@ def validateInputSamplesheet(input) {
 
     return [ metas[0], fastqs ]
 }
+
 //
 // Get attribute from genome config file e.g. fasta
 //