Merge pull request #96 from reichan1998/edit_cram_input

add input type `fastq.gz` and `fq.gz` for Illumina and HiC reads
sanger-tol · Jul 22, 2024 · 313a325 · 313a325
2 parents 67badb4 + aa800f8
commit 313a325
Show file tree

Hide file tree

Showing 4 changed files with 21 additions and 7 deletions.
diff --git a/assets/samplesheet_s3.csv b/assets/samplesheet_s3.csv
@@ -1,6 +1,6 @@
 sample,datatype,datafile,library
 mMelMel1,illumina,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel1/illumina/31231_3%231.subset.cram,
-mMelMel2,illumina,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel2/illumina/31231_4%231.subset.cram,
+mMelMel2,illumina,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel2/illumina/31231_4%231.subset.fastq.gz,
 mMelMel3,hic,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/hic/35528_2%231.subset.cram,
 mMelMel3,ont,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/ont/PAE35587_pass_1f1f0707_115.subset.fastq.gz,
 mMelMel3,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/pacbio/m64094_200910_173211.ccs.bc1022_BAK8B_OA--bc1022_BAK8B_OA.subset.bam,

diff --git a/docs/usage.md b/docs/usage.md
@@ -42,7 +42,7 @@ sample1_T5,pacbio,pacbio2.bam,pacbio2
 | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `sample`   | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (\_).                                                 |
 | `datatype` | Type of sequencing data. Must be one of `hic`, `Illumina`, `pacbio`, or `ont`.                                                                                                                                                        |
-| `datafile` | Full path to read data file. Must be `bam` or `cram` for `hic` and `illumina`. Must be `bam` for `pacbio`. Must be `fastq.gz` or `fq.gz` for `ont`.                                                                                   |
+| `datafile` | Full path to read data file. Must be `bam` or `cram` or `fastq.gz` or `fq.gz` for `Illumina` and `HiC`. Must be `bam` for `pacbio`. Must be `fastq.gz` or `fq.gz` for `ont`.                                                          |
 | `library`  | (Optional) The library value is a unique identifier which is assigned to read group (`@RG`) ID. If the library name is not specified, the pipeline will auto-create library name using the data filename provided in the samplesheet. |
 
 An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.

diff --git a/subworkflows/local/align_short.nf b/subworkflows/local/align_short.nf
@@ -18,14 +18,28 @@ workflow ALIGN_SHORT {
     main:
     ch_versions = Channel.empty()
 
+    // Check file types and branch
+    reads
+    | branch {
+        meta, reads ->
+            fastq : reads.findAll { it.getName().toLowerCase() =~ /.*f.*\.gz/ }
+            cram : true
+    }
+    | set { ch_reads }
+
 
-    // Convert from CRAM to FASTQ
-    SAMTOOLS_FASTQ ( reads, false )
+    // Convert from CRAM to FASTQ only if CRAM files were provided as input
+    SAMTOOLS_FASTQ ( ch_reads.cram, false )
     ch_versions = ch_versions.mix ( SAMTOOLS_FASTQ.out.versions.first() )
+
+
+    SAMTOOLS_FASTQ.out.fastq
+    | mix ( ch_reads.fastq )
+    | set { ch_reads_fastq }
 
 
-    // Align Fastq to Genome and output sorted BAM
-    BWAMEM2_MEM ( SAMTOOLS_FASTQ.out.fastq, index, true )
+     // Align Fastq to Genome and output sorted BAM
+    BWAMEM2_MEM ( ch_reads_fastq, index, true )
     ch_versions = ch_versions.mix ( BWAMEM2_MEM.out.versions.first() )
 
 

diff --git a/workflows/readmapping.nf b/workflows/readmapping.nf
@@ -118,7 +118,7 @@ workflow READMAPPING {
     //
     ALIGN_HIC ( PREPARE_GENOME.out.fasta, PREPARE_GENOME.out.bwaidx, ch_reads.hic )
     ch_versions = ch_versions.mix ( ALIGN_HIC.out.versions )
-
+    
     ALIGN_ILLUMINA ( PREPARE_GENOME.out.fasta, PREPARE_GENOME.out.bwaidx, ch_reads.illumina )
     ch_versions = ch_versions.mix ( ALIGN_ILLUMINA.out.versions )