Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

1357 grouping strategy applied by counting number of FASTQ files generated by FASTP #1364

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- [#1335](https://github.com/nf-core/sarek/pull/1335) - Add docs and validation for bcftools annotation parameters
- [#1345](https://github.com/nf-core/sarek/pull/1345) - Preserve STDERR for easier debugging
- [#1351](https://github.com/nf-core/sarek/pull/1351) - Fix params name for test profiles (`bcftools_annotations`)
- [#1357](https://github.com/nf-core/sarek/pull/1364) - Fixed bug where samples were dropped while reconstituting BAM files

### Removed

Expand Down
13 changes: 12 additions & 1 deletion nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,22 @@
"default": "",
"properties": {
"split_fastq": {
"oneOf": [
{
"type": "integer",
"minimum": 250
},
{
"type": "integer",
"minimum": 0,
"maximum": 0
}
],
"type": "integer",
"default": 50000000,
"fa_icon": "fas fa-clock",
"description": "Specify how many reads each split of a FastQ file contains. Set 0 to turn off splitting at all.",
"help_text": "Use the the tool FastP to split FASTQ file by number of reads. This parallelizes across fastq file shards speeding up mapping. "
"help_text": "Use the the tool FastP to split FASTQ file by number of reads. This parallelizes across fastq file shards speeding up mapping. Note although the minimum value is 250 reads, if you have fewer than 250 reads a single FASTQ shard will still be created."
},
"wes": {
"type": "boolean",
Expand Down
75 changes: 51 additions & 24 deletions workflows/sarek.nf
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,7 @@ workflow SAREK {
if (params.split_fastq) {
reads_for_alignment = FASTP.out.reads.map{ meta, reads ->
read_files = reads.sort(false) { a,b -> a.getName().tokenize('.')[0] <=> b.getName().tokenize('.')[0] }.collate(2)
[ meta + [ size:read_files.size() ], read_files ]
[ meta + [ n_fastq: read_files.size() ], read_files ]
}.transpose()
} else reads_for_alignment = FASTP.out.reads

Expand All @@ -482,34 +482,61 @@ workflow SAREK {
}

// STEP 1: MAPPING READS TO REFERENCE GENOME
// reads will be sorted
reads_for_alignment = reads_for_alignment.map{ meta, reads ->
// Update meta.id to meta.sample no multiple lanes or splitted fastqs
if (meta.size * meta.num_lanes == 1) [ meta + [ id:meta.sample ], reads ]
else [ meta, reads ]
}
// First, we must calculate number of lanes for each sample (meta.n_fastq)
// This is needed to group reads from the same sample together using groupKey to avoid stalling the workflow
// when reads from different samples are mixed together
reads_for_alignment.map { meta, reads ->
[ meta.subMap('patient', 'sample', 'sex', 'status'), reads ]
}
.groupTuple()
.map { meta, reads ->
meta + [ n_fastq: reads.size() ] // We can drop the FASTQ files now that we know how many there are
}
.set { reads_grouping_key }

// reads will be sorted
sort_bam = true
FASTQ_ALIGN_BWAMEM_MEM2_DRAGMAP_SENTIEON(reads_for_alignment, index_alignement, sort_bam, fasta, fasta_fai)

// Grouping the bams from the same samples not to stall the workflow
bam_mapped = FASTQ_ALIGN_BWAMEM_MEM2_DRAGMAP_SENTIEON.out.bam.map{ meta, bam ->

// Update meta.id to be meta.sample, ditching sample-lane that is not needed anymore
// Update meta.data_type
// Remove no longer necessary fields:
// read_group: Now in the BAM header
// num_lanes: only needed for mapping
// size: only needed for mapping

// Use groupKey to make sure that the correct group can advance as soon as it is complete
// and not stall the workflow until all reads from all channels are mapped
[ groupKey( meta - meta.subMap('num_lanes', 'read_group', 'size') + [ data_type:'bam', id:meta.sample ], (meta.num_lanes ?: 1) * (meta.size ?: 1)), bam ]
}.groupTuple()

bai_mapped = FASTQ_ALIGN_BWAMEM_MEM2_DRAGMAP_SENTIEON.out.bai.map{ meta, bai ->
[ groupKey( meta - meta.subMap('num_lanes', 'read_group', 'size') + [ data_type:'bai', id:meta.sample ], (meta.num_lanes ?: 1) * (meta.size ?: 1)), bai ]
}.groupTuple()
// Use groupKey to make sure that the correct group can advance as soon as it is complete
// and not stall the workflow until all reads from all channels are mapped
bam_mapped = FASTQ_ALIGN_BWAMEM_MEM2_DRAGMAP_SENTIEON.out.bam
.combine(reads_grouping_key) // Creates a tuple of [ meta, bam, reads_grouping_key ]
.filter { meta1, bam, meta2 -> meta1.sample == meta2.sample }
// Add n_fastq and other variables to meta
.map { meta1, bam, meta2 ->
[ meta1 + meta2, bam ]
}
// Manipulate meta map to remove old fields and add new ones
.map { meta, bam ->
[ meta - meta.subMap('id', 'read_group', 'data_type', 'num_lanes', 'read_group', 'size') + [ data_type: 'bam', id: meta.sample ], bam ]
}
// Create groupKey from meta map
.map { meta, bam ->
[ groupKey( meta, meta.n_fastq), bam ]
}
// Group
.groupTuple()

bai_mapped = FASTQ_ALIGN_BWAMEM_MEM2_DRAGMAP_SENTIEON.out.bai
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bai at only produced/tested with sentieon, but since it is the same, should work

.combine(reads_grouping_key) // Creates a tuple of [ meta, bai, reads_grouping_key ]
.filter { meta1, bai, meta2 -> meta1.sample == meta2.sample }
// Add n_fastq and other variables to meta
.map { meta1, bai, meta2 ->
[ meta1 + meta2, bai ]
}
// Manipulate meta map to remove old fields and add new ones
.map { meta, bai ->
[ meta - meta.subMap('id', 'read_group', 'data_type', 'num_lanes', 'read_group', 'size') + [ data_type: 'bai', id: meta.sample ], bai ]
}
// Create groupKey from meta map
.map { meta, bai ->
[ groupKey( meta, meta.n_fastq), bai ]
}
// Group
.groupTuple()


// gatk4 markduplicates can handle multiple bams as input, so no need to merge/index here
// Except if and only if save_mapped or (skipping markduplicates and sentieon-dedup)
Expand Down