Skip to content

Commit

Permalink
Merge pull request #95 from tkchafin/samtools_reheader
Browse files Browse the repository at this point in the history
Option to incorporate additional metadata from provided SAM header file
  • Loading branch information
tkchafin authored Jul 12, 2024
2 parents f9be6aa + 3aca264 commit 67badb4
Show file tree
Hide file tree
Showing 12 changed files with 107 additions and 18 deletions.
6 changes: 6 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,12 @@ process {
ext.prefix = { "${meta.id}.merge" }
}

// If custom header provided, this is inserted in place of existing
// @HD and @SQ lines, while preserving any other header entries
withName: SAMTOOLS_REHEADER {
ext.prefix = { "${meta.id}.reheader" }
}

withName: SAMTOOLS_COLLATETOFASTA {
ext.args = { (params.use_work_dir_as_temp ? "-T." : "") }
}
Expand Down
1 change: 1 addition & 0 deletions conf/test.config
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,5 @@ params {

// Fasta references
fasta = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.fasta.gz"
header = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.header.sam"
}
4 changes: 4 additions & 0 deletions docs/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,10 @@ The filtered PacBio reads are aligned with `MINIMAP2_ALIGN`. The sorted and merg

## Alignment post-processing

### External metadata

If provided using the `--header` option, all output alignments (`*.cram`) will include any additional metadata supplied as a SAM header template, replacing the existing _@HD_ and _@SD_ entries (note that this behaviour can be altered by modifying the `ext.args` for `SAMTOOLS_REHEADER` in `modules.config`.

### Statistics

The output alignments, along with the index, are used to calculate mapping statistics. Output files are generated using `SAMTOOLS_STATS`, `SAMTOOLS_FLAGSTAT` and `SAMTOOLS_IDXSTATS`.
Expand Down
2 changes: 2 additions & 0 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ work # Directory containing the nextflow working files
# Other nextflow hidden files, eg. history of pipeline runs and old logs.
```

You can also optionally supply a template SAM header using the `--header` option to add or modify metadata associated with the assembly, which will be incorporated into the output alignments.

### Updating the pipeline

When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline:
Expand Down
60 changes: 60 additions & 0 deletions modules/local/samtools_replaceheader.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
process SAMTOOLS_REHEADER {
tag "$meta.id"
label 'process_single'

conda "bioconda::samtools=1.17"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' :
'biocontainers/samtools:1.17--h00cdaf9_0' }"

input:
tuple val(meta), path(file)
path(header)

output:
tuple val(meta), path("${prefix}.bam") , optional:true, emit: bam
tuple val(meta), path("${prefix}.cram"), optional:true, emit: cram
path "versions.yml", emit: versions

when:
task.ext.when == null || task.ext.when

script:
prefix = task.ext.prefix ?: "${meta.id}"
suffix = file.getExtension()

if ("$file" == "${prefix}.${suffix}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!"
"""
# Replace SQ lines with those from external template
( samtools view --no-PG --header-only ${file} | \\
grep -v ^@SQ && grep ^@SQ ${header} ) > temp.header.sam
# custom sort for readability (retain order of insertion but sort groups by tag)
( grep ^@HD temp.header.sam || true && \
grep ^@SQ temp.header.sam || true && \
grep ^@RG temp.header.sam || true && \
grep ^@PG temp.header.sam || true && \
grep -v -E '^@HD|^@SQ|^@RG|^@PG' temp.header.sam || true; \
) > temp.sorted.header.sam
# Insert new header into file
samtools reheader temp.sorted.header.sam ${file} > ${prefix}.${suffix}
cat <<-END_VERSIONS > versions.yml
"${task.process}":
samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
END_VERSIONS
"""

stub:
prefix = task.ext.prefix ?: "${meta.id}"
suffix = file.getExtension()
"""
touch ${prefix}.${suffix}
cat <<-END_VERSIONS > versions.yml
"${task.process}":
samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
END_VERSIONS
"""
}
1 change: 1 addition & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ params {
vector_db = "${projectDir}/assets/vectorDB.tar.gz"
bwamem2_index = null
fasta = null
header = null

// Execution options
use_work_dir_as_temp = false
Expand Down
9 changes: 8 additions & 1 deletion nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@
"type": "string",
"format": "directory-path",
"description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.",
"fa_icon": "fas fa-folder-open"
"fa_icon": "fas fa-folder-open",
"default": "./results"
},
"vector_db": {
"type": "string",
Expand Down Expand Up @@ -64,6 +65,12 @@
"description": "Path to directory or tar.gz archive for pre-built BWAMEM2 index.",
"format": "path",
"fa_icon": "fas fa-bezier-curve"
},
"header": {
"type": "string",
"format": "path",
"description": "Optional template header file for BAM/CRAM outputs",
"fa_icon": "far fa-file-code"
}
}
},
Expand Down
1 change: 0 additions & 1 deletion subworkflows/local/align_ont.nf
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ workflow ALIGN_ONT {
// Convert merged BAM to CRAM and calculate indices and statistics
SAMTOOLS_MERGE.out.bam
| mix ( ch_bams.single_bam )
| map { meta, bam -> [ meta, bam, [] ] }
| set { ch_sort }


Expand Down
1 change: 0 additions & 1 deletion subworkflows/local/align_pacbio.nf
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ workflow ALIGN_PACBIO {
// Convert merged BAM to CRAM and calculate indices and statistics
SAMTOOLS_MERGE.out.bam
| mix ( ch_bams.single_bam )
| map { meta, bam -> [ meta, bam, [] ] }
| set { ch_sort }


Expand Down
9 changes: 1 addition & 8 deletions subworkflows/local/align_short.nf
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,7 @@ workflow ALIGN_SHORT {
SAMTOOLS_SORMADUP ( ch_bam, fasta )
ch_versions = ch_versions.mix ( SAMTOOLS_SORMADUP.out.versions )


// Convert merged BAM to CRAM and calculate indices and statistics
SAMTOOLS_SORMADUP.out.bam
| map { meta, bam -> [ meta, bam, [] ] }
| set { ch_stat }


emit:
bam = ch_stat // channel: [ val(meta), /path/to/bam ]
bam = SAMTOOLS_SORMADUP.out.bam // channel: [ val(meta), /path/to/bam ]
versions = ch_versions // channel: [ versions.yml ]
}
5 changes: 2 additions & 3 deletions subworkflows/local/convert_stats.nf
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,8 @@ workflow CONVERT_STATS {
// Compress the quality scores of Illumina and PacBio CCS alignments
bam
| branch {
meta, bam, bai ->
meta, bam ->
run_crumble : meta.datatype == "hic" || meta.datatype == "illumina" || meta.datatype == "pacbio"
[meta, bam]
no_crumble: true
}
| set { ch_bams }
Expand All @@ -34,8 +33,8 @@ workflow CONVERT_STATS {

// Convert BAM to CRAM
CRUMBLE.out.bam
| map { meta, bam -> [meta, bam, []] }
| mix ( ch_bams.no_crumble )
| map { meta, bam -> [meta, bam, []] }
| set { ch_bams_for_conversion }

SAMTOOLS_VIEW ( ch_bams_for_conversion, fasta, [] )
Expand Down
26 changes: 22 additions & 4 deletions workflows/readmapping.nf
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,21 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true
// Check mandatory parameters
if (params.input) { ch_input = Channel.fromPath(params.input) } else { exit 1, 'Input samplesheet not specified!' }
if (params.fasta) { ch_fasta = Channel.fromPath(params.fasta) } else { exit 1, 'Genome fasta file not specified!' }

if (params.header) { ch_header = Channel.fromPath(params.header) }

/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
IMPORT LOCAL MODULES/SUBWORKFLOWS
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/

//
// MODULE: Local modules
//

include { SAMTOOLS_REHEADER } from '../modules/local/samtools_replaceheader'


//
// SUBWORKFLOW: Consisting of a mix of local and nf-core/modules
//
Expand Down Expand Up @@ -48,7 +55,6 @@ include { CONVERT_STATS } from '../subworkflows/local/convert_st
include { UNTAR } from '../modules/nf-core/untar/main'
include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main'


/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
RUN MAIN WORKFLOW
Expand Down Expand Up @@ -125,14 +131,26 @@ workflow READMAPPING {
ALIGN_ONT ( PREPARE_GENOME.out.fasta, ch_reads.ont )
ch_versions = ch_versions.mix ( ALIGN_ONT.out.versions )


// gather alignments
ch_aligned_bams = Channel.empty()
| mix( ALIGN_HIC.out.bam )
| mix( ALIGN_ILLUMINA.out.bam )
| mix( ALIGN_HIFI.out.bam )
| mix( ALIGN_CLR.out.bam )
| mix( ALIGN_ONT.out.bam )
CONVERT_STATS ( ch_aligned_bams, PREPARE_GENOME.out.fasta )

// Optionally insert params.header information to bams
ch_reheadered_bams = Channel.empty()
if ( params.header ) {
SAMTOOLS_REHEADER( ch_aligned_bams, ch_header.first() )
ch_reheadered_bams = SAMTOOLS_REHEADER.out.bam
ch_versions = ch_versions.mix ( SAMTOOLS_REHEADER.out.versions )
} else {
ch_reheadered_bams = ch_aligned_bams
}

// convert to cram and gather stats
CONVERT_STATS ( ch_reheadered_bams, PREPARE_GENOME.out.fasta )
ch_versions = ch_versions.mix ( CONVERT_STATS.out.versions )


Expand Down

0 comments on commit 67badb4

Please sign in to comment.