Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LAST alignment statistics for MultiQC #5902

Merged
merged 12 commits into from
Jul 11, 2024
Merged
28 changes: 24 additions & 4 deletions modules/nf-core/last/lastal/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ process LAST_LASTAL {

output:
tuple val(meta), path("*.maf.gz"), emit: maf
tuple val(meta), path("*.tsv") , emit: multiqc
path "versions.yml" , emit: versions

when:
Expand All @@ -25,15 +26,33 @@ process LAST_LASTAL {
"""
INDEX_NAME=\$(basename \$(ls $index/*.des) .des)
set -o pipefail

function calculate_psl_metrics() {
awk 'BEGIN {
FS="\t"; # Set field separator as tab
totalMatches = 0;
totalAlignmentLength = 0;
print "Sample\tTotalAlignmentLength\tPercentSimilarity"; # Header for MultiQC
}
{
totalMatches += \$1 + \$3; # Sum matches and repMatches
totalAlignmentLength += \$1 + \$2 + \$3 + \$6 + \$8; # Sum matches, misMatches, repMatches, qBaseInsert, and tBaseInsert
}
END {
percentSimilarity = (totalAlignmentLength > 0) ? (totalMatches / totalAlignmentLength * 100) : 0;
print "$meta.id" "\t" totalAlignmentLength "\t" percentSimilarity; # Data in TSV format
}'
}

lastal \\
-P $task.cpus \\
$trained_params \\
$args \\
${index}/\$INDEX_NAME \\
$fastx \\
| gzip --no-name > ${prefix}.\$INDEX_NAME.maf.gz
# gzip needs --no-name otherwise it puts a timestamp in the file,
# which makes its checksum non-reproducible.
$fastx |
tee >(gzip --no-name > ${prefix}.maf.gz) |
maf-convert psl |
calculate_psl_metrics > ${prefix}.tsv

cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand All @@ -48,6 +67,7 @@ process LAST_LASTAL {
"""
INDEX_NAME=STUB
echo stub | gzip --no-name > ${prefix}.\$INDEX_NAME.maf.gz
touch ${prefix}.tsv

cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
4 changes: 4 additions & 0 deletions modules/nf-core/last/lastal/meta.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ output:
type: file
description: Gzipped MAF (Multiple Alignment Format) file
pattern: "*.{maf.gz}"
- multiqc:
type: file
description: Alignment summary for MultiQC
pattern: "*.tsv"
authors:
- "@charles-plessy"
maintainers:
Expand Down
68 changes: 61 additions & 7 deletions modules/nf-core/last/lastal/tests/main.nf.test.snap
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,19 @@
"id": "contigs",
"single_end": false
},
"contigs.genome.maf.gz:md5,902274b72657f62d270d284dc211aa7f"
"contigs.maf.gz:md5,902274b72657f62d270d284dc211aa7f"
]
],
"1": [
[
{
"id": "contigs",
"single_end": false
},
"contigs.tsv:md5,f028e69bd64e54080b9a03fd809cba74"
]
],
"2": [
"versions.yml:md5,e0a425d7cbca674252a1e4328b247ca2"
],
"maf": [
Expand All @@ -20,7 +29,16 @@
"id": "contigs",
"single_end": false
},
"contigs.genome.maf.gz:md5,902274b72657f62d270d284dc211aa7f"
"contigs.maf.gz:md5,902274b72657f62d270d284dc211aa7f"
]
],
"multiqc": [
[
{
"id": "contigs",
"single_end": false
},
"contigs.tsv:md5,f028e69bd64e54080b9a03fd809cba74"
]
],
"versions": [
Expand All @@ -32,7 +50,7 @@
"nf-test": "0.8.4",
"nextflow": "24.04.2"
},
"timestamp": "2024-06-06T23:11:59.764152"
"timestamp": "2024-07-02T17:57:48.589408"
},
"sarscov2 - contigs - genome - stub": {
"content": [
Expand All @@ -47,6 +65,15 @@
]
],
"1": [
[
{
"id": "contigs",
"single_end": false
},
"contigs.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
]
],
"2": [
"versions.yml:md5,e0a425d7cbca674252a1e4328b247ca2"
],
"maf": [
Expand All @@ -58,6 +85,15 @@
"contigs.STUB.maf.gz:md5,f50b84b1db4b83ba62ec1deacc69c260"
]
],
"multiqc": [
[
{
"id": "contigs",
"single_end": false
},
"contigs.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
]
],
"versions": [
"versions.yml:md5,e0a425d7cbca674252a1e4328b247ca2"
]
Expand All @@ -67,7 +103,7 @@
"nf-test": "0.8.4",
"nextflow": "24.04.2"
},
"timestamp": "2024-06-06T23:12:43.028075"
"timestamp": "2024-07-02T17:58:30.521811"
},
"sarscov2 - contigs - genome - withparams": {
"content": [
Expand All @@ -78,10 +114,19 @@
"id": "contigs",
"single_end": false
},
"contigs.genome.maf.gz:md5,8cb97b6daa34dbf9c723a2c4a984992d"
"contigs.maf.gz:md5,8cb97b6daa34dbf9c723a2c4a984992d"
]
],
"1": [
[
{
"id": "contigs",
"single_end": false
},
"contigs.tsv:md5,f315664aa18f1f6bad79486f9750f200"
]
],
"2": [
"versions.yml:md5,e0a425d7cbca674252a1e4328b247ca2"
],
"maf": [
Expand All @@ -90,7 +135,16 @@
"id": "contigs",
"single_end": false
},
"contigs.genome.maf.gz:md5,8cb97b6daa34dbf9c723a2c4a984992d"
"contigs.maf.gz:md5,8cb97b6daa34dbf9c723a2c4a984992d"
]
],
"multiqc": [
[
{
"id": "contigs",
"single_end": false
},
"contigs.tsv:md5,f315664aa18f1f6bad79486f9750f200"
]
],
"versions": [
Expand All @@ -102,6 +156,6 @@
"nf-test": "0.8.4",
"nextflow": "24.04.2"
},
"timestamp": "2024-06-06T23:12:21.536568"
"timestamp": "2024-07-02T17:58:09.677672"
}
}
26 changes: 25 additions & 1 deletion modules/nf-core/last/split/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ process LAST_SPLIT {

output:
tuple val(meta), path("*.maf.gz"), emit: maf
tuple val(meta), path("*.tsv") , emit: multiqc
path "versions.yml" , emit: versions

when:
Expand All @@ -23,7 +24,29 @@ process LAST_SPLIT {
if( "$maf" == "${prefix}.maf.gz" ) error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!"
"""
set -o pipefail
zcat < $maf | last-split $args | gzip --no-name > ${prefix}.maf.gz

function calculate_psl_metrics() {
awk 'BEGIN {
FS="\t"; # Set field separator as tab
totalMatches = 0;
totalAlignmentLength = 0;
print "Sample\tTotalAlignmentLength\tPercentSimilarity"; # Header for MultiQC
}
{
totalMatches += \$1 + \$3; # Sum matches and repMatches
totalAlignmentLength += \$1 + \$2 + \$3 + \$6 + \$8; # Sum matches, misMatches, repMatches, qBaseInsert, and tBaseInsert
}
END {
percentSimilarity = (totalAlignmentLength > 0) ? (totalMatches / totalAlignmentLength * 100) : 0;
print "$meta.id" "\t" totalAlignmentLength "\t" percentSimilarity; # Data in TSV format
}'
}

zcat < $maf |
last-split $args |
tee >(gzip --no-name > ${prefix}.maf.gz) |
maf-convert psl |
calculate_psl_metrics > ${prefix}.tsv

cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand All @@ -37,6 +60,7 @@ process LAST_SPLIT {
if( "$maf" == "${prefix}.maf.gz" ) error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!"
"""
echo stub | gzip --no-name > ${prefix}.maf.gz
touch ${prefix}.tsv

cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
4 changes: 4 additions & 0 deletions modules/nf-core/last/split/meta.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ output:
type: file
description: Multiple Aligment Format (MAF) file, compressed with gzip
pattern: "*.{maf.gz}"
- multiqc:
type: file
description: Alignment summary for MultiQC
pattern: "*.tsv"
authors:
- "@aleksandrabliznina"
- "@charles-plessy"
Expand Down
36 changes: 34 additions & 2 deletions modules/nf-core/last/split/tests/main.nf.test.snap
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,14 @@
]
],
"1": [
[
{
"id": "sarscov.contigs.genome"
},
"sarscov.contigs.genome.tsv:md5,b625a3b37343e9e6a279b8625d4c2da8"
]
],
"2": [
"versions.yml:md5,9e429d0800988ae0bbe5000827d34ad1"
],
"maf": [
Expand All @@ -21,6 +29,14 @@
"sarscov.contigs.genome.maf.gz:md5,689cb18ff7098ff90eaf87017f590208"
]
],
"multiqc": [
[
{
"id": "sarscov.contigs.genome"
},
"sarscov.contigs.genome.tsv:md5,b625a3b37343e9e6a279b8625d4c2da8"
]
],
"versions": [
"versions.yml:md5,9e429d0800988ae0bbe5000827d34ad1"
]
Expand All @@ -30,7 +46,7 @@
"nf-test": "0.8.4",
"nextflow": "24.04.2"
},
"timestamp": "2024-06-06T17:49:24.045661"
"timestamp": "2024-07-02T11:45:00.535348"
},
"sarscov2 - contigs_genome - stub": {
"content": [
Expand All @@ -44,6 +60,14 @@
]
],
"1": [
[
{
"id": "sarscov.contigs.genome"
},
"sarscov.contigs.genome.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
]
],
"2": [
"versions.yml:md5,9e429d0800988ae0bbe5000827d34ad1"
],
"maf": [
Expand All @@ -54,6 +78,14 @@
"sarscov.contigs.genome.maf.gz:md5,f50b84b1db4b83ba62ec1deacc69c260"
]
],
"multiqc": [
[
{
"id": "sarscov.contigs.genome"
},
"sarscov.contigs.genome.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
]
],
"versions": [
"versions.yml:md5,9e429d0800988ae0bbe5000827d34ad1"
]
Expand All @@ -63,6 +95,6 @@
"nf-test": "0.8.4",
"nextflow": "24.04.2"
},
"timestamp": "2024-06-06T17:50:20.139442"
"timestamp": "2024-07-02T11:45:21.243325"
}
}
12 changes: 12 additions & 0 deletions modules/nf-core/last/train/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ process LAST_TRAIN {

output:
tuple val(meta), path("*.train"), emit: param_file
tuple val(meta), path("*.tsv") , emit: multiqc
path "versions.yml" , emit: versions

when:
Expand All @@ -31,6 +32,16 @@ process LAST_TRAIN {
$fastx \\
> ${prefix}.\$INDEX_NAME.train

echo "id\tsubstitution_percent_identity\tlast -t\tlast -a\tlast -A\tlast -b\tlast -B\tlast -S" > ${prefix}.train.tsv
printf "\$(basename ${prefix}.\$INDEX_NAME.train .target.train)\t" >> ${prefix}.train.tsv
grep 'substitution percent identity' ${prefix}.\$INDEX_NAME.train | tail -n 1 | awk '{print \$5}' | tr '\n' '\t' >> ${prefix}.train.tsv
grep 'last -t' ${prefix}.\$INDEX_NAME.train | tail -n 1 | awk '{print \$2}' | sed -e 's/-t//' | tr '\n' '\t' >> ${prefix}.train.tsv
grep 'last -a' ${prefix}.\$INDEX_NAME.train | tail -n 1 | awk '{print \$3}' | tr '\n' '\t' >> ${prefix}.train.tsv
grep 'last -A' ${prefix}.\$INDEX_NAME.train | tail -n 1 | awk '{print \$3}' | tr '\n' '\t' >> ${prefix}.train.tsv
grep 'last -b' ${prefix}.\$INDEX_NAME.train | tail -n 1 | awk '{print \$3}' | tr '\n' '\t' >> ${prefix}.train.tsv
grep 'last -B' ${prefix}.\$INDEX_NAME.train | tail -n 1 | awk '{print \$3}' | tr '\n' '\t' >> ${prefix}.train.tsv
grep 'last -S' ${prefix}.\$INDEX_NAME.train | tail -n 1 | awk '{print \$3}' >> ${prefix}.train.tsv

cat <<-END_VERSIONS > versions.yml
"${task.process}":
last: \$(lastdb --version | sed 's/lastdb //')
Expand All @@ -43,6 +54,7 @@ process LAST_TRAIN {
"""
INDEX_NAME=STUB
touch ${prefix}.\$INDEX_NAME.train
touch ${prefix}.train.tsv

cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
5 changes: 5 additions & 0 deletions modules/nf-core/last/train/meta.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,13 @@ output:
type: file
description: Trained parameter file
pattern: "*.train"
- multiqc:
type: file
description: Alignment parameter summary for MultiQC
pattern: "*.tsv"
authors:
- "@aleksandrabliznina"
- "@charles-plessy"
- "@U13bs1125"
maintainers:
- "@charles-plessy"
Loading
Loading