Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add gene coverage columns during ingest workflow #36

Merged
merged 15 commits into from
Apr 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions ingest/bin/calculate-gene-converage-from-nextclade-translation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#! /usr/bin/env python3

import argparse
from sys import stderr

from Bio import SeqIO
import re


def parse_args():
parser = argparse.ArgumentParser(
description="Calculate gene coverage from amino acid sequence. The output will be in TSV format to stdout.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"--fasta",
type=str,
help="FASTA file of CDS translations from Nextclade.",
)
parser.add_argument(
"--out-col",
type=str,
default="gene_coverage",
help="Output column name.",
)
return parser.parse_args()


def calculate_gene_coverage_from_nextclade_cds(fasta, out_col):
"""
Calculate gene coverage from amino acid sequence in gene translation FASTA file from Nextclade.
"""
print(f"genbank_accession\t{out_col}")
# Iterate over the sequences in the FASTA file
for record in SeqIO.parse(fasta, "fasta"):
sequence_id = record.id
sequence = str(record.seq)

# Calculate gene coverage
results = re.findall(r"([ACDEFGHIKLMNPQRSTVWY])", sequence.upper())
j23414 marked this conversation as resolved.
Show resolved Hide resolved
gene_coverage = round(len(results) / len(sequence), 3)

# Print the results
print(f"{sequence_id}\t{gene_coverage}")


def main():
args = parse_args()

calculate_gene_coverage_from_nextclade_cds(args.fasta, args.out_col)


if __name__ == "__main__":
main()
10 changes: 9 additions & 1 deletion ingest/defaults/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -109,4 +109,12 @@ curate:
nextclade:
min_length: 1000 # E gene length is approximately 1400nt
min_seed_cover: 0.1
nextclade_field: 'nextclade_subtype'
gene: ['E','NS1']
# Nextclade Fields to rename to metadata field names.
field_map:
seqName: genbank_accession # ID field used to merge annotations
clade: nextclade_subtype
alignmentStart: alignmentStart
alignmentEnd: alignmentEnd
coverage: genome_coverage
failedCdses: failedCdses
84 changes: 75 additions & 9 deletions ingest/rules/nextclade.smk
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,23 @@ SEROTYPE_CONSTRAINTS = '|'.join(SUPPORTED_NEXTCLADE_SEROTYPES)
rule nextclade_denvX:
"""
For each type, classify into the appropriate subtype
1. Capture the alignment
2. Capture the translations of gene(s) of interest

Note: If using --cds-selection, only the thoese genes are reported in the failedCdses column
"""
input:
sequences="results/sequences_{serotype}.fasta",
dataset="../nextclade_data/{serotype}",
output:
nextclade_denvX="data/nextclade_results/nextclade_{serotype}.tsv",
nextclade_alignment="results/aligned_{serotype}.fasta",
nextclade_translations=expand("data/translations/{{serotype}}/{gene}/seqs.gene.fasta", gene=config["nextclade"]["gene"]),
threads: 4
params:
min_length=config["nextclade"]["min_length"],
min_seed_cover=config["nextclade"]["min_seed_cover"],
output_translations = lambda wildcards: f"data/translations/{wildcards.serotype}/{{cds}}/seqs.gene.fasta",
wildcard_constraints:
serotype=SEROTYPE_CONSTRAINTS
shell:
Expand All @@ -40,6 +47,8 @@ rule nextclade_denvX:
--min-length {params.min_length} \
--min-seed-cover {params.min_seed_cover} \
--silent \
--output-fasta {output.nextclade_alignment} \
--output-translations {params.output_translations} \
{input.sequences}
"""

Expand All @@ -48,19 +57,19 @@ rule concat_nextclade_subtype_results:
Concatenate all the nextclade results for dengue subtype classification
"""
input:
expand("data/nextclade_results/nextclade_{serotype}.tsv", serotype=SUPPORTED_NEXTCLADE_SEROTYPES),
nextclade_results_files = expand("data/nextclade_results/nextclade_{serotype}.tsv", serotype=SUPPORTED_NEXTCLADE_SEROTYPES),
output:
nextclade_subtypes="results/nextclade_subtypes.tsv",
params:
id_field=config["curate"]["id_field"],
nextclade_field=config["nextclade"]["nextclade_field"],
input_nextclade_fields=",".join([f'{key}' for key, value in config["nextclade"]["field_map"].items()]),
output_nextclade_fields=",".join([f'{value}' for key, value in config["nextclade"]["field_map"].items()]),
shell:
"""
echo "{params.id_field},{params.nextclade_field}" \
echo "{params.output_nextclade_fields}" \
| tr ',' '\t' \
> {output.nextclade_subtypes}

tsv-select -H -f "seqName,clade" {input} \
tsv-select -H -f "{params.input_nextclade_fields}" {input.nextclade_results_files} \
| awk 'NR>1 {{print}}' \
>> {output.nextclade_subtypes}
"""
Expand All @@ -73,21 +82,78 @@ rule append_nextclade_columns:
metadata="data/metadata_all.tsv",
nextclade_subtypes="results/nextclade_subtypes.tsv",
output:
metadata_all="results/metadata_all.tsv",
metadata_all="data/metadata_nextclade.tsv",
params:
id_field=config["curate"]["id_field"],
nextclade_field=config["nextclade"]["nextclade_field"],
id_field=list(config["nextclade"]["field_map"].values())[0],
output_nextclade_fields=",".join([f'{value}' for key, value in config["nextclade"]["field_map"].items()][1:]),
shell:
"""
tsv-join -H \
--filter-file {input.nextclade_subtypes} \
--key-fields {params.id_field} \
--append-fields {params.nextclade_field} \
--append-fields {params.output_nextclade_fields} \
--write-all ? \
{input.metadata} \
> {output.metadata_all}
"""

rule calculate_gene_coverage:
"""
Calculate the coverage of the gene of interest
"""
input:
nextclade_translation="data/translations/{serotype}/{gene}/seqs.gene.fasta",
output:
gene_coverage="data/translations/{serotype}/{gene}/gene_coverage.tsv",
wildcard_constraints:
serotype=SEROTYPE_CONSTRAINTS,
shell:
"""
python bin/calculate-gene-converage-from-nextclade-translation.py \
--fasta {input.nextclade_translation} \
--out-col {wildcards.gene}_coverage \
> {output.gene_coverage}
"""

rule aggregate_gene_coverage_by_gene:
"""
Aggregate the gene coverage results by gene
"""
input:
gene_coverage=expand("data/translations/{serotype}/{{gene}}/gene_coverage.tsv", serotype=SUPPORTED_NEXTCLADE_SEROTYPES),
output:
gene_coverage_all="results/{gene}/gene_coverage_all.tsv",
shell:
"""
tsv-append -H {input.gene_coverage} > {output.gene_coverage_all}
"""

rule append_gene_coverage_columns:
"""
Append the gene coverage results to the metadata
"""
input:
metadata="data/metadata_nextclade.tsv",
gene_coverage=expand("results/{gene}/gene_coverage_all.tsv", gene=config["nextclade"]["gene"])
output:
metadata_all="results/metadata_all.tsv",
params:
id_field=config["curate"]["id_field"],
shell:
"""
cp {input.metadata} {output.metadata_all}
for FILE in {input.gene_coverage}; do
tsv-join -H \
--filter-file $FILE \
--key-fields {params.id_field} \
--append-fields '*_coverage' \
--write-all ? \
{output.metadata_all} \
> results/temp_aggregate_gene_coverage.tsv
mv results/temp_aggregate_gene_coverage.tsv {output.metadata_all}
done
"""

rule split_metadata_by_serotype:
"""
Split the metadata by serotype
Expand Down