Skip to content

Commit

Permalink
Update nextclade dataset part 2/2
Browse files Browse the repository at this point in the history
  • Loading branch information
jameshadfield committed Aug 22, 2023
1 parent 1240f5c commit 4d9bbb8
Show file tree
Hide file tree
Showing 8 changed files with 272,909 additions and 150 deletions.
119 changes: 67 additions & 52 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,6 @@ include: "ingest/ingest.smk"

BUILDS = ["all", "dev", "nextclade-tree"]

GENOTYPES = ["A", "B", "C", "D"]

ROOT = {
"all": "HQ603073", # NHP-HBV isolate
# genotype roots chosen by examining the entire tree and picking a suitably close isolate
"A": "MK534669", # root is genotype I (I is A/C/G recombinant)
"B": "MK534669", # root is genotype I (I is A/C/G recombinant)
"C": "MK534669", # root is genotype I (I is A/C/G recombinant)
"D": "KX186584", # root is genotype E
}

# rule all:
# input:
Expand Down Expand Up @@ -87,19 +77,27 @@ ROOT = {
# #

def get_root(wildcards):
if wildcards.build in ROOT:
return ROOT[wildcards.build]
return ROOT['all']
if wildcards.build in config['roots']:
return config['roots'][wildcards.build]
return config['roots']['all']

## For "all-HBV" like builds - i.e. those which don't downsample to a single genotype
## We deliberatly include some outliers so we can prune out non-human sequences
## And we include the reference genome
rule include_file:
output:
file = "results/{build}/include.txt",
params:
root = get_root
shell:
"""
echo {params.root:q} > {output.file}
"""
root = get_root,
outgroups = lambda w: config['outgroups'] if w.build in BUILDS else [],
ref = lambda w: [config['reference_accession']] if w.build in BUILDS else [],
run:
with open(output[0], 'w') as fh:
print(params[0], file=fh)
for name in params[1]:
print(name, file=fh)
for name in params[2]:
print(name, file=fh)

def filter_params(wildcards):
if wildcards.build == "all":
Expand All @@ -110,7 +108,7 @@ def filter_params(wildcards):
return "--group-by genotype_genbank --subsample-max-sequences 2000"
elif wildcards.build == "nextclade-sequences":
return "--group-by genotype_genbank --subsample-max-sequences 25"
elif wildcards.build in GENOTYPES:
elif wildcards.build in config['genotypes']:
if wildcards.build == "C":
query = f"--query \"(clade_nextclade=='C') | (clade_nextclade=='C_re')\""
else:
Expand Down Expand Up @@ -187,58 +185,45 @@ rule refine:
--output-tree {output.tree} --output-node-data {output.node_data}
"""


## see https://github.com/nextstrain/augur/issues/340#issuecomment-545184212
rule prune_outgroup:
input:
tree = "results/{build}/tree.refined.nwk"
output:
tree = "results/{build}/tree.nwk"
params:
root = get_root
run:
from Bio import Phylo
T = Phylo.read(input[0], "newick")
outgroup = [c for c in T.find_clades() if str(c.name) == params[0]][0]
T.prune(outgroup)
Phylo.write(T, output[0], "newick")
outgroups = lambda w: [get_root(w), *(config['outgroups'] if w.build in BUILDS else [])]
shell:
"""
python scripts/remove_outgroups.py --names {params.outgroups} --tree {input.tree} --output {output.tree}
"""


rule ancestral:
input:
tree = "results/{build}/tree.nwk",
alignment = "results/{build}/filtered.fasta",
translations = rules.align_everything.output.translations,
output:
node_data = "results/{build}/nt_muts.json"
node_data = "results/{build}/mutations.json",
translations = expand("results/{{build}}/mutations_{gene}.translation.fasta", gene=config['genes'])
params:
inference = "joint"
inference = "joint",
genes = config['genes'],
annotation = config['temporary_genemap_for_augur_ancestral'],
translation_pattern = 'results/{build}/mutations_%GENE.translation.fasta'
shell:
"""
augur ancestral \
--tree {input.tree} \
--alignment {input.alignment} \
--inference {params.inference} \
--annotation {params.annotation} \
--translations ingest/results/nextclade_gene_%GENE.translation.fasta \
--genes {params.genes} \
--output-node-data {output.node_data} \
--inference {params.inference}
--output-translations {params.translation_pattern}
"""

# rule translate:
# message: "Translating amino acid sequences"
# input:
# tree = rules.refine.output.tree,
# node_data = rules.ancestral.output.node_data,
# reference = files.reference
# output:
# node_data = "results/aa_muts_hepatitisB_{lineage}.json"
# shell:
# """
# augur translate \
# --tree {input.tree} \
# --ancestral-sequences {input.node_data} \
# --reference-sequence {input.reference} \
# --output {output.node_data} \
# """


## Clades are hard to predict using mutational signatures for HBV due to the amount
## of recombination and the inherit difference in tree reconstruction.
## They are only run for the nextclade dataset build, which we manually check
Expand All @@ -247,7 +232,7 @@ rule clades:
input:
tree = "results/{build}/tree.nwk",
# aa_muts = rules.translate.output.node_data,
nuc_muts = "results/{build}/nt_muts.json",
nuc_muts = "results/{build}/mutations.json",
clades = "config/clades-genotypes.tsv"
output:
clade_data = "results/{build}/clades-genotypes.json",
Expand All @@ -262,13 +247,43 @@ rule clades:
--output {output.clade_data}
"""

# make sure all differences between the alignment reference and the root are attached as mutations to the root.
# This is almost possible via `augur ancestral --root-sequence` however that functionality does not work for
# CDSs which wrap the origin.
rule attach_root_mutations:
input:
mutations = "results/{build}/mutations.json",
tree = "results/{build}/tree.nwk",
translations = expand("results/{{build}}/mutations_{gene}.translation.fasta", gene=config['genes'])
output:
mutations = "results/{build}/mutations.reference-mutations-on-root.json",
params:
genes = config['genes'],
reference = config['reference_accession'],
translation_pattern = 'results/{build}/mutations_%GENE.translation.fasta',
shell:
"""
python3 scripts/attach_root_mutations.py \
--tree {input.tree} \
--reference-name {params.reference} \
--translations {params.translation_pattern} \
--genes {params.genes} \
--mutations-in {input.mutations} \
--mutations-out {output.mutations}
"""

def node_data_files(wildcards):
patterns = [
"results/{build}/branch_lengths.json",
"results/{build}/nt_muts.json",
## TODO XXX translated AA json
]

# For nextclade dataset trees we need to annotate all differences between the root & the nextclade reference
# But for everything else we don't want this!
if wildcards.build == "nextclade-tree":
patterns.append("results/{build}/mutations.reference-mutations-on-root.json")
else:
patterns.append("results/{build}/mutations.json")

# Only infer genotypes via `augur clades` for manually curated nextclade dataset builds
if wildcards.build == "nextclade-tree" or wildcards.build == "dev":
patterns.append("results/{build}/clades-genotypes.json")
Expand Down
45 changes: 17 additions & 28 deletions config/clades-genotypes.tsv
Original file line number Diff line number Diff line change
@@ -1,40 +1,29 @@
clade gene site alt

A nuc 667 T
A nuc 2044 C
A nuc 2996 T
A nuc 2551 G

# Note that B is a subclade of C in the full 11k tree
B nuc 841 C
B nuc 3072 A
B nuc 410 G

C nuc 346 T
C nuc 791 T
C nuc 1135 C
C nuc 2747 G
C nuc 117 A
C nuc 2981 A
C nuc 484 C

C_re nuc 287 A
C_re nuc 957 G
C_re nuc 45 A
C_re nuc 1043 A
C_re nuc 2136 C

D nuc 724 T
D nuc 1059 C
D nuc 113 C
D nuc 2689 G

E nuc 346 T
E nuc 505 T
E nuc 1060 C
E nuc 520 C

F nuc 572 T
F nuc 909 A
F nuc 1332 A
F nuc 1700 A
F nuc 2979 C
F nuc 1694 T

G nuc 1008 G
G nuc 2038 T
G nuc 1953 G
G nuc 2494 G
G nuc 2770 G

H nuc 529 C
H nuc 1467 T
H nuc 2619 G

I nuc 181 C
I nuc 852 G
I nuc 1000 T
I nuc 1052 G
27 changes: 25 additions & 2 deletions config/config.yaml
Original file line number Diff line number Diff line change
@@ -1,10 +1,33 @@


# Used to re-circularise genomes. This should be the same as the reference in the nextclade dataset!
# Also force-included for a number of datasets
reference_accession: "NC_003977"


# Path to the nextclade dataset, which will be used to align all sequences
nextclade_dataset: "nextclade_datasets/references/NC_003977/versions/2023-08-22"

nextclade_binary: "bin/nextclade"
nextclade_binary: "bin/nextclade"

# technically CDSs, but we use these terms rather interchangeably.
# These should be the entire list of CDSs in the nextclade dataset's genemap
genes: ['envS', 'envM', 'envL', 'X', 'pre-capsid', 'capsid', 'pol']

# Augur cannot parse the (correct) nextclade dataset genemap, so we make a temporary one
# Luckily, while many CDSs wrap, `augur ancestral` does the right thing (perhaps by chance not design)
temporary_genemap_for_augur_ancestral: "config/temp_genemap.gff"

genotypes: ["A", "B", "C", "D"]

roots: {
"all": "HQ603073", # NHP-HBV isolate to root the tree
# genotype roots chosen by examining the entire tree and picking a suitably close isolate
"A": "MK534669", # root is genotype I (I is A/C/G recombinant)
"B": "MK534669", # root is genotype I (I is A/C/G recombinant)
"C": "MK534669", # root is genotype I (I is A/C/G recombinant)
"D": "KX186584", # root is genotype E
}

# candidate non-human sequences which fall in outgroups (the entire outgroup will be pruned)
outgroups: ["FM209514", "HQ603059", "FJ798097", "HQ603068"] # "HQ603059", "FJ798097", "AY330914", "AY781182", "AB823660", "AY330914", "FJ798098"]

12 changes: 12 additions & 0 deletions config/temp_genemap.gff
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
##gff-version 3
##Created by Nextstrain (james hadfield) for use with `augur ancestral` which can only parse a very specific format of GFF files
##sequence-region NC_003977.2 1 3182
##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=10407
NC_003977.2 RefSeq region 1 3182 . + . ID=NC_003977.2:1..3182;Dbxref=taxon:10407;Is_circular=true;gbkey=Src;genome=genomic;mol_type=genomic DNA;strain=ayw
NC_003977.2 RefSeq gene 1376 1840 . + 0 Name=X;gene=X
NC_003977.2 RefSeq gene 1816 2454 . + 0 Name=pre-capsid;gene=pre-capsid
NC_003977.2 RefSeq gene 1903 2454 . + 0 Name=capsid;gene=capsid
NC_003977.2 RefSeq gene 2309 4807 . + 0 Name=pol;gene=pol
NC_003977.2 RefSeq gene 2850 4019 . + 0 Name=envL;gene=envL
NC_003977.2 RefSeq gene 3174 4019 . + 0 Name=envM;gene=envM
NC_003977.2 RefSeq gene 157 837 . + 0 Name=envS;gene=envS
3 changes: 2 additions & 1 deletion ingest/ingest.smk
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ rule align_everything:
output:
# Note that these outputs require `--output-basename nextclade`
alignment = "ingest/results/nextclade.aligned.fasta",
summary = "ingest/results/nextclade.tsv"
summary = "ingest/results/nextclade.tsv",
translations = expand("ingest/results/nextclade_gene_{gene}.translation.fasta", gene=config['genes'])
params:
dataset = config['nextclade_dataset'],
nextclade = config['nextclade_binary']
Expand Down
Loading

0 comments on commit 4d9bbb8

Please sign in to comment.