Update nextclade dataset part 2/2

nextstrain · Aug 22, 2023 · 4d9bbb8 · 4d9bbb8
1 parent 1240f5c
commit 4d9bbb8
Show file tree

Hide file tree

Showing 8 changed files with 272,909 additions and 150 deletions.
diff --git a/Snakefile b/Snakefile
@@ -3,16 +3,6 @@ include: "ingest/ingest.smk"
 
 BUILDS = ["all", "dev", "nextclade-tree"]
 
-GENOTYPES = ["A", "B", "C", "D"]
-
-ROOT = {
-    "all": "HQ603073", # NHP-HBV isolate
-    # genotype roots chosen by examining the entire tree and picking a suitably close isolate
-    "A": "MK534669", # root is genotype I (I is A/C/G recombinant)
-    "B": "MK534669", # root is genotype I (I is A/C/G recombinant)
-    "C": "MK534669", # root is genotype I (I is A/C/G recombinant)
-    "D": "KX186584", # root is genotype E
-}
 
 # rule all:
 #     input:
@@ -87,19 +77,27 @@ ROOT = {
 # #
 
 def get_root(wildcards):
-    if wildcards.build in ROOT:
-        return ROOT[wildcards.build]
-    return ROOT['all']
+    if wildcards.build in config['roots']:
+        return config['roots'][wildcards.build]
+    return config['roots']['all']
 
+## For "all-HBV" like builds - i.e. those which don't downsample to a single genotype
+## We deliberatly include some outliers so we can prune out non-human sequences
+## And we include the reference genome
 rule include_file:
     output:
         file = "results/{build}/include.txt",
     params:
-        root = get_root
-    shell:
-        """
-        echo {params.root:q} > {output.file}
-        """
+        root = get_root,
+        outgroups = lambda w: config['outgroups'] if w.build in BUILDS else [],
+        ref = lambda w: [config['reference_accession']] if w.build in BUILDS else [],
+    run:
+        with open(output[0], 'w') as fh:
+            print(params[0], file=fh)
+            for name in params[1]:
+                print(name, file=fh)
+            for name in params[2]:
+                print(name, file=fh)
 
 def filter_params(wildcards):
     if wildcards.build == "all":
@@ -110,7 +108,7 @@ def filter_params(wildcards):
         return "--group-by genotype_genbank --subsample-max-sequences 2000"
     elif wildcards.build == "nextclade-sequences":
         return "--group-by genotype_genbank --subsample-max-sequences 25"
-    elif wildcards.build in GENOTYPES:
+    elif wildcards.build in config['genotypes']:
         if wildcards.build == "C":
             query = f"--query \"(clade_nextclade=='C') | (clade_nextclade=='C_re')\""
         else:
@@ -187,58 +185,45 @@ rule refine:
             --output-tree {output.tree} --output-node-data {output.node_data}
         """
 
-
-## see https://github.com/nextstrain/augur/issues/340#issuecomment-545184212
 rule prune_outgroup:
     input:
         tree = "results/{build}/tree.refined.nwk"
     output:
         tree = "results/{build}/tree.nwk"
     params:
-        root = get_root
-    run:
-        from Bio import Phylo
-        T = Phylo.read(input[0], "newick")
-        outgroup = [c for c in T.find_clades() if str(c.name) == params[0]][0]
-        T.prune(outgroup)
-        Phylo.write(T, output[0], "newick")
+        outgroups = lambda w: [get_root(w), *(config['outgroups'] if w.build in BUILDS else [])]
+    shell:
+        """
+        python scripts/remove_outgroups.py --names {params.outgroups} --tree {input.tree} --output {output.tree}
+        """
 
 
 rule ancestral:
     input:
         tree = "results/{build}/tree.nwk",
         alignment = "results/{build}/filtered.fasta",
+        translations = rules.align_everything.output.translations,
     output:
-        node_data = "results/{build}/nt_muts.json"
+        node_data = "results/{build}/mutations.json",
+        translations = expand("results/{{build}}/mutations_{gene}.translation.fasta", gene=config['genes'])
     params:
-        inference = "joint"
+        inference = "joint",
+        genes = config['genes'],
+        annotation = config['temporary_genemap_for_augur_ancestral'],
+        translation_pattern = 'results/{build}/mutations_%GENE.translation.fasta'
     shell:
         """
         augur ancestral \
             --tree {input.tree} \
             --alignment {input.alignment} \
+            --inference {params.inference} \
+            --annotation {params.annotation} \
+            --translations ingest/results/nextclade_gene_%GENE.translation.fasta \
+            --genes {params.genes} \
             --output-node-data {output.node_data} \
-            --inference {params.inference}
+            --output-translations {params.translation_pattern}
         """
 
-# rule translate:
-#     message: "Translating amino acid sequences"
-#     input:
-#         tree = rules.refine.output.tree,
-#         node_data = rules.ancestral.output.node_data,
-#         reference = files.reference
-#     output:
-#         node_data = "results/aa_muts_hepatitisB_{lineage}.json"
-#     shell:
-#         """
-#         augur translate \
-#             --tree {input.tree} \
-#             --ancestral-sequences {input.node_data} \
-#             --reference-sequence {input.reference} \
-#             --output {output.node_data} \
-#         """
-
-
 ## Clades are hard to predict using mutational signatures for HBV due to the amount
 ## of recombination and the inherit difference in tree reconstruction.
 ## They are only run for the nextclade dataset build, which we manually check
@@ -247,7 +232,7 @@ rule clades:
     input:
         tree =  "results/{build}/tree.nwk",
         # aa_muts = rules.translate.output.node_data,
-        nuc_muts = "results/{build}/nt_muts.json",
+        nuc_muts = "results/{build}/mutations.json",
         clades = "config/clades-genotypes.tsv"
     output:
         clade_data = "results/{build}/clades-genotypes.json",
@@ -262,13 +247,43 @@ rule clades:
             --output {output.clade_data}
         """
 
+# make sure all differences between the alignment reference and the root are attached as mutations to the root.
+# This is almost possible via `augur ancestral --root-sequence` however that functionality does not work for
+# CDSs which wrap the origin.
+rule attach_root_mutations:
+    input:
+        mutations = "results/{build}/mutations.json",
+        tree = "results/{build}/tree.nwk",
+        translations = expand("results/{{build}}/mutations_{gene}.translation.fasta", gene=config['genes'])
+    output:
+        mutations = "results/{build}/mutations.reference-mutations-on-root.json",
+    params:
+        genes = config['genes'],
+        reference = config['reference_accession'],
+        translation_pattern = 'results/{build}/mutations_%GENE.translation.fasta',
+    shell:
+        """
+        python3 scripts/attach_root_mutations.py \
+            --tree {input.tree} \
+            --reference-name {params.reference} \
+            --translations {params.translation_pattern} \
+            --genes {params.genes} \
+            --mutations-in {input.mutations} \
+            --mutations-out {output.mutations}
+        """
+
 def node_data_files(wildcards):
     patterns = [
         "results/{build}/branch_lengths.json",
-        "results/{build}/nt_muts.json",
-        ## TODO XXX translated AA json
     ]
 
+    # For nextclade dataset trees we need to annotate all differences between the root & the nextclade reference
+    # But for everything else we don't want this!
+    if wildcards.build == "nextclade-tree":
+        patterns.append("results/{build}/mutations.reference-mutations-on-root.json")
+    else:
+        patterns.append("results/{build}/mutations.json")
+
     # Only infer genotypes via `augur clades` for manually curated nextclade dataset builds
     if wildcards.build == "nextclade-tree" or wildcards.build == "dev":
         patterns.append("results/{build}/clades-genotypes.json")

diff --git a/config/clades-genotypes.tsv b/config/clades-genotypes.tsv
@@ -1,40 +1,29 @@
 clade	gene	site	alt
 
-A	nuc	667	T
-A	nuc	2044	C
-A	nuc	2996	T
+A	nuc	2551	G
 
 # Note that B is a subclade of C in the full 11k tree
-B	nuc	841	C
-B	nuc	3072	A
+B	nuc	410	G
 
-C	nuc	346	T
-C	nuc	791	T
-C	nuc	1135	C
-C	nuc	2747	G
+C	nuc	117	A
+C	nuc	2981	A
+C	nuc	484	C
 
-C_re	nuc	287	A
-C_re	nuc	957	G
+C_re	nuc	45	A
+C_re	nuc	1043	A
+C_re	nuc	2136	C
 
-D	nuc	724	T
-D	nuc	1059	C
+D	nuc	113	C
+D	nuc	2689	G
 
-E	nuc	346	T
-E	nuc	505	T
-E	nuc	1060	C
+E	nuc	520	C
 
-F	nuc	572	T
-F	nuc	909	A
-F	nuc	1332	A
-F	nuc	1700	A
-F	nuc	2979	C
+F	nuc	1694	T
 
-G	nuc	1008	G
-G	nuc	2038	T
+G	nuc	1953	G
+G	nuc	2494	G
+G	nuc	2770	G
 
-H	nuc	529	C
-H	nuc	1467	T
+H	nuc	2619	G
 
-I	nuc	181	C
-I	nuc	852	G
-I	nuc	1000	T
+I	nuc	1052	G
diff --git a/config/config.yaml b/config/config.yaml
@@ -1,10 +1,33 @@
 
 
 # Used to re-circularise genomes. This should be the same as the reference in the nextclade dataset!
+# Also force-included for a number of datasets
 reference_accession: "NC_003977"
 
-
 # Path to the nextclade dataset, which will be used to align all sequences
 nextclade_dataset: "nextclade_datasets/references/NC_003977/versions/2023-08-22"
 
-nextclade_binary: "bin/nextclade"
+nextclade_binary: "bin/nextclade"
+
+# technically CDSs, but we use these terms rather interchangeably.
+# These should be the entire list of CDSs in the nextclade dataset's genemap
+genes: ['envS', 'envM', 'envL', 'X', 'pre-capsid', 'capsid', 'pol']
+
+# Augur cannot parse the (correct) nextclade dataset genemap, so we make a temporary one
+# Luckily, while many CDSs wrap, `augur ancestral` does the right thing (perhaps by chance not design)
+temporary_genemap_for_augur_ancestral: "config/temp_genemap.gff"
+
+genotypes: ["A", "B", "C", "D"]
+
+roots: {
+    "all": "HQ603073", # NHP-HBV isolate to root the tree
+    # genotype roots chosen by examining the entire tree and picking a suitably close isolate
+    "A": "MK534669", # root is genotype I (I is A/C/G recombinant)
+    "B": "MK534669", # root is genotype I (I is A/C/G recombinant)
+    "C": "MK534669", # root is genotype I (I is A/C/G recombinant)
+    "D": "KX186584", # root is genotype E
+}
+
+# candidate non-human sequences which fall in outgroups (the entire outgroup will be pruned)
+outgroups: ["FM209514", "HQ603059", "FJ798097", "HQ603068"] # "HQ603059", "FJ798097", "AY330914", "AY781182", "AB823660", "AY330914", "FJ798098"]
+
diff --git a/config/temp_genemap.gff b/config/temp_genemap.gff
@@ -0,0 +1,12 @@
+##gff-version 3
+##Created by Nextstrain (james hadfield) for use with `augur ancestral` which can only parse a very specific format of GFF files
+##sequence-region NC_003977.2 1 3182
+##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=10407
+NC_003977.2	RefSeq	region	1	3182	.	+	.	ID=NC_003977.2:1..3182;Dbxref=taxon:10407;Is_circular=true;gbkey=Src;genome=genomic;mol_type=genomic DNA;strain=ayw
+NC_003977.2	RefSeq	gene	1376	1840	.	+	0	Name=X;gene=X
+NC_003977.2	RefSeq	gene	1816	2454	.	+	0	Name=pre-capsid;gene=pre-capsid
+NC_003977.2	RefSeq	gene	1903	2454	.	+	0	Name=capsid;gene=capsid
+NC_003977.2	RefSeq	gene	2309	4807	.	+	0	Name=pol;gene=pol
+NC_003977.2	RefSeq	gene	2850	4019	.	+	0	Name=envL;gene=envL
+NC_003977.2	RefSeq	gene	3174	4019	.	+	0	Name=envM;gene=envM
+NC_003977.2	RefSeq	gene	157	837	.	+	0	Name=envS;gene=envS
diff --git a/ingest/ingest.smk b/ingest/ingest.smk
@@ -65,7 +65,8 @@ rule align_everything:
     output:
         # Note that these outputs require `--output-basename nextclade`
         alignment = "ingest/results/nextclade.aligned.fasta",
-        summary = "ingest/results/nextclade.tsv"
+        summary = "ingest/results/nextclade.tsv",
+        translations = expand("ingest/results/nextclade_gene_{gene}.translation.fasta", gene=config['genes'])
     params:
         dataset = config['nextclade_dataset'],
         nextclade = config['nextclade_binary']