Working all-HBV build

Including `augur clades` inference of genotypes derived from inspection of where genbank-annotated genotypes fell across the tree, which were quite messy. Still includes some basal non-human HBV viruses, and a clade of <10 non-human samples which are extremely diverged (either bad alignment or recombination)
nextstrain · Jun 16, 2023 · 80bdf40 · 80bdf40
1 parent 5468242
commit 80bdf40
Show file tree

Hide file tree

Showing 11 changed files with 733 additions and 134 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,7 +5,7 @@ logs/
 results/
 build/
 auspice/
-data/*
+data/
 defaults/colors.tsv
 defaults/colors_*.tsv
 stats.json

diff --git a/README.md b/README.md
@@ -4,19 +4,27 @@ Currently a WIP
 
 Based on Katie Kistler's work in [blab/adaptive-evolution](https://github.com/blab/adaptive-evolution)
 
-## Ingest
+## Obtain data
 
 1. Obtain `./ingest/genbank_sequences.gb`
     - example search: [(Hepatitis B virus) AND (complete genome)](https://www.ncbi.nlm.nih.gov/nuccore/?term=(Hepatitis+B+virus)+AND+(complete+genome)), which returns ~11k samples
       - Could also explore `"Hepatitis B virus"[porgn:__txid10407]`
     - download as "Complete Record" / "File" / "GenBank (full)", a ~100Mb file.
 
-2. `./ingest/curate_genbank_to_fasta.ipynb`
-    - produces `./ingest/hepatitisB_all.fasta`
+
+## Ingest
+
+You can run the ingest part of the snakemake pipeline via:
+
+```
+snakemake --cores 4 -npf ingest/results/aligned.fasta ingest/results/sequences.fasta ingest/results/metadata.tsv
+```
 
 
 ## Phylo
 
-1. `snakemake --cores 4 -pf auspice/hepatitisB_all.json`
+For a small ~800-tip dev tree: `snakemake --cores 4 -pf auspice/hbv_dev.json`
+For the entire human-HBV tree, with 11k tips (takes ~15min on a 4-core M1 machine): `snakemake --cores 4 -pf auspice/hbv_all.json`
 
+(If you haven't run the ingest section of the pipeline then any phylo build will run it.)
 
diff --git a/Snakefile b/Snakefile
@@ -1,97 +1,118 @@
-# LINEAGES = ['all']
-LINEAGES = ['A2']
+
+include: "ingest/ingest.smk"
+
+BUILDS = ["all", "dev"]
 
 # rule all:
 #     input:
 #         auspice_json = expand("auspice/hepatitisB_{lineage}.json", lineage=LINEAGES)
 
-rule files:
-    params:
-        reference = "config/reference_hepatitisB_{lineage}.gb",
-        auspice_config = "config/auspice_config_{lineage}.json",
-        dropped_strains = "config/dropped_strains_hepatitisB_{lineage}.txt"
+# rule files:
+#     params:
+#         reference = "config/reference_hepatitisB_{lineage}.gb",
+#         auspice_config = "config/auspice_config_all.json",
+#         dropped_strains = "config/dropped_strains_hepatitisB_{lineage}.txt"
 
 
-files = rules.files.params
+# files = rules.files.params
 
 
-rule parse:
-    message: "Parsing fasta into sequences and metadata"
-    input:
-        sequences = 'ingest/hepatitisB_{lineage}.fasta'
-    output:
-        sequences = "results/sequences_hepatitisB_{lineage}.fasta",
-        metadata = "results/metadata_hepatitisB_{lineage}.tsv"
-    params:
-        #strain will be accession number
-        fasta_fields = "strain strain_name date country host genotype subgenotype",
-    shell:
-        """
-        augur parse \
-            --sequences {input.sequences} \
-            --output-sequences {output.sequences} \
-            --output-metadata {output.metadata} \
-            --fields {params.fasta_fields} \
-        """
+
+# rule filter:
+#     message:
+#         """
+#         Filtering to
+#           - {params.sequences_per_group} sequence(s) per {params.group_by!s}
+#           - minimum genome length of {params.min_length}
+#         """
+#     input:
+#         sequences = rules.parse.output.sequences,
+#         metadata = rules.parse.output.metadata,
+#         exclude = files.dropped_strains,
+#         include = "config/include.txt", # TESTING ONLY
+#     output:
+#         sequences = "results/filtered_hepatitisB_{lineage}.fasta"
+#     params:
+#         group_by = "country year",
+#         sequences_per_group = 50,
+#         min_length = 3000
+#     shell:
+#         """
+#         augur filter \
+#             --sequences {input.sequences} \
+#             --metadata {input.metadata} \
+#             --exclude {input.exclude} \
+#             --exclude-all --include {input.include} \
+#             --output {output.sequences}
+#         """
+
+# --group-by {params.group_by} \
+# --sequences-per-group {params.sequences_per_group} \
+# --min-length {params.min_length}
+
+# rule align:
+#     message:
+#         """
+#         Aligning sequences to {input.reference}
+#           - filling gaps with N
+#         """
+#     input:
+#         sequences = rules.filter.output.sequences,
+#         reference = files.reference
+#     threads: 8
+#     output:
+#         alignment = "results/aligned_hepatitisB_{lineage}.fasta"
+#     shell:
+#         """
+#         augur align \
+#             --sequences {input.sequences} \
+#             --output {output.alignment} \
+#             --reference-name JN182318 \
+#             --nthreads {threads} \
+#             --fill-gaps
+#         """
+
+#                   #  --reference-sequence {input.reference} \
+# #
+
+
+### TODO XXX
+# iqtree writes log files etc to the input path + some suffix
+
+
+def filter_params(wildcards):
+    if wildcards.build == "all":
+        return ""
+    elif wildcards.build == "dev":
+        return "--group-by genotype_genbank --subsample-max-sequences 800"
+    raise Exception("Unknown build parameter")
+
 
 rule filter:
-    message:
-        """
-        Filtering to
-          - {params.sequences_per_group} sequence(s) per {params.group_by!s}
-          - minimum genome length of {params.min_length}
-        """
     input:
-        sequences = rules.parse.output.sequences,
-        metadata = rules.parse.output.metadata,
-        exclude = files.dropped_strains
+        sequences = "ingest/results/aligned.fasta",
+        metadata = "ingest/results/metadata.tsv",
+        include = "config/include.txt",
     output:
-        sequences = "results/filtered_hepatitisB_{lineage}.fasta"
+        sequences = "results/{build}/filtered.fasta"
     params:
-        group_by = "country year",
-        sequences_per_group = 50,
-        min_length = 3000
+        args = filter_params
     shell:
         """
         augur filter \
-            --sequences {input.sequences} \
-            --metadata {input.metadata} \
-            --exclude {input.exclude} \
-            --output {output.sequences} \
-            --group-by {params.group_by} \
-            --sequences-per-group {params.sequences_per_group} \
-            --min-length {params.min_length}
+            --sequences {input.sequences} --metadata {input.metadata} \
+            --include {input.include} \
+            {params.args} \
+            --output {output.sequences}
         """
 
-rule align:
-    message:
-        """
-        Aligning sequences to {input.reference}
-          - filling gaps with N
-        """
-    input:
-        sequences = rules.filter.output.sequences,
-        reference = files.reference
-    threads: 8
-    output:
-        alignment = "results/aligned_hepatitisB_{lineage}.fasta"
-    shell:
-        """
-        augur align \
-            --sequences {input.sequences} \
-            --reference-sequence {input.reference} \
-            --output {output.alignment} \
-            --remove-reference \
-            --nthreads {threads} \
-            --fill-gaps
-        """
 
 rule tree:
     message: "Building tree"
     input:
-        alignment = rules.align.output.alignment
+        alignment = "results/{build}/filtered.fasta"
     output:
-        tree = "results/tree_raw_hepatitisB_{lineage}.nwk"
+        tree = "results/{build}/tree_raw.nwk"
     threads: 8
     shell:
         """
@@ -108,12 +129,12 @@ rule refine:
         NO TIMETREE CURRENTLY
         """
     input:
-        tree = rules.tree.output.tree,
-        alignment = rules.align.output,
-        metadata = rules.parse.output.metadata
+        tree = "results/{build}/tree_raw.nwk",
+        alignment = "results/{build}/filtered.fasta",
+        metadata = "ingest/results/metadata.tsv"
     output:
-        tree = "results/tree_hepatitisB_{lineage}.nwk",
-        node_data = "results/branch_lengths_hepatitisB_{lineage}.json"
+        tree = "results/{build}/tree.nwk",
+        node_data = "results/{build}/branch_lengths.json"
     params:
         coalescent = "opt",
         date_inference = "marginal",
@@ -126,7 +147,7 @@ rule refine:
             --metadata {input.metadata} \
             --output-tree {output.tree} \
             --output-node-data {output.node_data} \
-            --keep-root \
+            --root HQ603073 \
             --coalescent {params.coalescent} \
             --date-confidence \
             --date-inference {params.date_inference} \
@@ -135,12 +156,11 @@ rule refine:
 
 
 rule ancestral:
-    message: "Reconstructing ancestral sequences and mutations"
     input:
-        tree = rules.refine.output.tree,
-        alignment = rules.align.output
+        tree = "results/{build}/tree.nwk",
+        alignment = "results/{build}/filtered.fasta",
     output:
-        node_data = "results/nt_muts_hepatitisB_{lineage}.json"
+        node_data = "results/{build}/nt_muts.json"
     params:
         inference = "joint"
     shell:
@@ -170,53 +190,57 @@ rule ancestral:
 #         """
 
 
-# rule clades:
-#     message: "Labeling clades as specified in config/clades.tsv"
-#     input:
-#         tree = rules.refine.output.tree,
-#         aa_muts = rules.translate.output.node_data,
-#         nuc_muts = rules.ancestral.output.node_data,
-#         clades = files.clades
-#     output:
-#         clade_data = "results/clades_hepatitisB_{lineage}.json"
-#     shell:
-#         """
-#         augur clades \
-#             --tree {input.tree} \
-#             --mutations {input.nuc_muts} {input.aa_muts} \
-#             --clades {input.clades} \
-#             --output {output.clade_data}
-#         """
-
+rule clades:
+    message: "Labeling clades as specified in config/clades.tsv"
+    input:
+        tree =  "results/{build}/tree.nwk",
+        # aa_muts = rules.translate.output.node_data,
+        nuc_muts = "results/{build}/nt_muts.json",
+        clades = "config/clades-genotypes.tsv"
+    output:
+        clade_data = "results/{build}/clades-genotypes.json",
+    shell:
+        """
+        augur clades \
+            --tree {input.tree} \
+            --mutations {input.nuc_muts} \
+            --membership-name genotype_inferred \
+            --label-name genotype_inferred \
+            --clades {input.clades} \
+            --output {output.clade_data}
+        """
 
 rule export:
-    message: "Exporting data files for for auspice"
     input:
-        tree = rules.refine.output.tree,
-        metadata = rules.parse.output.metadata,
-        branch_lengths = rules.refine.output.node_data,
-        nt_muts = rules.ancestral.output.node_data,
+        tree = "results/{build}/tree.nwk",
+        metadata = "ingest/results/metadata.tsv",
+        branch_lengths = "results/{build}/branch_lengths.json",
+        nt_muts = "results/{build}/nt_muts.json",
+        clades = "results/{build}/clades-genotypes.json",
         # aa_muts = rules.translate.output.node_data,
-        auspice_config = files.auspice_config
+        auspice_config = "config/auspice_config_all.json", ### TODO XXX parameterise when necessary
     output:
-        auspice_json = "auspice/hepatitisB_{lineage}.json"
+        auspice_json = "auspice/hbv_{build}.json"
     shell:
         """
         export AUGUR_RECURSION_LIMIT=10000;
         augur export v2 \
             --tree {input.tree} \
             --metadata {input.metadata} \
-            --node-data {input.branch_lengths} {input.nt_muts} \
-            --include-root-sequence \
+            --node-data {input.branch_lengths} {input.nt_muts} {input.clades} \
             --auspice-config {input.auspice_config} \
-            --include-root-sequence \
-            --output {output.auspice_json}
+            --output {output.auspice_json} \
+            --validation-mode skip
         """
 
-rule clean:
-    message: "Removing directories: {params}"
-    params:
-        "results ",
-        "auspice"
-    shell:
-        "rm -rfv {params}"
+## AUGUR BUG TODO XXX
+## we skip validation because
+##   .display_defaults.branch_label "genotype_inferred" failed pattern validation for "^(none|[a-zA-Z0-9]+)$"
+
+# rule clean:
+#     message: "Removing directories: {params}"
+#     params:
+#         "results ",
+#         "auspice"
+#     shell:
+#         "rm -rfv {params}"