Skip to content

Commit

Permalink
Working all-HBV build
Browse files Browse the repository at this point in the history
Including `augur clades` inference of genotypes derived from inspection
of where genbank-annotated genotypes fell across the tree, which were
quite messy.

Still includes some basal non-human HBV viruses, and a clade of <10
non-human samples which are extremely diverged (either bad alignment
or recombination)
  • Loading branch information
jameshadfield committed Jun 16, 2023
1 parent 5468242 commit 80bdf40
Show file tree
Hide file tree
Showing 11 changed files with 733 additions and 134 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ logs/
results/
build/
auspice/
data/*
data/
defaults/colors.tsv
defaults/colors_*.tsv
stats.json
Expand Down
16 changes: 12 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,27 @@ Currently a WIP

Based on Katie Kistler's work in [blab/adaptive-evolution](https://github.com/blab/adaptive-evolution)

## Ingest
## Obtain data

1. Obtain `./ingest/genbank_sequences.gb`
- example search: [(Hepatitis B virus) AND (complete genome)](https://www.ncbi.nlm.nih.gov/nuccore/?term=(Hepatitis+B+virus)+AND+(complete+genome)), which returns ~11k samples
- Could also explore `"Hepatitis B virus"[porgn:__txid10407]`
- download as "Complete Record" / "File" / "GenBank (full)", a ~100Mb file.

2. `./ingest/curate_genbank_to_fasta.ipynb`
- produces `./ingest/hepatitisB_all.fasta`

## Ingest

You can run the ingest part of the snakemake pipeline via:

```
snakemake --cores 4 -npf ingest/results/aligned.fasta ingest/results/sequences.fasta ingest/results/metadata.tsv
```


## Phylo

1. `snakemake --cores 4 -pf auspice/hepatitisB_all.json`
For a small ~800-tip dev tree: `snakemake --cores 4 -pf auspice/hbv_dev.json`
For the entire human-HBV tree, with 11k tips (takes ~15min on a 4-core M1 machine): `snakemake --cores 4 -pf auspice/hbv_all.json`

(If you haven't run the ingest section of the pipeline then any phylo build will run it.)

256 changes: 140 additions & 116 deletions Snakefile
Original file line number Diff line number Diff line change
@@ -1,97 +1,118 @@
# LINEAGES = ['all']
LINEAGES = ['A2']

include: "ingest/ingest.smk"

BUILDS = ["all", "dev"]

# rule all:
# input:
# auspice_json = expand("auspice/hepatitisB_{lineage}.json", lineage=LINEAGES)

rule files:
params:
reference = "config/reference_hepatitisB_{lineage}.gb",
auspice_config = "config/auspice_config_{lineage}.json",
dropped_strains = "config/dropped_strains_hepatitisB_{lineage}.txt"
# rule files:
# params:
# reference = "config/reference_hepatitisB_{lineage}.gb",
# auspice_config = "config/auspice_config_all.json",
# dropped_strains = "config/dropped_strains_hepatitisB_{lineage}.txt"


files = rules.files.params
# files = rules.files.params


rule parse:
message: "Parsing fasta into sequences and metadata"
input:
sequences = 'ingest/hepatitisB_{lineage}.fasta'
output:
sequences = "results/sequences_hepatitisB_{lineage}.fasta",
metadata = "results/metadata_hepatitisB_{lineage}.tsv"
params:
#strain will be accession number
fasta_fields = "strain strain_name date country host genotype subgenotype",
shell:
"""
augur parse \
--sequences {input.sequences} \
--output-sequences {output.sequences} \
--output-metadata {output.metadata} \
--fields {params.fasta_fields} \
"""

# rule filter:
# message:
# """
# Filtering to
# - {params.sequences_per_group} sequence(s) per {params.group_by!s}
# - minimum genome length of {params.min_length}
# """
# input:
# sequences = rules.parse.output.sequences,
# metadata = rules.parse.output.metadata,
# exclude = files.dropped_strains,
# include = "config/include.txt", # TESTING ONLY
# output:
# sequences = "results/filtered_hepatitisB_{lineage}.fasta"
# params:
# group_by = "country year",
# sequences_per_group = 50,
# min_length = 3000
# shell:
# """
# augur filter \
# --sequences {input.sequences} \
# --metadata {input.metadata} \
# --exclude {input.exclude} \
# --exclude-all --include {input.include} \
# --output {output.sequences}
# """

# --group-by {params.group_by} \
# --sequences-per-group {params.sequences_per_group} \
# --min-length {params.min_length}

# rule align:
# message:
# """
# Aligning sequences to {input.reference}
# - filling gaps with N
# """
# input:
# sequences = rules.filter.output.sequences,
# reference = files.reference
# threads: 8
# output:
# alignment = "results/aligned_hepatitisB_{lineage}.fasta"
# shell:
# """
# augur align \
# --sequences {input.sequences} \
# --output {output.alignment} \
# --reference-name JN182318 \
# --nthreads {threads} \
# --fill-gaps
# """

# # --reference-sequence {input.reference} \
# #


### TODO XXX
# iqtree writes log files etc to the input path + some suffix


def filter_params(wildcards):
if wildcards.build == "all":
return ""
elif wildcards.build == "dev":
return "--group-by genotype_genbank --subsample-max-sequences 800"
raise Exception("Unknown build parameter")


rule filter:
message:
"""
Filtering to
- {params.sequences_per_group} sequence(s) per {params.group_by!s}
- minimum genome length of {params.min_length}
"""
input:
sequences = rules.parse.output.sequences,
metadata = rules.parse.output.metadata,
exclude = files.dropped_strains
sequences = "ingest/results/aligned.fasta",
metadata = "ingest/results/metadata.tsv",
include = "config/include.txt",
output:
sequences = "results/filtered_hepatitisB_{lineage}.fasta"
sequences = "results/{build}/filtered.fasta"
params:
group_by = "country year",
sequences_per_group = 50,
min_length = 3000
args = filter_params
shell:
"""
augur filter \
--sequences {input.sequences} \
--metadata {input.metadata} \
--exclude {input.exclude} \
--output {output.sequences} \
--group-by {params.group_by} \
--sequences-per-group {params.sequences_per_group} \
--min-length {params.min_length}
--sequences {input.sequences} --metadata {input.metadata} \
--include {input.include} \
{params.args} \
--output {output.sequences}
"""

rule align:
message:
"""
Aligning sequences to {input.reference}
- filling gaps with N
"""
input:
sequences = rules.filter.output.sequences,
reference = files.reference
threads: 8
output:
alignment = "results/aligned_hepatitisB_{lineage}.fasta"
shell:
"""
augur align \
--sequences {input.sequences} \
--reference-sequence {input.reference} \
--output {output.alignment} \
--remove-reference \
--nthreads {threads} \
--fill-gaps
"""

rule tree:
message: "Building tree"
input:
alignment = rules.align.output.alignment
alignment = "results/{build}/filtered.fasta"
output:
tree = "results/tree_raw_hepatitisB_{lineage}.nwk"
tree = "results/{build}/tree_raw.nwk"
threads: 8
shell:
"""
Expand All @@ -108,12 +129,12 @@ rule refine:
NO TIMETREE CURRENTLY
"""
input:
tree = rules.tree.output.tree,
alignment = rules.align.output,
metadata = rules.parse.output.metadata
tree = "results/{build}/tree_raw.nwk",
alignment = "results/{build}/filtered.fasta",
metadata = "ingest/results/metadata.tsv"
output:
tree = "results/tree_hepatitisB_{lineage}.nwk",
node_data = "results/branch_lengths_hepatitisB_{lineage}.json"
tree = "results/{build}/tree.nwk",
node_data = "results/{build}/branch_lengths.json"
params:
coalescent = "opt",
date_inference = "marginal",
Expand All @@ -126,7 +147,7 @@ rule refine:
--metadata {input.metadata} \
--output-tree {output.tree} \
--output-node-data {output.node_data} \
--keep-root \
--root HQ603073 \
--coalescent {params.coalescent} \
--date-confidence \
--date-inference {params.date_inference} \
Expand All @@ -135,12 +156,11 @@ rule refine:


rule ancestral:
message: "Reconstructing ancestral sequences and mutations"
input:
tree = rules.refine.output.tree,
alignment = rules.align.output
tree = "results/{build}/tree.nwk",
alignment = "results/{build}/filtered.fasta",
output:
node_data = "results/nt_muts_hepatitisB_{lineage}.json"
node_data = "results/{build}/nt_muts.json"
params:
inference = "joint"
shell:
Expand Down Expand Up @@ -170,53 +190,57 @@ rule ancestral:
# """


# rule clades:
# message: "Labeling clades as specified in config/clades.tsv"
# input:
# tree = rules.refine.output.tree,
# aa_muts = rules.translate.output.node_data,
# nuc_muts = rules.ancestral.output.node_data,
# clades = files.clades
# output:
# clade_data = "results/clades_hepatitisB_{lineage}.json"
# shell:
# """
# augur clades \
# --tree {input.tree} \
# --mutations {input.nuc_muts} {input.aa_muts} \
# --clades {input.clades} \
# --output {output.clade_data}
# """

rule clades:
message: "Labeling clades as specified in config/clades.tsv"
input:
tree = "results/{build}/tree.nwk",
# aa_muts = rules.translate.output.node_data,
nuc_muts = "results/{build}/nt_muts.json",
clades = "config/clades-genotypes.tsv"
output:
clade_data = "results/{build}/clades-genotypes.json",
shell:
"""
augur clades \
--tree {input.tree} \
--mutations {input.nuc_muts} \
--membership-name genotype_inferred \
--label-name genotype_inferred \
--clades {input.clades} \
--output {output.clade_data}
"""

rule export:
message: "Exporting data files for for auspice"
input:
tree = rules.refine.output.tree,
metadata = rules.parse.output.metadata,
branch_lengths = rules.refine.output.node_data,
nt_muts = rules.ancestral.output.node_data,
tree = "results/{build}/tree.nwk",
metadata = "ingest/results/metadata.tsv",
branch_lengths = "results/{build}/branch_lengths.json",
nt_muts = "results/{build}/nt_muts.json",
clades = "results/{build}/clades-genotypes.json",
# aa_muts = rules.translate.output.node_data,
auspice_config = files.auspice_config
auspice_config = "config/auspice_config_all.json", ### TODO XXX parameterise when necessary
output:
auspice_json = "auspice/hepatitisB_{lineage}.json"
auspice_json = "auspice/hbv_{build}.json"
shell:
"""
export AUGUR_RECURSION_LIMIT=10000;
augur export v2 \
--tree {input.tree} \
--metadata {input.metadata} \
--node-data {input.branch_lengths} {input.nt_muts} \
--include-root-sequence \
--node-data {input.branch_lengths} {input.nt_muts} {input.clades} \
--auspice-config {input.auspice_config} \
--include-root-sequence \
--output {output.auspice_json}
--output {output.auspice_json} \
--validation-mode skip
"""

rule clean:
message: "Removing directories: {params}"
params:
"results ",
"auspice"
shell:
"rm -rfv {params}"
## AUGUR BUG TODO XXX
## we skip validation because
## .display_defaults.branch_label "genotype_inferred" failed pattern validation for "^(none|[a-zA-Z0-9]+)$"

# rule clean:
# message: "Removing directories: {params}"
# params:
# "results ",
# "auspice"
# shell:
# "rm -rfv {params}"
Loading

0 comments on commit 80bdf40

Please sign in to comment.