Skip to content

Commit

Permalink
Move to stand-alone ingest workflow
Browse files Browse the repository at this point in the history
This includes a stand-alone README and folder structure largely based on
<https://github.com/nextstrain/zika/tree/a91e575bff38f154390c9eb11a44a89abf95a55b>
  • Loading branch information
jameshadfield committed Mar 4, 2024
1 parent 7cc0e1d commit e4aa322
Show file tree
Hide file tree
Showing 6 changed files with 110 additions and 38 deletions.
2 changes: 1 addition & 1 deletion config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ reference_accession: "NC_003977"
# Path to the nextclade dataset, which will be used to align all sequences
nextclade_dataset: "nextclade_datasets/references/NC_003977/versions/2023-08-22"

nextclade_binary: "bin/nextclade"
nextclade_binary: "../bin/nextclade"

# technically CDSs, but we use these terms rather interchangeably.
# These should be the entire list of CDSs in the nextclade dataset's genemap
Expand Down
44 changes: 44 additions & 0 deletions ingest/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# nextstrain.org/hbv/ingest

This is the ingest pipeline for Hepatitis B (HBV) virus sequences

## Software requirements

Follow the [standard installation instructions](https://docs.nextstrain.org/en/latest/install.html) for Nextstrain's suite of software tools.

## Usage

> NOTE: These command examples assume you are within the `ingest` directory.
```sh
snakemake --cores 4
```

This produces a number of intermediate files in `data/` as well as three files in `results/` for downstream analysis:

- `results/metadata.tsv`
- `results/sequences.fasta`
- `results/aligned.fasta`

## Configuration

Configuration parameters are in `defaults/config.yaml`. These may be overridden by using Snakemake's `--configfile` or `--config` options.

### Environment Variables

None currently required

## Input data

### GenBank data

GenBank sequences and metadata are fetched via a NCBI Entrez query.
As of mid 2024 there are around ~11.5k genomes and the full GenBank file is ~150Mb.


## `ingest/vendored`

This repository uses [`git subrepo`](https://github.com/ingydotnet/git-subrepo) to manage copies of ingest scripts in [ingest/vendored](./vendored), from [nextstrain/ingest](https://github.com/nextstrain/ingest).

See [vendored/README.md](vendored/README.md#vendoring) for instructions on how to update
the vendored scripts.
84 changes: 47 additions & 37 deletions ingest/ingest.smk → ingest/Snakefile
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
# Use default configuration values. Override with Snakemake's --configfile/--config options.
configfile: "defaults/config.yaml"

rule all:
input:
"results/aligned.fasta",
"results/sequences.fasta",
"results/metadata.tsv",


rule vendor_nextclade3_x86:
"""
Nextclade v3 is unreleased. This repo includes an arch64 version but for GitHub
Expand All @@ -21,13 +31,13 @@ rule vendor_nextclade3_x86:

rule fetch_genbank:
params:
term = 'Hepatitis+B+virus[All+Fields]complete+genome[All+Fields]'
term = config['entrez_query']
output:
genbank = "ingest/data/genbank.gb",
genbank = "data/entrez/genbank.gb",
retries: 1 # Requires snakemake 7.7.0 or later
shell:
"""
ingest/vendored/fetch-from-ncbi-entrez --term {params.term:q} --output {output.genbank}
vendored/fetch-from-ncbi-entrez --term {params.term:q} --output {output.genbank}
"""

rule add_extra_genomes:
Expand All @@ -37,39 +47,39 @@ rule add_extra_genomes:
to add this genome.
"""
input:
entrez = "ingest/data/genbank.gb",
ref = "ingest/config/NC_003977.gb"
entrez = "data/entrez/genbank.gb",
ref = config['reference_genbank'],
output:
genbank = "ingest/data/genbank.extended.gb",
genbank = "data/entrez/genbank.with-reference.gb",
shell:
"""
cat {input.ref:q} {input.entrez:q} > {output.genbank:q}
"""

rule parse_genbank:
input:
genbank = "ingest/data/genbank.extended.gb",
genbank = "data/entrez/genbank.with-reference.gb",
output:
sequences = "ingest/results/genbank.fasta",
metadata = "ingest/results/genbank.tsv",
sequences = "data/genbank.fasta",
metadata = "data/genbank.tsv",
shell:
"""
ingest/scripts/parse-genbank.py --genbank {input.genbank} \
scripts/parse-genbank.py --genbank {input.genbank} \
--output-sequences {output.sequences} --output-metadata {output.metadata}
"""

rule re_circularise:
input:
metadata = "ingest/results/genbank.tsv",
sequences = "ingest/results/genbank.fasta",
metadata = "data/genbank.tsv",
sequences = "data/genbank.fasta",
output:
sequences = "ingest/results/genbank.recircular.fasta",
metadata = "ingest/results/genbank.recircular.tsv",
sequences = "data/genbank.recircular.fasta",
metadata = "data/genbank.recircular.tsv",
params:
reference = config['reference_accession']
shell:
"""
ingest/scripts/re-circularise.py \
scripts/re-circularise.py \
--seqs-in {input.sequences} --meta-in {input.metadata} \
--seqs-out {output.sequences} --meta-out {output.metadata} \
--reference {params.reference}
Expand All @@ -81,13 +91,13 @@ rule align_everything:
Note that the minimum seed match rate is specified in the dataset itself.
"""
input:
sequences = "ingest/results/genbank.recircular.fasta",
metadata = "ingest/results/genbank.recircular.tsv",
sequences = "data/genbank.recircular.fasta",
metadata = "data/genbank.recircular.tsv",
output:
# Note that these outputs require `--output-basename nextclade`
alignment = "ingest/results/nextclade.aligned.fasta",
summary = "ingest/results/nextclade.tsv",
translations = expand("ingest/results/nextclade_gene_{gene}.translation.fasta", gene=config['genes'])
alignment = "data/nextclade/nextclade.aligned.fasta",
summary = "data/nextclade/nextclade.tsv",
translations = expand("data/nextclade/nextclade_gene_{gene}.translation.fasta", gene=config['genes'])
params:
dataset = config['nextclade_dataset'],
nextclade = config['nextclade_binary']
Expand All @@ -97,51 +107,51 @@ rule align_everything:
{params.nextclade} run \
-j {threads} --silent --replace-unknown \
--input-dataset {params.dataset} \
--output-all ingest/results \
--output-all data/nextclade \
--output-basename nextclade \
{input.sequences}
"""

rule join_nextclade_metadata:
input:
metadata = "ingest/results/genbank.recircular.tsv",
nextclade = "ingest/results/nextclade.tsv"
metadata = "data/genbank.recircular.tsv",
nextclade = "data/nextclade/nextclade.tsv"
output:
metadata = "ingest/results/metadata_nextclade.tsv",
summary = "ingest/results/metadata.summary.txt",
metadata = "data/metadata.tsv",
summary = "data/metadata.summary.txt",
shell:
"""
ingest/scripts/join-nextclade-metadata.py \
scripts/join-nextclade-metadata.py \
--metadata {input.metadata} --nextclade {input.nextclade} \
--output {output.metadata} --summary {output.summary}
"""

rule transform_metadata:
input:
metadata = "ingest/results/metadata_nextclade.tsv"
metadata = "data/metadata.tsv"
output:
metadata = "ingest/results/metadata.tsv"
metadata = "results/metadata.tsv"
params:
metadata_columns = ['name', 'accesion', "strain_name", "date", "year", "region", "country", "host", "genotype_genbank", "subgenotype_genbank", \
"circularise", "circularise_shift_bp","clade_nextclade","QC_overall_score","QC_overall_status","total_substitutions","total_deletions", \
"total_insertions","total_frame_shifts","total_missing","alignment_score","coverage","QC_missing_data","QC_mixed_sites","QC_rare_mutations", \
"QC_frame_shifts","QC_stop_codons"]
shell:
"""
ingest/scripts/tsv-to-ndjson.py < {input.metadata} |
ingest/scripts/fix_country_field.py |
ingest/vendored/apply-geolocation-rules --geolocation-rules ingest/config/geoLocationRules.tsv |
ingest/scripts/add-year.py |
ingest/scripts/ndjson-to-tsv.py --metadata-columns {params.metadata_columns} --metadata {output.metadata}
scripts/tsv-to-ndjson.py < {input.metadata} |
scripts/fix_country_field.py |
vendored/apply-geolocation-rules --geolocation-rules defaults/geoLocationRules.tsv |
scripts/add-year.py |
scripts/ndjson-to-tsv.py --metadata-columns {params.metadata_columns} --metadata {output.metadata}
"""

rule copy_ingest_files:
input:
sequences = "ingest/results/genbank.recircular.fasta",
aligned = "ingest/results/nextclade.aligned.fasta"
sequences = "data/genbank.recircular.fasta",
aligned = "data/nextclade/nextclade.aligned.fasta",
output:
sequences = "ingest/results/sequences.fasta",
aligned = "ingest/results/aligned.fasta"
sequences = "results/sequences.fasta",
aligned = "results/aligned.fasta"
shell:
"""
cp {input.sequences} {output.sequences}
Expand Down
File renamed without changes.
18 changes: 18 additions & 0 deletions ingest/defaults/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# This configuration file should contain all required configuration parameters
# for the ingest workflow to run to completion.

# Used to re-circularise genomes. This should be the same as the reference in the nextclade dataset!
# Also force-included for a number of datasets
reference_accession: "NC_003977"
reference_genbank: "defaults/NC_003977.gb"

entrez_query: 'Hepatitis+B+virus[All+Fields]complete+genome[All+Fields]'

# Path to the nextclade dataset, which will be used to align all sequences
nextclade_dataset: "../nextclade_datasets/references/NC_003977/versions/2023-08-22"

nextclade_binary: "../bin/nextclade"

# technically CDSs, but we use these terms rather interchangeably.
# These should be the entire list of CDSs in the nextclade dataset's genemap
genes: ['envS', 'envM', 'envL', 'X', 'pre-capsid', 'capsid', 'pol']
File renamed without changes.

0 comments on commit e4aa322

Please sign in to comment.