Move to stand-alone ingest workflow

This includes a stand-alone README and folder structure largely based on <https://github.com/nextstrain/zika/tree/a91e575bff38f154390c9eb11a44a89abf95a55b>
nextstrain · Mar 4, 2024 · e4aa322 · e4aa322
1 parent 7cc0e1d
commit e4aa322
Show file tree

Hide file tree

Showing 6 changed files with 110 additions and 38 deletions.
diff --git a/config/config.yaml b/config/config.yaml
@@ -7,7 +7,7 @@ reference_accession: "NC_003977"
 # Path to the nextclade dataset, which will be used to align all sequences
 nextclade_dataset: "nextclade_datasets/references/NC_003977/versions/2023-08-22"
 
-nextclade_binary: "bin/nextclade"
+nextclade_binary: "../bin/nextclade"
 
 # technically CDSs, but we use these terms rather interchangeably.
 # These should be the entire list of CDSs in the nextclade dataset's genemap

diff --git a/ingest/README.md b/ingest/README.md
@@ -0,0 +1,44 @@
+# nextstrain.org/hbv/ingest
+
+This is the ingest pipeline for Hepatitis B (HBV) virus sequences
+
+## Software requirements
+
+Follow the [standard installation instructions](https://docs.nextstrain.org/en/latest/install.html) for Nextstrain's suite of software tools.
+
+## Usage
+
+> NOTE: These command examples assume you are within the `ingest` directory.
+
+```sh
+snakemake --cores 4
+```
+
+This produces a number of intermediate files in `data/` as well as three files in `results/` for downstream analysis:
+
+- `results/metadata.tsv`
+- `results/sequences.fasta`
+- `results/aligned.fasta`
+
+## Configuration
+
+Configuration parameters are in `defaults/config.yaml`. These may be overridden by using Snakemake's `--configfile` or `--config` options.
+
+### Environment Variables
+
+None currently required
+
+## Input data
+
+### GenBank data
+
+GenBank sequences and metadata are fetched via a NCBI Entrez query.
+As of mid 2024 there are around ~11.5k genomes and the full GenBank file is ~150Mb.
+
+
+## `ingest/vendored`
+
+This repository uses [`git subrepo`](https://github.com/ingydotnet/git-subrepo) to manage copies of ingest scripts in [ingest/vendored](./vendored), from [nextstrain/ingest](https://github.com/nextstrain/ingest).
+
+See [vendored/README.md](vendored/README.md#vendoring) for instructions on how to update
+the vendored scripts.
diff --git a/ingest/ingest.smk → ingest/Snakefile b/ingest/ingest.smk → ingest/Snakefile
@@ -1,3 +1,13 @@
+# Use default configuration values. Override with Snakemake's --configfile/--config options.
+configfile: "defaults/config.yaml"
+
+rule all:
+    input:
+        "results/aligned.fasta",
+        "results/sequences.fasta",
+        "results/metadata.tsv",
+
+
 rule vendor_nextclade3_x86:
     """
     Nextclade v3 is unreleased. This repo includes an arch64 version but for GitHub
@@ -21,13 +31,13 @@ rule vendor_nextclade3_x86:
 
 rule fetch_genbank:
     params:
-        term = 'Hepatitis+B+virus[All+Fields]complete+genome[All+Fields]'
+        term = config['entrez_query']
     output:
-        genbank = "ingest/data/genbank.gb",
+        genbank = "data/entrez/genbank.gb",
     retries: 1  # Requires snakemake 7.7.0 or later
     shell:
         """
-        ingest/vendored/fetch-from-ncbi-entrez --term {params.term:q} --output {output.genbank}
+        vendored/fetch-from-ncbi-entrez --term {params.term:q} --output {output.genbank}
         """
 
 rule add_extra_genomes:
@@ -37,39 +47,39 @@ rule add_extra_genomes:
     to add this genome.
     """
     input:
-        entrez = "ingest/data/genbank.gb",
-        ref = "ingest/config/NC_003977.gb"
+        entrez = "data/entrez/genbank.gb",
+        ref = config['reference_genbank'],
     output:
-        genbank = "ingest/data/genbank.extended.gb",
+        genbank = "data/entrez/genbank.with-reference.gb",
     shell:
         """
         cat {input.ref:q} {input.entrez:q} > {output.genbank:q}
         """
 
 rule parse_genbank:
     input:
-        genbank = "ingest/data/genbank.extended.gb",
+        genbank = "data/entrez/genbank.with-reference.gb",
     output:
-        sequences = "ingest/results/genbank.fasta",
-        metadata = "ingest/results/genbank.tsv",
+        sequences = "data/genbank.fasta",
+        metadata = "data/genbank.tsv",
     shell:
         """
-        ingest/scripts/parse-genbank.py --genbank {input.genbank} \
+        scripts/parse-genbank.py --genbank {input.genbank} \
             --output-sequences {output.sequences} --output-metadata {output.metadata}
         """
 
 rule re_circularise:
     input:
-        metadata = "ingest/results/genbank.tsv",
-        sequences = "ingest/results/genbank.fasta",
+        metadata = "data/genbank.tsv",
+        sequences = "data/genbank.fasta",
     output:
-        sequences = "ingest/results/genbank.recircular.fasta",
-        metadata = "ingest/results/genbank.recircular.tsv",
+        sequences = "data/genbank.recircular.fasta",
+        metadata = "data/genbank.recircular.tsv",
     params:
         reference = config['reference_accession']
     shell:
         """
-        ingest/scripts/re-circularise.py \
+        scripts/re-circularise.py \
             --seqs-in {input.sequences} --meta-in {input.metadata} \
             --seqs-out {output.sequences} --meta-out {output.metadata} \
             --reference {params.reference}
@@ -81,13 +91,13 @@ rule align_everything:
     Note that the minimum seed match rate is specified in the dataset itself.
     """
     input:
-        sequences = "ingest/results/genbank.recircular.fasta",
-        metadata = "ingest/results/genbank.recircular.tsv",
+        sequences = "data/genbank.recircular.fasta",
+        metadata = "data/genbank.recircular.tsv",
     output:
         # Note that these outputs require `--output-basename nextclade`
-        alignment = "ingest/results/nextclade.aligned.fasta",
-        summary = "ingest/results/nextclade.tsv",
-        translations = expand("ingest/results/nextclade_gene_{gene}.translation.fasta", gene=config['genes'])
+        alignment = "data/nextclade/nextclade.aligned.fasta",
+        summary = "data/nextclade/nextclade.tsv",
+        translations = expand("data/nextclade/nextclade_gene_{gene}.translation.fasta", gene=config['genes'])
     params:
         dataset = config['nextclade_dataset'],
         nextclade = config['nextclade_binary']
@@ -97,51 +107,51 @@ rule align_everything:
         {params.nextclade} run \
             -j {threads} --silent --replace-unknown \
             --input-dataset {params.dataset} \
-            --output-all ingest/results \
+            --output-all data/nextclade \
             --output-basename nextclade \
             {input.sequences}
         """
 
 rule join_nextclade_metadata:
     input:
-        metadata = "ingest/results/genbank.recircular.tsv",
-        nextclade = "ingest/results/nextclade.tsv"
+        metadata = "data/genbank.recircular.tsv",
+        nextclade = "data/nextclade/nextclade.tsv"
     output:
-        metadata = "ingest/results/metadata_nextclade.tsv",
-        summary = "ingest/results/metadata.summary.txt",
+        metadata = "data/metadata.tsv",
+        summary = "data/metadata.summary.txt",
     shell:
         """
-        ingest/scripts/join-nextclade-metadata.py \
+        scripts/join-nextclade-metadata.py \
              --metadata {input.metadata} --nextclade {input.nextclade} \
              --output {output.metadata} --summary {output.summary}
         """
 
 rule transform_metadata:
     input:
-        metadata = "ingest/results/metadata_nextclade.tsv"
+        metadata = "data/metadata.tsv"
     output:
-        metadata = "ingest/results/metadata.tsv"
+        metadata = "results/metadata.tsv"
     params:
         metadata_columns = ['name', 'accesion', "strain_name", "date", "year", "region", "country", "host", "genotype_genbank", "subgenotype_genbank", \
         "circularise", "circularise_shift_bp","clade_nextclade","QC_overall_score","QC_overall_status","total_substitutions","total_deletions", \
         "total_insertions","total_frame_shifts","total_missing","alignment_score","coverage","QC_missing_data","QC_mixed_sites","QC_rare_mutations", \
         "QC_frame_shifts","QC_stop_codons"]
     shell:
         """
-        ingest/scripts/tsv-to-ndjson.py < {input.metadata} |
-            ingest/scripts/fix_country_field.py |
-            ingest/vendored/apply-geolocation-rules --geolocation-rules ingest/config/geoLocationRules.tsv |
-            ingest/scripts/add-year.py |
-            ingest/scripts/ndjson-to-tsv.py --metadata-columns {params.metadata_columns} --metadata {output.metadata}
+        scripts/tsv-to-ndjson.py < {input.metadata} |
+            scripts/fix_country_field.py |
+            vendored/apply-geolocation-rules --geolocation-rules defaults/geoLocationRules.tsv |
+            scripts/add-year.py |
+            scripts/ndjson-to-tsv.py --metadata-columns {params.metadata_columns} --metadata {output.metadata}
         """
 
 rule copy_ingest_files:
     input:
-        sequences = "ingest/results/genbank.recircular.fasta",
-        aligned = "ingest/results/nextclade.aligned.fasta"
+        sequences = "data/genbank.recircular.fasta",
+        aligned = "data/nextclade/nextclade.aligned.fasta",
     output:
-        sequences = "ingest/results/sequences.fasta",
-        aligned = "ingest/results/aligned.fasta"
+        sequences = "results/sequences.fasta",
+        aligned = "results/aligned.fasta"
     shell:
         """
         cp {input.sequences} {output.sequences}

diff --git a/ingest/config/NC_003977.gb → ingest/defaults/NC_003977.gb b/ingest/config/NC_003977.gb → ingest/defaults/NC_003977.gb
diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml
@@ -0,0 +1,18 @@
+# This configuration file should contain all required configuration parameters
+# for the ingest workflow to run to completion.
+
+# Used to re-circularise genomes. This should be the same as the reference in the nextclade dataset!
+# Also force-included for a number of datasets
+reference_accession: "NC_003977"
+reference_genbank: "defaults/NC_003977.gb"
+
+entrez_query: 'Hepatitis+B+virus[All+Fields]complete+genome[All+Fields]'
+
+# Path to the nextclade dataset, which will be used to align all sequences
+nextclade_dataset: "../nextclade_datasets/references/NC_003977/versions/2023-08-22"
+
+nextclade_binary: "../bin/nextclade"
+
+# technically CDSs, but we use these terms rather interchangeably.
+# These should be the entire list of CDSs in the nextclade dataset's genemap
+genes: ['envS', 'envM', 'envL', 'X', 'pre-capsid', 'capsid', 'pol']
diff --git a/ingest/config/geoLocationRules.tsv → ingest/defaults/geoLocationRules.tsv b/ingest/config/geoLocationRules.tsv → ingest/defaults/geoLocationRules.tsv