Harmonize with pathogen repo guide #31

nextstrain · Feb 6, 2024 · c710ca1 · c710ca1
2 parents c0b9a6d + f66a276
commit c710ca1
Show file tree

Hide file tree

Showing 12 changed files with 4,796 additions and 129 deletions.
diff --git a/ingest/Snakefile b/ingest/Snakefile
@@ -54,8 +54,8 @@ rule all:
         _get_all_targets,
 
 
-include: "rules/fetch_sequences.smk"
-include: "rules/transform.smk"
+include: "rules/fetch_from_ncbi.smk"
+include: "rules/curate.smk"
 
 
 if config.get("upload", False):

diff --git a/ingest/config/defaults.yaml b/ingest/config/defaults.yaml
@@ -1,7 +1,15 @@
+# This configuration file should contain all required configuration parameters
+# for the ingest workflow to run to completion.
+#
+# Define optional config parameters with their default values here so that users
+# do not have to dig through the workflows to figure out the default values
+
 # Sources of sequences to include in the ingest run
 sources: ['genbank']
-# Pathogen NCBI Taxonomy ID
-ncbi_taxon_id: '64320'
+
+# Required to fetch from NCBI Datasets
+ncbi_taxon_id: "64320"
+
 # The list of NCBI Datasets fields to include from NCBI Datasets output
 # These need to be the mneumonics of the NCBI Datasets fields, see docs for full list of fields
 # https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields
@@ -24,68 +32,67 @@ ncbi_datasets_fields:
   - submitter-affiliation
   - submitter-country
 
-# Params for the transform rule
-transform:
-  # NCBI fields to rename to Nextstrain field names.
-  # This is the first step in the pipeline, so any references to field names
-  # in the configs below should use the new field names
-  field_map: [
-    'accession=genbank_accession',
-    'accession-rev=genbank_accession_rev',
-    'isolate-lineage=strain',
-    'sourcedb=database',
-    'geo-region=region',
-    'geo-location=location',
-    'host-name=host',
-    'isolate-collection-date=date',
-    'release-date=release_date',
-    'update-date=update_date',
-    'sra-accs=sra_accessions',
-    'submitter-names=authors',
-    'submitter-affiliations=institution',
-  ]
+# Config parameters related to the curate pipeline
+curate:
+  # URL pointed to public generalized geolocation rules
+  # For the Nextstrain team, this is currently
+  # 'https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv'
+  geolocation_rules_url: 'https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv'
+  # The path to the local geolocation rules within the pathogen repo
+  # The path should be relative to the ingest directory.
+  local_geolocation_rules: 'config/geolocation-rules.tsv'
+  # List of field names to change where the key is the original field name and the value is the new field name
+  # The original field names should match the ncbi_datasets_fields provided above.
+  # This is the first step in the pipeline, so any references to field names in the configs below should use the new field names
+  field_map:
+    accession: genbank_accession
+    accession-rev: genbank_accession_rev
+    isolate-lineage: strain
+    sourcedb: database
+    geo-region: region
+    geo-location: location
+    host-name: host
+    isolate-collection-date: date
+    release-date: release_date
+    update-date: update_date
+    sra-accs: sra_accessions
+    submitter-names: authors
+    submitter-affiliations: institution
   # Standardized strain name regex
-  # Currently accepts any characters because we do not have a clear standard for strain names
+  # Currently accepts any characters because we do not have a clear standard for strain names across pathogens
   strain_regex: '^.+$'
-  # Back up strain name field if 'strain' doesn't match regex above
+  # Back up strain name field to use if 'strain' doesn't match regex above
   strain_backup_fields: ['genbank_accession']
-  # List of date fields to standardize
+  # List of date fields to standardize to ISO format YYYY-MM-DD
   date_fields: ['date', 'release_date', 'update_date']
-  # Expected date formats present in date fields
+  # List of expected date formats that are present in the date fields provided above
   # These date formats should use directives expected by datetime
   # See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes
   expected_date_formats: ['%Y', '%Y-%m', '%Y-%m-%d', '%Y-%m-%dT%H:%M:%SZ']
-  # Titlecase rules
   titlecase:
-    # Abbreviations not cast to titlecase, keeps uppercase
+    # List of string fields to titlecase
+    fields: ['region', 'country', 'division', 'location']
+    # List of abbreviations not cast to titlecase, keeps uppercase
     abbreviations: ['USA']
     # Articles that should not be cast to titlecase
     articles: [
       'and', 'd', 'de', 'del', 'des', 'di', 'do', 'en', 'l', 'la', 'las', 'le',
       'los', 'nad', 'of', 'op', 'sur', 'the', 'y'
     ]
-    # List of string fields to titlecase
-    fields: ['region', 'country', 'division', 'location']
-  # Authors field name
+  # Metadata field that contains the list of authors associated with the sequence
   authors_field: 'authors'
-  # Authors default value if authors value is empty
+  # Default value to use if the authors field is empty
   authors_default_value: '?'
-  # Field name for the generated abbreviated authors
-  abbr_authors_field: 'abbr_authors'
-  # General geolocation rules to apply to geolocation fields
-  geolocation_rules_url: 'https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv'
-  # Local geolocation rules that are only applicable to zika data
-  # Local rules can overwrite the general geolocation rules provided above
-  local_geolocation_rules: 'config/geolocation-rules.tsv'
-  # User annotations file
-  annotations: 'config/annotations.tsv'
-  # ID field used to merge annotations
+  # Path to the manual annotations file
+  # The path should be relative to the ingest directory
+  annotations: "config/annotations.tsv"
+  # The ID field in the metadata to use to merge the manual annotations
   annotations_id: 'genbank_accession'
-  # Field to use as the sequence ID in the FASTA file
-  id_field: 'genbank_accession'
-  # Field to use as the sequence in the FASTA file
-  sequence_field: 'sequence'
-  # Final output columns for the metadata TSV
+  # The ID field in the metadata to use as the sequence id in the output FASTA file
+  output_id_field: 'genbank_accession'
+  # The field in the NDJSON record that contains the actual genomic sequence
+  output_sequence_field: 'sequence'
+  # The list of metadata columns to keep in the final output of the curation pipeline.
   metadata_columns: [
     'genbank_accession',
     'genbank_accession_rev',
@@ -103,4 +110,3 @@ transform:
     'authors',
     'institution',
   ]
-
diff --git a/ingest/rules/transform.smk → ingest/rules/curate.smk b/ingest/rules/transform.smk → ingest/rules/curate.smk
@@ -1,5 +1,5 @@
 """
-This part of the workflow handles transforming the data into standardized
+This part of the workflow handles curating the data into standardized
 formats and expects input file
 
     sequences_ndjson = "data/sequences.ndjson"
@@ -9,15 +9,15 @@ This will produce output files as
     metadata = "results/metadata.tsv"
     sequences = "results/sequences.fasta"
 
-Parameters are expected to be defined in `config.transform`.
+Parameters are expected to be defined in `config.curate`.
 """
 
 
 rule fetch_general_geolocation_rules:
     output:
         general_geolocation_rules="data/general-geolocation-rules.tsv",
     params:
-        geolocation_rules_url=config["transform"]["geolocation_rules_url"],
+        geolocation_rules_url=config["curate"]["geolocation_rules_url"],
     shell:
         """
         curl {params.geolocation_rules_url} > {output.general_geolocation_rules}
@@ -27,7 +27,7 @@ rule fetch_general_geolocation_rules:
 rule concat_geolocation_rules:
     input:
         general_geolocation_rules="data/general-geolocation-rules.tsv",
-        local_geolocation_rules=config["transform"]["local_geolocation_rules"],
+        local_geolocation_rules=config["curate"]["local_geolocation_rules"],
     output:
         all_geolocation_rules="data/all-geolocation-rules.tsv",
     shell:
@@ -36,32 +36,38 @@ rule concat_geolocation_rules:
         """
 
 
-rule transform:
+def format_field_map(field_map: dict[str, str]) -> str:
+    """
+    Format dict to `"key1"="value1" "key2"="value2"...` for use in shell commands.
+    """
+    return " ".join([f'"{key}"="{value}"' for key, value in field_map.items()])
+
+
+rule curate:
     input:
         sequences_ndjson="data/sequences.ndjson",
         all_geolocation_rules="data/all-geolocation-rules.tsv",
-        annotations=config["transform"]["annotations"],
+        annotations=config["curate"]["annotations"],
     output:
         metadata="results/metadata.tsv",
         sequences="results/sequences.fasta",
     log:
-        "logs/transform.txt",
+        "logs/curate.txt",
     params:
-        field_map=config["transform"]["field_map"],
-        strain_regex=config["transform"]["strain_regex"],
-        strain_backup_fields=config["transform"]["strain_backup_fields"],
-        date_fields=config["transform"]["date_fields"],
-        expected_date_formats=config["transform"]["expected_date_formats"],
-        articles=config["transform"]["titlecase"]["articles"],
-        abbreviations=config["transform"]["titlecase"]["abbreviations"],
-        titlecase_fields=config["transform"]["titlecase"]["fields"],
-        authors_field=config["transform"]["authors_field"],
-        authors_default_value=config["transform"]["authors_default_value"],
-        abbr_authors_field=config["transform"]["abbr_authors_field"],
-        annotations_id=config["transform"]["annotations_id"],
-        metadata_columns=config["transform"]["metadata_columns"],
-        id_field=config["transform"]["id_field"],
-        sequence_field=config["transform"]["sequence_field"],
+        field_map=format_field_map(config["curate"]["field_map"]),
+        strain_regex=config["curate"]["strain_regex"],
+        strain_backup_fields=config["curate"]["strain_backup_fields"],
+        date_fields=config["curate"]["date_fields"],
+        expected_date_formats=config["curate"]["expected_date_formats"],
+        articles=config["curate"]["titlecase"]["articles"],
+        abbreviations=config["curate"]["titlecase"]["abbreviations"],
+        titlecase_fields=config["curate"]["titlecase"]["fields"],
+        authors_field=config["curate"]["authors_field"],
+        authors_default_value=config["curate"]["authors_default_value"],
+        annotations_id=config["curate"]["annotations_id"],
+        metadata_columns=config["curate"]["metadata_columns"],
+        id_field=config["curate"]["output_id_field"],
+        sequence_field=config["curate"]["output_sequence_field"],
     shell:
         """
         (cat {input.sequences_ndjson} \

diff --git a/ingest/rules/fetch_sequences.smk → ingest/rules/fetch_from_ncbi.smk b/ingest/rules/fetch_sequences.smk → ingest/rules/fetch_from_ncbi.smk
@@ -7,6 +7,10 @@ defined in the config. If adding other sources, add a new rule upstream
 of rule `fetch_all_sequences` to create the file `data/{source}.ndjson` or the
 file must exist as a static file in the repo.
 
+Fetch with NCBI Datasets (https://www.ncbi.nlm.nih.gov/datasets/)
+    - requires `ncbi_taxon_id` config
+    - Only returns metadata fields that are available through NCBI Datasets
+
 Produces final output as
 
     sequences_ndjson = "data/sequences.ndjson"

diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile
@@ -4,8 +4,8 @@ rule all:
     input:
         auspice_json = "auspice/zika.json",
 
-include: "rules/usvi.smk"
 include: "rules/prepare_sequences.smk"
+include: "rules/merge_sequences_usvi.smk"
 include: "rules/construct_phylogeny.smk"
 include: "rules/annotate_phylogeny.smk"
 include: "rules/export.smk"

diff --git a/phylogenetic/data/README.md b/phylogenetic/data/README.md
@@ -0,0 +1,101 @@
+# Zika Data
+
+The primary source of data for this build is GenBank. However, there are instances where certain records, crucial for phylogenetic analysis, are not yet present in GenBank. This is particularly true for some USVI records, as detailed in a previous analysis documented at https://github.com/blab/zika-usvi/.
+
+### Integration of USVI data
+
+This Zika build incorporates data from https://github.com/blab/zika-usvi/. The sequences and metadata for USVI from that GitHub repository have undergone curation and were uploaded to https://github.com/nextstrain/fauna. Subsequently, they were downloaded as sequences and metadata, and a filter was applied to include only those records not yet submitted to NCBI GenBank. The resulting records are now available as a pair of metadata and sequences files in this directory.
+
+The process of merging the USVI data into the GenBank dataset is facilitated through the `append_usvi` rule.
+
+Steps to create the `metadata_usvi.tsv` and `sequences_usvi.fasta` files were as follows:
+
+1. Sequences were uploaded to the fauna database following [these instructions](https://github.com/nextstrain/fauna/blob/f9e7955cb4381d5e881c337e005778ed43b7c56c/builds/ZIKA.md#fred-hutch-sequences).
+2. Sequences were downloaded from the fauna database following [these instructions](https://github.com/nextstrain/fauna/blob/5d5a1f3faf06805a5f31e91df2c76b06e6f3bf6a/builds/ZIKA.md#download-from-fauna-parse-compress-and-push-to-s3) and saved as `zika.fasta`
+3. Sequences were ingested from GenBank following [these instructions](../README.md) and saved as `sequences.fasta`
+4. [NCBI Blastn](https://www.ncbi.nlm.nih.gov/books/NBK279690/) was used to identify fauna sequences that were not one hundred percent identical to GenBank sequences using the following commnads:
+
+
+```bash
+GENBANK_SEQUENCES=sequences.fasta
+FAUNA_SEQUENCES=zika.fasta
+
+# Create a local blast database
+makeblastdb \
+  -in ${GENBANK_SEQUENCES} \  
+  -dbtype nucl
+
+# Blast fauna against GenBank
+blastn \
+  -db ${GENBANK_SEQUENCES} \
+  -query ${FAUNA_SEQUENCES} \
+  -num_alignments 1 \
+  -outfmt 6 \
+  -out blast_output.txt
+
+# USVI strains that
+# + match at 100%
+# + match at least a 5000nt region (to filter out short substring matches)
+cat blast_output.txt \
+| awk -F'\t' '$1~"USVI" && $3>=100 && $4>5000 , OFS="\t" {print $1}' \
+> USVI_100_match.txt
+
+less USVI_100_match.txt
+# USVI/5/2016|zika|MW165881|2016-10-17|north_america|usvi|saint_thomas|saint_thomas|genbank|genome|Santiago
+# USVI/43/2016|zika|MW165884|2016-07-19|north_america|usvi|saint_thomas|saint_thomas|genbank|genome|Santiago
+# USVI/4/2016|zika|MW165880|2016-10-14|north_america|usvi|saint_thomas|saint_thomas|genbank|genome|Santiago
+# USVI/35/2016|zika|MW165883|2016-09-08|north_america|usvi|saint_thomas|saint_thomas|genbank|genome|Santiago
+# USVI/25/2016|zika|MW165882|2016-09-27|north_america|usvi|saint_thomas|saint_thomas|genbank|genome|Santiago
+
+# USVI strains that are not in the 100 match list
+cat blast_output.txt \
+| awk -F'\t' '$1~"USVI" , OFS="\t" {print}' \
+| grep -Fvf USVI_100_match.txt \
+| awk -F'\t' '{print $1}' \
+| sort \
+| uniq \
+> USVI_not_match.txt
+
+head USVI_not_match.txt
+# USVI/1/2016|zika|VI1_1d|2016-09-28|north_america|usvi|saint_croix|saint_croix|fh|genome|Black
+# USVI/11/2016|zika|VI11|2016-03-22|north_america|usvi|saint_thomas|saint_thomas|fh|genome|Black
+# USVI/12/2016|zika|VI12|2016-11-04|north_america|usvi|saint_croix|saint_croix|fh|genome|Black
+# USVI/13/2016|zika|VI13|2016-08-13|north_america|usvi|saint_thomas|saint_thomas|fh|genome|Black
+# USVI/19/2016|zika|VI19_12plex|2016-11-21|north_america|usvi|saint_croix|saint_croix|fh|genome|Black
+# ...
+```
+
+5. Pull out the corresponding `metadata_usvi.tsv` and `sequences_usvi.fasta` using a combination of [smof](https://github.com/incertae-sedis/smof) and [augur parse](https://docs.nextstrain.org/projects/augur/en/stable/usage/cli/parse.html)
+
+```bash
+# Pulls out sequences based on a match against header strings
+smof grep -f USVI_not_match.txt zika.fasta > usvi.fasta
+
+# Splits file into metadata_usvi.tsv and sequences_usvi.fasta
+augur parse \
+  --sequences usvi.fasta \
+  --output-sequences sequences_usvi.fasta \
+  --output-metadata raw_metadata_usvi.tsv \
+  --fields strain virus accession date region country division location institution segment authors url title journal paper_url \
+  --prettify-fields region country division location
+
+augur parse \
+  --sequences usvi.fasta \
+  --output-sequences sequences_usvi.fasta \
+  --output-metadata no.tsv \
+  --fields a b strain c d e f g h i j k l m n
+
+# Add sequence lengths to metadata
+echo "accession|length" | tr '|' '\t' > lengths_usvi.tsv
+smof stat --length --byseq sequences_usvi.fasta >> lengths_usvi.tsv
+
+tsv-join -H \
+  --filter-file lengths_usvi.tsv\
+  --key-fields accession \
+  --append-fields length \
+  raw_metadata_usvi.tsv \
+  | tsv-select -H \
+  --fields accession,strain,date,region,country,division,location,length,authors,institution,url \
+  > metadata_usvi.tsv
+```
+
diff --git a/phylogenetic/data/metadata_usvi.tsv b/phylogenetic/data/metadata_usvi.tsv
@@ -0,0 +1,26 @@
+accession	strain	date	region	country	division	location	length	authors	institution	url
+VI44	USVI/44/2016	2016-10-17	North America	Usvi	Saint Thomas	Saint Thomas	10807	Black et al	fh	https://github.com/blab/zika-usvi/
+VI20_12plex	USVI/20/2016	2016-10-13	North America	Usvi	Saint Croix	Saint Croix	10792	Black et al	fh	https://github.com/blab/zika-usvi/
+VI41	USVI/41/2016	2016-11-10	North America	Usvi	Saint Croix	Saint Croix	10807	Black et al	fh	https://github.com/blab/zika-usvi/
+VI32_12plex	USVI/32/2016	2016-08-11	North America	Usvi	Saint Croix	Saint Croix	10792	Black et al	fh	https://github.com/blab/zika-usvi/
+VI12	USVI/12/2016	2016-11-04	North America	Usvi	Saint Croix	Saint Croix	10636	Black et al	fh	https://github.com/blab/zika-usvi/
+VI46	USVI/46/2016	2016-07-15	North America	Usvi	Saint Thomas	Saint Thomas	10807	Black et al	fh	https://github.com/blab/zika-usvi/
+VI23_12plex	USVI/23/2016	2016-07-12	North America	Usvi	Saint Thomas	Saint Thomas	10807	Black et al	fh	https://github.com/blab/zika-usvi/
+VI28_1d	USVI/28/2016	2016-11-28	North America	Usvi	Saint Croix	Saint Croix	10807	Black et al	fh	https://github.com/blab/zika-usvi/
+VI45	USVI/45/2016	2016-08-03	North America	Usvi	Saint Thomas	Saint Thomas	10807	Black et al	fh	https://github.com/blab/zika-usvi/
+VI37	USVI/37/2016	2016-10-06	North America	Usvi	Saint Croix	Saint Croix	10807	Black et al	fh	https://github.com/blab/zika-usvi/
+VI39_12plex	USVI/39/2016	2016-11-09	North America	Usvi	Saint Croix	Saint Croix	10807	Black et al	fh	https://github.com/blab/zika-usvi/
+VI36	USVI/36/2016	2016-09-13	North America	Usvi	Saint Croix	Saint Croix	10807	Black et al	fh	https://github.com/blab/zika-usvi/
+VI34	USVI/34/2016	2016-08-01	North America	Usvi	Saint Thomas	Saint Thomas	10792	Black et al	fh	https://github.com/blab/zika-usvi/
+VI11	USVI/11/2016	2016-03-22	North America	Usvi	Saint Thomas	Saint Thomas	10636	Black et al	fh	https://github.com/blab/zika-usvi/
+VI6	USVI/6/2016	2016-10-19	North America	Usvi	Saint John	Saint John	10636	Black et al	fh	https://github.com/blab/zika-usvi/
+VI3	USVI/3/2016	2016-09-26	North America	Usvi	Saint Thomas	Saint Thomas	10807	Black et al	fh	https://github.com/blab/zika-usvi/
+VI27_1d	USVI/27/2016	2016-08-19	North America	Usvi	Saint Thomas	Saint Thomas	10807	Black et al	fh	https://github.com/blab/zika-usvi/
+VI13	USVI/13/2016	2016-08-13	North America	Usvi	Saint Thomas	Saint Thomas	10636	Black et al	fh	https://github.com/blab/zika-usvi/
+VI7	USVI/7/2016	2016-10-27	North America	Usvi	Saint Thomas	Saint Thomas	10636	Black et al	fh	https://github.com/blab/zika-usvi/
+VI38	USVI/38/2016	2016-10-25	North America	Usvi	Saint Thomas	Saint Thomas	10807	Black et al	fh	https://github.com/blab/zika-usvi/
+VI19_12plex	USVI/19/2016	2016-11-21	North America	Usvi	Saint Croix	Saint Croix	10792	Black et al	fh	https://github.com/blab/zika-usvi/
+VI1_1d	USVI/1/2016	2016-09-28	North America	Usvi	Saint Croix	Saint Croix	10792	Black et al	fh	https://github.com/blab/zika-usvi/
+VI42	USVI/42/2016	2016-10-26	North America	Usvi	Saint Croix	Saint Croix	10807	Black et al	fh	https://github.com/blab/zika-usvi/
+VI30_1d	USVI/30/2016	2016-08-07	North America	Usvi	Saint Croix	Saint Croix	10807	Black et al	fh	https://github.com/blab/zika-usvi/
+VI2	USVI/2/2016	2016-09-28	North America	Usvi	Saint Croix	Saint Croix	10807	Black et al	fh	https://github.com/blab/zika-usvi/