Skip to content

Commit

Permalink
Harmonize ingest/config/defaults.yaml with the pathogen repo template
Browse files Browse the repository at this point in the history
  • Loading branch information
j23414 committed Jan 30, 2024
1 parent 5b1369e commit 446047e
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 37 deletions.
73 changes: 39 additions & 34 deletions ingest/config/defaults.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,15 @@
# This configuration file should contain all required configuration parameters
# for the ingest workflow to run to completion.
#
# Define optional config parameters with their default values here so that users
# do not have to dig through the workflows to figure out the default values

# Sources of sequences to include in the ingest run
sources: ['genbank']
# Pathogen NCBI Taxonomy ID
ncbi_taxon_id: '64320'

# Required to fetch from NCBI Datasets
ncbi_taxon_id: "64320"

# The list of NCBI Datasets fields to include from NCBI Datasets output
# These need to be the mneumonics of the NCBI Datasets fields, see docs for full list of fields
# https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields
Expand All @@ -24,13 +32,18 @@ ncbi_datasets_fields:
- submitter-affiliation
- submitter-country

# Params for the curate rule
# Config parameters related to the curate pipeline
curate:
# NCBI fields to rename to Nextstrain field names.
# List of field names to change where the key is the original field name and
# the value is the new field name
# This is the first step in the pipeline, so any references to field names
# in the configs below should use the new field names
# URL pointed to public generalized geolocation rules
# For the Nextstrain team, this is currently
# 'https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv'
geolocation_rules_url: 'https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv'
# The path to the local geolocation rules within the pathogen repo
# The path should be relative to the ingest directory.
local_geolocation_rules: 'config/geolocation-rules.tsv'
# List of field names to change where the key is the original field name and the value is the new field name
# The original field names should match the ncbi_datasets_fields provided above.
# This is the first step in the pipeline, so any references to field names in the configs below should use the new field names
field_map:
accession: genbank_accession
accession-rev: genbank_accession_rev
Expand All @@ -46,47 +59,40 @@ curate:
submitter-names: authors
submitter-affiliations: institution
# Standardized strain name regex
# Currently accepts any characters because we do not have a clear standard for strain names
# Currently accepts any characters because we do not have a clear standard for strain names across pathogens
strain_regex: '^.+$'
# Back up strain name field if 'strain' doesn't match regex above
# Back up strain name field to use if 'strain' doesn't match regex above
strain_backup_fields: ['genbank_accession']
# List of date fields to standardize
# List of date fields to standardize to ISO format YYYY-MM-DD
date_fields: ['date', 'release_date', 'update_date']
# Expected date formats present in date fields
# List of expected date formats that are present in the date fields provided above
# These date formats should use directives expected by datetime
# See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes
expected_date_formats: ['%Y', '%Y-%m', '%Y-%m-%d', '%Y-%m-%dT%H:%M:%SZ']
# Titlecase rules
titlecase:
# Abbreviations not cast to titlecase, keeps uppercase
# List of string fields to titlecase
fields: ['region', 'country', 'division', 'location']
# List of abbreviations not cast to titlecase, keeps uppercase
abbreviations: ['USA']
# Articles that should not be cast to titlecase
articles: [
'and', 'd', 'de', 'del', 'des', 'di', 'do', 'en', 'l', 'la', 'las', 'le',
'los', 'nad', 'of', 'op', 'sur', 'the', 'y'
]
# List of string fields to titlecase
fields: ['region', 'country', 'division', 'location']
# Authors field name
# Metadata field that contains the list of authors associated with the sequence
authors_field: 'authors'
# Authors default value if authors value is empty
# Default value to use if the authors field is empty
authors_default_value: '?'
# Field name for the generated abbreviated authors
abbr_authors_field: 'abbr_authors'
# General geolocation rules to apply to geolocation fields
geolocation_rules_url: 'https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv'
# Local geolocation rules that are only applicable to zika data
# Local rules can overwrite the general geolocation rules provided above
local_geolocation_rules: 'config/geolocation-rules.tsv'
# User annotations file
annotations: 'config/annotations.tsv'
# ID field used to merge annotations
# Path to the manual annotations file
# The path should be relative to the ingest directory
annotations: "config/annotations.tsv"
# The ID field in the metadata to use to merge the manual annotations
annotations_id: 'genbank_accession'
# Field to use as the sequence ID in the FASTA file
id_field: 'genbank_accession'
# Field to use as the sequence in the FASTA file
sequence_field: 'sequence'
# Final output columns for the metadata TSV
# The ID field in the metadata to use as the sequence id in the output FASTA file
output_id_field: 'genbank_accession'
# The field in the NDJSON record that contains the actual genomic sequence
output_sequence_field: 'sequence'
# The list of metadata columns to keep in the final output of the curation pipeline.
metadata_columns: [
'genbank_accession',
'genbank_accession_rev',
Expand All @@ -104,4 +110,3 @@ curate:
'authors',
'institution',
]

5 changes: 2 additions & 3 deletions ingest/rules/curate.smk
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,10 @@ rule curate:
titlecase_fields=config["curate"]["titlecase"]["fields"],
authors_field=config["curate"]["authors_field"],
authors_default_value=config["curate"]["authors_default_value"],
abbr_authors_field=config["curate"]["abbr_authors_field"],
annotations_id=config["curate"]["annotations_id"],
metadata_columns=config["curate"]["metadata_columns"],
id_field=config["curate"]["id_field"],
sequence_field=config["curate"]["sequence_field"],
id_field=config["curate"]["output_id_field"],
sequence_field=config["curate"]["output_sequence_field"],
shell:
"""
(cat {input.sequences_ndjson} \
Expand Down

0 comments on commit 446047e

Please sign in to comment.