Skip to content

Commit

Permalink
parameterize genbank-url by NCBI Taxon ID
Browse files Browse the repository at this point in the history
Parameterized the genbank-url script and subsequent calls by NCBI Taxon ID.
This change generalizes the scripts and rules such that swapping in a different virus should only require a different Taxon ID.
Co-authored-by: Jover Lee <joverlee521@gmail.com>
  • Loading branch information
j23414 committed Nov 17, 2022
1 parent f777113 commit 5d3281d
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 55 deletions.
7 changes: 4 additions & 3 deletions ingest/bin/fetch-from-genbank
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,16 @@
set -euo pipefail

bin="$(dirname "$0")"
TAXID="${1:?NCBI taxon id is required.}"

main() {
fetch | "$bin"/csv-to-ndjson
fetch "$TAXID" | "$bin"/csv-to-ndjson
}

fetch() {
curl "$("$bin"/genbank-url)" \
curl "$("$bin"/genbank-url --taxonid "$1")" \
--fail --silent --show-error --http1.1 \
--header 'User-Agent: https://github.com/nextstrain/monkeypox (hello@nextstrain.org)'
--header 'User-Agent: https://github.com/nextstrain/dengue (hello@nextstrain.org)'
}

main "$@"
139 changes: 88 additions & 51 deletions ingest/bin/genbank-url
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
"""
Generate URL to download all Monkeypox sequences and their curated metadata
Generate URL to download all Pathogen sequences and their curated metadata
from GenBank via NCBI Virus.
The URL this program builds is based on the URL for SARS-CoV-2 constructed with
Expand All @@ -13,54 +13,91 @@ and observing the network activity at
https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/virus?SeqType_s=Nucleotide&VirusLineage_ss=Dengue%20virus,%20taxid:12637
"""
from urllib.parse import urlencode
import argparse

endpoint = "https://www.ncbi.nlm.nih.gov/genomes/VirusVariation/vvsearch2/"
params = {
# Search criteria
'fq': [
'{!tag=SeqType_s}SeqType_s:("Nucleotide")', # Nucleotide sequences (as opposed to protein)
'VirusLineageId_ss:(10244)', # NCBI Taxon id for Monkeypox
],

# Unclear, but seems necessary.
'q': '*:*',

# Response format
'cmd': 'download',
'dlfmt': 'csv',
'fl': ','.join(
':'.join(names) for names in [
# Pairs of (output column name, source data field).
('genbank_accession', 'id'),
('genbank_accession_rev', 'AccVer_s'),
('database', 'SourceDB_s'),
('strain', 'Isolate_s'),
('region', 'Region_s'),
('location', 'CountryFull_s'),
('collected', 'CollectionDate_s'),
('submitted', 'CreateDate_dt'),
('length', 'SLen_i'),
('host', 'Host_s'),
('isolation_source', 'Isolation_csv'),
('bioproject_accession', 'BioProject_s'),
('biosample_accession', 'BioSample_s'),
('sra_accession', 'SRALink_csv'),
('title', 'Definition_s'),
('authors', 'Authors_csv'),
('submitting_organization', 'SubmitterAffilFull_s'),
('publications', 'PubMed_csv'),
('sequence', 'Nucleotide_seq'),
]
),

# Stable sort with GenBank accessions.
# Columns are source data fields, not our output columns.
'sort': 'id asc',

# This isn't Entrez, but include the same email parameter it requires just
# to be nice.
'email': 'hello@nextstrain.org',
}
query = urlencode(params, doseq = True, encoding = "utf-8")

print(f"{endpoint}?{query}")

def parse_args():
"""
Define parse args
"""
parser = argparse.ArgumentParser(
description="Given an NCBI taxon ID, generate URL to download "
"all viral sequences and their curated metadata from GenBank via NCBI Virus."
)
parser.add_argument(
"--taxonid",
help="NCBI Taxon ID.",
required=True
)
return parser.parse_args()


def build_query_url(ncbi_id: str):
"""
Generate URL to download all viral sequences and their curated metadata
from GenBank via NCBI Virus.
"""
endpoint = "https://www.ncbi.nlm.nih.gov/genomes/VirusVariation/vvsearch2/"
params = {
# Search criteria
'fq': [
# Nucleotide sequences (as opposed to protein)
'{!tag=SeqType_s}SeqType_s:("Nucleotide")',
# NCBI Taxon id for virus
'VirusLineageId_ss:({ncbi_id})',
],

# Unclear, but seems necessary.
'q': '*:*',

# Response format
'cmd': 'download',
'dlfmt': 'csv',
'fl': ','.join(
':'.join(names) for names in [
# Pairs of (output column name, source data field).
('genbank_accession', 'id'),
('genbank_accession_rev', 'AccVer_s'),
('database', 'SourceDB_s'),
('strain', 'Isolate_s'),
('region', 'Region_s'),
('location', 'CountryFull_s'),
('collected', 'CollectionDate_s'),
('submitted', 'CreateDate_dt'),
('length', 'SLen_i'),
('host', 'Host_s'),
('isolation_source', 'Isolation_csv'),
('bioproject_accession', 'BioProject_s'),
('biosample_accession', 'BioSample_s'),
('sra_accession', 'SRALink_csv'),
('title', 'Definition_s'),
('authors', 'Authors_csv'),
('submitting_organization', 'SubmitterAffilFull_s'),
('publications', 'PubMed_csv'),
('sequence', 'Nucleotide_seq'),
]
),

# Stable sort with GenBank accessions.
# Columns are source data fields, not our output columns.
'sort': 'id asc',

# This isn't Entrez, but include the same email parameter it requires just
# to be nice.
'email': 'hello@nextstrain.org',
}
query = urlencode(params, doseq=True, encoding="utf-8")

print(f"{endpoint}?{query}")


def main():
"""
Main method
"""
args = parse_args()
build_query_url(args.taxonid)


if __name__ == "__main__":
main()
3 changes: 2 additions & 1 deletion ingest/workflow/snakemake_rules/fetch_sequences.smk
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ rule fetch_from_genbank:
output:
genbank_ndjson="data/genbank_{serotype}.ndjson",
params:
serotype_tax_id=download_serotype,
csv_to_ndjson_url="https://raw.githubusercontent.com/nextstrain/monkeypox/master/ingest/bin/csv-to-ndjson",
shell:
"""
Expand All @@ -39,7 +40,7 @@ rule fetch_from_genbank:
chmod 755 *
cd ..
fi
./bin/fetch-from-genbank > {output.genbank_ndjson}
./bin/fetch-from-genbank {params.serotype_tax_id} > {output.genbank_ndjson}
"""


Expand Down

0 comments on commit 5d3281d

Please sign in to comment.