nextstrain · joverlee521 · Aug 29, 2023 · Aug 17, 2023 · Aug 17, 2023 · Aug 17, 2023
diff --git a/README.md b/README.md
@@ -73,6 +73,9 @@ NCBI interaction scripts that are useful for fetching public metadata and sequen
 
 - [fetch-from-ncbi-entrez](fetch-from-ncbi-entrez) - Fetch metadata and nucleotide sequences from [NCBI Entrez](https://www.ncbi.nlm.nih.gov/books/NBK25501/) and output to a GenBank file.
   Useful for pathogens with metadata and annotations in custom fields that are not part of the standard [NCBI Virus](https://www.ncbi.nlm.nih.gov/labs/virus/vssi/) or [NCBI Datasets](https://www.ncbi.nlm.nih.gov/datasets/) outputs.
+- [fetch-from-ncbi-virus](fetch-from-ncbi-virus) - Fetch metadata and nucleotide sequences from [NCBI Virus](https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/) and output NDJSON records to stdout.
+- [ncbi-virus-url](ncbi-virus-url) - Generates the URL to download metadata and sequences from NCBI Virus as a single CSV file.
+- [csv-to-ndjson](csv-to-ndjson) - Converts CSV file to NDJSON file with a hard-coded 200MiB field size limit to accommodate sequences in the NCBI Virus download.
 
 Potential Nextstrain CLI scripts
 

diff --git a/csv-to-ndjson b/csv-to-ndjson
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+"""
+Convert CSV on stdin to NDJSON on stdout.
+usage: `cat dummy.csv | ./csv-to-ndjson > dummy.ndjson`
+"""
+import csv
+import json
+from sys import stdin, stdout
+
+# 200 MiB; default is 128 KiB
+csv.field_size_limit(200 * 1024 * 1024)
+
+for row in csv.DictReader(stdin):
+    json.dump(row, stdout, allow_nan = False, indent = None, separators = ',:')
+    print()
diff --git a/docs/ncbi-virus-all-fields-example.json b/docs/ncbi-virus-all-fields-example.json
@@ -0,0 +1,292 @@
+{
+    "ExportDate_dt": "2023-08-08T21:02:01.475Z",
+    "QualNum_i": 0,
+    "QualPct_d": 0.0,
+    "IncompleteCdsCnt_i": 0,
+    "gi_l": 1798174254,
+    "Host_s": "Homo sapiens",
+    "HostSpecies_s": "Homo sapiens (human), taxid:9606|",
+    "HostLineage_ss": [
+        "cellular organisms, taxid:131567| biota",
+        "Eukaryota (eucaryotes), taxid:2759| eukaryotes Eucarya Eucaryotae Eukarya Eukaryotae",
+        "Opisthokonta, taxid:33154| Fungi/Metazoa group opisthokonts",
+        "Metazoa (metazoans), taxid:33208| multicellular animals Animalia animals",
+        "Eumetazoa, taxid:6072|",
+        "Bilateria, taxid:33213|",
+        "Deuterostomia (deuterostomes), taxid:33511|",
+        "Chordata (chordates), taxid:7711|",
+        "Craniata, taxid:89593|",
+        "Vertebrata (vertebrates), taxid:7742|",
+        "Gnathostomata (jawed vertebrates), taxid:7776|",
+        "Teleostomi, taxid:117570|",
+        "Euteleostomi (bony vertebrates), taxid:117571|",
+        "Sarcopterygii, taxid:8287|",
+        "Dipnotetrapodomorpha, taxid:1338369|",
+        "Tetrapoda (tetrapods), taxid:32523|",
+        "Amniota (amniotes), taxid:32524|",
+        "Mammalia (mammals), taxid:40674|",
+        "Theria, taxid:32525|",
+        "Eutheria (placentals), taxid:9347| eutherian mammals placental mammals Placentalia",
+        "Boreoeutheria, taxid:1437010| Boreotheria",
+        "Euarchontoglires, taxid:314146|",
+        "Primates, taxid:9443| Primata primates",
+        "Haplorrhini, taxid:376913|",
+        "Simiiformes, taxid:314293| Anthropoidea",
+        "Catarrhini, taxid:9526|",
+        "Hominoidea (apes), taxid:314295| ape",
+        "Hominidae (great apes), taxid:9604| Pongidae",
+        "Homininae, taxid:207598| Homo/Pan/Gorilla group",
+        "Homo (humans), taxid:9605|",
+        "Homo sapiens (human), taxid:9606|"
+    ],
+    "HostLineageId_ss": [
+        "131567",
+        "2759",
+        "33154",
+        "33208",
+        "6072",
+        "33213",
+        "33511",
+        "7711",
+        "89593",
+        "7742",
+        "7776",
+        "117570",
+        "117571",
+        "8287",
+        "1338369",
+        "32523",
+        "32524",
+        "40674",
+        "32525",
+        "9347",
+        "1437010",
+        "314146",
+        "9443",
+        "376913",
+        "314293",
+        "9526",
+        "314295",
+        "9604",
+        "207598",
+        "9605",
+        "9606"
+    ],
+    "Locus_s": "NC_045512",
+    "OrgId_i": 2697049,
+    "VirusFamily_s": "Coronaviridae",
+    "VirusGenus_s": "Betacoronavirus",
+    "VirusSpecies_s": "Severe acute respiratory syndrome-related coronavirus",
+    "VirusSpeciesId_i": 694009,
+    "VirusLineage_ss": [
+        "Viruses, taxid:10239| Vira Viridae viruses",
+        "Riboviria (RNA viruses), taxid:2559587| RNA viruses and viroids",
+        "Orthornavirae, taxid:2732396|",
+        "Pisuviricota, taxid:2732408|",
+        "Pisoniviricetes, taxid:2732506|",
+        "Nidovirales, taxid:76804|",
+        "Cornidovirineae, taxid:2499399|",
+        "Coronaviridae, taxid:11118|",
+        "Orthocoronavirinae, taxid:2501931|",
+        "Betacoronavirus, taxid:694002| Coronavirus",
+        "Sarbecovirus, taxid:2509511|",
+        "Severe acute respiratory syndrome-related coronavirus, taxid:694009| HCoV-SARS SARS SARSr-CoV SARSrCoV",
+        "Severe acute respiratory syndrome coronavirus 2, taxid:2697049| SARS-CoV-2",
+        "RNA viruses"
+    ],
+    "VirusLineageId_ss": [
+        "10239",
+        "2559587",
+        "2732396",
+        "2732408",
+        "2732506",
+        "76804",
+        "2499399",
+        "11118",
+        "2501931",
+        "694002",
+        "2509511",
+        "694009",
+        "2697049"
+    ],
+    "VirusL0_s": "RNA viruses",
+    "VirusL1_s": "Orthornavirae, taxid:2732396",
+    "VirusL2_s": "Pisuviricota, taxid:2732408",
+    "VirusL3_s": "Pisoniviricetes, taxid:2732506",
+    "VirusL4_s": "Nidovirales, taxid:76804",
+    "VirusL5_s": "Cornidovirineae, taxid:2499399",
+    "VirusL6_s": "Coronaviridae, taxid:11118",
+    "VirusL7_s": "Orthocoronavirinae, taxid:2501931",
+    "VirusL8_s": "Betacoronavirus, taxid:694002",
+    "VirusL9_s": "Sarbecovirus, taxid:2509511",
+    "VirusL10_s": "Severe acute respiratory syndrome-related coronavirus, taxid:694009",
+    "ViralHost_ss": [
+        "human",
+        "vertebrates"
+    ],
+    "GenomicMoltype_s": "ssRNA(+)",
+    "SLen_i": 29903,
+    "Flags_ss": [
+        "refseq",
+        "complete"
+    ],
+    "Flags_csv": "refseq, complete",
+    "FlagsCount_i": 2,
+    "SetAcc_s": "GCF_009858895.2",
+    "Authors_ss": [
+        "Wu,F.",
+        "Zhao,S.",
+        "Yu,B.",
+        "Chen,Y.M.",
+        "Wang,W.",
+        "Song,Z.G.",
+        "Hu,Y.",
+        "Tao,Z.W.",
+        "Tian,J.H.",
+        "Pei,Y.Y.",
+        "Yuan,M.L.",
+        "Zhang,Y.L.",
+        "Dai,F.H.",
+        "Liu,Y.",
+        "Wang,Q.M.",
+        "Zheng,J.J.",
+        "Xu,L.",
+        "Holmes,E.C.",
+        "Zhang,Y.Z.",
+        "Baranov,P.V.",
+        "Henderson,C.M.",
+        "Anderson,C.B.",
+        "Gesteland,R.F.",
+        "Atkins,J.F.",
+        "Howard,M.T.",
+        "Robertson,M.P.",
+        "Igel,H.",
+        "Baertsch,R.",
+        "Haussler,D.",
+        "Ares,M. Jr.",
+        "Scott,W.G.",
+        "Williams,G.D.",
+        "Chang,R.Y.",
+        "Brian,D.A.",
+        "Chen,Y.-M.",
+        "Song,Z.-G.",
+        "Tao,Z.-W.",
+        "Tian,J.-H.",
+        "Pei,Y.-Y.",
+        "Zhang,Y.-L.",
+        "Dai,F.-H.",
+        "Wang,Q.-M.",
+        "Zheng,J.-J.",
+        "Zhang,Y.-Z."
+    ],
+    "Authors_csv": "Wu,F., Zhao,S., Yu,B., Chen,Y.M., Wang,W., Song,Z.G., Hu,Y., Tao,Z.W., Tian,J.H., Pei,Y.Y., Yuan,M.L., Zhang,Y.L., Dai,F.H., Liu,Y., Wang,Q.M., Zheng,J.J., Xu,L., Holmes,E.C., Zhang,Y.Z., Baranov,P.V., Henderson,C.M., Anderson,C.B., Gesteland,R.F., Atkins,J.F., Howard,M.T., Robertson,M.P., Igel,H., Baertsch,R., Haussler,D., Ares,M. Jr., Scott,W.G., Williams,G.D., Chang,R.Y., Brian,D.A., Chen,Y.-M., Song,Z.-G., Tao,Z.-W., Tian,J.-H., Pei,Y.-Y., Zhang,Y.-L., Dai,F.-H., Wang,Q.-M., Zheng,J.-J., Zhang,Y.-Z.",
+    "AuthorsCount_i": 44,
+    "Country_s": "China",
+    "Isolate_s": "Wuhan-Hu-1",
+    "Lineage_s": "B",
+    "Division_s": "VRL",
+    "Keywords_ss": [
+        "RefSeq"
+    ],
+    "KeywordsCount_i": 1,
+    "TaxName_s": "Severe acute respiratory syndrome coronavirus 2",
+    "Region_s": "Asia",
+    "ParentAcc_s": "set:NC_045512",
+    "SetPosition_i": 0,
+    "SourceDB_s": "RefSeq",
+    "Definition_s": "Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, complete genome",
+    "HostId_i": 9606,
+    "CreateDate_dt": "2020-01-13T00:00:00Z",
+    "CreateYear_i": 2020,
+    "Genome_js": "[{\"id\": \"NC_045512.2\", \"segment\": null, \"proteins\": [{\"id\": \"YP_009724389.1\", \"name\": \"ORF1ab polyprotein\", \"location\": \"join(266..13468,13468..21555)\"}, {\"id\": \"YP_009725295.1\", \"name\": \"ORF1a polyprotein\", \"location\": \"266..13483\"}, {\"id\": \"YP_009724390.1\", \"name\": \"surface glycoprotein\", \"location\": \"21563..25384\"}, {\"id\": \"YP_009724391.1\", \"name\": \"ORF3a protein\", \"location\": \"25393..26220\"}, {\"id\": \"YP_009724392.1\", \"name\": \"envelope protein\", \"location\": \"26245..26472\"}, {\"id\": \"YP_009724393.1\", \"name\": \"membrane glycoprotein\", \"location\": \"26523..27191\"}, {\"id\": \"YP_009724394.1\", \"name\": \"ORF6 protein\", \"location\": \"27202..27387\"}, {\"id\": \"YP_009724395.1\", \"name\": \"ORF7a protein\", \"location\": \"27394..27759\"}, {\"id\": \"YP_009725318.1\", \"name\": \"ORF7b\", \"location\": \"27756..27887\"}, {\"id\": \"YP_009724396.1\", \"name\": \"ORF8 protein\", \"location\": \"27894..28259\"}, {\"id\": \"YP_009724397.2\", \"name\": \"nucleocapsid phosphoprotein\", \"location\": \"28274..29533\"}, {\"id\": \"YP_009725255.1\", \"name\": \"ORF10 protein\", \"location\": \"29558..29674\"}]}]",
+    "MolType_s": "RNA",
+    "ProtAcc_ss": [
+        "YP_009724389",
+        "YP_009725295",
+        "YP_009724390",
+        "YP_009724391",
+        "YP_009724392",
+        "YP_009724393",
+        "YP_009724394",
+        "YP_009724395",
+        "YP_009725318",
+        "YP_009724396",
+        "YP_009724397",
+        "YP_009725255"
+    ],
+    "ProtAccCount_i": 12,
+    "UpdateDate_dt": "2020-07-18T00:00:00Z",
+    "UpdateYear_i": 2020,
+    "PubMed_ss": [
+        "32015508",
+        "15680415",
+        "15630477",
+        "10482585"
+    ],
+    "PubMed_csv": "32015508, 15680415, 15630477, 10482585",
+    "PubMedCount_i": 4,
+    "Completeness_s": "complete",
+    "CountryFull_s": "China",
+    "ProtNames_ss": [
+        "ORF1ab polyprotein",
+        "ORF1a polyprotein",
+        "surface glycoprotein",
+        "ORF3a protein",
+        "envelope protein",
+        "membrane glycoprotein",
+        "ORF6 protein",
+        "ORF7a protein",
+        "ORF7b protein",
+        "ORF8 protein",
+        "nucleocapsid phosphoprotein",
+        "ORF10 protein"
+    ],
+    "ProtNamesCount_i": 12,
+    "IsolateParsed_s": "Wuhan-Hu-1",
+    "NuclAcc_ss": [
+        "NC_045512"
+    ],
+    "NuclAccCount_i": 1,
+    "CollectionDate_dr": "2019-12",
+    "CollectionYear_i": 2019,
+    "SubmitterAffil_s": "National Center for Biotechnology Information, NIH",
+    "BioProject_ss": [
+        "PRJNA485481"
+    ],
+    "BioProject_csv": "PRJNA485481",
+    "BioProjectCount_i": 1,
+    "AccVer_s": "NC_045512.2",
+    "CollectionDate_s": "2019-12",
+    "SubmitterCountry_s": "USA",
+    "CollectionDate_dt": "2019-12-01T00:00:00Z",
+    "GenomeCompleteness_s": "complete",
+    "SubmitterAffilFull_s": "National Center for Biotechnology Information, NIH",
+    "BioProject_s": "PRJNA485481",
+    "AccNV_s": "NC_045512",
+    "id": "NC_045512",
+    "SeqType_s": "Nucleotide",
+    "FastaMD5_s": "4928f859a1822d291e0225206a0068c8",
+    "live_i": 1,
+    "ids_ss": [
+        "GCF_009858895",
+        "GCF_009858895.2",
+        "NC_045512",
+        "NC_045512.2",
+        "PRJNA485481",
+        "YP_009724389",
+        "YP_009724390",
+        "YP_009724391",
+        "YP_009724392",
+        "YP_009724393",
+        "YP_009724394",
+        "YP_009724395",
+        "YP_009724396",
+        "YP_009724397",
+        "YP_009725255",
+        "YP_009725295",
+        "YP_009725318",
+        "set:NC_045512"
+    ],
+    "gi_i": 1798174254,
+    "_version_": 1773711315042304000
+}
diff --git a/fetch-from-ncbi-virus b/fetch-from-ncbi-virus
@@ -0,0 +1,55 @@
+#!/bin/bash
+# usage: fetch-from-ncbi-virus [options] <ncbi_taxon_id> <github_repo>
+#
+# Fetch metadata and nucleotide sequences from [NCBI Virus](https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/)
+# and output NDJSON records to stdout.
+#
+# options:
+#
+#   --filter=<filter_query>                                 Filter criteria to add as `fq` param values for the NCBI Virus URL
+#                                                           May be specified multiple times.
+#
+#   --field=<output_column_name>:<ncbi_virus_field_name>    Metadata fields to add as `fl` param values for the NCBI Virus URL
+#                                                           May be specified multiple times.
+#
+# Originally copied from "bin/fetch-from-genbank" in nextstrain/ncov-ingest:
+#   https://github.com/nextstrain/ncov-ingest/blob/2a5f255329ee5bdf0cabc8b8827a700c92becbe4/bin/fetch-from-genbank
+#
+set -euo pipefail
+
+bin="$(dirname "$0")"
+
+
+main() {
+    declare -a filters
+    declare -a fields
+
+    for arg; do
+        case "$arg" in
+            --filter=*)
+                filters+=("${arg#*=}")
+                shift;;
+            --field=*)
+                fields+=("${arg#*=}")
+                shift;;
+            *)
+                break;;
+        esac
+    done
+
+    local ncbi_taxon_id="${1:?NCBI taxon id is required.}"
+    local github_repo="${2:?A GitHub repository with owner and repository name is required as the second argument}"
+
+    local ncbi_virus_url
+    ncbi_virus_url="$("$bin"/ncbi-virus-url --ncbi-taxon-id "$ncbi_taxon_id" --filters "${filters[@]}" --fields "${fields[@]}")"
+
+    fetch "$ncbi_virus_url" "$github_repo" | "$bin"/csv-to-ndjson
+}
+
+fetch() {
+    curl "$1" \
+        --fail --silent --show-error --http1.1 \
+        --header "User-Agent: https://github.com/$2 (hello@nextstrain.org)"
+}
+
+main "$@"