git subrepo pull (merge) ingest/vendored

subrepo: subdir: "ingest/vendored" merged: "d141c04" upstream: origin: "https://github.com/nextstrain/ingest" branch: "main" commit: "d141c04" git-subrepo: version: "0.4.6" origin: "https://github.com/ingydotnet/git-subrepo" commit: "110b9eb"
nextstrain · Aug 18, 2023 · 80fc4c5 · 80fc4c5
1 parent 1f38f62
commit 80fc4c5
Show file tree

Hide file tree

Showing 5 changed files with 83 additions and 8 deletions.
diff --git a/ingest/vendored/.gitrepo b/ingest/vendored/.gitrepo
@@ -6,7 +6,7 @@
 [subrepo]
 	remote = https://github.com/nextstrain/ingest
 	branch = main
-	commit = 5d908187d13cbce27253af5163b4803d6bac03a6
-	parent = 1cdd1970924fed3fca67db43e8cb8f3de010ec37
+	commit = d141c04ac38796cd26366207f43454f75b3d638b
+	parent = 1f38f623d493bbafc25a4ab60226a4bd59ef8f6d
 	method = merge
 	cmdver = 0.4.6
diff --git a/ingest/vendored/README.md b/ingest/vendored/README.md
@@ -69,6 +69,11 @@ Scripts for supporting ingest workflow automation that don’t really belong in
 - [trigger-on-new-data](trigger-on-new-data) - Triggers downstream GitHub Actions if the provided `upload-to-s3` outputs do not contain the `identical_file_message`
   A hacky way to ensure that we only trigger downstream phylogenetic builds if the S3 objects have been updated.
 
+NCBI interaction scripts that are useful for fetching public metadata and sequences.
+
+- [fetch-from-ncbi-entrez](fetch-from-ncbi-entrez) - Fetch metadata and nucleotide sequences from [NCBI Entrez](https://www.ncbi.nlm.nih.gov/books/NBK25501/) and output to a GenBank file.
+  Useful for pathogens with metadata and annotations in custom fields that are not part of the standard [NCBI Virus](https://www.ncbi.nlm.nih.gov/labs/virus/vssi/) or [NCBI Datasets](https://www.ncbi.nlm.nih.gov/datasets/) outputs.
+
 Potential Nextstrain CLI scripts
 
 - [sha256sum](sha256sum) - Used to check if files are identical in upload-to-s3 and download-from-s3 scripts.

diff --git a/ingest/vendored/fetch-from-ncbi-entrez b/ingest/vendored/fetch-from-ncbi-entrez
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+"""
+Fetch metadata and nucleotide sequences from NCBI Entrez and output to a GenBank file.
+"""
+import json
+import argparse
+from Bio import SeqIO, Entrez
+
+# To use the efetch API, the docs indicate only around 10,000 records should be fetched per request
+# https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch
+# However, in my testing with HepB, the max records returned was 9,999
+#   - Jover, 16 August 2023
+BATCH_SIZE = 9999
+
+Entrez.email = "hello@nextstrain.org"
+
+def parse_args():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument('--term', required=True, type=str,
+        help='Genbank search term. Replace spaces with "+", e.g. "Hepatitis+B+virus[All+Fields]complete+genome[All+Fields]"')
+    parser.add_argument('--output', required=True, type=str, help='Output file (Genbank)')
+    return parser.parse_args()
+
+
+def get_esearch_history(term):
+    """
+    Search for the provided *term* via ESearch and store the results using the
+    Entrez history server.¹
+
+    Returns the total count of returned records, query key, and web env needed
+    to access the records from the server.
+
+    ¹ https://www.ncbi.nlm.nih.gov/books/NBK25497/#chapter2.Using_the_Entrez_History_Server
+    """
+    handle = Entrez.esearch(db="nucleotide", term=term, retmode="json", usehistory="y", retmax=0)
+    esearch_result = json.loads(handle.read())['esearchresult']
+    print(f"Search term {term!r} returned {esearch_result['count']} IDs.")
+    return {
+        "count": int(esearch_result["count"]),
+        "query_key": esearch_result["querykey"],
+        "web_env": esearch_result["webenv"]
+    }
+
+
+def fetch_from_esearch_history(count, query_key, web_env):
+    """
+    Fetch records in batches from Entrez history server using the provided
+    *query_key* and *web_env* and yields them as a BioPython SeqRecord iterator.
+    """
+    print(f"Fetching GenBank records in batches of n={BATCH_SIZE}")
+
+    for start in range(0, count, BATCH_SIZE):
+        handle = Entrez.efetch(
+            db="nucleotide",
+            query_key=query_key,
+            webenv=web_env,
+            retstart=start,
+            retmax=BATCH_SIZE,
+            rettype="gb",
+            retmode="text")
+
+        yield SeqIO.parse(handle, "genbank")
+
+
+if __name__=="__main__":
+    args = parse_args()
+
+    with open(args.output, "w") as output_handle:
+        for batch_results in fetch_from_esearch_history(**get_esearch_history(args.term)):
+            SeqIO.write(batch_results, output_handle, "genbank")
diff --git a/ingest/vendored/trigger b/ingest/vendored/trigger
@@ -3,7 +3,7 @@ set -euo pipefail
 
 : "${PAT_GITHUB_DISPATCH:=}"
 
-repo="${1:?A repository name is required as the first argument.}"
+github_repo="${1:?A GitHub repository with owner and repository name is required as the first argument.}"
 event_type="${2:?An event type is required as the second argument.}"
 shift 2
 
@@ -12,7 +12,7 @@ if [[ $# -eq 0 && -z $PAT_GITHUB_DISPATCH ]]; then
 You must specify options to curl for your GitHub credentials.  For example, you
 can specify your GitHub username, and will be prompted for your password:
 
-  $0 $repo $event_type --user <your-github-username>
+  $0 $github_repo $event_type --user <your-github-username>
 
 Be sure to enter a personal access token¹ as your password since GitHub has
 discontinued password authentication to the API starting on November 13, 2020².
@@ -26,7 +26,7 @@ file³:
 
 and then tell curl to use it:
 
-  $0 $repo $event_type --netrc
+  $0 $github_repo $event_type --netrc
 
 which will then not require you to type your password every time.
 
@@ -42,7 +42,7 @@ if [[ -n $PAT_GITHUB_DISPATCH ]]; then
   auth="Authorization: Bearer ${PAT_GITHUB_DISPATCH}"
 fi
 
-if curl -fsS "https://api.github.com/repos/nextstrain/${repo}/dispatches" \
+if curl -fsS "https://api.github.com/repos/${github_repo}/dispatches" \
     -H 'Accept: application/vnd.github.v3+json' \
     -H 'Content-Type: application/json' \
     -H "$auth" \

diff --git a/ingest/vendored/trigger-on-new-data b/ingest/vendored/trigger-on-new-data
@@ -5,7 +5,7 @@ set -euo pipefail
 
 bin="$(dirname "$0")"
 
-repo="${1:?A repository name is required as the first argument.}"
+github_repo="${1:?A GitHub repository with owner and repository name is required as the first argument.}"
 event_type="${2:?An event type is required as the second argument.}"
 metadata="${3:?A metadata upload output file is required as the third argument.}"
 sequences="${4:?An sequence FASTA upload output file is required as the fourth argument.}"
@@ -19,7 +19,7 @@ slack_message=""
 # grep exit status 0 for found match, 1 for no match, 2 if an error occurred
 if [[ $new_metadata -eq 1 || $new_sequences -eq 1 ]]; then
     slack_message="Triggering new builds due to updated metadata and/or sequences"
-    "$bin"/trigger "$repo" "$event_type"
+    "$bin"/trigger "$github_repo" "$event_type"
 elif [[ $new_metadata -eq 0 && $new_sequences -eq 0 ]]; then
     slack_message="Skipping trigger of rebuild: Both metadata TSV and sequences FASTA are identical to S3 files."
 else