From 8ecd4eb49cd37114922e7e8458471feb3c84adec Mon Sep 17 00:00:00 2001
From: John SJ Anderson <janders4@fredhutch.org>
Date: Mon, 1 Jul 2024 10:22:39 -0700
Subject: [PATCH 1/3] cp ingest/transform-genbank-location
 augur/curate/parse_genbank_location [#1485]

---
 augur/curate/parse_genbank_location.py | 59 ++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)
 create mode 100644 augur/curate/parse_genbank_location.py
diff --git a/augur/curate/parse_genbank_location.py b/augur/curate/parse_genbank_location.py
new file mode 100644
index 000000000..010955aea
--- /dev/null
+++ b/augur/curate/parse_genbank_location.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+"""
+Parses GenBank's 'location' field of the NDJSON record from stdin to 3 separate
+fields: 'country', 'division', and 'location'. Checks that a record is from
+GenBank by verifying that the 'database' field has a value of "GenBank" or "RefSeq".
+
+Outputs the modified record to stdout.
+"""
+import json
+from sys import stdin, stderr, stdout
+
+
+def parse_location(record: dict) -> dict:
+    # Expected pattern for the location field is "<country_value>[:<region>][, <locality>]"
+    # See GenBank docs for their "country" field:
+    # https://www.ncbi.nlm.nih.gov/genbank/collab/country/
+    location_field = record.get("location", "")
+    if not location_field:
+        print(
+            "`transform-genbank-location` requires a `location` field; this record does not have one.",
+            file=stderr,
+        )
+        # bail early because we're not gonna make any changes
+        return record
+
+    geographic_data = location_field.split(':')
+
+    country = geographic_data[0]
+    division = ''
+    location = ''
+
+    if len(geographic_data) == 2:
+        division , _ , location = geographic_data[1].partition(',')
+
+    record['country'] = country.strip()
+    record['division'] = division.strip()
+    record['location'] = location.strip()
+
+    return record
+
+
+if __name__ == '__main__':
+
+    for record in stdin:
+        record = json.loads(record)
+
+        database = record.get('database', '')
+        if database in {'GenBank', 'RefSeq'}:
+            parse_location(record)
+        else:
+            if database:
+                error_msg = f"""Database value of {database} not supported for `transform-genbank-location`; must be "GenBank" or "RefSeq"."""
+            else:
+                error_msg = "Record must contain `database` field to use `transform-genbank-location.`"
+
+            print(error_msg, file=stderr)
+
+        json.dump(record, stdout, allow_nan=False, indent=None, separators=',:')
+        print()

From 2100393f7842c906e36390e4bab530d77c191da5 Mon Sep 17 00:00:00 2001
From: John SJ Anderson <janders4@fredhutch.org>
Date: Mon, 1 Jul 2024 12:14:10 -0700
Subject: [PATCH 2/3] Port `parse-genbank-location` into `augur curate` style
 [#1485]

* Convert script over to expected sub-command style
* Add `--location-field` argument to preserve back-compat with older
  data files
* Add type hints throughout
* Add tests
---
 augur/curate/__init__.py                      |  3 +-
 augur/curate/parse_genbank_location.py        | 85 ++++++++++++-------
 .../parse-genbank-location/default-behavior.t | 33 +++++++
 .../cram/parse-genbank-location/errors.t      | 31 +++++++
 4 files changed, 122 insertions(+), 30 deletions(-)
 create mode 100644 tests/functional/curate/cram/parse-genbank-location/default-behavior.t
 create mode 100644 tests/functional/curate/cram/parse-genbank-location/errors.t

diff --git a/augur/curate/__init__.py b/augur/curate/__init__.py
index 211e093ef..fce520e6b 100644
--- a/augur/curate/__init__.py
+++ b/augur/curate/__init__.py
@@ -12,7 +12,7 @@
 from augur.io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_table_to_dict, read_metadata_with_sequences, write_records_to_tsv
 from augur.io.sequences import write_records_to_fasta
 from augur.types import DataErrorMethod
-from . import format_dates, normalize_strings, passthru, titlecase, apply_geolocation_rules, apply_record_annotations, abbreviate_authors
+from . import format_dates, normalize_strings, passthru, titlecase, apply_geolocation_rules, apply_record_annotations, abbreviate_authors, parse_genbank_location
 
 
 SUBCOMMAND_ATTRIBUTE = '_curate_subcommand'
@@ -24,6 +24,7 @@
     apply_geolocation_rules,
     apply_record_annotations,
     abbreviate_authors,
+    parse_genbank_location,
 ]
 
 
diff --git a/augur/curate/parse_genbank_location.py b/augur/curate/parse_genbank_location.py
index 010955aea..35c55aa2d 100644
--- a/augur/curate/parse_genbank_location.py
+++ b/augur/curate/parse_genbank_location.py
@@ -1,59 +1,86 @@
-#!/usr/bin/env python3
 """
-Parses GenBank's 'location' field of the NDJSON record from stdin to 3 separate
-fields: 'country', 'division', and 'location'. Checks that a record is from
-GenBank by verifying that the 'database' field has a value of "GenBank" or "RefSeq".
+Parses GenBank's 'location' field of the NDJSON record to 3 separate
+fields: 'country', 'division', and 'location'.
 
-Outputs the modified record to stdout.
+Checks that a record is from GenBank by verifying that the 'database'
+field has a value of "GenBank" or "RefSeq".
 """
-import json
-from sys import stdin, stderr, stdout
 
+import argparse
+from typing import Generator, List
+from augur.io.print import print_err
+from augur.utils import first_line
 
-def parse_location(record: dict) -> dict:
-    # Expected pattern for the location field is "<country_value>[:<region>][, <locality>]"
+
+def parse_location(
+    record: dict,
+    location_field_name: str,
+) -> dict:
+    # Expected pattern for the location field is
+    # "<country_value>[:<region>][, <locality>]"
+    #
     # See GenBank docs for their "country" field:
     # https://www.ncbi.nlm.nih.gov/genbank/collab/country/
-    location_field = record.get("location", "")
+    location_field = record.get(location_field_name, "")
     if not location_field:
-        print(
-            "`transform-genbank-location` requires a `location` field; this record does not have one.",
-            file=stderr,
+        print_err(
+            f"`parse-genbank-location` requires a `{location_field_name}` field; this record does not have one.",
         )
         # bail early because we're not gonna make any changes
         return record
 
-    geographic_data = location_field.split(':')
+    geographic_data = location_field.split(":")
 
     country = geographic_data[0]
-    division = ''
-    location = ''
+    division = ""
+    location = ""
 
     if len(geographic_data) == 2:
-        division , _ , location = geographic_data[1].partition(',')
+        division, _, location = geographic_data[1].partition(",")
 
-    record['country'] = country.strip()
-    record['division'] = division.strip()
-    record['location'] = location.strip()
+    record["country"] = country.strip()
+    record["division"] = division.strip()
+    record["location"] = location.strip()
 
     return record
 
 
-if __name__ == '__main__':
+def register_parser(
+    parent_subparsers: argparse._SubParsersAction,
+) -> argparse._SubParsersAction:
+    parser = parent_subparsers.add_parser(
+        "parse-genbank-location",
+        parents=[parent_subparsers.shared_parser],  # type: ignore
+        help=first_line(__doc__),
+    )
+
+    parser.add_argument(
+        "--location-field",
+        default="geo_loc_name",
+        help="The field containing the location, "
+        + "in the format `<geo_loc_name>[:<region>][, <locality>]`",
+    )
+
+    return parser
 
-    for record in stdin:
-        record = json.loads(record)
 
-        database = record.get('database', '')
-        if database in {'GenBank', 'RefSeq'}:
-            parse_location(record)
+def run(
+    args: argparse.Namespace,
+    records: List[dict],
+) -> Generator[dict, None, None]:
+    for record in records:
+        database = record.get("database", "")
+        if database in {"GenBank", "RefSeq"}:
+            parse_location(
+                record,
+                args.location_field,
+            )
         else:
             if database:
                 error_msg = f"""Database value of {database} not supported for `transform-genbank-location`; must be "GenBank" or "RefSeq"."""
             else:
                 error_msg = "Record must contain `database` field to use `transform-genbank-location.`"
 
-            print(error_msg, file=stderr)
+            print_err(error_msg)
 
-        json.dump(record, stdout, allow_nan=False, indent=None, separators=',:')
-        print()
+        yield record
diff --git a/tests/functional/curate/cram/parse-genbank-location/default-behavior.t b/tests/functional/curate/cram/parse-genbank-location/default-behavior.t
new file mode 100644
index 000000000..4925aaaab
--- /dev/null
+++ b/tests/functional/curate/cram/parse-genbank-location/default-behavior.t
@@ -0,0 +1,33 @@
+Setup
+
+  $ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}"
+
+Running the command with no arguments produces the expected output
+
+  $ echo '{"geo_loc_name":"Canada:Vancouver", "database":"GenBank"}' \
+  >   | ${AUGUR} curate parse-genbank-location
+  {"geo_loc_name": "Canada:Vancouver", "database": "GenBank", "country": "Canada", "division": "Vancouver", "location": ""}
+
+`--location-field` can be used to specify a different field name
+
+  $ echo '{"location":"Canada:Vancouver", "database":"GenBank"}' \
+  >   | ${AUGUR} curate parse-genbank-location --location-field location
+  {"location": "", "database": "GenBank", "country": "Canada", "division": "Vancouver"}
+
+`RefSeq` works as a database value
+
+  $ echo '{"geo_loc_name":"Canada:Vancouver", "database":"RefSeq"}' \
+  >   | ${AUGUR} curate parse-genbank-location
+  {"geo_loc_name": "Canada:Vancouver", "database": "RefSeq", "country": "Canada", "division": "Vancouver", "location": ""}
+
+Record with only a `country` value results in expected output
+
+  $ echo '{"geo_loc_name":"Canada", "database":"GenBank"}' \
+  >   | ${AUGUR} curate parse-genbank-location
+  {"geo_loc_name": "Canada", "database": "GenBank", "country": "Canada", "division": "", "location": ""}
+
+Record with `country`, `region`, and `location` values results in expected output
+
+  $ echo '{"geo_loc_name":"France:Cote d'\''Azur, Antibes", "database":"GenBank"}' \
+  >   | ${AUGUR} curate parse-genbank-location
+  {"geo_loc_name": "France:Cote d'Azur, Antibes", "database": "GenBank", "country": "France", "division": "Cote d'Azur", "location": "Antibes"}
diff --git a/tests/functional/curate/cram/parse-genbank-location/errors.t b/tests/functional/curate/cram/parse-genbank-location/errors.t
new file mode 100644
index 000000000..fa01784e7
--- /dev/null
+++ b/tests/functional/curate/cram/parse-genbank-location/errors.t
@@ -0,0 +1,31 @@
+Setup
+
+  $ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}"
+
+Records without a `database` field result in the expected warning
+
+  $ echo '{"geo_loc_name":"Canada:Vancouver"}' \
+  >   | ${AUGUR} curate parse-genbank-location
+  Record must contain `database` field to use `transform-genbank-location.`
+  {"geo_loc_name": "Canada:Vancouver"}
+
+Records with a `database` field with an unsupported value result in the expected warning
+
+  $ echo '{"geo_loc_name":"Canada:Vancouver", "database":"database"}' \
+  >   | ${AUGUR} curate parse-genbank-location
+  Database value of database not supported for `transform-genbank-location`; must be "GenBank" or "RefSeq".
+  {"geo_loc_name": "Canada:Vancouver", "database": "database"}
+
+Records without a `location` field result in the expected warning
+
+  $ echo '{"database":"GenBank"}' \
+  >   | ${AUGUR} curate parse-genbank-location
+  `parse-genbank-location` requires a `geo_loc_name` field; this record does not have one.
+  {"database": "GenBank"}
+
+Records without a `location` field and a custom `--location-field` result in the expected warning
+
+  $ echo '{"database":"GenBank"}' \
+  >   | ${AUGUR} curate parse-genbank-location --location-field location
+  `parse-genbank-location` requires a `location` field; this record does not have one.
+  {"database": "GenBank"}

From ee3be712642df7cb20da39815b09c1734c990b94 Mon Sep 17 00:00:00 2001
From: John SJ Anderson <janders4@fredhutch.org>
Date: Mon, 1 Jul 2024 13:56:03 -0700
Subject: [PATCH 3/3] Update CHANGES.md

---
 CHANGES.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGES.md b/CHANGES.md
index 4a7c19f7a..48a91c47d 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -8,7 +8,7 @@
 * Added a default color for the "Asia" region that will be used in `augur export` is no custom colors are provided. [#1490][] (@joverlee521)
 * Added a new sub-command `augur curate apply-record-annotations` to apply user curated annotations to existing fields in a metadata file. Previously, this was available as a `merge-user-metadata` in the nextstrain/ingest repo. [#1495][] (@joverlee521)
 * Added a new sub-command `augur curate abbreviate-authors` to abbreviate lists of authors to "<first author> et al." Previously, this was avaliable as the `transform-authors` script within the nextstrain/ingest repo. [#1483][] (@genehack)
-
+* Added a new sub-command `augur curate parse-genbank-location` to parse the `geo_loc_name` field from GenBank reconds. Previously, this was available as the `translate-genbank-location` script within the nextstrain/ingest repo. [#1485][] (@genehack)
 ### Bug Fixes
 
 * filter: Improve speed of checking duplicates in metadata, especially for large files. [#1466][] (@victorlin)