From 8ecd4eb49cd37114922e7e8458471feb3c84adec Mon Sep 17 00:00:00 2001 From: John SJ Anderson Date: Mon, 1 Jul 2024 10:22:39 -0700 Subject: [PATCH 1/3] cp ingest/transform-genbank-location augur/curate/parse_genbank_location [#1485] --- augur/curate/parse_genbank_location.py | 59 ++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 augur/curate/parse_genbank_location.py diff --git a/augur/curate/parse_genbank_location.py b/augur/curate/parse_genbank_location.py new file mode 100644 index 000000000..010955aea --- /dev/null +++ b/augur/curate/parse_genbank_location.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 +""" +Parses GenBank's 'location' field of the NDJSON record from stdin to 3 separate +fields: 'country', 'division', and 'location'. Checks that a record is from +GenBank by verifying that the 'database' field has a value of "GenBank" or "RefSeq". + +Outputs the modified record to stdout. +""" +import json +from sys import stdin, stderr, stdout + + +def parse_location(record: dict) -> dict: + # Expected pattern for the location field is "[:][, ]" + # See GenBank docs for their "country" field: + # https://www.ncbi.nlm.nih.gov/genbank/collab/country/ + location_field = record.get("location", "") + if not location_field: + print( + "`transform-genbank-location` requires a `location` field; this record does not have one.", + file=stderr, + ) + # bail early because we're not gonna make any changes + return record + + geographic_data = location_field.split(':') + + country = geographic_data[0] + division = '' + location = '' + + if len(geographic_data) == 2: + division , _ , location = geographic_data[1].partition(',') + + record['country'] = country.strip() + record['division'] = division.strip() + record['location'] = location.strip() + + return record + + +if __name__ == '__main__': + + for record in stdin: + record = json.loads(record) + + database = record.get('database', '') + if database in {'GenBank', 'RefSeq'}: + parse_location(record) + else: + if database: + error_msg = f"""Database value of {database} not supported for `transform-genbank-location`; must be "GenBank" or "RefSeq".""" + else: + error_msg = "Record must contain `database` field to use `transform-genbank-location.`" + + print(error_msg, file=stderr) + + json.dump(record, stdout, allow_nan=False, indent=None, separators=',:') + print() From 2100393f7842c906e36390e4bab530d77c191da5 Mon Sep 17 00:00:00 2001 From: John SJ Anderson Date: Mon, 1 Jul 2024 12:14:10 -0700 Subject: [PATCH 2/3] Port `parse-genbank-location` into `augur curate` style [#1485] * Convert script over to expected sub-command style * Add `--location-field` argument to preserve back-compat with older data files * Add type hints throughout * Add tests --- augur/curate/__init__.py | 3 +- augur/curate/parse_genbank_location.py | 85 ++++++++++++------- .../parse-genbank-location/default-behavior.t | 33 +++++++ .../cram/parse-genbank-location/errors.t | 31 +++++++ 4 files changed, 122 insertions(+), 30 deletions(-) create mode 100644 tests/functional/curate/cram/parse-genbank-location/default-behavior.t create mode 100644 tests/functional/curate/cram/parse-genbank-location/errors.t diff --git a/augur/curate/__init__.py b/augur/curate/__init__.py index 211e093ef..fce520e6b 100644 --- a/augur/curate/__init__.py +++ b/augur/curate/__init__.py @@ -12,7 +12,7 @@ from augur.io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_table_to_dict, read_metadata_with_sequences, write_records_to_tsv from augur.io.sequences import write_records_to_fasta from augur.types import DataErrorMethod -from . import format_dates, normalize_strings, passthru, titlecase, apply_geolocation_rules, apply_record_annotations, abbreviate_authors +from . import format_dates, normalize_strings, passthru, titlecase, apply_geolocation_rules, apply_record_annotations, abbreviate_authors, parse_genbank_location SUBCOMMAND_ATTRIBUTE = '_curate_subcommand' @@ -24,6 +24,7 @@ apply_geolocation_rules, apply_record_annotations, abbreviate_authors, + parse_genbank_location, ] diff --git a/augur/curate/parse_genbank_location.py b/augur/curate/parse_genbank_location.py index 010955aea..35c55aa2d 100644 --- a/augur/curate/parse_genbank_location.py +++ b/augur/curate/parse_genbank_location.py @@ -1,59 +1,86 @@ -#!/usr/bin/env python3 """ -Parses GenBank's 'location' field of the NDJSON record from stdin to 3 separate -fields: 'country', 'division', and 'location'. Checks that a record is from -GenBank by verifying that the 'database' field has a value of "GenBank" or "RefSeq". +Parses GenBank's 'location' field of the NDJSON record to 3 separate +fields: 'country', 'division', and 'location'. -Outputs the modified record to stdout. +Checks that a record is from GenBank by verifying that the 'database' +field has a value of "GenBank" or "RefSeq". """ -import json -from sys import stdin, stderr, stdout +import argparse +from typing import Generator, List +from augur.io.print import print_err +from augur.utils import first_line -def parse_location(record: dict) -> dict: - # Expected pattern for the location field is "[:][, ]" + +def parse_location( + record: dict, + location_field_name: str, +) -> dict: + # Expected pattern for the location field is + # "[:][, ]" + # # See GenBank docs for their "country" field: # https://www.ncbi.nlm.nih.gov/genbank/collab/country/ - location_field = record.get("location", "") + location_field = record.get(location_field_name, "") if not location_field: - print( - "`transform-genbank-location` requires a `location` field; this record does not have one.", - file=stderr, + print_err( + f"`parse-genbank-location` requires a `{location_field_name}` field; this record does not have one.", ) # bail early because we're not gonna make any changes return record - geographic_data = location_field.split(':') + geographic_data = location_field.split(":") country = geographic_data[0] - division = '' - location = '' + division = "" + location = "" if len(geographic_data) == 2: - division , _ , location = geographic_data[1].partition(',') + division, _, location = geographic_data[1].partition(",") - record['country'] = country.strip() - record['division'] = division.strip() - record['location'] = location.strip() + record["country"] = country.strip() + record["division"] = division.strip() + record["location"] = location.strip() return record -if __name__ == '__main__': +def register_parser( + parent_subparsers: argparse._SubParsersAction, +) -> argparse._SubParsersAction: + parser = parent_subparsers.add_parser( + "parse-genbank-location", + parents=[parent_subparsers.shared_parser], # type: ignore + help=first_line(__doc__), + ) + + parser.add_argument( + "--location-field", + default="geo_loc_name", + help="The field containing the location, " + + "in the format `[:][, ]`", + ) + + return parser - for record in stdin: - record = json.loads(record) - database = record.get('database', '') - if database in {'GenBank', 'RefSeq'}: - parse_location(record) +def run( + args: argparse.Namespace, + records: List[dict], +) -> Generator[dict, None, None]: + for record in records: + database = record.get("database", "") + if database in {"GenBank", "RefSeq"}: + parse_location( + record, + args.location_field, + ) else: if database: error_msg = f"""Database value of {database} not supported for `transform-genbank-location`; must be "GenBank" or "RefSeq".""" else: error_msg = "Record must contain `database` field to use `transform-genbank-location.`" - print(error_msg, file=stderr) + print_err(error_msg) - json.dump(record, stdout, allow_nan=False, indent=None, separators=',:') - print() + yield record diff --git a/tests/functional/curate/cram/parse-genbank-location/default-behavior.t b/tests/functional/curate/cram/parse-genbank-location/default-behavior.t new file mode 100644 index 000000000..4925aaaab --- /dev/null +++ b/tests/functional/curate/cram/parse-genbank-location/default-behavior.t @@ -0,0 +1,33 @@ +Setup + + $ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}" + +Running the command with no arguments produces the expected output + + $ echo '{"geo_loc_name":"Canada:Vancouver", "database":"GenBank"}' \ + > | ${AUGUR} curate parse-genbank-location + {"geo_loc_name": "Canada:Vancouver", "database": "GenBank", "country": "Canada", "division": "Vancouver", "location": ""} + +`--location-field` can be used to specify a different field name + + $ echo '{"location":"Canada:Vancouver", "database":"GenBank"}' \ + > | ${AUGUR} curate parse-genbank-location --location-field location + {"location": "", "database": "GenBank", "country": "Canada", "division": "Vancouver"} + +`RefSeq` works as a database value + + $ echo '{"geo_loc_name":"Canada:Vancouver", "database":"RefSeq"}' \ + > | ${AUGUR} curate parse-genbank-location + {"geo_loc_name": "Canada:Vancouver", "database": "RefSeq", "country": "Canada", "division": "Vancouver", "location": ""} + +Record with only a `country` value results in expected output + + $ echo '{"geo_loc_name":"Canada", "database":"GenBank"}' \ + > | ${AUGUR} curate parse-genbank-location + {"geo_loc_name": "Canada", "database": "GenBank", "country": "Canada", "division": "", "location": ""} + +Record with `country`, `region`, and `location` values results in expected output + + $ echo '{"geo_loc_name":"France:Cote d'\''Azur, Antibes", "database":"GenBank"}' \ + > | ${AUGUR} curate parse-genbank-location + {"geo_loc_name": "France:Cote d'Azur, Antibes", "database": "GenBank", "country": "France", "division": "Cote d'Azur", "location": "Antibes"} diff --git a/tests/functional/curate/cram/parse-genbank-location/errors.t b/tests/functional/curate/cram/parse-genbank-location/errors.t new file mode 100644 index 000000000..fa01784e7 --- /dev/null +++ b/tests/functional/curate/cram/parse-genbank-location/errors.t @@ -0,0 +1,31 @@ +Setup + + $ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}" + +Records without a `database` field result in the expected warning + + $ echo '{"geo_loc_name":"Canada:Vancouver"}' \ + > | ${AUGUR} curate parse-genbank-location + Record must contain `database` field to use `transform-genbank-location.` + {"geo_loc_name": "Canada:Vancouver"} + +Records with a `database` field with an unsupported value result in the expected warning + + $ echo '{"geo_loc_name":"Canada:Vancouver", "database":"database"}' \ + > | ${AUGUR} curate parse-genbank-location + Database value of database not supported for `transform-genbank-location`; must be "GenBank" or "RefSeq". + {"geo_loc_name": "Canada:Vancouver", "database": "database"} + +Records without a `location` field result in the expected warning + + $ echo '{"database":"GenBank"}' \ + > | ${AUGUR} curate parse-genbank-location + `parse-genbank-location` requires a `geo_loc_name` field; this record does not have one. + {"database": "GenBank"} + +Records without a `location` field and a custom `--location-field` result in the expected warning + + $ echo '{"database":"GenBank"}' \ + > | ${AUGUR} curate parse-genbank-location --location-field location + `parse-genbank-location` requires a `location` field; this record does not have one. + {"database": "GenBank"} From ee3be712642df7cb20da39815b09c1734c990b94 Mon Sep 17 00:00:00 2001 From: John SJ Anderson Date: Mon, 1 Jul 2024 13:56:03 -0700 Subject: [PATCH 3/3] Update CHANGES.md --- CHANGES.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index 4a7c19f7a..48a91c47d 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -8,7 +8,7 @@ * Added a default color for the "Asia" region that will be used in `augur export` is no custom colors are provided. [#1490][] (@joverlee521) * Added a new sub-command `augur curate apply-record-annotations` to apply user curated annotations to existing fields in a metadata file. Previously, this was available as a `merge-user-metadata` in the nextstrain/ingest repo. [#1495][] (@joverlee521) * Added a new sub-command `augur curate abbreviate-authors` to abbreviate lists of authors to " et al." Previously, this was avaliable as the `transform-authors` script within the nextstrain/ingest repo. [#1483][] (@genehack) - +* Added a new sub-command `augur curate parse-genbank-location` to parse the `geo_loc_name` field from GenBank reconds. Previously, this was available as the `translate-genbank-location` script within the nextstrain/ingest repo. [#1485][] (@genehack) ### Bug Fixes * filter: Improve speed of checking duplicates in metadata, especially for large files. [#1466][] (@victorlin)