diff --git a/CHANGES.md b/CHANGES.md index 4a7c19f7a..48a91c47d 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -8,7 +8,7 @@ * Added a default color for the "Asia" region that will be used in `augur export` is no custom colors are provided. [#1490][] (@joverlee521) * Added a new sub-command `augur curate apply-record-annotations` to apply user curated annotations to existing fields in a metadata file. Previously, this was available as a `merge-user-metadata` in the nextstrain/ingest repo. [#1495][] (@joverlee521) * Added a new sub-command `augur curate abbreviate-authors` to abbreviate lists of authors to " et al." Previously, this was avaliable as the `transform-authors` script within the nextstrain/ingest repo. [#1483][] (@genehack) - +* Added a new sub-command `augur curate parse-genbank-location` to parse the `geo_loc_name` field from GenBank reconds. Previously, this was available as the `translate-genbank-location` script within the nextstrain/ingest repo. [#1485][] (@genehack) ### Bug Fixes * filter: Improve speed of checking duplicates in metadata, especially for large files. [#1466][] (@victorlin) diff --git a/augur/curate/__init__.py b/augur/curate/__init__.py index 211e093ef..fce520e6b 100644 --- a/augur/curate/__init__.py +++ b/augur/curate/__init__.py @@ -12,7 +12,7 @@ from augur.io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_table_to_dict, read_metadata_with_sequences, write_records_to_tsv from augur.io.sequences import write_records_to_fasta from augur.types import DataErrorMethod -from . import format_dates, normalize_strings, passthru, titlecase, apply_geolocation_rules, apply_record_annotations, abbreviate_authors +from . import format_dates, normalize_strings, passthru, titlecase, apply_geolocation_rules, apply_record_annotations, abbreviate_authors, parse_genbank_location SUBCOMMAND_ATTRIBUTE = '_curate_subcommand' @@ -24,6 +24,7 @@ apply_geolocation_rules, apply_record_annotations, abbreviate_authors, + parse_genbank_location, ] diff --git a/augur/curate/parse_genbank_location.py b/augur/curate/parse_genbank_location.py new file mode 100644 index 000000000..35c55aa2d --- /dev/null +++ b/augur/curate/parse_genbank_location.py @@ -0,0 +1,86 @@ +""" +Parses GenBank's 'location' field of the NDJSON record to 3 separate +fields: 'country', 'division', and 'location'. + +Checks that a record is from GenBank by verifying that the 'database' +field has a value of "GenBank" or "RefSeq". +""" + +import argparse +from typing import Generator, List +from augur.io.print import print_err +from augur.utils import first_line + + +def parse_location( + record: dict, + location_field_name: str, +) -> dict: + # Expected pattern for the location field is + # "[:][, ]" + # + # See GenBank docs for their "country" field: + # https://www.ncbi.nlm.nih.gov/genbank/collab/country/ + location_field = record.get(location_field_name, "") + if not location_field: + print_err( + f"`parse-genbank-location` requires a `{location_field_name}` field; this record does not have one.", + ) + # bail early because we're not gonna make any changes + return record + + geographic_data = location_field.split(":") + + country = geographic_data[0] + division = "" + location = "" + + if len(geographic_data) == 2: + division, _, location = geographic_data[1].partition(",") + + record["country"] = country.strip() + record["division"] = division.strip() + record["location"] = location.strip() + + return record + + +def register_parser( + parent_subparsers: argparse._SubParsersAction, +) -> argparse._SubParsersAction: + parser = parent_subparsers.add_parser( + "parse-genbank-location", + parents=[parent_subparsers.shared_parser], # type: ignore + help=first_line(__doc__), + ) + + parser.add_argument( + "--location-field", + default="geo_loc_name", + help="The field containing the location, " + + "in the format `[:][, ]`", + ) + + return parser + + +def run( + args: argparse.Namespace, + records: List[dict], +) -> Generator[dict, None, None]: + for record in records: + database = record.get("database", "") + if database in {"GenBank", "RefSeq"}: + parse_location( + record, + args.location_field, + ) + else: + if database: + error_msg = f"""Database value of {database} not supported for `transform-genbank-location`; must be "GenBank" or "RefSeq".""" + else: + error_msg = "Record must contain `database` field to use `transform-genbank-location.`" + + print_err(error_msg) + + yield record diff --git a/tests/functional/curate/cram/parse-genbank-location/default-behavior.t b/tests/functional/curate/cram/parse-genbank-location/default-behavior.t new file mode 100644 index 000000000..4925aaaab --- /dev/null +++ b/tests/functional/curate/cram/parse-genbank-location/default-behavior.t @@ -0,0 +1,33 @@ +Setup + + $ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}" + +Running the command with no arguments produces the expected output + + $ echo '{"geo_loc_name":"Canada:Vancouver", "database":"GenBank"}' \ + > | ${AUGUR} curate parse-genbank-location + {"geo_loc_name": "Canada:Vancouver", "database": "GenBank", "country": "Canada", "division": "Vancouver", "location": ""} + +`--location-field` can be used to specify a different field name + + $ echo '{"location":"Canada:Vancouver", "database":"GenBank"}' \ + > | ${AUGUR} curate parse-genbank-location --location-field location + {"location": "", "database": "GenBank", "country": "Canada", "division": "Vancouver"} + +`RefSeq` works as a database value + + $ echo '{"geo_loc_name":"Canada:Vancouver", "database":"RefSeq"}' \ + > | ${AUGUR} curate parse-genbank-location + {"geo_loc_name": "Canada:Vancouver", "database": "RefSeq", "country": "Canada", "division": "Vancouver", "location": ""} + +Record with only a `country` value results in expected output + + $ echo '{"geo_loc_name":"Canada", "database":"GenBank"}' \ + > | ${AUGUR} curate parse-genbank-location + {"geo_loc_name": "Canada", "database": "GenBank", "country": "Canada", "division": "", "location": ""} + +Record with `country`, `region`, and `location` values results in expected output + + $ echo '{"geo_loc_name":"France:Cote d'\''Azur, Antibes", "database":"GenBank"}' \ + > | ${AUGUR} curate parse-genbank-location + {"geo_loc_name": "France:Cote d'Azur, Antibes", "database": "GenBank", "country": "France", "division": "Cote d'Azur", "location": "Antibes"} diff --git a/tests/functional/curate/cram/parse-genbank-location/errors.t b/tests/functional/curate/cram/parse-genbank-location/errors.t new file mode 100644 index 000000000..fa01784e7 --- /dev/null +++ b/tests/functional/curate/cram/parse-genbank-location/errors.t @@ -0,0 +1,31 @@ +Setup + + $ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}" + +Records without a `database` field result in the expected warning + + $ echo '{"geo_loc_name":"Canada:Vancouver"}' \ + > | ${AUGUR} curate parse-genbank-location + Record must contain `database` field to use `transform-genbank-location.` + {"geo_loc_name": "Canada:Vancouver"} + +Records with a `database` field with an unsupported value result in the expected warning + + $ echo '{"geo_loc_name":"Canada:Vancouver", "database":"database"}' \ + > | ${AUGUR} curate parse-genbank-location + Database value of database not supported for `transform-genbank-location`; must be "GenBank" or "RefSeq". + {"geo_loc_name": "Canada:Vancouver", "database": "database"} + +Records without a `location` field result in the expected warning + + $ echo '{"database":"GenBank"}' \ + > | ${AUGUR} curate parse-genbank-location + `parse-genbank-location` requires a `geo_loc_name` field; this record does not have one. + {"database": "GenBank"} + +Records without a `location` field and a custom `--location-field` result in the expected warning + + $ echo '{"database":"GenBank"}' \ + > | ${AUGUR} curate parse-genbank-location --location-field location + `parse-genbank-location` requires a `location` field; this record does not have one. + {"database": "GenBank"}