-
Notifications
You must be signed in to change notification settings - Fork 129
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1508 from nextstrain/parse-gb-loc-1485
Port `translate-genbank-location` to `augur curate parse-genbank-location` [#1485]
- Loading branch information
Showing
5 changed files
with
153 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
""" | ||
Parses GenBank's 'location' field of the NDJSON record to 3 separate | ||
fields: 'country', 'division', and 'location'. | ||
Checks that a record is from GenBank by verifying that the 'database' | ||
field has a value of "GenBank" or "RefSeq". | ||
""" | ||
|
||
import argparse | ||
from typing import Generator, List | ||
from augur.io.print import print_err | ||
from augur.utils import first_line | ||
|
||
|
||
def parse_location( | ||
record: dict, | ||
location_field_name: str, | ||
) -> dict: | ||
# Expected pattern for the location field is | ||
# "<country_value>[:<region>][, <locality>]" | ||
# | ||
# See GenBank docs for their "country" field: | ||
# https://www.ncbi.nlm.nih.gov/genbank/collab/country/ | ||
location_field = record.get(location_field_name, "") | ||
if not location_field: | ||
print_err( | ||
f"`parse-genbank-location` requires a `{location_field_name}` field; this record does not have one.", | ||
) | ||
# bail early because we're not gonna make any changes | ||
return record | ||
|
||
geographic_data = location_field.split(":") | ||
|
||
country = geographic_data[0] | ||
division = "" | ||
location = "" | ||
|
||
if len(geographic_data) == 2: | ||
division, _, location = geographic_data[1].partition(",") | ||
|
||
record["country"] = country.strip() | ||
record["division"] = division.strip() | ||
record["location"] = location.strip() | ||
|
||
return record | ||
|
||
|
||
def register_parser( | ||
parent_subparsers: argparse._SubParsersAction, | ||
) -> argparse._SubParsersAction: | ||
parser = parent_subparsers.add_parser( | ||
"parse-genbank-location", | ||
parents=[parent_subparsers.shared_parser], # type: ignore | ||
help=first_line(__doc__), | ||
) | ||
|
||
parser.add_argument( | ||
"--location-field", | ||
default="geo_loc_name", | ||
help="The field containing the location, " | ||
+ "in the format `<geo_loc_name>[:<region>][, <locality>]`", | ||
) | ||
|
||
return parser | ||
|
||
|
||
def run( | ||
args: argparse.Namespace, | ||
records: List[dict], | ||
) -> Generator[dict, None, None]: | ||
for record in records: | ||
database = record.get("database", "") | ||
if database in {"GenBank", "RefSeq"}: | ||
parse_location( | ||
record, | ||
args.location_field, | ||
) | ||
else: | ||
if database: | ||
error_msg = f"""Database value of {database} not supported for `transform-genbank-location`; must be "GenBank" or "RefSeq".""" | ||
else: | ||
error_msg = "Record must contain `database` field to use `transform-genbank-location.`" | ||
|
||
print_err(error_msg) | ||
|
||
yield record |
33 changes: 33 additions & 0 deletions
33
tests/functional/curate/cram/parse-genbank-location/default-behavior.t
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
Setup | ||
|
||
$ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}" | ||
|
||
Running the command with no arguments produces the expected output | ||
|
||
$ echo '{"geo_loc_name":"Canada:Vancouver", "database":"GenBank"}' \ | ||
> | ${AUGUR} curate parse-genbank-location | ||
{"geo_loc_name": "Canada:Vancouver", "database": "GenBank", "country": "Canada", "division": "Vancouver", "location": ""} | ||
|
||
`--location-field` can be used to specify a different field name | ||
|
||
$ echo '{"location":"Canada:Vancouver", "database":"GenBank"}' \ | ||
> | ${AUGUR} curate parse-genbank-location --location-field location | ||
{"location": "", "database": "GenBank", "country": "Canada", "division": "Vancouver"} | ||
|
||
`RefSeq` works as a database value | ||
|
||
$ echo '{"geo_loc_name":"Canada:Vancouver", "database":"RefSeq"}' \ | ||
> | ${AUGUR} curate parse-genbank-location | ||
{"geo_loc_name": "Canada:Vancouver", "database": "RefSeq", "country": "Canada", "division": "Vancouver", "location": ""} | ||
|
||
Record with only a `country` value results in expected output | ||
|
||
$ echo '{"geo_loc_name":"Canada", "database":"GenBank"}' \ | ||
> | ${AUGUR} curate parse-genbank-location | ||
{"geo_loc_name": "Canada", "database": "GenBank", "country": "Canada", "division": "", "location": ""} | ||
|
||
Record with `country`, `region`, and `location` values results in expected output | ||
|
||
$ echo '{"geo_loc_name":"France:Cote d'\''Azur, Antibes", "database":"GenBank"}' \ | ||
> | ${AUGUR} curate parse-genbank-location | ||
{"geo_loc_name": "France:Cote d'Azur, Antibes", "database": "GenBank", "country": "France", "division": "Cote d'Azur", "location": "Antibes"} |
31 changes: 31 additions & 0 deletions
31
tests/functional/curate/cram/parse-genbank-location/errors.t
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
Setup | ||
|
||
$ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}" | ||
|
||
Records without a `database` field result in the expected warning | ||
|
||
$ echo '{"geo_loc_name":"Canada:Vancouver"}' \ | ||
> | ${AUGUR} curate parse-genbank-location | ||
Record must contain `database` field to use `transform-genbank-location.` | ||
{"geo_loc_name": "Canada:Vancouver"} | ||
|
||
Records with a `database` field with an unsupported value result in the expected warning | ||
|
||
$ echo '{"geo_loc_name":"Canada:Vancouver", "database":"database"}' \ | ||
> | ${AUGUR} curate parse-genbank-location | ||
Database value of database not supported for `transform-genbank-location`; must be "GenBank" or "RefSeq". | ||
{"geo_loc_name": "Canada:Vancouver", "database": "database"} | ||
|
||
Records without a `location` field result in the expected warning | ||
|
||
$ echo '{"database":"GenBank"}' \ | ||
> | ${AUGUR} curate parse-genbank-location | ||
`parse-genbank-location` requires a `geo_loc_name` field; this record does not have one. | ||
{"database": "GenBank"} | ||
|
||
Records without a `location` field and a custom `--location-field` result in the expected warning | ||
|
||
$ echo '{"database":"GenBank"}' \ | ||
> | ${AUGUR} curate parse-genbank-location --location-field location | ||
`parse-genbank-location` requires a `location` field; this record does not have one. | ||
{"database": "GenBank"} |