From ea69f00a09b876d11c3ee0a6ad5b0c114644ac6a Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Mon, 1 Jul 2024 11:11:49 -0700 Subject: [PATCH 1/4] Add tests for augur curate format-dates Shows current behavior when the date field is empty or whitespace only. --- .../cram/format-dates/empty-date-field.t | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 tests/functional/curate/cram/format-dates/empty-date-field.t diff --git a/tests/functional/curate/cram/format-dates/empty-date-field.t b/tests/functional/curate/cram/format-dates/empty-date-field.t new file mode 100644 index 000000000..7fde801ca --- /dev/null +++ b/tests/functional/curate/cram/format-dates/empty-date-field.t @@ -0,0 +1,20 @@ +Setup + + $ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}" + +Test empty date value. +This currently has the unexpected behavior of returning the empty string. + + $ echo '{"record": 1, "date": ""}' \ + > | ${AUGUR} curate format-dates \ + > --date-fields "date" + {"record": 1, "date": ""} + +Test whitespace only date value. +This currently raises an error. + + $ echo '{"record": 1, "date": " "}' \ + > | ${AUGUR} curate format-dates \ + > --date-fields "date" + ERROR: Unable to format date string ' ' in field 'date' of record 0. + [2] From 06030e8211c2e266abdbb7d61e67a1c5750cfcdc Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Mon, 1 Jul 2024 11:22:28 -0700 Subject: [PATCH 2/4] format-dates: Return fully masked dates for empty date fields If a date field is empty, it should be completely masked to indicate that the date is unknown. Resolves --- augur/curate/format_dates.py | 12 +++++++++++- .../curate/cram/format-dates/empty-date-field.t | 11 ++++------- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/augur/curate/format_dates.py b/augur/curate/format_dates.py index 4fc94595a..7e39830cd 100644 --- a/augur/curate/format_dates.py +++ b/augur/curate/format_dates.py @@ -117,6 +117,10 @@ def format_date(date_string, expected_formats): >>> expected_formats = ['%Y', '%Y-%m', '%Y-%m-%d', '%Y-%m-%dT%H:%M:%SZ', '%m-%d'] + >>> format_date("", expected_formats) + 'XXXX-XX-XX' + >>> format_date(" ", expected_formats) + 'XXXX-XX-XX' >>> format_date("01-01", expected_formats) 'XXXX-XX-XX' >>> format_date("2020", expected_formats) @@ -133,6 +137,10 @@ def format_date(date_string, expected_formats): '2020-01-15' """ + date_string = date_string.strip() + if date_string == '': + return 'XXXX-XX-XX' + for date_format in expected_formats: try: parsed_date = datetime.strptime(date_string, date_format) @@ -180,7 +188,9 @@ def run(args, records): for field in args.date_fields: date_string = record.get(field) - if not date_string: + # TODO: This should raise an error if the expected date field does + # not exist in the the record + if date_string is None: continue formatted_date_string = format_date(date_string, args.expected_date_formats) diff --git a/tests/functional/curate/cram/format-dates/empty-date-field.t b/tests/functional/curate/cram/format-dates/empty-date-field.t index 7fde801ca..da33d5d01 100644 --- a/tests/functional/curate/cram/format-dates/empty-date-field.t +++ b/tests/functional/curate/cram/format-dates/empty-date-field.t @@ -2,19 +2,16 @@ Setup $ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}" -Test empty date value. -This currently has the unexpected behavior of returning the empty string. +Test empty date value, which should be returned as a fully masked date. $ echo '{"record": 1, "date": ""}' \ > | ${AUGUR} curate format-dates \ > --date-fields "date" - {"record": 1, "date": ""} + {"record": 1, "date": "XXXX-XX-XX"} -Test whitespace only date value. -This currently raises an error. +Test whitespace only date value, which should be returned as a fully masked date. $ echo '{"record": 1, "date": " "}' \ > | ${AUGUR} curate format-dates \ > --date-fields "date" - ERROR: Unable to format date string ' ' in field 'date' of record 0. - [2] + {"record": 1, "date": "XXXX-XX-XX"} From ff7c859f4cb5f755d51865eeed4a4c5ac227c5d9 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Mon, 1 Jul 2024 13:29:11 -0700 Subject: [PATCH 3/4] format-dates: Error on date fields that do not exist The user should be providing date fields that exist in the record. Raise a loud error if a field does not exist! --- augur/curate/format_dates.py | 4 +--- .../cram/format-dates/date-field-not-found-error.t | 11 +++++++++++ 2 files changed, 12 insertions(+), 3 deletions(-) create mode 100644 tests/functional/curate/cram/format-dates/date-field-not-found-error.t diff --git a/augur/curate/format_dates.py b/augur/curate/format_dates.py index 7e39830cd..063096de4 100644 --- a/augur/curate/format_dates.py +++ b/augur/curate/format_dates.py @@ -188,10 +188,8 @@ def run(args, records): for field in args.date_fields: date_string = record.get(field) - # TODO: This should raise an error if the expected date field does - # not exist in the the record if date_string is None: - continue + raise AugurError(f"Expected date field {field!r} not found in record {record_id!r}.") formatted_date_string = format_date(date_string, args.expected_date_formats) if formatted_date_string is None: diff --git a/tests/functional/curate/cram/format-dates/date-field-not-found-error.t b/tests/functional/curate/cram/format-dates/date-field-not-found-error.t new file mode 100644 index 000000000..6d5162e13 --- /dev/null +++ b/tests/functional/curate/cram/format-dates/date-field-not-found-error.t @@ -0,0 +1,11 @@ +Setup + + $ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}" + +Providing a date field that does not exist in the record should result in an error. + + $ echo '{"record": 1, "date": "2024-01-01"}' \ + > | ${AUGUR} curate format-dates \ + > --date-fields "bad-date-field" + ERROR: Expected date field 'bad-date-field' not found in record 0. + [2] From c2f29cf974cf69a37cfe8c4c91cafbd4db9231a0 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Mon, 1 Jul 2024 16:18:07 -0700 Subject: [PATCH 4/4] Update changelog --- CHANGES.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index 37a7b6881..9feeb20d7 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,6 +2,10 @@ ## __NEXT__ +### Major changes + +* curate format-dates: Raises an error if provided date field does not exist in records. [#1509][] (@joverlee521) + ### Features * Added a new sub-command `augur curate apply-geolocation-rules` to apply user curated geolocation rules to the geolocation fields in a metadata file. Previously, this was available as a script within the nextstrain/ingest repo. [#1491][] (@victorlin) @@ -15,6 +19,7 @@ * filter: Improve speed of checking duplicates in metadata, especially for large files. [#1466][] (@victorlin) * curate: Stop adding double quotes to the metadata TSV output when field values have internal quotes. [#1493][] (@joverlee521) +* curate format-dates: Mask empty date values as `XXXX-XX-XX` to represent unknown dates. [#1509][] (@joverlee521) [#1466]: https://github.com/nextstrain/augur/pull/1466 [#1490]: https://github.com/nextstrain/augur/pull/1490 @@ -22,6 +27,7 @@ [#1493]: https://github.com/nextstrain/augur/pull/1493 [#1495]: https://github.com/nextstrain/augur/pull/1495 [#1501]: https://github.com/nextstrain/augur/pull/1501 +[#1509]: https://github.com/nextstrain/augur/pull/1509 ## 24.4.0 (15 May 2024)