-
Notifications
You must be signed in to change notification settings - Fork 129
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1197: Curate titlecase
- Loading branch information
Showing
6 changed files
with
247 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
""" | ||
Applies titlecase to string fields in a metadata record | ||
""" | ||
import re | ||
from typing import Optional, Set, Union | ||
|
||
from augur.errors import AugurError | ||
from augur.io.print import print_err | ||
from augur.types import DataErrorMethod | ||
|
||
def register_parser(parent_subparsers): | ||
parser = parent_subparsers.add_parser("titlecase", | ||
parents = [parent_subparsers.shared_parser], | ||
help = __doc__) | ||
|
||
required = parser.add_argument_group(title="REQUIRED") | ||
required.add_argument("--titlecase-fields", nargs="*", | ||
help="List of fields to convert to titlecase.", required=True) | ||
|
||
optional = parser.add_argument_group(title="OPTIONAL") | ||
optional.add_argument("--articles", nargs="*", | ||
help="List of articles that should not be converted to titlecase.") | ||
optional.add_argument("--abbreviations", nargs="*", | ||
help="List of abbreviations that should not be converted to titlecase, keeps uppercase.") | ||
|
||
optional.add_argument("--failure-reporting", | ||
type=DataErrorMethod.argtype, | ||
choices=[ method for method in DataErrorMethod ], | ||
default=DataErrorMethod.ERROR_FIRST, | ||
help="How should failed titlecase formatting be reported.") | ||
return parser | ||
|
||
|
||
def titlecase(text: Union[str, None], articles: Set[str] = set(), abbreviations: Set[str] = set()) -> Optional[str]: | ||
""" | ||
Originally from nextstrain/ncov-ingest | ||
Returns a title cased location name from the given location name | ||
*tokens*. Ensures that no tokens contained in the *whitelist_tokens* are | ||
converted to title case. | ||
>>> articles = {'a', 'and', 'of', 'the', 'le'} | ||
>>> abbreviations = {'USA', 'DC'} | ||
>>> titlecase("the night OF THE LIVING DEAD", articles) | ||
'The Night of the Living Dead' | ||
>>> titlecase("BRAINE-LE-COMTE, FRANCE", articles) | ||
'Braine-le-Comte, France' | ||
>>> titlecase("auvergne-RHÔNE-alpes", articles) | ||
'Auvergne-Rhône-Alpes' | ||
>>> titlecase("washington DC, usa", articles, abbreviations) | ||
'Washington DC, USA' | ||
""" | ||
if not isinstance(text, str): | ||
return None | ||
|
||
words = enumerate(re.split(r'\b', text)) | ||
|
||
def changecase(index, word): | ||
casefold = word.casefold() | ||
upper = word.upper() | ||
|
||
if upper in abbreviations: | ||
return upper | ||
elif casefold in articles and index != 1: | ||
return word.lower() | ||
else: | ||
return word.title() | ||
|
||
return ''.join(changecase(i, w) for i, w in words) | ||
|
||
|
||
def run(args, records): | ||
failures = [] | ||
failure_reporting = args.failure_reporting | ||
|
||
articles = set() | ||
if args.articles: | ||
articles = set(args.articles) | ||
|
||
abbreviations = set() | ||
if args.abbreviations: | ||
abbreviations = set(args.abbreviations) | ||
|
||
for index, record in enumerate(records): | ||
record = record.copy() | ||
record_id = index | ||
|
||
for field in args.titlecase_fields: | ||
# Ignore non-existent fields but could change this to warnings if desired | ||
if field not in record: | ||
continue | ||
elif record[field] is None: | ||
continue | ||
|
||
titlecased_string = titlecase(record[field], articles, abbreviations) | ||
|
||
failure_message = f"Failed to titlecase {field!r}:{record.get(field)!r} in record {record_id!r} because the value is a {type(record.get(field)).__name__!r} and is not a string." | ||
if titlecased_string is None: | ||
if failure_reporting is DataErrorMethod.ERROR_FIRST: | ||
raise AugurError(failure_message) | ||
if failure_reporting is DataErrorMethod.ERROR_ALL: | ||
print_err(f"ERROR: {failure_message}") | ||
if failure_reporting is DataErrorMethod.WARN: | ||
print_err(f"WARNING: {failure_message}") | ||
|
||
# Keep track of failures for final summary | ||
failures.append((record_id, field, record.get(field))) | ||
else: | ||
record[field] = titlecased_string | ||
|
||
yield record | ||
|
||
if failure_reporting is not DataErrorMethod.SILENT and failures: | ||
failure_message = ( | ||
"Unable to change to titlecase for the following (record, field, field value):\n" + \ | ||
'\n'.join(map(repr, failures)) | ||
) | ||
if failure_reporting is DataErrorMethod.ERROR_ALL: | ||
raise AugurError(failure_message) | ||
|
||
elif failure_reporting is DataErrorMethod.WARN: | ||
print_err(f"WARNING: {failure_message}") | ||
|
||
else: | ||
raise ValueError(f"Encountered unhandled failure reporting method: {failure_reporting!r}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
================= | ||
titlecase | ||
================= | ||
|
||
.. argparse:: | ||
:module: augur | ||
:func: make_parser | ||
:prog: augur | ||
:path: curate titlecase |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
Setup | ||
|
||
$ pushd "$TESTDIR" > /dev/null | ||
$ export AUGUR="${AUGUR:-../../../../bin/augur}" | ||
|
||
|
||
Test output with articles and a mixture of lower and uppercase letters. | ||
|
||
$ echo '{"title": "the night OF THE LIVING DEAD"}' \ | ||
> | ${AUGUR} curate titlecase --titlecase-fields "title" --articles "a" "and" "of" "the" "le" | ||
{"title": "The Night of the Living Dead"} | ||
|
||
Test output with hyphenated location. | ||
|
||
$ echo '{"location": "BRAINE-LE-COMTE, FRANCE"}' \ | ||
> | ${AUGUR} curate titlecase --titlecase-fields "location" --articles "a" "and" "of" "the" "le" | ||
{"location": "Braine-le-Comte, France"} | ||
|
||
Test output with unicode characters | ||
|
||
$ echo '{"location": "Auvergne-Rhône-Alpes" }' \ | ||
> | ${AUGUR} curate titlecase --titlecase-fields "location" | ||
{"location": "Auvergne-Rh\u00f4ne-Alpes"} | ||
|
||
Test output with abbreviations | ||
|
||
$ echo '{"city": "Washington DC, USA"}' \ | ||
> | ${AUGUR} curate titlecase --titlecase-fields "city" --abbreviations "USA" "DC" | ||
{"city": "Washington DC, USA"} | ||
|
||
Test output with numbers | ||
|
||
$ echo '{"title": "2021 SARS-CoV"}' \ | ||
> | ${AUGUR} curate titlecase --titlecase-fields "title" --abbreviations "SARS" | ||
{"title": "2021 SARS-Cov"} | ||
|
||
Test output with only numbers | ||
|
||
$ echo '{"int": "2021", "float": "2021.10", "address": "2021.20.30" }' \ | ||
> | ${AUGUR} curate titlecase --titlecase-fields "int" "float" "address" | ||
{"int": "2021", "float": "2021.10", "address": "2021.20.30"} | ||
|
||
Test case that passes on empty or null values | ||
|
||
$ echo '{"empty": "", "null_entry":null }' \ | ||
> | ${AUGUR} curate titlecase --titlecase-fields "empty" "null_entry" | ||
{"empty": "", "null_entry": null} | ||
|
||
|
||
Test case that fails on a non-string int | ||
|
||
$ echo '{"bare_int": 2021}' \ | ||
> | ${AUGUR} curate titlecase --titlecase-fields "bare_int" | ||
ERROR: Failed to titlecase 'bare_int':2021 in record 0 because the value is a 'int' and is not a string. | ||
[2] | ||
|
||
Test case that fails on complex types (e.g. arrays) | ||
|
||
$ echo '{"an_array": ["hello", "world"]}' \ | ||
> | ${AUGUR} curate titlecase --titlecase-fields "an_array" | ||
ERROR: Failed to titlecase 'an_array':['hello', 'world'] in record 0 because the value is a 'list' and is not a string. | ||
[2] | ||
|
||
Test cases when fields do not exist, decide if this should error out and may affect ingest pipelines | ||
|
||
$ echo '{"region":"europe", "country":"france" }' \ | ||
> | ${AUGUR} curate titlecase --titlecase-fields "region" "country" "division" "location" "not exist" | ||
{"region": "Europe", "country": "France"} | ||
|
||
Test output with non-string value input with `ERROR_ALL` failure reporting. | ||
This reports a collection of all titlecase failures which is especially beneficial for automated pipelines. | ||
|
||
$ echo '{"bare_int": 2021, "bare_float": 1.2}' \ | ||
> | ${AUGUR} curate titlecase --titlecase-fields "bare_int" "bare_float" \ | ||
> --failure-reporting "error_all" 1> /dev/null | ||
ERROR: Failed to titlecase 'bare_int':2021 in record 0 because the value is a 'int' and is not a string. | ||
ERROR: Failed to titlecase 'bare_float':1.2 in record 0 because the value is a 'float' and is not a string. | ||
ERROR: Unable to change to titlecase for the following (record, field, field value): | ||
(0, 'bare_int', 2021) | ||
(0, 'bare_float', 1.2) | ||
[2] | ||
|
||
Test warning on failures such as when encountering a non-string value. | ||
|
||
$ echo '{"bare_int": 2021}' \ | ||
> | ${AUGUR} curate titlecase --titlecase-fields "bare_int" \ | ||
> --failure-reporting "warn" | ||
WARNING: Failed to titlecase 'bare_int':2021 in record 0 because the value is a 'int' and is not a string. | ||
WARNING: Unable to change to titlecase for the following (record, field, field value): | ||
(0, 'bare_int', 2021) | ||
{"bare_int": 2021} | ||
|
||
Test silencing on failures such as when encountering a non-string value | ||
|
||
$ echo '{"bare_int": 2021}' \ | ||
> | ${AUGUR} curate titlecase --titlecase-fields "bare_int" \ | ||
> --failure-reporting "silent" | ||
{"bare_int": 2021} | ||
|