Skip to content

Commit

Permalink
Keep @modifiers when parsing locales (#947)
Browse files Browse the repository at this point in the history
Locale modifiers ("@Variants") are described in the GNU gettext
documentation like this:

> The ‘@variant’ can denote any kind of characteristics that is not
> already implied by the language ll and the country CC. […] It can also
> denote a dialect of the language, …

Wherein Babel previously would discard these, this patch stores the
modifier information in the `Locale` objects, handling string
representation accordingly.

Resolves: #946
Signed-off-by: martin f. krafft <madduck@madduck.net>
Co-authored-by: Aarni Koskela <akx@iki.fi>
  • Loading branch information
madduck and akx authored Jan 26, 2023
1 parent 6bf793a commit d019ed1
Show file tree
Hide file tree
Showing 2 changed files with 88 additions and 33 deletions.
115 changes: 84 additions & 31 deletions babel/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ def __init__(
territory: str | None = None,
script: str | None = None,
variant: str | None = None,
modifier: str | None = None,
) -> None:
"""Initialize the locale object from the given identifier components.
Expand All @@ -181,6 +182,7 @@ def __init__(
:param territory: the territory (country or region) code
:param script: the script code
:param variant: the variant code
:param modifier: a modifier (following the '@' symbol, sometimes called '@variant')
:raise `UnknownLocaleError`: if no locale data is available for the
requested locale
"""
Expand All @@ -192,10 +194,13 @@ def __init__(
self.script = script
#: the variant code
self.variant = variant
#: the modifier
self.modifier = modifier
self.__data = None

identifier = str(self)
if not localedata.exists(identifier):
identifier_without_modifier = identifier.partition('@')[0]
if not localedata.exists(identifier_without_modifier):
raise UnknownLocaleError(identifier)

@classmethod
Expand Down Expand Up @@ -290,6 +295,11 @@ def parse(
>>> Locale.parse('und_AT')
Locale('de', territory='AT')
Modifiers are optional, and always at the end, separated by "@":
>>> Locale.parse('de_AT@euro')
Locale('de', territory='AT', modifier='euro')
:param identifier: the locale identifier string
:param sep: optional component separator
:param resolve_likely_subtags: if this is specified then a locale will
Expand Down Expand Up @@ -348,7 +358,11 @@ def _try_load_reducing(parts):
# implement ICU like fuzzy locale objects and provide a way to
# maximize and minimize locale tags.

language, territory, script, variant = parts
if len(parts) == 5:
language, territory, script, variant, modifier = parts
else:
language, territory, script, variant = parts
modifier = None
language = get_global('language_aliases').get(language, language)
territory = get_global('territory_aliases').get(territory, (territory,))[0]
script = get_global('script_aliases').get(script, script)
Expand All @@ -359,7 +373,7 @@ def _try_load_reducing(parts):
if script == 'Zzzz':
script = None

parts = language, territory, script, variant
parts = language, territory, script, variant, modifier

# First match: try the whole identifier
new_id = get_locale_identifier(parts)
Expand All @@ -373,41 +387,49 @@ def _try_load_reducing(parts):
# simplified identifier that is just the language
likely_subtag = get_global('likely_subtags').get(language)
if likely_subtag is not None:
language2, _, script2, variant2 = parse_locale(likely_subtag)
locale = _try_load_reducing((language2, territory, script2, variant2))
parts2 = parse_locale(likely_subtag)
if len(parts2) == 5:
language2, _, script2, variant2, modifier2 = parse_locale(likely_subtag)
else:
language2, _, script2, variant2 = parse_locale(likely_subtag)
modifier2 = None
locale = _try_load_reducing((language2, territory, script2, variant2, modifier2))
if locale is not None:
return locale

raise UnknownLocaleError(input_id)

def __eq__(self, other: object) -> bool:
for key in ('language', 'territory', 'script', 'variant'):
for key in ('language', 'territory', 'script', 'variant', 'modifier'):
if not hasattr(other, key):
return False
return (
self.language == getattr(other, 'language') and # noqa: B009
self.territory == getattr(other, 'territory') and # noqa: B009
self.script == getattr(other, 'script') and # noqa: B009
self.variant == getattr(other, 'variant') # noqa: B009
self.variant == getattr(other, 'variant') and # noqa: B009
self.modifier == getattr(other, 'modifier') # noqa: B009
)

def __ne__(self, other: object) -> bool:
return not self.__eq__(other)

def __hash__(self) -> int:
return hash((self.language, self.territory, self.script, self.variant))
return hash((self.language, self.territory, self.script,
self.variant, self.modifier))

def __repr__(self) -> str:
parameters = ['']
for key in ('territory', 'script', 'variant'):
for key in ('territory', 'script', 'variant', 'modifier'):
value = getattr(self, key)
if value is not None:
parameters.append(f"{key}={value!r}")
return f"Locale({self.language!r}{', '.join(parameters)})"

def __str__(self) -> str:
return get_locale_identifier((self.language, self.territory,
self.script, self.variant))
self.script, self.variant,
self.modifier))

@property
def _data(self) -> localedata.LocaleDataDict:
Expand All @@ -424,6 +446,11 @@ def get_display_name(self, locale: Locale | str | None = None) -> str | None:
>>> Locale('zh', 'CN', script='Hans').get_display_name('en')
u'Chinese (Simplified, China)'
Modifiers are currently passed through verbatim:
>>> Locale('it', 'IT', modifier='euro').get_display_name('en')
u'Italian (Italy, euro)'
:param locale: the locale to use
"""
if locale is None:
Expand All @@ -438,6 +465,8 @@ def get_display_name(self, locale: Locale | str | None = None) -> str | None:
details.append(locale.territories.get(self.territory))
if self.variant:
details.append(locale.variants.get(self.variant))
if self.modifier:
details.append(self.modifier)
details = filter(None, details)
if details:
retval += f" ({', '.join(details)})"
Expand Down Expand Up @@ -1115,9 +1144,12 @@ def negotiate_locale(preferred: Iterable[str], available: Iterable[str], sep: st
return None


def parse_locale(identifier: str, sep: str = '_') -> tuple[str, str | None, str | None, str | None]:
def parse_locale(
identifier: str,
sep: str = '_'
) -> tuple[str, str | None, str | None, str | None, str | None]:
"""Parse a locale identifier into a tuple of the form ``(language,
territory, script, variant)``.
territory, script, variant, modifier)``.
>>> parse_locale('zh_CN')
('zh', 'CN', None, None)
Expand All @@ -1129,12 +1161,22 @@ def parse_locale(identifier: str, sep: str = '_') -> tuple[str, str | None, str
('en', '150', None, None)
>>> parse_locale('en_us_posix')
('en', 'US', None, 'POSIX')
>>> parse_locale('it_IT@euro')
('it', 'IT', None, None, 'euro')
>>> parse_locale('it_IT@custom')
('it', 'IT', None, None, 'custom')
>>> parse_locale('it_IT@')
('it', 'IT', None, None)
The default component separator is "_", but a different separator can be
specified using the `sep` parameter:
specified using the `sep` parameter.
The optional modifier is always separated with "@" and at the end:
>>> parse_locale('zh-CN', sep='-')
('zh', 'CN', None, None)
>>> parse_locale('zh-CN@custom', sep='-')
('zh', 'CN', None, None, 'custom')
If the identifier cannot be parsed into a locale, a `ValueError` exception
is raised:
Expand All @@ -1144,14 +1186,13 @@ def parse_locale(identifier: str, sep: str = '_') -> tuple[str, str | None, str
...
ValueError: 'not_a_LOCALE_String' is not a valid locale identifier
Encoding information and locale modifiers are removed from the identifier:
Encoding information is removed from the identifier, while modifiers are
kept:
>>> parse_locale('it_IT@euro')
('it', 'IT', None, None)
>>> parse_locale('en_US.UTF-8')
('en', 'US', None, None)
>>> parse_locale('de_DE.iso885915@euro')
('de', 'DE', None, None)
('de', 'DE', None, None, 'euro')
See :rfc:`4646` for more information.
Expand All @@ -1161,13 +1202,10 @@ def parse_locale(identifier: str, sep: str = '_') -> tuple[str, str | None, str
:raise `ValueError`: if the string does not appear to be a valid locale
identifier
"""
identifier, _, modifier = identifier.partition('@')
if '.' in identifier:
# this is probably the charset/encoding, which we don't care about
identifier = identifier.split('.', 1)[0]
if '@' in identifier:
# this is a locale modifier such as @euro, which we don't care about
# either
identifier = identifier.split('@', 1)[0]

parts = identifier.split(sep)
lang = parts.pop(0).lower()
Expand All @@ -1193,22 +1231,37 @@ def parse_locale(identifier: str, sep: str = '_') -> tuple[str, str | None, str
if parts:
raise ValueError(f"{identifier!r} is not a valid locale identifier")

return lang, territory, script, variant


def get_locale_identifier(tup: tuple[str, str | None, str | None, str | None], sep: str = '_') -> str:
# TODO(3.0): always return a 5-tuple
if modifier:
return lang, territory, script, variant, modifier
else:
return lang, territory, script, variant


def get_locale_identifier(
tup: tuple[str]
| tuple[str, str | None]
| tuple[str, str | None, str | None]
| tuple[str, str | None, str | None, str | None]
| tuple[str, str | None, str | None, str | None, str | None],
sep: str = "_",
) -> str:
"""The reverse of :func:`parse_locale`. It creates a locale identifier out
of a ``(language, territory, script, variant)`` tuple. Items can be set to
of a ``(language, territory, script, variant, modifier)`` tuple. Items can be set to
``None`` and trailing ``None``\\s can also be left out of the tuple.
>>> get_locale_identifier(('de', 'DE', None, '1999'))
'de_DE_1999'
>>> get_locale_identifier(('de', 'DE', None, '1999', 'custom'))
'de_DE_1999@custom'
>>> get_locale_identifier(('fi', None, None, None, 'custom'))
'fi@custom'
.. versionadded:: 1.0
:param tup: the tuple as returned by :func:`parse_locale`.
:param sep: the separator for the identifier.
"""
tup = tuple(tup[:4])
lang, territory, script, variant = tup + (None,) * (4 - len(tup))
return sep.join(filter(None, (lang, script, territory, variant)))
tup = tuple(tup[:5])
lang, territory, script, variant, modifier = tup + (None,) * (5 - len(tup))
ret = sep.join(filter(None, (lang, script, territory, variant)))
return f'{ret}@{modifier}' if modifier else ret
6 changes: 4 additions & 2 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,10 +283,12 @@ def test_parse_locale():
assert (excinfo.value.args[0] ==
"'not_a_LOCALE_String' is not a valid locale identifier")

assert core.parse_locale('it_IT@euro') == ('it', 'IT', None, None)
assert core.parse_locale('it_IT@euro') == ('it', 'IT', None, None, 'euro')
assert core.parse_locale('it_IT@something') == ('it', 'IT', None, None, 'something')

assert core.parse_locale('en_US.UTF-8') == ('en', 'US', None, None)
assert (core.parse_locale('de_DE.iso885915@euro') ==
('de', 'DE', None, None))
('de', 'DE', None, None, 'euro'))


@pytest.mark.parametrize('filename', [
Expand Down

0 comments on commit d019ed1

Please sign in to comment.