Skip to content

Commit 8b7c5e1

Browse files
committed
Added basic likely-subtag resolving
1 parent 59f02c9 commit 8b7c5e1

File tree

4 files changed

+129
-9
lines changed

4 files changed

+129
-9
lines changed

ChangeLog

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,8 @@ Version 1.0
7070
* Added experimental Python 3 support.
7171
* Added better support for returning timezone names.
7272
* Don't throw away a Catalog's obsolete messages when updating it.
73+
* Added basic likelySubtag resolving when doing locale parsing and no
74+
match can be found.
7375

7476

7577
Version 0.9.6

babel/core.py

Lines changed: 81 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ def negotiate(cls, preferred, available, sep='_', aliases=LOCALE_ALIASES):
194194
return Locale.parse(identifier, sep=sep)
195195

196196
@classmethod
197-
def parse(cls, identifier, sep='_'):
197+
def parse(cls, identifier, sep='_', resolve_likely_subtags=True):
198198
"""Create a `Locale` instance for the given locale identifier.
199199
200200
>>> l = Locale.parse('de-DE', sep='-')
@@ -207,8 +207,22 @@ def parse(cls, identifier, sep='_'):
207207
>>> Locale.parse(l)
208208
Locale('de', territory='DE')
209209
210+
This also can perform resolving of likely subtags which it does
211+
by default.
212+
210213
:param identifier: the locale identifier string
211214
:param sep: optional component separator
215+
:param resolve_likely_subtags: if this is specified then a locale will
216+
have its likely subtag resolved if the
217+
locale otherwise does not exist. For
218+
instance ``zh_TW`` by itself is not a
219+
locale that exists but Babel can
220+
automatically expand it to the full
221+
form of ``zh_hant_TW``. Note that this
222+
expansion is only taking place if no
223+
locale exists otherwise. For instance
224+
there is a locale ``en`` that can exist
225+
by itself.
212226
:return: a corresponding `Locale` instance
213227
:rtype: `Locale`
214228
:raise `ValueError`: if the string does not appear to be a valid locale
@@ -217,9 +231,72 @@ def parse(cls, identifier, sep='_'):
217231
requested locale
218232
:see: `parse_locale`
219233
"""
220-
if isinstance(identifier, string_types):
221-
return cls(*parse_locale(identifier, sep=sep))
222-
return identifier
234+
if identifier is None:
235+
return None
236+
elif isinstance(identifier, Locale):
237+
return identifier
238+
elif not isinstance(identifier, string_types):
239+
raise TypeError('Unxpected value for identifier: %r' % (identifier,))
240+
241+
parts = parse_locale(identifier, sep=sep)
242+
243+
def _make_id(language, territory, script, variant):
244+
return '_'.join(filter(None, [language, script,
245+
territory, variant]))
246+
247+
input_id = _make_id(*parts)
248+
249+
def _try_load(parts):
250+
try:
251+
return cls(*parts)
252+
except UnknownLocaleError:
253+
return None
254+
255+
locale = _try_load(parts)
256+
if locale is not None:
257+
return locale
258+
if not resolve_likely_subtags:
259+
raise UnknownLocaleError(input_id)
260+
261+
# From here onwards is some very bad likely subtag resolving. This
262+
# whole logic is not entirely correct but good enough (tm) for the
263+
# time being. This has been added so that zh_TW does not cause
264+
# errors for people when they upgrade. Later we should properly
265+
# implement ICU like fuzzy locale objects and provide a way to
266+
# maximize and minimize locale tags.
267+
268+
language, territory, script, variant = parts
269+
language = get_global('language_aliases').get(language, language)
270+
territory = get_global('territory_aliases').get(territory, territory)
271+
script = get_global('script_aliases').get(script, script)
272+
variant = get_global('variant_aliases').get(variant, variant)
273+
274+
if territory == 'ZZ':
275+
territory = None
276+
if script == 'Zzzz':
277+
script = None
278+
279+
parts = language, territory, script, variant
280+
281+
new_id = _make_id(*parts)
282+
likely_subtag = get_global('likely_subtags').get(new_id)
283+
if likely_subtag is None:
284+
raise UnknownLocaleError(input_id)
285+
286+
parts2 = parse_locale(likely_subtag)
287+
288+
# Success on first hit, return it.
289+
locale = _try_load(parts2)
290+
if locale is not None:
291+
return locale
292+
293+
# Now try without script and variant
294+
lcoale = _try_load(parts2[:2])
295+
if locale is not None:
296+
return locale
297+
298+
# Give up.
299+
raise UnknownLocaleError(input_id)
223300

224301
def __eq__(self, other):
225302
for key in ('language', 'territory', 'script', 'variant'):

scripts/import_cldr.py

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,10 @@ def main():
109109
bcp47_timezone = parse(os.path.join(srcdir, 'bcp47', 'timezone.xml'))
110110
sup_windows_zones = parse(os.path.join(srcdir, 'supplemental',
111111
'windowsZones.xml'))
112+
sup_metadata = parse(os.path.join(srcdir, 'supplemental',
113+
'supplementalMetadata.xml'))
114+
sup_likely = parse(os.path.join(srcdir, 'supplemental',
115+
'likelySubtags.xml'))
112116
sup = parse(sup_filename)
113117

114118
# Import global data from the supplemental files
@@ -119,11 +123,16 @@ def main():
119123
zone_aliases = global_data.setdefault('zone_aliases', {})
120124
zone_territories = global_data.setdefault('zone_territories', {})
121125
win_mapping = global_data.setdefault('windows_zone_mapping', {})
122-
123-
# create auxiliary zone->territory map from the windows zones (we don't set
124-
# the 'zones_territories' map directly here, because there are some zones
125-
# aliases listed and we defer the decision of which ones to choose to the
126-
# 'bcp47' data
126+
language_aliases = global_data.setdefault('language_aliases', {})
127+
territory_aliases = global_data.setdefault('territory_aliases', {})
128+
script_aliases = global_data.setdefault('script_aliases', {})
129+
variant_aliases = global_data.setdefault('variant_aliases', {})
130+
likely_subtags = global_data.setdefault('likely_subtags', {})
131+
132+
# create auxiliary zone->territory map from the windows zones (we don't set
133+
# the 'zones_territories' map directly here, because there are some zones
134+
# aliases listed and we defer the decision of which ones to choose to the
135+
# 'bcp47' data
127136
_zone_territory_map = {}
128137
for map_zone in sup_windows_zones.findall('.//windowsZones/mapTimezones/mapZone'):
129138
if map_zone.attrib.get('territory') == '001':
@@ -151,6 +160,32 @@ def main():
151160
if 'to' not in child.attrib: # FIXME: support old mappings
152161
meta_zones[elem.attrib['type']] = child.attrib['mzone']
153162

163+
# Language aliases
164+
for alias in sup_metadata.findall('.//alias/languageAlias'):
165+
# We don't have a use for those at the moment. They don't
166+
# pass our parser anyways.
167+
if '-' in alias.attrib['type']:
168+
continue
169+
language_aliases[alias.attrib['type']] = alias.attrib['replacement']
170+
171+
# Territory aliases
172+
for alias in sup_metadata.findall('.//alias/territoryAlias'):
173+
territory_aliases[alias.attrib['type']] = alias.attrib['replacement'].split()
174+
175+
# Script aliases
176+
for alias in sup_metadata.findall('.//alias/scriptAlias'):
177+
script_aliases[alias.attrib['type']] = alias.attrib['replacement']
178+
179+
# Variant aliases
180+
for alias in sup_metadata.findall('.//alias/variantAlias'):
181+
repl = alias.attrib.get('replacement')
182+
if repl:
183+
variant_aliases[alias.attrib['type']] = repl
184+
185+
# Likely subtags
186+
for likely_subtag in sup_likely.findall('.//likelySubtags/likelySubtag'):
187+
likely_subtags[likely_subtag.attrib['from']] = likely_subtag.attrib['to']
188+
154189
outfile = open(global_path, 'wb')
155190
try:
156191
pickle.dump(global_data, outfile, 2)

tests/test_core.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,12 @@ def test_parse(self):
9191
de_DE = Locale.parse(l)
9292
assert (de_DE.language, de_DE.territory) == ('de', 'DE')
9393

94+
def test_parse_likely_subtags(self):
95+
l = Locale.parse('zh-TW', sep='-')
96+
assert l.language == 'zh'
97+
assert l.territory == 'TW'
98+
assert l.script == 'Hant'
99+
94100
def test_get_display_name(self):
95101
zh_CN = Locale('zh', 'CN', script='Hans')
96102
assert zh_CN.get_display_name('en') == 'Chinese (Simplified, China)'

0 commit comments

Comments
 (0)