Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bpo-37760: Convert from length-18 lists to a dataclass, in makeunicodedata. #15265

Merged
merged 5 commits into from
Sep 12, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
The :file:`Tools/unicode/makeunicodedata.py` script, which is used for
converting information from the Unicode Character Database into generated
code and data used by the methods of :class:`str` and by the
:mod:`unicodedata` module, now handles each character's data as a
``dataclass`` with named attributes, rather than a length-18 list of
different fields.
150 changes: 88 additions & 62 deletions Tools/unicode/makeunicodedata.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,14 @@
# written by Fredrik Lundh (fredrik@pythonware.com)
#

import dataclasses
import os
import sys
import zipfile

from functools import partial
from textwrap import dedent
from typing import Iterator, List, Tuple
from typing import Iterator, List, Optional, Set, Tuple

SCRIPT = sys.argv[0]
VERSION = "3.3"
Expand Down Expand Up @@ -148,12 +149,12 @@ def makeunicodedata(unicode, trace):
record = unicode.table[char]
if record:
# extract database properties
category = CATEGORY_NAMES.index(record[2])
combining = int(record[3])
bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
mirrored = record[9] == "Y"
eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
normalizationquickcheck = record[17]
category = CATEGORY_NAMES.index(record.general_category)
combining = int(record.canonical_combining_class)
bidirectional = BIDIRECTIONAL_NAMES.index(record.bidi_class)
mirrored = record.bidi_mirrored == "Y"
eastasianwidth = EASTASIANWIDTH_NAMES.index(record.east_asian_width)
normalizationquickcheck = record.quick_check
item = (
category, combining, bidirectional, mirrored, eastasianwidth,
normalizationquickcheck
Expand All @@ -179,8 +180,8 @@ def makeunicodedata(unicode, trace):
for char in unicode.chars:
record = unicode.table[char]
if record:
if record[5]:
decomp = record[5].split()
if record.decomposition_type:
decomp = record.decomposition_type.split()
if len(decomp) > 19:
raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
# prefix
Expand All @@ -200,7 +201,7 @@ def makeunicodedata(unicode, trace):
# Collect NFC pairs
if not prefix and len(decomp) == 3 and \
char not in unicode.exclusions and \
unicode.table[decomp[1]][3] == "0":
unicode.table[decomp[1]].canonical_combining_class == "0":
p, l, r = decomp
comp_first[l] = 1
comp_last[r] = 1
Expand Down Expand Up @@ -404,9 +405,9 @@ def makeunicodetype(unicode, trace):
record = unicode.table[char]
if record:
# extract database properties
category = record[2]
bidirectional = record[4]
properties = record[16]
category = record.general_category
bidirectional = record.bidi_class
properties = record.binary_properties
flags = 0
if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
flags |= ALPHA_MASK
Expand Down Expand Up @@ -434,16 +435,16 @@ def makeunicodetype(unicode, trace):
flags |= CASE_IGNORABLE_MASK
sc = unicode.special_casing.get(char)
cf = unicode.case_folding.get(char, [char])
if record[12]:
upper = int(record[12], 16)
if record.simple_uppercase_mapping:
upper = int(record.simple_uppercase_mapping, 16)
else:
upper = char
if record[13]:
lower = int(record[13], 16)
if record.simple_lowercase_mapping:
lower = int(record.simple_lowercase_mapping, 16)
else:
lower = char
if record[14]:
title = int(record[14], 16)
if record.simple_titlecase_mapping:
title = int(record.simple_titlecase_mapping, 16)
else:
title = upper
if sc is None and cf != [lower]:
Expand Down Expand Up @@ -480,16 +481,16 @@ def makeunicodetype(unicode, trace):
extra_casing.extend(sc[1])
# decimal digit, integer digit
decimal = 0
if record[6]:
if record.decomposition_mapping:
flags |= DECIMAL_MASK
decimal = int(record[6])
decimal = int(record.decomposition_mapping)
digit = 0
if record[7]:
if record.numeric_type:
flags |= DIGIT_MASK
digit = int(record[7])
if record[8]:
digit = int(record.numeric_type)
if record.numeric_value:
flags |= NUMERIC_MASK
numeric.setdefault(record[8], []).append(char)
numeric.setdefault(record.numeric_value, []).append(char)
item = (
upper, lower, title, decimal, digit, flags
)
Expand Down Expand Up @@ -609,7 +610,7 @@ def makeunicodename(unicode, trace):
for char in unicode.chars:
record = unicode.table[char]
if record:
name = record[1].strip()
name = record.name.strip()
if name and name[0] != "<":
names[char] = name + chr(0)

Expand Down Expand Up @@ -719,7 +720,7 @@ def word_key(a):
for char in unicode.chars:
record = unicode.table[char]
if record:
name = record[1].strip()
name = record.name.strip()
if name and name[0] != "<":
data.append((name, char))

Expand Down Expand Up @@ -819,31 +820,27 @@ def merge_old_version(version, new, old):
continue
# check characters that differ
if old.table[i] != new.table[i]:
for k in range(len(old.table[i])):
if old.table[i][k] != new.table[i][k]:
value = old.table[i][k]
for k, field in enumerate(dataclasses.fields(UcdRecord)):
value = getattr(old.table[i], field.name)
new_value = getattr(new.table[i], field.name)
if value != new_value:
if k == 1 and i in PUA_15:
# the name is not set in the old.table, but in the
# new.table we are using it for aliases and named seq
assert value == ''
elif k == 2:
#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
category_changes[i] = CATEGORY_NAMES.index(value)
elif k == 4:
#print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
elif k == 5:
#print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
# We assume that all normalization changes are in 1:1 mappings
assert " " not in value
normalization_changes.append((i, value))
elif k == 6:
#print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
# we only support changes where the old value is a single digit
assert value in "0123456789"
decimal_changes[i] = int(value)
elif k == 8:
# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
# Since 0 encodes "no change", the old value is better not 0
if not value:
numeric_changes[i] = -1
Expand Down Expand Up @@ -949,25 +946,60 @@ def expanded(self) -> Iterator[Tuple[int, List[str]]]:
yield char, rest


@dataclasses.dataclass
class UcdRecord:
# 15 fields from UnicodeData.txt . See:
# https://www.unicode.org/reports/tr44/#UnicodeData.txt
codepoint: str
name: str
general_category: str
canonical_combining_class: str
bidi_class: str
decomposition_type: str
decomposition_mapping: str
numeric_type: str
numeric_value: str
bidi_mirrored: str
unicode_1_name: str # obsolete
iso_comment: str # obsolete
simple_uppercase_mapping: str
simple_lowercase_mapping: str
simple_titlecase_mapping: str

# https://www.unicode.org/reports/tr44/#EastAsianWidth.txt
east_asian_width: Optional[str]

# Binary properties, as a set of those that are true.
# Taken from multiple files:
# https://www.unicode.org/reports/tr44/#DerivedCoreProperties.txt
# https://www.unicode.org/reports/tr44/#LineBreak.txt
binary_properties: Set[str]

# The Quick_Check properties related to normalization:
# https://www.unicode.org/reports/tr44/#Decompositions_and_Normalization
# We store them as a bitmask.
quick_check: int


def from_row(row: List[str]) -> UcdRecord:
return UcdRecord(*row, None, set(), 0)


# --------------------------------------------------------------------
# the following support code is taken from the unidb utilities
# Copyright (c) 1999-2000 by Secret Labs AB

# load a unicode-data file from disk

class UnicodeData:
# Record structure:
# [ID, name, category, combining, bidi, decomp, (6)
# decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
# ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
# derived-props] (17)
# table: List[Optional[UcdRecord]] # index is codepoint; None means unassigned

def __init__(self, version, cjk_check=True):
self.changed = []
table = [None] * 0x110000
for s in UcdFile(UNICODE_DATA, version):
char = int(s[0], 16)
table[char] = s
table[char] = from_row(s)

cjk_ranges_found = []

Expand All @@ -979,19 +1011,17 @@ def __init__(self, version, cjk_check=True):
# https://www.unicode.org/reports/tr44/#Code_Point_Ranges
s = table[i]
if s:
if s[1][-6:] == "First>":
s[1] = ""
field = s
elif s[1][-5:] == "Last>":
if s[1].startswith("<CJK Ideograph"):
if s.name[-6:] == "First>":
s.name = ""
field = dataclasses.astuple(s)[:15]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the sole reason for this tuple conversion line 1025? Can you use the replace method of dataclasses there? The hardcoded [:15] somewhat defeats the nice benefit of naming fields you've achieved here.

(Also, I wonder if the first two loops of this function can be merged but that's orthogonal.)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the sole reason for this tuple conversion line 1025? Can you use the replace method of dataclasses there? The hardcoded [:15] somewhat defeats the nice benefit of naming fields you've achieved here.

Yeah. I think I originally did this with replace and that was too slow. Let's see if that reproduces...

Oh right, another bit is that we need a distinct set for the binary_properties attribute of each record, so the code ends up not quite as clean as you might hope anyway. In this version from_row is the place that knows that fact, and we use it in both these loops.

OK, the result is it makes the whole script about 20% slower: on my (fast) desktop, it's 8.3s in this version and 10.0s with .replace. Here's the patch:

             if s:
                 if s.name[-6:] == "First>":
                     s.name = ""
-                    field = dataclasses.astuple(s)[:15]
+                    field = s
                 elif s.name[-5:] == "Last>":
                     if s.name.startswith("<CJK Ideograph"):
-                        cjk_ranges_found.append((field[0],
+                        cjk_ranges_found.append((field.codepoint,
                                                  s.codepoint))
                     s.name = ""
                     field = None
             elif field:
-                table[i] = from_row(('%X' % i,) + field[1:])
+                table[i] = dataclasses.replace(field,
+                    codepoint='%X' % (i,), binary_properties=set())

Certainly cleaner code that way, but the 20% speed hit (~2 extra seconds) is noticeably painful when doing development on this script and consequently running it all the time to try changes.

This tuple is quite local -- it doesn't escape this loop of 20 lines or so -- so I'd prefer to accept the short stretch of lower-level fiddling in exchange for the speedup when working on the rest of the script.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, I suppose [:15] is unlikely to break too in the future given the stability of UnicodeData.txt.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed :)

elif s.name[-5:] == "Last>":
if s.name.startswith("<CJK Ideograph"):
cjk_ranges_found.append((field[0],
s[0]))
s[1] = ""
s.codepoint))
s.name = ""
field = None
elif field:
f2 = field[:]
f2[0] = "%X" % i
table[i] = f2
table[i] = from_row(('%X' % i,) + field[1:])
if cjk_check and cjk_ranges != cjk_ranges_found:
raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)

Expand All @@ -1012,7 +1042,7 @@ def __init__(self, version, cjk_check=True):
char = int(char, 16)
self.aliases.append((name, char))
# also store the name in the PUA 1
self.table[pua_index][1] = name
self.table[pua_index].name = name
pua_index += 1
assert pua_index - NAME_ALIASES_START == len(self.aliases)

Expand All @@ -1031,7 +1061,7 @@ def __init__(self, version, cjk_check=True):
"the NamedSequence struct and in unicodedata_lookup")
self.named_sequences.append((name, chars))
# also store these in the PUA 1
self.table[pua_index][1] = name
self.table[pua_index].name = name
pua_index += 1
assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)

Expand All @@ -1046,23 +1076,19 @@ def __init__(self, version, cjk_check=True):

for i in range(0, 0x110000):
if table[i] is not None:
table[i].append(widths[i])

for i in range(0, 0x110000):
if table[i] is not None:
table[i].append(set())
table[i].east_asian_width = widths[i]

for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
if table[char]:
# Some properties (e.g. Default_Ignorable_Code_Point)
# apply to unassigned code points; ignore them
table[char][-1].add(p)
table[char].binary_properties.add(p)

for char_range, value in UcdFile(LINE_BREAK, version):
if value not in MANDATORY_LINE_BREAKS:
continue
for char in expand_range(char_range):
table[char][-1].add('Line_Break')
table[char].binary_properties.add('Line_Break')

# We only want the quickcheck properties
# Format: NF?_QC; Y(es)/N(o)/M(aybe)
Expand All @@ -1084,7 +1110,7 @@ def __init__(self, version, cjk_check=True):
quickchecks[char] |= quickcheck
for i in range(0, 0x110000):
if table[i] is not None:
table[i].append(quickchecks[i])
table[i].quick_check = quickchecks[i]

with open_data(UNIHAN, version) as file:
zip = zipfile.ZipFile(file)
Expand All @@ -1103,7 +1129,7 @@ def __init__(self, version, cjk_check=True):
i = int(code[2:], 16)
# Patch the numeric field
if table[i] is not None:
table[i][8] = value
table[i].numeric_value = value

sc = self.special_casing = {}
for data in UcdFile(SPECIAL_CASING, version):
Expand Down