Skip to content

Commit

Permalink
Bugfixes for zero-width characters (#91)
Browse files Browse the repository at this point in the history
Major
-----

Bugfix zero-with characters, closes #57, #47, #45, #39, #26, #25, #24, #22, #8, wow !

This is mostly achieved by replacing `ZERO_WIDTH_CF` with dynamic parsing by Category codes in bin/update-tables.py and putting those in the zero-wide tables.

Tests
-----

- `verify-table-integrity.py` exercises a "bug" of duplicated tables that has no effect, because wcswidth() first checks for zero-width, and that is preferred in cases of conflict. This PR also resolves that error of duplication.
- new automatic tests for balinese, kr jamo, zero-width emoji, devanagari, tamil, kannada.  
- added pytest-benchmark plugin, example use:

        # baseline
        tox -epy312 -- --verbose --benchmark-save=original
        # compare
        tox -epy312 -- --verbose --benchmark-compare=.benchmarks/Linux-CPython-3.12-64bit/0001_original.json
  • Loading branch information
jquast authored Oct 30, 2023
1 parent 4f41d0c commit 04d6d90
Show file tree
Hide file tree
Showing 18 changed files with 2,735 additions and 2,512 deletions.
202 changes: 123 additions & 79 deletions bin/update-tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
JINJA_ENV = jinja2.Environment(
loader=jinja2.FileSystemLoader(os.path.join(PATH_UP, 'code_templates')),
keep_trailing_newline=True)
UTC_NOW = datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S UTC")
UTC_NOW = datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%d %H:%M:%S UTC")

CONNECT_TIMEOUT = int(os.environ.get('CONNECT_TIMEOUT', '10'))
FETCH_BLOCKSIZE = int(os.environ.get('FETCH_BLOCKSIZE', '4096'))
Expand All @@ -72,7 +72,7 @@ class UnicodeVersion:
@classmethod
def parse(cls, version_str: str) -> UnicodeVersion:
"""
parse a version string.
Parse a version string.
>>> UnicodeVersion.parse("14.0.0")
UnicodeVersion(major=14, minor=0, micro=0)
Expand All @@ -90,21 +90,99 @@ def __str__(self) -> str:
@dataclass(frozen=True)
class TableEntry:
"""An entry of a unicode table."""
code_range: range | None
code_range: tuple[int, int] | None
properties: tuple[str, ...]
comment: str

def filter_by_category(self, category_codes: str, wide: int) -> bool:
"""
Return whether entry matches given category code and displayed width.
Categories are described here, https://www.unicode.org/reports/tr44/#GC_Values_Table
"""
if self.code_range is None:
return False
elif self.properties[0] == 'Sk':
if 'EMOJI MODIFIER' in self.comment:
# These codepoints are fullwidth when used without emoji, 0-width with.
# Generate code that expects the best case, that is always combined
return wide == 0
elif 'FULLWIDTH' in self.comment:
# Some codepoints in 'Sk' categories are fullwidth(!)
# at this time just 3, FULLWIDTH: CIRCUMFLEX ACCENT, GRAVE ACCENT, and MACRON
return wide == 2
else:
# the rest are narrow
return wide == 1
# Me Enclosing Mark
# Mn Nonspacing Mark
# Cf Format
# Zl Line Separator
# Zp Paragraph Separator
if self.properties[0] in ('Me', 'Mn', 'Mc', 'Cf', 'Zl', 'Zp'):
return wide == 0
# F Fullwidth
# W Wide
if self.properties[0] in ('W', 'F'):
return wide == 2
return wide == 1

@staticmethod
def parse_category_values(category_codes: str,
table_iter: Iterator[TableEntry],
wide: int) -> set[tuple[int, int]]:
"""Parse value ranges of unicode data files, by given category and width."""
return {n
for entry in table_iter
if entry.filter_by_category(category_codes, wide)
for n in list(range(entry.code_range[0], entry.code_range[1]))}


@dataclass
class TableDef:
filename: str
date: str
values: list[tuple[str, str, str]]
values: set[int]

def as_value_ranges(self) -> list[tuple[int, int]]:
"""Return a list of tuple of (start, end) ranges for given set of 'values'."""
table: list[tuple[int, int]] = []
values_iter = iter(sorted(self.values))
start = end = next(values_iter)
table.append((start, end))

for value in values_iter:
# remove last-most entry for comparison,
start, end = table.pop()
if end == value - 1:
# continuation of existing range, rewrite
table.append((start, value,))
else:
# non-continuation: insert back previous range,
table.append((start, end,))
# and start a new one
table.append((value, value,))
return table

@property
def hex_range_descriptions(self) -> list[tuple[str, str, str]]:
"""Convert integers into string table of (hex_start, hex_end, txt_description)."""
pytable_values: list[tuple[str, str, str]] = []
for start, end in self.as_value_ranges():
hex_start, hex_end = f'0x{start:05x}', f'0x{end:05x}'
ucs_start, ucs_end = chr(start), chr(end)
name_start = name_ucs(ucs_start) or '(nil)'
name_end = name_ucs(ucs_end) or '(nil)'
if name_start != name_end:
txt_description = f'{name_start[:24].rstrip():24s}..{name_end[:24].rstrip()}'
else:
txt_description = f'{name_start[:48]}'
pytable_values.append((hex_start, hex_end, txt_description))
return pytable_values


@dataclass(frozen=True)
class RenderContext:

def to_dict(self) -> dict[str, Any]:
return {field.name: getattr(self, field.name)
for field in fields(self)}
Expand Down Expand Up @@ -145,11 +223,11 @@ def __post_init__(self) -> None:
}

def render(self) -> str:
"""just like jinja2.Template.render."""
"""Just like jinja2.Template.render."""
return self._template.render(self._render_context)

def generate(self) -> Iterator[str]:
"""just like jinja2.Template.generate."""
"""Just like jinja2.Template.generate."""
return self._template.generate(self._render_context)


Expand Down Expand Up @@ -248,22 +326,38 @@ def fetch_table_wide_data() -> UnicodeTableRenderCtx:
"""Fetch and update east-asian tables."""
table: dict[UnicodeVersion, TableDef] = {}
for version in fetch_unicode_versions():
fname = os.path.join(PATH_DATA, f'EastAsianWidth-{version}.txt')
do_retrieve(url=URL_EASTASIAN_WIDTH.format(version=version), fname=fname)
table[version] = parse_category(fname=fname, category_codes=('W', 'F',))
# parse typical 'wide' characters by categories 'W' and 'F',
fname_eaw = os.path.join(PATH_DATA, f'EastAsianWidth-{version}.txt')
do_retrieve(url=URL_EASTASIAN_WIDTH.format(version=version), fname=fname_eaw)
table[version] = parse_category(fname=fname_eaw, category_codes=('W', 'F'), wide=2)

# subtract(!) wide characters that are defined as 'W' category in EAW, but
# as a zero-width category 'Mn' or 'Mc' in DGC, which is preferred.
fname_dgc = os.path.join(PATH_DATA, f'DerivedGeneralCategory-{version}.txt')
do_retrieve(url=URL_UNICODE_DERIVED_AGE.format(version=version), fname=fname_dgc)
table[version].values.discard(parse_category(fname=fname_dgc, category_codes=('Mn', 'Mc'), wide=0).values)

# join with some atypical 'wide' characters defined only by category
# 'Sk' in DGC
table[version].values.update(parse_category(fname=fname_dgc, category_codes=('Sk',), wide=2).values)
return UnicodeTableRenderCtx('WIDE_EASTASIAN', table)


def fetch_table_zero_data() -> UnicodeTableRenderCtx:
"""Fetch and update zero width tables."""
"""
Fetch and update zero width tables.
See also: https://unicode.org/L2/L2002/02368-default-ignorable.html
"""
table: dict[UnicodeVersion, TableDef] = {}
for version in fetch_unicode_versions():
fname = os.path.join(PATH_DATA, f'DerivedGeneralCategory-{version}.txt')
do_retrieve(url=URL_DERIVED_CATEGORY.format(version=version), fname=fname)
# TODO: test whether all of category, 'Cf' should be 'zero
# width', or, just the subset 2060..2064, see open issue
# https://github.com/jquast/wcwidth/issues/26
table[version] = parse_category(fname=fname, category_codes=('Me', 'Mn',))
# Determine values of zero-width character lookup table by the following category codes
fname_dgc = os.path.join(PATH_DATA, f'DerivedGeneralCategory-{version}.txt')
do_retrieve(url=URL_DERIVED_CATEGORY.format(version=version), fname=fname_dgc)
table[version] = parse_category(fname=fname_dgc, category_codes=('Me', 'Mn', 'Mc', 'Cf', 'Zl', 'Zp', 'Sk'), wide=0)

# And, include NULL
table[version].values.add(0)
return UnicodeTableRenderCtx('ZERO_WIDTH', table)


Expand All @@ -277,54 +371,11 @@ def cite_source_description(filename: str) -> tuple[str, str]:
return fname, date


def make_table(values: Collection[int]) -> tuple[tuple[int, int], ...]:
"""
Return a tuple of lookup tables for given values.
>>> make_table([0,1,2,5,6,7,9])
((0, 2), (5, 7), (9, 9))
"""
table: list[tuple[int, int]] = []
values_iter = iter(values)
start = end = next(values_iter)
table.append((start, end))

for value in values_iter:
start, end = table.pop()
if end == value - 1:
# continuation of existing range
table.append((start, value,))
else:
# put back existing range,
table.append((start, end,))
# and start a new one
table.append((value, value,))
return tuple(table)


def convert_values_to_string_table(
values: Collection[tuple[int, int]],
) -> list[tuple[str, str, str]]:
"""Convert integers into string table of (hex_start, hex_end, txt_description)."""
pytable_values: list[tuple[str, str, str]] = []
for start, end in values:
hex_start, hex_end = (f'0x{start:05x}', f'0x{end:05x}')
ucs_start, ucs_end = chr(start), chr(end)
name_start, name_end = '(nil)', '(nil)'
try:
name_start = string.capwords(unicodedata.name(ucs_start))
except ValueError:
pass
try:
name_end = string.capwords(unicodedata.name(ucs_end))
except ValueError:
pass
if name_start != name_end:
txt_description = f'{name_start[:24].rstrip():24s}..{name_end[:24].rstrip()}'
else:
txt_description = f'{name_start[:48]}'
pytable_values.append((hex_start, hex_end, txt_description))
return pytable_values
def name_ucs(ucs: str) -> str:
try:
return string.capwords(unicodedata.name(ucs))
except ValueError:
return None


def parse_unicode_table(file: Iterable[str]) -> Iterator[TableEntry]:
Expand All @@ -346,13 +397,12 @@ def parse_unicode_table(file: Iterable[str]) -> Iterator[TableEntry]:
start, end = code_points_str.split('..')
else:
start = end = code_points_str
code_range = range(int(start, base=16),
int(end, base=16) + 1)
code_range = (int(start, base=16), int(end, base=16) + 1)

yield TableEntry(code_range, tuple(properties), comment)


def parse_category(fname: str, category_codes: Container[str]) -> TableDef:
def parse_category(fname: str, category_codes: Container[str], wide: int) -> TableDef:
"""Parse value ranges of unicode data files, by given categories into string tables."""
print(f'parsing {fname}: ', end='', flush=True)

Expand All @@ -363,16 +413,9 @@ def parse_category(fname: str, category_codes: Container[str]) -> TableDef:
version = next(table_iter).comment.strip()
# and "date string" from second line
date = next(table_iter).comment.split(':', 1)[1].strip()

values: set[int] = set()
for entry in table_iter:
if (entry.code_range is not None
and entry.properties[0] in category_codes):
values.update(entry.code_range)

txt_values = convert_values_to_string_table(make_table(sorted(values)))
values = TableEntry.parse_category_values(category_codes, table_iter, wide)
print('ok')
return TableDef(version, date, txt_values)
return TableDef(version, date, values)


@functools.cache
Expand Down Expand Up @@ -401,7 +444,7 @@ def is_url_newer(url: str, fname: str) -> bool:
def do_retrieve(url: str, fname: str) -> None:
"""Retrieve given url to target filepath fname."""
folder = os.path.dirname(fname)
if not os.path.exists(folder):
if folder and not os.path.exists(folder):
os.makedirs(folder)
if not is_url_newer(url, fname):
return
Expand Down Expand Up @@ -431,9 +474,9 @@ def get_codegen_definitions() -> Iterator[RenderDefinition]:
yield UnicodeVersionPyRenderDef.new(
UnicodeVersionPyRenderCtx(fetch_unicode_versions())
)
yield UnicodeVersionRstRenderDef.new(fetch_source_headers())
yield UnicodeTableRenderDef.new('table_wide.py', fetch_table_wide_data())
yield UnicodeTableRenderDef.new('table_zero.py', fetch_table_zero_data())
yield UnicodeVersionRstRenderDef.new(fetch_source_headers())

for render_def in get_codegen_definitions():
with open(render_def.output_filename, 'w', encoding='utf-8', newline='\n') as fout:
Expand All @@ -445,3 +488,4 @@ def get_codegen_definitions() -> Iterator[RenderDefinition]:

if __name__ == '__main__':
main()

4 changes: 2 additions & 2 deletions bin/wcwidth-browser.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,12 +159,12 @@ class Style(object):
# Too few public methods (0/2)
@staticmethod
def attr_major(text):
"""non-stylized callable for "major" text, for non-ttys."""
"""Non-stylized callable for "major" text, for non-ttys."""
return text

@staticmethod
def attr_minor(text):
"""non-stylized callable for "minor" text, for non-ttys."""
"""Non-stylized callable for "minor" text, for non-ttys."""
return text

delimiter = '|'
Expand Down
2 changes: 1 addition & 1 deletion code_templates/python_table.py.j2
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ This code generated by {{this_filepath}} on {{utc_now}}.
# Source: {{ table_def.filename }}
# Date: {{ table_def.date }}
#
{%- for hex_start, hex_end, txt_description in table_def.values %}
{%- for hex_start, hex_end, txt_description in table_def.hex_range_descriptions %}
({{ hex_start }}, {{ hex_end }},), # {{txt_description}}
{%- endfor %}
),
Expand Down
9 changes: 4 additions & 5 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,9 @@
Public API
==========

This package follows SEMVER_ rules for version, therefore, for all of the
given functions signatures, at example version 1.1.1, you may use version
dependency ``>=1.1.1,<2.0`` for forward compatibility of future wcwidth
versions.
This package follows SEMVER_ rules. Therefore, for the functions of the below
list, you may safely use version dependency definition ``wcwidth<2`` in your
requirements.txt or equivalent. Their signatures will never change.

.. autofunction:: wcwidth.wcwidth

Expand All @@ -22,7 +21,7 @@ Private API
These functions should only be used for wcwidth development, and not used by
dependent packages except with care and by use of frozen version dependency,
as these functions may change names, signatures, or disappear entirely at any
time in the future, and not reflected by SEMVER rules.
time in the future, and not reflected by SEMVER_ rules!

If stable public API for any of the given functions is needed, please suggest a
Pull Request!
Expand Down
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
#html_static_path = ['_static']
# html_static_path = ['_static']

# Custom sidebar templates, must be a dictionary that maps document names
# to template names.
Expand Down
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ wcwidth

intro
unicode_version
specs
api

Indices and tables
Expand Down
6 changes: 6 additions & 0 deletions docs/intro.rst
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,11 @@ Other Languages
=======
History
=======
0.2.9 *2023-10-20*
* **Bugfix** zero-width characters used in Emoji ZWJ sequences, Balinese,
Jamo, Devanagari, Tamil, Kannada and others (`PR #91`).
* **Updated** to include `Specification <Specification_from_pypi>`_ of
character measurements.

0.2.8 *2023-09-30*
* Include requirements files in the source distibution (`PR #82`).
Expand Down Expand Up @@ -296,6 +301,7 @@ https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c::
* for any purpose and without fee is hereby granted. The author
* disclaims all warranties with regard to this software.

.. _`Specification_from_pypi`: https://wcwidth.readthedocs.io/en/latest/specs.html
.. _`tox`: https://tox.wiki/en/latest/
.. _`prospector`: https://github.com/landscapeio/prospector
.. _`combining`: https://en.wikipedia.org/wiki/Combining_character
Expand Down
Loading

0 comments on commit 04d6d90

Please sign in to comment.