Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support Unicode 16 #74

Merged
merged 4 commits into from
Jan 15, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
129 changes: 105 additions & 24 deletions scripts/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
from itertools import batched
from typing import Callable, Iterable

UNICODE_VERSION = "15.1.0"
UNICODE_VERSION = "16.0.0"
"""The version of the Unicode data files to download."""

NUM_CODEPOINTS = 0x110000
Expand Down Expand Up @@ -175,8 +175,11 @@ class WidthState(enum.IntEnum):
- 4th bit: whether to set top bit on emoji presentation.
If this is set but 3rd is not, the width mode is related to zwj sequences
- 5th from top: whether this is unaffected by ligature-transparent
(if set, should also set 3rd and 4th)
- 6th bit: if 4th is set but this one is not, then this is a ZWJ ligature state
where no ZWJ has been encountered yet; encountering one flips this on"""
where no ZWJ has been encountered yet; encountering one flips this on
- Seventh bit: is VS1 (if CJK) or is VS2 (not CJK)
"""

# BASIC WIDTHS

Expand Down Expand Up @@ -264,8 +267,17 @@ class WidthState(enum.IntEnum):
TAG_A6_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_1110
"(\\uE0061..=\\uE007A){6} \\uE007F \\u200D `Emoji_Presentation`"

# Kirat Rai
KIRAT_RAI_VOWEL_SIGN_E = 0b0000_0000_0010_0000
"\\u16D67 (\\u16D67 \\u16D67)+ and canonical equivalents"
KIRAT_RAI_VOWEL_SIGN_AI = 0b0000_0000_0010_0001
"(\\u16D68)+ and canonical equivalents"

# VARIATION SELECTORS

VARIATION_SELECTOR_1_OR_2 = 0b0000_0010_0000_0000
"\\uFE00 if CJK, or \\uFE01 otherwise"

# Text presentation sequences (not CJK)
VARIATION_SELECTOR_15 = 0b0100_0000_0000_0000
"\\uFE0E (text presentation sequences)"
Expand Down Expand Up @@ -361,6 +373,7 @@ def width_alone(self) -> int:
| WidthState.COMBINING_LONG_SOLIDUS_OVERLAY
| WidthState.VARIATION_SELECTOR_15
| WidthState.VARIATION_SELECTOR_16
| WidthState.VARIATION_SELECTOR_1_OR_2
):
return 0
case (
Expand Down Expand Up @@ -493,12 +506,6 @@ def load_zero_widths() -> list[bool]:
lambda cp: operator.setitem(zw_map, cp, True),
)

# Unicode spec bug: these should be `Grapheme_Cluster_Break=Extend`,
# as they canonically decompose to two characters with this property,
# but they aren't.
for c in [0x0CC0, 0x0CC7, 0x0CC8, 0x0CCA, 0x0CCB, 0x1B3B, 0x1B3D, 0x1B43]:
zw_map[c] = True

# Treat `Hangul_Syllable_Type`s of `Vowel_Jamo` and `Trailing_Jamo`
# as zero-width. This matches the behavior of glibc `wcwidth`.
#
Expand Down Expand Up @@ -639,6 +646,8 @@ def load_width_maps() -> tuple[list[WidthState], list[WidthState]]:
([0xA4FD], WidthState.LISU_TONE_LETTER_MYA_NA_JEU),
([0xFE0F], WidthState.VARIATION_SELECTOR_16),
([0x10C03], WidthState.OLD_TURKIC_LETTER_ORKHON_I),
([0x16D67], WidthState.KIRAT_RAI_VOWEL_SIGN_E),
([0x16D68], WidthState.KIRAT_RAI_VOWEL_SIGN_AI),
(emoji_presentation, WidthState.EMOJI_PRESENTATION),
(emoji_modifiers, WidthState.EMOJI_MODIFIER),
(regional_indicators, WidthState.REGIONAL_INDICATOR),
Expand All @@ -648,9 +657,11 @@ def load_width_maps() -> tuple[list[WidthState], list[WidthState]]:
ea[cp] = width

# East-Asian only
ea[0xFE00] = WidthState.VARIATION_SELECTOR_1_OR_2
ea[0x0338] = WidthState.COMBINING_LONG_SOLIDUS_OVERLAY

# Not East Asian only
not_ea[0xFE01] = WidthState.VARIATION_SELECTOR_1_OR_2
not_ea[0xFE0E] = WidthState.VARIATION_SELECTOR_15

return (not_ea, ea)
Expand Down Expand Up @@ -716,7 +727,7 @@ def load_solidus_transparent(
cjk_width_map: list[WidthState],
) -> list[tuple[Codepoint, Codepoint]]:
"""Characters expanding to a canonical combining class above 1, plus `ligature_transparent`s from above.
Ranges matching ones in `ligature_transparent` exactly are excluded (for compression), so it needs to bechecked also.
Ranges matching ones in `ligature_transparent` exactly are excluded (for compression), so it needs to be checked also.
"""

ccc_above_1 = set()
Expand Down Expand Up @@ -748,7 +759,7 @@ def load_solidus_transparent(
num_chars = len(ccc_above_1)

for cp in ccc_above_1:
if cp != 0xFE0F:
if cp not in [0xFE00, 0xFE0F]:
assert (
cjk_width_map[cp].table_width() != CharWidthInTable.SPECIAL
), f"U+{cp:X}"
Expand Down Expand Up @@ -1304,8 +1315,17 @@ def lookup_fns(
return (0, next_info.set_emoji_presentation());
}"""

if not is_cjk:
if is_cjk:
s += """
if c == '\\u{FE00}' {
return (0, next_info.set_vs1_2());
}
"""
else:
s += """
if c == '\\u{FE01}' {
return (0, next_info.set_vs1_2());
}
if c == '\\u{FE0E}' {
return (0, next_info.set_text_presentation());
}
Expand All @@ -1315,9 +1335,19 @@ def lookup_fns(
} else {
next_info = next_info.unset_text_presentation();
}
}"""
} else """

s += """
s += """if next_info.is_vs1_2() {
if matches!(c, '\\u{2018}' | '\\u{2019}' | '\\u{201C}' | '\\u{201D}') {
return ("""

s += str(2 - is_cjk)

s += """, WidthInfo::DEFAULT);
} else {
next_info = next_info.unset_vs1_2();
}
}
if next_info.is_ligature_transparent() {
if c == '\\u{200D}' {
return (0, next_info.set_zwj_bit());
Expand Down Expand Up @@ -1496,6 +1526,22 @@ def lookup_fns(
return (0, WidthInfo::EMOJI_PRESENTATION)
}}

(WidthInfo::KIRAT_RAI_VOWEL_SIGN_E, '\\u{{16D63}}') => {{
return (0, WidthInfo::DEFAULT);
}}
(WidthInfo::KIRAT_RAI_VOWEL_SIGN_E, '\\u{{16D67}}') => {{
return (0, WidthInfo::KIRAT_RAI_VOWEL_SIGN_AI);
}}
(WidthInfo::KIRAT_RAI_VOWEL_SIGN_E, '\\u{{16D68}}') => {{
return (1, WidthInfo::KIRAT_RAI_VOWEL_SIGN_E);
}}
(WidthInfo::KIRAT_RAI_VOWEL_SIGN_E, '\\u{{16D69}}') => {{
return (0, WidthInfo::DEFAULT);
}}
(WidthInfo::KIRAT_RAI_VOWEL_SIGN_AI, '\\u{{16D63}}') => {{
return (0, WidthInfo::DEFAULT);
}}

// Fallback
_ => {{}}
}}
Expand Down Expand Up @@ -1562,6 +1608,8 @@ def emit_module(
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
struct WidthInfo(u16);

const LIGATURE_TRANSPARENT_MASK: u16 = 0b0010_0000_0000_0000;

impl WidthInfo {
/// No special handling necessary
const DEFAULT: Self = Self(0);
Expand Down Expand Up @@ -1591,51 +1639,84 @@ def emit_module(

/// Has top bit set
fn is_emoji_presentation(self) -> bool {{
(self.0 & 0b1000_0000_0000_0000) == 0b1000_0000_0000_0000
(self.0 & WidthInfo::VARIATION_SELECTOR_16.0) == WidthInfo::VARIATION_SELECTOR_16.0
}}

/// Has top bit set
fn is_zwj_emoji_presentation(self) -> bool {{
(self.0 & 0b1011_0000_0000_0000) == 0b1001_0000_0000_0000
}}

/// Set top bit
fn set_emoji_presentation(self) -> Self {{
if (self.0 & 0b0010_0000_0000_0000) == 0b0010_0000_0000_0000
if (self.0 & LIGATURE_TRANSPARENT_MASK) == LIGATURE_TRANSPARENT_MASK
|| (self.0 & 0b1001_0000_0000_0000) == 0b0001_0000_0000_0000
{{
Self(self.0 | 0b1000_0000_0000_0000)
Self(
self.0
| WidthInfo::VARIATION_SELECTOR_16.0
& !WidthInfo::VARIATION_SELECTOR_15.0
& !WidthInfo::VARIATION_SELECTOR_1_OR_2.0,
)
}} else {{
Self::VARIATION_SELECTOR_16
}}
}}

/// Clear top bit
fn unset_emoji_presentation(self) -> Self {{
if (self.0 & 0b0010_0000_0000_0000) == 0b0010_0000_0000_0000 {{
Self(self.0 & 0b0111_1111_1111_1111)
if (self.0 & LIGATURE_TRANSPARENT_MASK) == LIGATURE_TRANSPARENT_MASK {{
Self(self.0 & !WidthInfo::VARIATION_SELECTOR_16.0)
}} else {{
Self::DEFAULT
}}
}}

/// Has 2nd bit set
fn is_text_presentation(self) -> bool {{
(self.0 & 0b0100_0000_0000_0000) == 0b0100_0000_0000_0000
(self.0 & WidthInfo::VARIATION_SELECTOR_15.0) == WidthInfo::VARIATION_SELECTOR_15.0
}}

/// Set 2nd bit
fn set_text_presentation(self) -> Self {{
if (self.0 & 0b0010_0000_0000_0000) == 0b0010_0000_0000_0000 {{
Self(self.0 | 0b0100_0000_0000_0000)
if (self.0 & LIGATURE_TRANSPARENT_MASK) == LIGATURE_TRANSPARENT_MASK {{
Self(
self.0
| WidthInfo::VARIATION_SELECTOR_15.0
& !WidthInfo::VARIATION_SELECTOR_16.0
& !WidthInfo::VARIATION_SELECTOR_1_OR_2.0,
)
}} else {{
Self(0b0100_0000_0000_0000)
Self(WidthInfo::VARIATION_SELECTOR_15.0)
}}
}}

/// Clear 2nd bit
fn unset_text_presentation(self) -> Self {{
Self(self.0 & 0b1011_1111_1111_1111)
Self(self.0 & !WidthInfo::VARIATION_SELECTOR_15.0)
}}

/// Has 7th bit set
fn is_vs1_2(self) -> bool {{
(self.0 & WidthInfo::VARIATION_SELECTOR_1_OR_2.0) == WidthInfo::VARIATION_SELECTOR_1_OR_2.0
}}

/// Set 7th bit
fn set_vs1_2(self) -> Self {{
if (self.0 & LIGATURE_TRANSPARENT_MASK) == LIGATURE_TRANSPARENT_MASK {{
Self(
self.0
| WidthInfo::VARIATION_SELECTOR_1_OR_2.0
& !WidthInfo::VARIATION_SELECTOR_15.0
& !WidthInfo::VARIATION_SELECTOR_16.0,
)
}} else {{
Self(WidthInfo::VARIATION_SELECTOR_1_OR_2.0)
}}
}}

/// Clear 7th bit
fn unset_vs1_2(self) -> Self {{
Self(self.0 & !WidthInfo::VARIATION_SELECTOR_1_OR_2.0)
}}
}}

Expand Down
46 changes: 21 additions & 25 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,12 @@
//! - Outside of an East Asian context, [text presentation sequences] have width 1 if their base character:
//! - Has the [`Emoji_Presentation`] property, and
//! - Is not in the [Enclosed Ideographic Supplement] block.
//! - [`'\u{2018}'`, `'\u{2019}'`, `'\u{201C}'`, and `'\u{201D}'`][General Punctuation] always have width 1 when followed by '\u{FE00}',
//! and width 2 when followed by '\u{FE01}'.
//! - Script-specific ligatures:
//! - For all the following ligatures, the insertion of any number of [default-ignorable][`Default_Ignorable_Code_Point`]
//! [combining marks] anywhere in the sequence will not change the total width. In addition, for all non-Arabic
//! ligatures, the insertion of any number of [`'\u{200D}'` ZERO WIDTH JOINER](https://www.unicode.org/versions/Unicode15.0.0/ch23.pdf#G23126)s
//! ligatures, the insertion of any number of [`'\u{200D}'` ZERO WIDTH JOINER](https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-23/#G23126)s
//! will not affect the width.
//! - **[Arabic]**: A character sequence consisting of one character with [`Joining_Group`]`=Lam`,
//! followed by any number of characters with [`Joining_Type`]`=Transparent`, followed by one character
Expand All @@ -75,6 +77,7 @@
//! - **[Khmer]**: Coeng signs consisting of `'\u{17D2}'` followed by a character in
//! `'\u{1780}'..='\u{1782}' | '\u{1784}'..='\u{1787}' | '\u{1789}'..='\u{178C}' | '\u{178E}'..='\u{1793}' | '\u{1795}'..='\u{1798}' | '\u{179B}'..='\u{179D}' | '\u{17A0}' | '\u{17A2}' | '\u{17A7}' | '\u{17AB}'..='\u{17AC}' | '\u{17AF}'`
//! have width 0.
//! - **[Kirat Rai]**: Any sequence canonically equivalent to `'\u{16D68}'`, `'\u{16D69}'`, or `'\u{16D6A}'` has total width 1.
//! - **[Lisu]**: Tone letter combinations consisting of a character in the range `'\u{A4F8}'..='\u{A4FB}'`
//! followed by a character in the range `'\u{A4FC}'..='\u{A4FD}'` have width 1. For example: `ꓹꓼ`
//! - **[Old Turkic]**: `"\u{10C32}\u{200D}\u{10C03}"` (`𐰲‍𐰃`) has total width 1.
Expand All @@ -96,15 +99,6 @@
//! with the [`Default_Ignorable_Code_Point`] property.
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D)
//! with the [`Grapheme_Extend`] property.
//! - The following 8 characters, all of which have NFD decompositions consisting of two [`Grapheme_Extend`] characters:
//! - [`'\u{0CC0}'` KANNADA VOWEL SIGN II](https://util.unicode.org/UnicodeJsps/character.jsp?a=0CC0),
//! - [`'\u{0CC7}'` KANNADA VOWEL SIGN EE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0CC7),
//! - [`'\u{0CC8}'` KANNADA VOWEL SIGN AI](https://util.unicode.org/UnicodeJsps/character.jsp?a=0CC8),
//! - [`'\u{0CCA}'` KANNADA VOWEL SIGN O](https://util.unicode.org/UnicodeJsps/character.jsp?a=0CCA),
//! - [`'\u{0CCB}'` KANNADA VOWEL SIGN OO](https://util.unicode.org/UnicodeJsps/character.jsp?a=0CCB),
//! - [`'\u{1B3B}'` BALINESE VOWEL SIGN RA REPA TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B3B),
//! - [`'\u{1B3D}'` BALINESE VOWEL SIGN LA LENGA TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B3D), and
//! - [`'\u{1B43}'` BALINESE VOWEL SIGN PEPET TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B43).
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BHangul_Syllable_Type%3DV%7D%5Cp%7BHangul_Syllable_Type%3DT%7D)
//! with a [`Hangul_Syllable_Type`] of `Vowel_Jamo` (`V`) or `Trailing_Jamo` (`T`).
//! - The following [`Prepended_Concatenation_Mark`]s:
Expand All @@ -130,18 +124,18 @@
//! [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY]: https://util.unicode.org/UnicodeJsps/character.jsp?a=0338
//! [`'\u{2D7F}'` TIFINAGH CONSONANT JOINER]: https://util.unicode.org/UnicodeJsps/character.jsp?a=2D7F
//!
//! [`Canonical_Combining_Class`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G50313
//! [`Default_Ignorable_Code_Point`]: https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095
//! [`Canonical_Combining_Class`]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G50313
//! [`Default_Ignorable_Code_Point`]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-5/#G40095
//! [`East_Asian_Width`]: https://www.unicode.org/reports/tr11/#ED1
//! [`Emoji_Presentation`]: https://unicode.org/reports/tr51/#def_emoji_presentation
//! [`General_Category`]: https://www.unicode.org/versions/Unicode15.0.0/ch04.pdf#G124142
//! [`General_Category`]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-4/#G124142
//! [`Grapheme_Extend=Prepend`]: https://www.unicode.org/reports/tr29/#Prepend
//! [`Grapheme_Extend`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G52443
//! [`Hangul_Syllable_Type`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G45593
//! [`Grapheme_Extend`]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G52443
//! [`Hangul_Syllable_Type`]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G45593
//! [`Joining_Group`]: https://www.unicode.org/versions/Unicode14.0.0/ch09.pdf#G36862
//! [`Joining_Type`]: http://www.unicode.org/versions/Unicode15.0.0/ch09.pdf#G50009
//! [`Joining_Type`]: http://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-9/#G50009
//! [`Line_Break`]: https://www.unicode.org/reports/tr14/#LD5
//! [`Prepended_Concatenation_Mark`]: https://www.unicode.org/versions/Unicode15.0.0/ch23.pdf#G37908
//! [`Prepended_Concatenation_Mark`]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-23/#G37908
//! [`Script`]: https://www.unicode.org/reports/tr24/#Script
//!
//! [`Fullwidth`]: https://www.unicode.org/reports/tr11/#ED2
Expand All @@ -150,22 +144,24 @@
//!
//! [`AI`]: https://www.unicode.org/reports/tr14/#AI
//!
//! [combining marks]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G30602
//! [combining marks]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G30602
//!
//! [emoji ZWJ sequences]: https://www.unicode.org/reports/tr51/#def_emoji_sequence
//! [Emoji modifier sequences]: https://www.unicode.org/reports/tr51/#def_emoji_modifier_sequence
//! [Emoji presentation sequences]: https://unicode.org/reports/tr51/#def_emoji_presentation_sequence
//! [text presentation sequences]: https://unicode.org/reports/tr51/#def_text_presentation_sequence
//!
//! [General Punctuation]: https://www.unicode.org/charts/PDF/Unicode-16.0/U160-2000.pdf
//! [Enclosed Ideographic Supplement]: https://unicode.org/charts/nameslist/n_1F200.html
//!
//! [Arabic]: https://www.unicode.org/versions/Unicode15.0.0/ch09.pdf#G7480
//! [Buginese]: https://www.unicode.org/versions/Unicode15.0.0/ch17.pdf#G26743
//! [Hebrew]: https://www.unicode.org/versions/Unicode15.0.0/ch09.pdf#G6528
//! [Khmer]: https://www.unicode.org/versions/Unicode15.0.0/ch16.pdf#G64642
//! [Lisu]: https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G44587
//! [Old Turkic]: https://www.unicode.org/versions/Unicode15.0.0/ch14.pdf#G41975
//! [Tifinagh]: http://www.unicode.org/versions/Unicode15.0.0/ch19.pdf#G43184
//! [Arabic]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-9/#G7480
//! [Buginese]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-17/#G26743
//! [Hebrew]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-9/#G6528
//! [Khmer]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-16/#G64642
//! [Kirat Rai]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-13/#G746409
//! [Lisu]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-18/#G44587
//! [Old Turkic]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-14/#G41975
//! [Tifinagh]: http://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-19/#G43184
//!
//!
//! ## Canonical equivalence
Expand Down
Loading
Loading