Skip to content

Commit

Permalink
Support Unicode 16 variation seqs for quotation mark width
Browse files Browse the repository at this point in the history
  • Loading branch information
Jules-Bertholet committed Jan 15, 2025
1 parent 6ab41d7 commit 1dd88e4
Show file tree
Hide file tree
Showing 4 changed files with 198 additions and 33 deletions.
97 changes: 80 additions & 17 deletions scripts/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,8 +175,11 @@ class WidthState(enum.IntEnum):
- 4th bit: whether to set top bit on emoji presentation.
If this is set but 3rd is not, the width mode is related to zwj sequences
- 5th from top: whether this is unaffected by ligature-transparent
(if set, should also set 3rd and 4th)
- 6th bit: if 4th is set but this one is not, then this is a ZWJ ligature state
where no ZWJ has been encountered yet; encountering one flips this on"""
where no ZWJ has been encountered yet; encountering one flips this on
- Seventh bit: is VS1 (if CJK) or is VS2 (not CJK)
"""

# BASIC WIDTHS

Expand Down Expand Up @@ -272,6 +275,9 @@ class WidthState(enum.IntEnum):

# VARIATION SELECTORS

VARIATION_SELECTOR_1_OR_2 = 0b0000_0010_0000_0000
"\\uFE00 if CJK, or \\uFE01 otherwise"

# Text presentation sequences (not CJK)
VARIATION_SELECTOR_15 = 0b0100_0000_0000_0000
"\\uFE0E (text presentation sequences)"
Expand Down Expand Up @@ -367,6 +373,7 @@ def width_alone(self) -> int:
| WidthState.COMBINING_LONG_SOLIDUS_OVERLAY
| WidthState.VARIATION_SELECTOR_15
| WidthState.VARIATION_SELECTOR_16
| WidthState.VARIATION_SELECTOR_1_OR_2
):
return 0
case (
Expand Down Expand Up @@ -656,9 +663,11 @@ def load_width_maps() -> tuple[list[WidthState], list[WidthState]]:
ea[cp] = width

# East-Asian only
ea[0xFE00] = WidthState.VARIATION_SELECTOR_1_OR_2
ea[0x0338] = WidthState.COMBINING_LONG_SOLIDUS_OVERLAY

# Not East Asian only
not_ea[0xFE01] = WidthState.VARIATION_SELECTOR_1_OR_2
not_ea[0xFE0E] = WidthState.VARIATION_SELECTOR_15

return (not_ea, ea)
Expand Down Expand Up @@ -724,7 +733,7 @@ def load_solidus_transparent(
cjk_width_map: list[WidthState],
) -> list[tuple[Codepoint, Codepoint]]:
"""Characters expanding to a canonical combining class above 1, plus `ligature_transparent`s from above.
Ranges matching ones in `ligature_transparent` exactly are excluded (for compression), so it needs to bechecked also.
Ranges matching ones in `ligature_transparent` exactly are excluded (for compression), so it needs to be checked also.
"""

ccc_above_1 = set()
Expand Down Expand Up @@ -756,7 +765,7 @@ def load_solidus_transparent(
num_chars = len(ccc_above_1)

for cp in ccc_above_1:
if cp != 0xFE0F:
if cp not in [0xFE00, 0xFE0F]:
assert (
cjk_width_map[cp].table_width() != CharWidthInTable.SPECIAL
), f"U+{cp:X}"
Expand Down Expand Up @@ -1312,8 +1321,17 @@ def lookup_fns(
return (0, next_info.set_emoji_presentation());
}"""

if not is_cjk:
if is_cjk:
s += """
if c == '\\u{FE00}' {
return (0, next_info.set_vs1_2());
}
"""
else:
s += """
if c == '\\u{FE01}' {
return (0, next_info.set_vs1_2());
}
if c == '\\u{FE0E}' {
return (0, next_info.set_text_presentation());
}
Expand All @@ -1323,9 +1341,19 @@ def lookup_fns(
} else {
next_info = next_info.unset_text_presentation();
}
}"""
} else """

s += """
s += """if next_info.is_vs1_2() {
if matches!(c, '\\u{2018}' | '\\u{2019}' | '\\u{201C}' | '\\u{201D}') {
return ("""

s += str(2 - is_cjk)

s += """, WidthInfo::DEFAULT);
} else {
next_info = next_info.unset_vs1_2();
}
}
if next_info.is_ligature_transparent() {
if c == '\\u{200D}' {
return (0, next_info.set_zwj_bit());
Expand Down Expand Up @@ -1586,6 +1614,8 @@ def emit_module(
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
struct WidthInfo(u16);
const LIGATURE_TRANSPARENT_MASK: u16 = 0b0010_0000_0000_0000;
impl WidthInfo {
/// No special handling necessary
const DEFAULT: Self = Self(0);
Expand Down Expand Up @@ -1615,51 +1645,84 @@ def emit_module(
/// Has top bit set
fn is_emoji_presentation(self) -> bool {{
(self.0 & 0b1000_0000_0000_0000) == 0b1000_0000_0000_0000
(self.0 & WidthInfo::VARIATION_SELECTOR_16.0) == WidthInfo::VARIATION_SELECTOR_16.0
}}
/// Has top bit set
fn is_zwj_emoji_presentation(self) -> bool {{
(self.0 & 0b1011_0000_0000_0000) == 0b1001_0000_0000_0000
}}
/// Set top bit
fn set_emoji_presentation(self) -> Self {{
if (self.0 & 0b0010_0000_0000_0000) == 0b0010_0000_0000_0000
if (self.0 & LIGATURE_TRANSPARENT_MASK) == LIGATURE_TRANSPARENT_MASK
|| (self.0 & 0b1001_0000_0000_0000) == 0b0001_0000_0000_0000
{{
Self(self.0 | 0b1000_0000_0000_0000)
Self(
self.0
| WidthInfo::VARIATION_SELECTOR_16.0
& !WidthInfo::VARIATION_SELECTOR_15.0
& !WidthInfo::VARIATION_SELECTOR_1_OR_2.0,
)
}} else {{
Self::VARIATION_SELECTOR_16
}}
}}
/// Clear top bit
fn unset_emoji_presentation(self) -> Self {{
if (self.0 & 0b0010_0000_0000_0000) == 0b0010_0000_0000_0000 {{
Self(self.0 & 0b0111_1111_1111_1111)
if (self.0 & LIGATURE_TRANSPARENT_MASK) == LIGATURE_TRANSPARENT_MASK {{
Self(self.0 & !WidthInfo::VARIATION_SELECTOR_16.0)
}} else {{
Self::DEFAULT
}}
}}
/// Has 2nd bit set
fn is_text_presentation(self) -> bool {{
(self.0 & 0b0100_0000_0000_0000) == 0b0100_0000_0000_0000
(self.0 & WidthInfo::VARIATION_SELECTOR_15.0) == WidthInfo::VARIATION_SELECTOR_15.0
}}
/// Set 2nd bit
fn set_text_presentation(self) -> Self {{
if (self.0 & 0b0010_0000_0000_0000) == 0b0010_0000_0000_0000 {{
Self(self.0 | 0b0100_0000_0000_0000)
if (self.0 & LIGATURE_TRANSPARENT_MASK) == LIGATURE_TRANSPARENT_MASK {{
Self(
self.0
| WidthInfo::VARIATION_SELECTOR_15.0
& !WidthInfo::VARIATION_SELECTOR_16.0
& !WidthInfo::VARIATION_SELECTOR_1_OR_2.0,
)
}} else {{
Self(0b0100_0000_0000_0000)
Self(WidthInfo::VARIATION_SELECTOR_15.0)
}}
}}
/// Clear 2nd bit
fn unset_text_presentation(self) -> Self {{
Self(self.0 & 0b1011_1111_1111_1111)
Self(self.0 & !WidthInfo::VARIATION_SELECTOR_15.0)
}}
/// Has 7th bit set
fn is_vs1_2(self) -> bool {{
(self.0 & WidthInfo::VARIATION_SELECTOR_1_OR_2.0) == WidthInfo::VARIATION_SELECTOR_1_OR_2.0
}}
/// Set 7th bit
fn set_vs1_2(self) -> Self {{
if (self.0 & LIGATURE_TRANSPARENT_MASK) == LIGATURE_TRANSPARENT_MASK {{
Self(
self.0
| WidthInfo::VARIATION_SELECTOR_1_OR_2.0
& !WidthInfo::VARIATION_SELECTOR_15.0
& !WidthInfo::VARIATION_SELECTOR_16.0,
)
}} else {{
Self(WidthInfo::VARIATION_SELECTOR_1_OR_2.0)
}}
}}
/// Clear 7th bit
fn unset_vs1_2(self) -> Self {{
Self(self.0 & !WidthInfo::VARIATION_SELECTOR_1_OR_2.0)
}}
}}
Expand Down
5 changes: 4 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@
//! - Outside of an East Asian context, [text presentation sequences] have width 1 if their base character:
//! - Has the [`Emoji_Presentation`] property, and
//! - Is not in the [Enclosed Ideographic Supplement] block.
//! - [`'\u{2018}'`, `'\u{2019}'`, `'\u{201C}'`, and `'\u{201D}'`][General Punctuation] always have width 1 when followed by '\u{FE00}',
//! and width 2 when followed by '\u{FE01}'.
//! - Script-specific ligatures:
//! - For all the following ligatures, the insertion of any number of [default-ignorable][`Default_Ignorable_Code_Point`]
//! [combining marks] anywhere in the sequence will not change the total width. In addition, for all non-Arabic
Expand All @@ -75,7 +77,7 @@
//! - **[Khmer]**: Coeng signs consisting of `'\u{17D2}'` followed by a character in
//! `'\u{1780}'..='\u{1782}' | '\u{1784}'..='\u{1787}' | '\u{1789}'..='\u{178C}' | '\u{178E}'..='\u{1793}' | '\u{1795}'..='\u{1798}' | '\u{179B}'..='\u{179D}' | '\u{17A0}' | '\u{17A2}' | '\u{17A7}' | '\u{17AB}'..='\u{17AC}' | '\u{17AF}'`
//! have width 0.
//! - **[Kirat Rai]**: Any sequence canonically equivalent to `\u{16D68}`, `\u{16D69}`, or `\u{16D6A}` has total width 1.
//! - **[Kirat Rai]**: Any sequence canonically equivalent to `'\u{16D68}'`, `'\u{16D69}'`, or `'\u{16D6A}'` has total width 1.
//! - **[Lisu]**: Tone letter combinations consisting of a character in the range `'\u{A4F8}'..='\u{A4FB}'`
//! followed by a character in the range `'\u{A4FC}'..='\u{A4FD}'` have width 1. For example: `ꓹꓼ`
//! - **[Old Turkic]**: `"\u{10C32}\u{200D}\u{10C03}"` (`𐰲‍𐰃`) has total width 1.
Expand Down Expand Up @@ -158,6 +160,7 @@
//! [Emoji presentation sequences]: https://unicode.org/reports/tr51/#def_emoji_presentation_sequence
//! [text presentation sequences]: https://unicode.org/reports/tr51/#def_text_presentation_sequence
//!
//! [General Punctuation]: https://www.unicode.org/charts/PDF/Unicode-16.0/U160-2000.pdf
//! [Enclosed Ideographic Supplement]: https://unicode.org/charts/nameslist/n_1F200.html
//!
//! [Arabic]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-9/#G7480
Expand Down
Loading

0 comments on commit 1dd88e4

Please sign in to comment.