From 98cb59e9f0bedd4e076df4a53687d391f55ecfa8 Mon Sep 17 00:00:00 2001 From: Timothy Gu Date: Mon, 26 Jun 2017 15:19:03 +0800 Subject: [PATCH] src: revise character width calculation - Categorize all nonspacing marks (Mn) and enclosing marks (Me) as 0-width - Categorize all spacing marks (Mc) as non-0-width. - Treat soft hyphens (a format character Cf) as non-0-width. - Do not treat all unassigned code points as 0-width; instead, let ICU select the default for that character per UAX #11. - Avoid getting the General_Category of a character multiple times as it is an intensive operation. Refs: http://unicode.org/reports/tr11/ PR-URL: https://github.com/nodejs/node/pull/13918 Reviewed-By: James M Snell --- src/node_i18n.cc | 27 ++++++++++++++++++---- test/parallel/test-icu-stringwidth.js | 32 ++++++++++++++++++++++++++- 2 files changed, 54 insertions(+), 5 deletions(-) diff --git a/src/node_i18n.cc b/src/node_i18n.cc index 44d94d625585e6..3b337449495f4c 100644 --- a/src/node_i18n.cc +++ b/src/node_i18n.cc @@ -601,14 +601,33 @@ static void ToASCII(const FunctionCallbackInfo& args) { // newer wide characters. wcwidth, on the other hand, uses a fixed // algorithm that does not take things like emoji into proper // consideration. +// +// TODO(TimothyGu): Investigate Cc (C0/C1 control codes). Both VTE (used by +// GNOME Terminal) and Konsole don't consider them to be zero-width (see refs +// below), and when printed in VTE it is Narrow. However GNOME Terminal doesn't +// allow it to be input. Linux's PTY terminal prints control characters as +// Narrow rhombi. +// +// TODO(TimothyGu): Investigate Hangul jamo characters. Medial vowels and final +// consonants are 0-width when combined with initial consonants; otherwise they +// are technically Wide. But many terminals (including Konsole and +// VTE/GLib-based) implement all medials and finals as 0-width. +// +// Refs: https://eev.ee/blog/2015/09/12/dark-corners-of-unicode/#combining-characters-and-character-width +// Refs: https://github.com/GNOME/glib/blob/79e4d4c6be/glib/guniprop.c#L388-L420 +// Refs: https://github.com/KDE/konsole/blob/8c6a5d13c0/src/konsole_wcwidth.cpp#L101-L223 static int GetColumnWidth(UChar32 codepoint, bool ambiguous_as_full_width = false) { - if (!u_isdefined(codepoint) || - u_iscntrl(codepoint) || - u_getCombiningClass(codepoint) > 0 || - u_hasBinaryProperty(codepoint, UCHAR_EMOJI_MODIFIER)) { + const auto zero_width_mask = U_GC_CC_MASK | // C0/C1 control code + U_GC_CF_MASK | // Format control character + U_GC_ME_MASK | // Enclosing mark + U_GC_MN_MASK; // Nonspacing mark + if (codepoint != 0x00AD && // SOFT HYPHEN is Cf but not zero-width + ((U_MASK(u_charType(codepoint)) & zero_width_mask) || + u_hasBinaryProperty(codepoint, UCHAR_EMOJI_MODIFIER))) { return 0; } + // UCHAR_EAST_ASIAN_WIDTH is the Unicode property that identifies a // codepoint as being full width, wide, ambiguous, neutral, narrow, // or halfwidth. diff --git a/test/parallel/test-icu-stringwidth.js b/test/parallel/test-icu-stringwidth.js index 80e798b13af154..7c8c2e948e0eba 100644 --- a/test/parallel/test-icu-stringwidth.js +++ b/test/parallel/test-icu-stringwidth.js @@ -11,13 +11,43 @@ const assert = require('assert'); const readline = require('internal/readline'); // Test column width + +// Ll (Lowercase Letter): LATIN SMALL LETTER A assert.strictEqual(readline.getStringWidth('a'), 1); +assert.strictEqual(readline.getStringWidth(0x0061), 1); +// Lo (Other Letter) assert.strictEqual(readline.getStringWidth('丁'), 2); +assert.strictEqual(readline.getStringWidth(0x4E01), 2); +// Surrogate pairs assert.strictEqual(readline.getStringWidth('\ud83d\udc78\ud83c\udfff'), 2); assert.strictEqual(readline.getStringWidth('πŸ‘…'), 2); +// Cs (Surrogate): High Surrogate +assert.strictEqual(readline.getStringWidth('\ud83d'), 1); +// Cs (Surrogate): Low Surrogate +assert.strictEqual(readline.getStringWidth('\udc78'), 1); +// Cc (Control): NULL +assert.strictEqual(readline.getStringWidth(0), 0); +// Cc (Control): BELL +assert.strictEqual(readline.getStringWidth(0x0007), 0); +// Cc (Control): LINE FEED assert.strictEqual(readline.getStringWidth('\n'), 0); +// Cf (Format): SOFT HYPHEN +assert.strictEqual(readline.getStringWidth(0x00AD), 1); +// Cf (Format): LEFT-TO-RIGHT MARK +// Cf (Format): RIGHT-TO-LEFT MARK assert.strictEqual(readline.getStringWidth('\u200Ef\u200F'), 1); -assert.strictEqual(readline.getStringWidth(97), 1); +// Cn (Unassigned): Not a character +assert.strictEqual(readline.getStringWidth(0x10FFEF), 1); +// Cn (Unassigned): Not a character (but in a CJK range) +assert.strictEqual(readline.getStringWidth(0x3FFEF), 2); +// Mn (Nonspacing Mark): COMBINING ACUTE ACCENT +assert.strictEqual(readline.getStringWidth(0x0301), 0); +// Mc (Spacing Mark): BALINESE ADEG ADEG +// Chosen as its Canonical_Combining_Class is not 0, but is not a 0-width +// character. +assert.strictEqual(readline.getStringWidth(0x1B44), 1); +// Me (Enclosing Mark): COMBINING ENCLOSING CIRCLE +assert.strictEqual(readline.getStringWidth(0x20DD), 0); // The following is an emoji sequence. In some implementations, it is // represented as a single glyph, in other implementations as a sequence