-
Notifications
You must be signed in to change notification settings - Fork 29.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
readline: use icu based string width calculation #9040
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,103 +1,117 @@ | ||
'use strict'; | ||
|
||
// Regexes used for ansi escape code splitting | ||
// Regex used for ansi escape code splitting | ||
// eslint-disable-next-line no-control-regex | ||
const metaKeyCodeReAnywhere = /(?:\x1b)([a-zA-Z0-9])/; | ||
const functionKeyCodeReAnywhere = new RegExp('(?:\x1b+)(O|N|\\[|\\[\\[)(?:' + [ | ||
'(\\d+)(?:;(\\d+))?([~^$])', | ||
'(?:M([@ #!a`])(.)(.))', // mouse | ||
'(?:1;)?(\\d+)?([a-zA-Z])' | ||
].join('|') + ')'); | ||
// Adopted from https://github.com/chalk/ansi-regex/blob/master/index.js | ||
// License: MIT, authors: @sindresorhus, Qix-, and arjunmehta | ||
// Matches all ansi escape code sequences in a string | ||
const ansi = | ||
/[\u001b\u009b][[()#;?]*(?:[0-9]{1,4}(?:;[0-9]{0,4})*)?[0-9A-ORZcf-nqry=><]/g; | ||
|
||
|
||
module.exports = { | ||
emitKeys, | ||
getStringWidth, | ||
isFullWidthCodePoint, | ||
stripVTControlCharacters | ||
}; | ||
|
||
if (process.binding('config').hasIntl) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not detectable from There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using |
||
const icu = process.binding('icu'); | ||
module.exports.getStringWidth = function getStringWidth(str, options) { | ||
options = options || {}; | ||
if (!Number.isInteger(str)) | ||
str = stripVTControlCharacters(String(str)); | ||
return icu.getStringWidth(str, | ||
Boolean(options.ambiguousAsFullWidth), | ||
Boolean(options.expandEmojiSequence)); | ||
}; | ||
module.exports.isFullWidthCodePoint = | ||
function isFullWidthCodePoint(code, options) { | ||
if (typeof code !== 'number') | ||
return false; | ||
return icu.getStringWidth(code, options) === 2; | ||
}; | ||
} else { | ||
/** | ||
* Returns the number of columns required to display the given string. | ||
*/ | ||
module.exports.getStringWidth = function getStringWidth(str) { | ||
if (Number.isInteger(str)) | ||
return module.exports.isFullWidthCodePoint(str) ? 2 : 1; | ||
|
||
/** | ||
* Returns the number of columns required to display the given string. | ||
*/ | ||
function getStringWidth(str) { | ||
let width = 0; | ||
let width = 0; | ||
|
||
str = stripVTControlCharacters(str); | ||
str = stripVTControlCharacters(String(str)); | ||
|
||
for (var i = 0; i < str.length; i++) { | ||
const code = str.codePointAt(i); | ||
for (var i = 0; i < str.length; i++) { | ||
const code = str.codePointAt(i); | ||
|
||
if (code >= 0x10000) { // surrogates | ||
i++; | ||
} | ||
if (code >= 0x10000) { // surrogates | ||
i++; | ||
} | ||
|
||
if (isFullWidthCodePoint(code)) { | ||
width += 2; | ||
} else { | ||
width++; | ||
if (module.exports.isFullWidthCodePoint(code)) { | ||
width += 2; | ||
} else { | ||
width++; | ||
} | ||
} | ||
} | ||
|
||
return width; | ||
} | ||
|
||
return width; | ||
}; | ||
|
||
/** | ||
* Returns true if the character represented by a given | ||
* Unicode code point is full-width. Otherwise returns false. | ||
*/ | ||
function isFullWidthCodePoint(code) { | ||
if (isNaN(code)) { | ||
return false; | ||
} | ||
/** | ||
* Returns true if the character represented by a given | ||
* Unicode code point is full-width. Otherwise returns false. | ||
*/ | ||
module.exports.isFullWidthCodePoint = function isFullWidthCodePoint(code) { | ||
if (!Number.isInteger(code)) { | ||
return false; | ||
} | ||
|
||
// Code points are derived from: | ||
// http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt | ||
if (code >= 0x1100 && ( | ||
code <= 0x115f || // Hangul Jamo | ||
0x2329 === code || // LEFT-POINTING ANGLE BRACKET | ||
0x232a === code || // RIGHT-POINTING ANGLE BRACKET | ||
// CJK Radicals Supplement .. Enclosed CJK Letters and Months | ||
(0x2e80 <= code && code <= 0x3247 && code !== 0x303f) || | ||
// Enclosed CJK Letters and Months .. CJK Unified Ideographs Extension A | ||
0x3250 <= code && code <= 0x4dbf || | ||
// CJK Unified Ideographs .. Yi Radicals | ||
0x4e00 <= code && code <= 0xa4c6 || | ||
// Hangul Jamo Extended-A | ||
0xa960 <= code && code <= 0xa97c || | ||
// Hangul Syllables | ||
0xac00 <= code && code <= 0xd7a3 || | ||
// CJK Compatibility Ideographs | ||
0xf900 <= code && code <= 0xfaff || | ||
// Vertical Forms | ||
0xfe10 <= code && code <= 0xfe19 || | ||
// CJK Compatibility Forms .. Small Form Variants | ||
0xfe30 <= code && code <= 0xfe6b || | ||
// Halfwidth and Fullwidth Forms | ||
0xff01 <= code && code <= 0xff60 || | ||
0xffe0 <= code && code <= 0xffe6 || | ||
// Kana Supplement | ||
0x1b000 <= code && code <= 0x1b001 || | ||
// Enclosed Ideographic Supplement | ||
0x1f200 <= code && code <= 0x1f251 || | ||
// CJK Unified Ideographs Extension B .. Tertiary Ideographic Plane | ||
0x20000 <= code && code <= 0x3fffd)) { | ||
return true; | ||
} | ||
// Code points are derived from: | ||
// http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt | ||
if (code >= 0x1100 && ( | ||
code <= 0x115f || // Hangul Jamo | ||
0x2329 === code || // LEFT-POINTING ANGLE BRACKET | ||
0x232a === code || // RIGHT-POINTING ANGLE BRACKET | ||
// CJK Radicals Supplement .. Enclosed CJK Letters and Months | ||
(0x2e80 <= code && code <= 0x3247 && code !== 0x303f) || | ||
// Enclosed CJK Letters and Months .. CJK Unified Ideographs Extension A | ||
0x3250 <= code && code <= 0x4dbf || | ||
// CJK Unified Ideographs .. Yi Radicals | ||
0x4e00 <= code && code <= 0xa4c6 || | ||
// Hangul Jamo Extended-A | ||
0xa960 <= code && code <= 0xa97c || | ||
// Hangul Syllables | ||
0xac00 <= code && code <= 0xd7a3 || | ||
// CJK Compatibility Ideographs | ||
0xf900 <= code && code <= 0xfaff || | ||
// Vertical Forms | ||
0xfe10 <= code && code <= 0xfe19 || | ||
// CJK Compatibility Forms .. Small Form Variants | ||
0xfe30 <= code && code <= 0xfe6b || | ||
// Halfwidth and Fullwidth Forms | ||
0xff01 <= code && code <= 0xff60 || | ||
0xffe0 <= code && code <= 0xffe6 || | ||
// Kana Supplement | ||
0x1b000 <= code && code <= 0x1b001 || | ||
// Enclosed Ideographic Supplement | ||
0x1f200 <= code && code <= 0x1f251 || | ||
// CJK Unified Ideographs Extension B .. Tertiary Ideographic Plane | ||
0x20000 <= code && code <= 0x3fffd)) { | ||
return true; | ||
} | ||
|
||
return false; | ||
return false; | ||
}; | ||
} | ||
|
||
|
||
/** | ||
* Tries to remove all VT control characters. Use to estimate displayed | ||
* string width. May be buggy due to not running a real state machine | ||
*/ | ||
function stripVTControlCharacters(str) { | ||
str = str.replace(new RegExp(functionKeyCodeReAnywhere.source, 'g'), ''); | ||
return str.replace(new RegExp(metaKeyCodeReAnywhere.source, 'g'), ''); | ||
return str.replace(ansi, ''); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. doesn't need There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it uses |
||
} | ||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,6 +31,7 @@ | |
#include "v8.h" | ||
|
||
#include <unicode/putil.h> | ||
#include <unicode/uchar.h> | ||
#include <unicode/udata.h> | ||
#include <unicode/uidna.h> | ||
|
||
|
@@ -185,13 +186,102 @@ static void ToASCII(const FunctionCallbackInfo<Value>& args) { | |
len).ToLocalChecked()); | ||
} | ||
|
||
// This is similar to wcwidth except that it takes the current unicode | ||
// character properties database into consideration, allowing it to | ||
// correctly calculate the column widths of things like emoji's and | ||
// newer wide characters. wcwidth, on the other hand, uses a fixed | ||
// algorithm that does not take things like emoji into proper | ||
// consideration. | ||
static int GetColumnWidth(UChar32 codepoint, | ||
bool ambiguous_as_full_width = false) { | ||
if (!u_isdefined(codepoint) || | ||
u_iscntrl(codepoint) || | ||
u_getCombiningClass(codepoint) > 0 || | ||
u_hasBinaryProperty(codepoint, UCHAR_EMOJI_MODIFIER)) { | ||
return 0; | ||
} | ||
// UCHAR_EAST_ASIAN_WIDTH is the Unicode property that identifies a | ||
// codepoint as being full width, wide, ambiguous, neutral, narrow, | ||
// or halfwidth. | ||
const int eaw = u_getIntPropertyValue(codepoint, UCHAR_EAST_ASIAN_WIDTH); | ||
switch (eaw) { | ||
case U_EA_FULLWIDTH: | ||
case U_EA_WIDE: | ||
return 2; | ||
case U_EA_AMBIGUOUS: | ||
// See: http://www.unicode.org/reports/tr11/#Ambiguous for details | ||
if (ambiguous_as_full_width) { | ||
return 2; | ||
} | ||
// Fall through if ambiguous_as_full_width if false. | ||
case U_EA_NEUTRAL: | ||
if (u_hasBinaryProperty(codepoint, UCHAR_EMOJI_PRESENTATION)) { | ||
return 2; | ||
} | ||
// Fall through | ||
case U_EA_HALFWIDTH: | ||
case U_EA_NARROW: | ||
default: | ||
return 1; | ||
} | ||
} | ||
|
||
// Returns the column width for the given String. | ||
static void GetStringWidth(const FunctionCallbackInfo<Value>& args) { | ||
Environment* env = Environment::GetCurrent(args); | ||
if (args.Length() < 1) | ||
return; | ||
|
||
bool ambiguous_as_full_width = args[1]->BooleanValue(); | ||
bool expand_emoji_sequence = args[2]->BooleanValue(); | ||
|
||
if (args[0]->IsNumber()) { | ||
args.GetReturnValue().Set( | ||
GetColumnWidth(args[0]->Uint32Value(), | ||
ambiguous_as_full_width)); | ||
return; | ||
} | ||
|
||
TwoByteValue value(env->isolate(), args[0]); | ||
// reinterpret_cast is required by windows to compile | ||
UChar* str = reinterpret_cast<UChar*>(*value); | ||
UChar32 c; | ||
UChar32 p; | ||
size_t n = 0; | ||
uint32_t width = 0; | ||
|
||
while (n < value.length()) { | ||
p = c; | ||
U16_NEXT(str, n, value.length(), c); | ||
// Don't count individual emoji codepoints that occur within an | ||
// emoji sequence. This is not necessarily foolproof. Some | ||
// environments display emoji sequences in the appropriate | ||
// condensed form (as a single emoji glyph), other environments | ||
// may not understand an emoji sequence and will display each | ||
// individual emoji separately. When this happens, the width | ||
// calculated will be off, and there's no reliable way of knowing | ||
// in advance if a particular sequence is going to be supported. | ||
// The expand_emoji_sequence option allows the caller to skip this | ||
// check and count each code within an emoji sequence separately. | ||
if (!expand_emoji_sequence && | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems like a reasoable way of doing this calculation |
||
n > 0 && p == 0x200d && // 0x200d == ZWJ (zero width joiner) | ||
(u_hasBinaryProperty(c, UCHAR_EMOJI_PRESENTATION) || | ||
u_hasBinaryProperty(c, UCHAR_EMOJI_MODIFIER))) { | ||
continue; | ||
} | ||
width += GetColumnWidth(c, ambiguous_as_full_width); | ||
} | ||
args.GetReturnValue().Set(width); | ||
} | ||
|
||
void Init(Local<Object> target, | ||
Local<Value> unused, | ||
Local<Context> context, | ||
void* priv) { | ||
Environment* env = Environment::GetCurrent(context); | ||
env->SetMethod(target, "toUnicode", ToUnicode); | ||
env->SetMethod(target, "toASCII", ToASCII); | ||
env->SetMethod(target, "getStringWidth", GetStringWidth); | ||
} | ||
|
||
} // namespace i18n | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
// Flags: --expose_internals | ||
'use strict'; | ||
|
||
const common = require('../common'); | ||
const assert = require('assert'); | ||
const readline = require('internal/readline'); | ||
|
||
if (!process.binding('config').hasIntl) { | ||
common.skip('missing intl... skipping test'); | ||
return; | ||
} | ||
|
||
// Test column width | ||
assert.strictEqual(readline.getStringWidth('a'), 1); | ||
assert.strictEqual(readline.getStringWidth('丁'), 2); | ||
assert.strictEqual(readline.getStringWidth('\ud83d\udc78\ud83c\udfff'), 2); | ||
assert.strictEqual(readline.getStringWidth('👅'), 2); | ||
assert.strictEqual(readline.getStringWidth('\n'), 0); | ||
assert.strictEqual(readline.getStringWidth('\u200Ef\u200F'), 1); | ||
assert.strictEqual(readline.getStringWidth(97), 1); | ||
|
||
// The following is an emoji sequence. In some implementations, it is | ||
// represented as a single glyph, in other implementations as a sequence | ||
// of individual glyphs. By default, the algorithm will assume the single | ||
// glyph interpretation and return a value of 2. By passing the | ||
// expandEmojiSequence: true option, each component will be counted | ||
// individually. | ||
assert.strictEqual(readline.getStringWidth('👩👩👧👧'), 2); | ||
assert.strictEqual( | ||
readline.getStringWidth('👩👩👧👧', {expandEmojiSequence: true}), 8); | ||
|
||
// By default, unicode characters whose width is considered ambiguous will | ||
// be considered half-width. For these characters, getStringWidth will return | ||
// 1. In some contexts, however, it is more appropriate to consider them full | ||
// width. By default, the algorithm will assume half width. By passing | ||
// the ambiguousAsFullWidth: true option, ambiguous characters will be counted | ||
// as 2 columns. | ||
assert.strictEqual(readline.getStringWidth('\u01d4'), 1); | ||
assert.strictEqual( | ||
readline.getStringWidth('\u01d4', {ambiguousAsFullWidth: true}), 2); | ||
|
||
// Control chars and combining chars are zero | ||
assert.strictEqual(readline.getStringWidth('\u200E\n\u220A\u20D2'), 1); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
maybe we should initially assign no-ops with comments?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
how come? that does not seem very practical.