From 9a3fc10dd4eab551fc0c5176698b5ddd591a4ae3 Mon Sep 17 00:00:00 2001 From: James M Snell Date: Mon, 12 Jun 2017 08:25:53 -0700 Subject: [PATCH] util: implement WHATWG Encoding Standard API Provide an (initially experimental) implementation of the WHATWG Encoding Standard API (`TextDecoder` and `TextEncoder`). The is the same API implemented on the browser side. By default, with small-icu, only the UTF-8, UTF-16le and UTF-16be decoders are supported. With full-icu enabled, every encoding other than iso-8859-16 is supported. This provides a basic test, but does not include the full web platform tests. Note: many of the web platform tests for this would fail by default because we ship with small-icu by default. A process warning will be emitted on first use to indicate that the API is still experimental. No runtime flag is required to use the feature. Backport-PR-URL: https://github.com/nodejs/node/pull/14585 Backport-Reviewed-By: Anna Henningsen Refs: https://encoding.spec.whatwg.org/ PR-URL: https://github.com/nodejs/node/pull/13644 Reviewed-By: Timothy Gu Reviewed-By: Matteo Collina --- doc/api/buffer.md | 14 +- doc/api/util.md | 151 +++++++++ lib/internal/encoding.js | 458 ++++++++++++++++++++++++++ lib/internal/errors.js | 10 + lib/util.js | 3 + node.gyp | 1 + src/node_buffer.cc | 23 ++ src/node_i18n.cc | 155 +++++++++ src/node_i18n.h | 1 + src/node_util.cc | 1 + test/parallel/test-whatwg-encoding.js | 385 ++++++++++++++++++++++ tools/icu/icu-generic.gyp | 9 - 12 files changed, 1195 insertions(+), 16 deletions(-) create mode 100644 lib/internal/encoding.js create mode 100644 test/parallel/test-whatwg-encoding.js diff --git a/doc/api/buffer.md b/doc/api/buffer.md index d73af5fd162dca..f8681c6be8e28e 100644 --- a/doc/api/buffer.md +++ b/doc/api/buffer.md @@ -193,11 +193,12 @@ The character encodings currently supported by Node.js include: * `'hex'` - Encode each byte as two hexadecimal characters. -*Note*: Today's browsers follow the [WHATWG spec] which aliases both 'latin1' -and ISO-8859-1 to win-1252. This means that while doing something like -`http.get()`, if the returned charset is one of those listed in the WHATWG spec -it's possible that the server actually returned win-1252-encoded data, and -using `'latin1'` encoding may incorrectly decode the characters. +*Note*: Today's browsers follow the [WHATWG Encoding Standard][] which aliases +both 'latin1' and ISO-8859-1 to win-1252. This means that while doing something +like `http.get()`, if the returned charset is one of those listed in the WHATWG +specification it is possible that the server actually returned +win-1252-encoded data, and using `'latin1'` encoding may incorrectly decode the +characters. ## Buffers and TypedArray + +> Stability: 1 - Experimental + +An implementation of the [WHATWG Encoding Standard][] `TextDecoder` API. + +```js +const decoder = new TextDecoder('shift_jis'); +let string = ''; +let buffer; +while (buffer = getNextChunkSomehow()) { + string += decoder.decode(buffer, { stream: true }); +} +string += decoder.decode(); // end-of-stream +``` + +#### WHATWG Supported Encodings + +Per the [WHATWG Encoding Standard][], the encodings supported by the +`TextDecoder` API are outlined in the tables below. For each encoding, +one or more aliases may be used. Support for some encodings is enabled +only when Node.js is using the full ICU data. + +##### Encodings Supported By Default + +| Encoding | Aliases | +| ----------- | --------------------------------- | +| `'utf8'` | `'unicode-1-1-utf-8'`, `'utf-8'` | +| `'utf-16be'`| | +| `'utf-16le'`| `'utf-16'` | + +##### Encodings Requiring Full-ICU + +| Encoding | Aliases | +| ----------------- | -------------------------------- | +| `'ibm866'` | `'866'`, `'cp866'`, `'csibm866'` | +| `'iso-8859-2'` | `'csisolatin2'`, `'iso-ir-101'`, `'iso8859-2'`, `'iso88592'`, `'iso_8859-2'`, `'iso_8859-2:1987'`, `'l2'`, `'latin2'` | +| `'iso-8859-3'` | `'csisolatin3'`, `'iso-ir-109'`, `'iso8859-3'`, `'iso88593'`, `'iso_8859-3'`, `'iso_8859-3:1988'`, `'l3'`, `'latin3'` | +| `'iso-8859-4'` | `'csisolatin4'`, `'iso-ir-110'`, `'iso8859-4'`, `'iso88594'`, `'iso_8859-4'`, `'iso_8859-4:1988'`, `'l4'`, `'latin4'` | +| `'iso-8859-5'` | `'csisolatincyrillic'`, `'cyrillic'`, `'iso-ir-144'`, `'iso8859-5'`, `'iso88595'`, `'iso_8859-5'`, `'iso_8859-5:1988'`| +| `'iso-8859-6'` | `'arabic'`, `'asmo-708'`, `'csiso88596e'`, `'csiso88596i'`, `'csisolatinarabic'`, `'ecma-114'`, `'iso-8859-6-e'`, `'iso-8859-6-i'`, `'iso-ir-127'`, `'iso8859-6'`, `'iso88596'`, `'iso_8859-6'`, `'iso_8859-6:1987'` | +| `'iso-8859-7'` | `'csisolatingreek'`, `'ecma-118'`, `'elot_928'`, `'greek'`, `'greek8'`, `'iso-ir-126'`, `'iso8859-7'`, `'iso88597'`, `'iso_8859-7'`, `'iso_8859-7:1987'`, `'sun_eu_greek'` | +| `'iso-8859-8'` | `'csiso88598e'`, `'csisolatinhebrew'`, `'hebrew'`, `'iso-8859-8-e'`, `'iso-ir-138'`, `'iso8859-8'`, `'iso88598'`, `'iso_8859-8'`, `'iso_8859-8:1988'`, `'visual'` | +| `'iso-8859-8-i'` | `'csiso88598i'`, `'logical'` | +| `'iso-8859-10'` | `'csisolatin6'`, `'iso-ir-157'`, `'iso8859-10'`, `'iso885910'`, `'l6'`, `'latin6'` | +| `'iso-8859-13'` | `'iso8859-13'`, `'iso885913'` | +| `'iso-8859-14'` | `'iso8859-14'`, `'iso885914'` | +| `'iso-8859-15'` | `'csisolatin9'`, `'iso8859-15'`, `'iso885915'`, `'iso_8859-15'`, `'l9'` | +| `'koi8-r'` | `'cskoi8r'`, `'koi'`, `'koi8'`, `'koi8_r'` | +| `'koi8-u'` | `'koi8-ru'` | +| `'macintosh'` | `'csmacintosh'`, `'mac'`, `'x-mac-roman'` | +| `'windows-874'` | `'dos-874'`, `'iso-8859-11'`, `'iso8859-11'`, `'iso885911'`, `'tis-620'` | +| `'windows-1250'` | `'cp1250'`, `'x-cp1250'` | +| `'windows-1251'` | `'cp1251'`, `'x-cp1251'` | +| `'windows-1252'` | `'ansi_x3.4-1968'`, `'ascii'`, `'cp1252'`, `'cp819'`, `'csisolatin1'`, `'ibm819'`, `'iso-8859-1'`, `'iso-ir-100'`, `'iso8859-1'`, `'iso88591'`, `'iso_8859-1'`, `'iso_8859-1:1987'`, `'l1'`, `'latin1'`, `'us-ascii'`, `'x-cp1252'` | +| `'windows-1253'` | `'cp1253'`, `'x-cp1253'` | +| `'windows-1254'` | `'cp1254'`, `'csisolatin5'`, `'iso-8859-9'`, `'iso-ir-148'`, `'iso8859-9'`, `'iso88599'`, `'iso_8859-9'`, `'iso_8859-9:1989'`, `'l5'`, `'latin5'`, `'x-cp1254'` | +| `'windows-1255'` | `'cp1255'`, `'x-cp1255'` | +| `'windows-1256'` | `'cp1256'`, `'x-cp1256'` | +| `'windows-1257'` | `'cp1257'`, `'x-cp1257'` | +| `'windows-1258'` | `'cp1258'`, `'x-cp1258'` | +| `'x-mac-cyrillic'`| `'x-mac-ukrainian'` | +| `'gbk'` | `'chinese'`, `'csgb2312'`, `'csiso58gb231280'`, `'gb2312'`, `'gb_2312'`, `'gb_2312-80'`, `'iso-ir-58'`, `'x-gbk'` | +| `'gb18030'` | | +| `'big5'` | `'big5-hkscs'`, `'cn-big5'`, `'csbig5'`, `'x-x-big5'` | +| `'euc-jp'` | `'cseucpkdfmtjapanese'`, `'x-euc-jp'` | +| `'iso-2022-jp'` | `'csiso2022jp'` | +| `'shift_jis'` | `'csshiftjis'`, `'ms932'`, `'ms_kanji'`, `'shift-jis'`, `'sjis'`, `'windows-31j'`, `'x-sjis'` | +| `'euc-kr'` | `'cseuckr'`, `'csksc56011987'`, `'iso-ir-149'`, `'korean'`, `'ks_c_5601-1987'`, `'ks_c_5601-1989'`, `'ksc5601'`, `'ksc_5601'`, `'windows-949'` | + +*Note*: The `'iso-8859-16'` encoding listed in the [WHATWG Encoding Standard][] +is not supported. + +#### new TextDecoder([encoding[, options]]) + +* `encoding` {string} Identifies the `encoding` that this `TextDecoder` instance + supports. Defaults to `'utf-8'`. +* `options` {Object} + * `fatal` {boolean} `true` if decoding failures are fatal. Defaults to + `false`. + * `ignoreBOM` {boolean} When `true`, the `TextDecoder` will include the byte + order mark in the decoded result. When `false`, the byte order mark will + be removed from the output. This option is only used when `encoding` is + `'utf-8'`, `'utf-16be'` or `'utf-16le'`. Defaults to `false`. + +Creates an new `TextDecoder` instance. The `encoding` may specify one of the +supported encodings or an alias. + +#### textDecoder.decode([input[, options]]) + +* `input` {ArrayBuffer|DataView|TypedArray} An `ArrayBuffer`, `DataView` or + Typed Array instance containing the encoded data. +* `options` {Object} + * `stream` {boolean} `true` if additional chunks of data are expected. + Defaults to `false`. +* Returns: {string} + +Decodes the `input` and returns a string. If `options.stream` is `true`, any +incomplete byte sequences occuring at the end of the `input` are buffered +internally and emitted after the next call to `textDecoder.decode()`. + +If `textDecoder.fatal` is `true`, decoding errors that occur will result in a +`TypeError` being thrown. + +#### textDecoder.encoding + +* Value: {string} + +The encoding supported by the `TextDecoder` instance. + +#### textDecoder.fatal + +* Value: {boolean} + +The value will be `true` if decoding errors result in a `TypeError` being +thrown. + +#### textDecoder.ignoreBOM + +* Value: {boolean} + +The value will be `true` if the decoding result will include the byte order +mark. + +### Class: util.TextEncoder + + +> Stability: 1 - Experimental + +An implementation of the [WHATWG Encoding Standard][] `TextEncoder` API. All +instances of `TextEncoder` only support `UTF-8` encoding. + +```js +const encoder = new TextEncoder(); +const uint8array = encoder.encode('this is some data'); +``` + +#### textEncoder.encode([input]) + +* `input` {string} The text to encode. Defaults to an empty string. +* Returns: {Uint8Array} + +UTF-8 Encodes the `input` string and returns a `Uint8Array` containing the +encoded bytes. + ## Deprecated APIs The following APIs have been deprecated and should no longer be used. Existing @@ -1022,3 +1172,4 @@ Deprecated predecessor of `console.log`. [Custom promisified functions]: #util_custom_promisified_functions [constructor]: https://developer.mozilla.org/en/JavaScript/Reference/Global_Objects/Object/constructor [semantically incompatible]: https://github.com/nodejs/node/issues/4179 +[WHATWG Encoding Standard]: https://encoding.spec.whatwg.org/ diff --git a/lib/internal/encoding.js b/lib/internal/encoding.js new file mode 100644 index 00000000000000..22ae5c6c0db1ab --- /dev/null +++ b/lib/internal/encoding.js @@ -0,0 +1,458 @@ +'use strict'; + +// An implementation of the WHATWG Encoding Standard +// https://encoding.spec.whatwg.org + +const errors = require('internal/errors'); +const kHandle = Symbol('handle'); +const kFlags = Symbol('flags'); +const kEncoding = Symbol('encoding'); +const kDecoder = Symbol('decoder'); +const kEncoder = Symbol('encoder'); + +let warned = false; +const experimental = + 'The WHATWG Encoding Standard implementation is an experimental API. It ' + + 'should not yet be used in production applications.'; + +const { + getConstructorOf, + customInspectSymbol: inspect +} = require('internal/util'); + +const { + isArrayBuffer +} = process.binding('util'); + +const { + encodeUtf8String +} = process.binding('buffer'); + +const { + decode: _decode, + getConverter, + hasConverter +} = process.binding('icu'); + +const CONVERTER_FLAGS_FLUSH = 0x1; +const CONVERTER_FLAGS_FATAL = 0x2; +const CONVERTER_FLAGS_IGNORE_BOM = 0x4; + +const empty = new Uint8Array(0); + +const encodings = new Map([ + ['unicode-1-1-utf-8', 'utf-8'], + ['utf8', 'utf-8'], + ['utf-8', 'utf-8'], + ['866', 'ibm866'], + ['cp866', 'ibm866'], + ['csibm866', 'ibm866'], + ['ibm866', 'ibm866'], + ['csisolatin2', 'iso-8859-2'], + ['iso-8859-2', 'iso-8859-2'], + ['iso-ir-101', 'iso-8859-2'], + ['iso8859-2', 'iso-8859-2'], + ['iso88592', 'iso-8859-2'], + ['iso_8859-2', 'iso-8859-2'], + ['iso_8859-2:1987', 'iso-8859-2'], + ['l2', 'iso-8859-2'], + ['latin2', 'iso-8859-2'], + ['csisolatin3', 'iso-8859-3'], + ['iso-8859-3', 'iso-8859-3'], + ['iso-ir-109', 'iso-8859-3'], + ['iso8859-3', 'iso-8859-3'], + ['iso88593', 'iso-8859-3'], + ['iso_8859-3', 'iso-8859-3'], + ['iso_8859-3:1988', 'iso-8859-3'], + ['l3', 'iso-8859-3'], + ['latin3', 'iso-8859-3'], + ['csisolatin4', 'iso-8859-4'], + ['iso-8859-4', 'iso-8859-4'], + ['iso-ir-110', 'iso-8859-4'], + ['iso8859-4', 'iso-8859-4'], + ['iso88594', 'iso-8859-4'], + ['iso_8859-4', 'iso-8859-4'], + ['iso_8859-4:1988', 'iso-8859-4'], + ['l4', 'iso-8859-4'], + ['latin4', 'iso-8859-4'], + ['csisolatincyrillic', 'iso-8859-5'], + ['cyrillic', 'iso-8859-5'], + ['iso-8859-5', 'iso-8859-5'], + ['iso-ir-144', 'iso-8859-5'], + ['iso8859-5', 'iso-8859-5'], + ['iso88595', 'iso-8859-5'], + ['iso_8859-5', 'iso-8859-5'], + ['iso_8859-5:1988', 'iso-8859-5'], + ['arabic', 'iso-8859-6'], + ['asmo-708', 'iso-8859-6'], + ['csiso88596e', 'iso-8859-6'], + ['csiso88596i', 'iso-8859-6'], + ['csisolatinarabic', 'iso-8859-6'], + ['ecma-114', 'iso-8859-6'], + ['iso-8859-6', 'iso-8859-6'], + ['iso-8859-6-e', 'iso-8859-6'], + ['iso-8859-6-i', 'iso-8859-6'], + ['iso-ir-127', 'iso-8859-6'], + ['iso8859-6', 'iso-8859-6'], + ['iso88596', 'iso-8859-6'], + ['iso_8859-6', 'iso-8859-6'], + ['iso_8859-6:1987', 'iso-8859-6'], + ['csisolatingreek', 'iso-8859-7'], + ['ecma-118', 'iso-8859-7'], + ['elot_928', 'iso-8859-7'], + ['greek', 'iso-8859-7'], + ['greek8', 'iso-8859-7'], + ['iso-8859-7', 'iso-8859-7'], + ['iso-ir-126', 'iso-8859-7'], + ['iso8859-7', 'iso-8859-7'], + ['iso88597', 'iso-8859-7'], + ['iso_8859-7', 'iso-8859-7'], + ['iso_8859-7:1987', 'iso-8859-7'], + ['sun_eu_greek', 'iso-8859-7'], + ['csiso88598e', 'iso-8859-8'], + ['csisolatinhebrew', 'iso-8859-8'], + ['hebrew', 'iso-8859-8'], + ['iso-8859-8', 'iso-8859-8'], + ['iso-8859-8-e', 'iso-8859-8'], + ['iso-ir-138', 'iso-8859-8'], + ['iso8859-8', 'iso-8859-8'], + ['iso88598', 'iso-8859-8'], + ['iso_8859-8', 'iso-8859-8'], + ['iso_8859-8:1988', 'iso-8859-8'], + ['visual', 'iso-8859-8'], + ['csiso88598i', 'iso-8859-8-i'], + ['iso-8859-8-i', 'iso-8859-8-i'], + ['logical', 'iso-8859-8-i'], + ['csisolatin6', 'iso-8859-10'], + ['iso-8859-10', 'iso-8859-10'], + ['iso-ir-157', 'iso-8859-10'], + ['iso8859-10', 'iso-8859-10'], + ['iso885910', 'iso-8859-10'], + ['l6', 'iso-8859-10'], + ['latin6', 'iso-8859-10'], + ['iso-8859-13', 'iso-8859-13'], + ['iso8859-13', 'iso-8859-13'], + ['iso885913', 'iso-8859-13'], + ['iso-8859-14', 'iso-8859-14'], + ['iso8859-14', 'iso-8859-14'], + ['iso885914', 'iso-8859-14'], + ['csisolatin9', 'iso-8859-15'], + ['iso-8859-15', 'iso-8859-15'], + ['iso8859-15', 'iso-8859-15'], + ['iso885915', 'iso-8859-15'], + ['iso_8859-15', 'iso-8859-15'], + ['l9', 'iso-8859-15'], + ['cskoi8r', 'koi8-r'], + ['koi', 'koi8-r'], + ['koi8', 'koi8-r'], + ['koi8-r', 'koi8-r'], + ['koi8_r', 'koi8-r'], + ['koi8-ru', 'koi8-u'], + ['koi8-u', 'koi8-u'], + ['csmacintosh', 'macintosh'], + ['mac', 'macintosh'], + ['macintosh', 'macintosh'], + ['x-mac-roman', 'macintosh'], + ['dos-874', 'windows-874'], + ['iso-8859-11', 'windows-874'], + ['iso8859-11', 'windows-874'], + ['iso885911', 'windows-874'], + ['tis-620', 'windows-874'], + ['windows-874', 'windows-874'], + ['cp1250', 'windows-1250'], + ['windows-1250', 'windows-1250'], + ['x-cp1250', 'windows-1250'], + ['cp1251', 'windows-1251'], + ['windows-1251', 'windows-1251'], + ['x-cp1251', 'windows-1251'], + ['ansi_x3.4-1968', 'windows-1252'], + ['ascii', 'windows-1252'], + ['cp1252', 'windows-1252'], + ['cp819', 'windows-1252'], + ['csisolatin1', 'windows-1252'], + ['ibm819', 'windows-1252'], + ['iso-8859-1', 'windows-1252'], + ['iso-ir-100', 'windows-1252'], + ['iso8859-1', 'windows-1252'], + ['iso88591', 'windows-1252'], + ['iso_8859-1', 'windows-1252'], + ['iso_8859-1:1987', 'windows-1252'], + ['l1', 'windows-1252'], + ['latin1', 'windows-1252'], + ['us-ascii', 'windows-1252'], + ['windows-1252', 'windows-1252'], + ['x-cp1252', 'windows-1252'], + ['cp1253', 'windows-1253'], + ['windows-1253', 'windows-1253'], + ['x-cp1253', 'windows-1253'], + ['cp1254', 'windows-1254'], + ['csisolatin5', 'windows-1254'], + ['iso-8859-9', 'windows-1254'], + ['iso-ir-148', 'windows-1254'], + ['iso8859-9', 'windows-1254'], + ['iso88599', 'windows-1254'], + ['iso_8859-9', 'windows-1254'], + ['iso_8859-9:1989', 'windows-1254'], + ['l5', 'windows-1254'], + ['latin5', 'windows-1254'], + ['windows-1254', 'windows-1254'], + ['x-cp1254', 'windows-1254'], + ['cp1255', 'windows-1255'], + ['windows-1255', 'windows-1255'], + ['x-cp1255', 'windows-1255'], + ['cp1256', 'windows-1256'], + ['windows-1256', 'windows-1256'], + ['x-cp1256', 'windows-1256'], + ['cp1257', 'windows-1257'], + ['windows-1257', 'windows-1257'], + ['x-cp1257', 'windows-1257'], + ['cp1258', 'windows-1258'], + ['windows-1258', 'windows-1258'], + ['x-cp1258', 'windows-1258'], + ['x-mac-cyrillic', 'x-mac-cyrillic'], + ['x-mac-ukrainian', 'x-mac-cyrillic'], + ['chinese', 'gbk'], + ['csgb2312', 'gbk'], + ['csiso58gb231280', 'gbk'], + ['gb2312', 'gbk'], + ['gb_2312', 'gbk'], + ['gb_2312-80', 'gbk'], + ['gbk', 'gbk'], + ['iso-ir-58', 'gbk'], + ['x-gbk', 'gbk'], + ['gb18030', 'gb18030'], + ['big5', 'big5'], + ['big5-hkscs', 'big5'], + ['cn-big5', 'big5'], + ['csbig5', 'big5'], + ['x-x-big5', 'big5'], + ['cseucpkdfmtjapanese', 'euc-jp'], + ['euc-jp', 'euc-jp'], + ['x-euc-jp', 'euc-jp'], + ['csiso2022jp', 'iso-2022-jp'], + ['iso-2022-jp', 'iso-2022-jp'], + ['csshiftjis', 'shift_jis'], + ['ms932', 'shift_jis'], + ['ms_kanji', 'shift_jis'], + ['shift-jis', 'shift_jis'], + ['shift_jis', 'shift_jis'], + ['sjis', 'shift_jis'], + ['windows-31j', 'shift_jis'], + ['x-sjis', 'shift_jis'], + ['cseuckr', 'euc-kr'], + ['csksc56011987', 'euc-kr'], + ['euc-kr', 'euc-kr'], + ['iso-ir-149', 'euc-kr'], + ['korean', 'euc-kr'], + ['ks_c_5601-1987', 'euc-kr'], + ['ks_c_5601-1989', 'euc-kr'], + ['ksc5601', 'euc-kr'], + ['ksc_5601', 'euc-kr'], + ['windows-949', 'euc-kr'], + ['utf-16be', 'utf-16be'], + ['utf-16le', 'utf-16le'], + ['utf-16', 'utf-16le'] +]); + +// Unfortunately, String.prototype.trim also removes non-ascii whitespace, +// so we have to do this manually +function trimAsciiWhitespace(label) { + var s = 0; + var e = label.length; + while (s < e && ( + label[s] === '\u0009' || + label[s] === '\u000a' || + label[s] === '\u000c' || + label[s] === '\u000d' || + label[s] === '\u0020')) { + s++; + } + while (e > s && ( + label[e - 1] === '\u0009' || + label[e - 1] === '\u000a' || + label[e - 1] === '\u000c' || + label[e - 1] === '\u000d' || + label[e - 1] === '\u0020')) { + e--; + } + return label.slice(s, e); +} + +function getEncodingFromLabel(label) { + const enc = encodings.get(label); + if (enc !== undefined) return enc; + return encodings.get(trimAsciiWhitespace(label.toLowerCase())); +} + +function hasTextDecoder(encoding = 'utf-8') { + if (typeof encoding !== 'string') + throw new errors.Error('ERR_INVALID_ARG_TYPE', 'encoding', 'string'); + return hasConverter(getEncodingFromLabel(encoding)); +} + +var Buffer; +function lazyBuffer() { + if (Buffer === undefined) + Buffer = require('buffer').Buffer; + return Buffer; +} + +class TextDecoder { + constructor(encoding = 'utf-8', options = {}) { + if (!warned) { + warned = true; + process.emitWarning(experimental, 'ExperimentalWarning'); + } + + encoding = `${encoding}`; + if (typeof options !== 'object') + throw new errors.Error('ERR_INVALID_ARG_TYPE', 'options', 'object'); + + const enc = getEncodingFromLabel(encoding); + if (enc === undefined) + throw new errors.RangeError('ERR_ENCODING_NOT_SUPPORTED', encoding); + + var flags = 0; + if (options !== null) { + flags |= options.fatal ? CONVERTER_FLAGS_FATAL : 0; + flags |= options.ignoreBOM ? CONVERTER_FLAGS_IGNORE_BOM : 0; + } + + const handle = getConverter(enc, flags); + if (handle === undefined) + throw new errors.Error('ERR_ENCODING_NOT_SUPPORTED', encoding); + + this[kHandle] = handle; + this[kFlags] = flags; + this[kEncoding] = enc; + } + + get encoding() { + if (this == null || this[kDecoder] !== true) + throw new errors.TypeError('ERR_INVALID_THIS', 'TextDecoder'); + return this[kEncoding]; + } + + get fatal() { + if (this == null || this[kDecoder] !== true) + throw new errors.TypeError('ERR_INVALID_THIS', 'TextDecoder'); + return (this[kFlags] & CONVERTER_FLAGS_FATAL) === CONVERTER_FLAGS_FATAL; + } + + get ignoreBOM() { + if (this == null || this[kDecoder] !== true) + throw new errors.TypeError('ERR_INVALID_THIS', 'TextDecoder'); + return (this[kFlags] & CONVERTER_FLAGS_IGNORE_BOM) === + CONVERTER_FLAGS_IGNORE_BOM; + } + + decode(input = empty, options = {}) { + if (this == null || this[kDecoder] !== true) + throw new errors.TypeError('ERR_INVALID_THIS', 'TextDecoder'); + if (isArrayBuffer(input)) { + input = lazyBuffer().from(input); + } else if (!ArrayBuffer.isView(input)) { + throw new errors.TypeError('ERR_INVALID_ARG_TYPE', 'input', + ['ArrayBuffer', 'ArrayBufferView']); + } + if (typeof options !== 'object') { + throw new errors.TypeError('ERR_INVALID_ARG_TYPE', 'options', 'object'); + } + + var flags = 0; + if (options !== null) + flags |= options.stream ? 0 : CONVERTER_FLAGS_FLUSH; + + const ret = _decode(this[kHandle], input, flags); + if (typeof ret === 'number') { + const err = new errors.TypeError('ERR_ENCODING_INVALID_ENCODED_DATA', + this.encoding); + err.errno = ret; + throw err; + } + return ret.toString('ucs2'); + } + + [inspect](depth, opts) { + if (this == null || this[kDecoder] !== true) + throw new errors.TypeError('ERR_INVALID_THIS', 'TextDecoder'); + if (typeof depth === 'number' && depth < 0) + return opts.stylize('[Object]', 'special'); + var ctor = getConstructorOf(this); + var obj = Object.create({ + constructor: ctor === null ? TextDecoder : ctor + }); + obj.encoding = this.encoding; + obj.fatal = this.fatal; + obj.ignoreBOM = this.ignoreBOM; + if (opts.showHidden) { + obj[kFlags] = this[kFlags]; + obj[kHandle] = this[kHandle]; + } + // Lazy to avoid circular dependency + return require('util').inspect(obj, opts); + } +} + +class TextEncoder { + constructor() { + if (!warned) { + warned = true; + process.emitWarning(experimental, 'ExperimentalWarning'); + } + } + + get encoding() { + if (this == null || this[kEncoder] !== true) + throw new errors.TypeError('ERR_INVALID_THIS', 'TextEncoder'); + return 'utf-8'; + } + + encode(input = '') { + if (this == null || this[kEncoder] !== true) + throw new errors.TypeError('ERR_INVALID_THIS', 'TextEncoder'); + return encodeUtf8String(`${input}`); + } + + [inspect](depth, opts) { + if (this == null || this[kEncoder] !== true) + throw new errors.TypeError('ERR_INVALID_THIS', 'TextEncoder'); + if (typeof depth === 'number' && depth < 0) + return opts.stylize('[Object]', 'special'); + var ctor = getConstructorOf(this); + var obj = Object.create({ + constructor: ctor === null ? TextEncoder : ctor + }); + obj.encoding = this.encoding; + // Lazy to avoid circular dependency + return require('util').inspect(obj, opts); + } +} + +Object.defineProperties( + TextDecoder.prototype, { + [kDecoder]: { enumerable: false, value: true, configurable: false }, + 'decode': { enumerable: true }, + 'encoding': { enumerable: true }, + 'fatal': { enumerable: true }, + 'ignoreBOM': { enumerable: true }, + [Symbol.toStringTag]: { + configurable: true, + value: 'TextDecoder' + } }); +Object.defineProperties( + TextEncoder.prototype, { + [kEncoder]: { enumerable: false, value: true, configurable: false }, + 'encode': { enumerable: true }, + 'encoding': { enumerable: true }, + [Symbol.toStringTag]: { + configurable: true, + value: 'TextEncoder' + } }); + +module.exports = { + getEncodingFromLabel, + hasTextDecoder, + TextDecoder, + TextEncoder +}; diff --git a/lib/internal/errors.js b/lib/internal/errors.js index f519397be252b1..3ee34cf428939a 100644 --- a/lib/internal/errors.js +++ b/lib/internal/errors.js @@ -109,7 +109,17 @@ module.exports = exports = { // Note: Please try to keep these in alphabetical order E('ERR_ARG_NOT_ITERABLE', '%s must be iterable'); E('ERR_ASSERTION', (msg) => msg); +E('ERR_ENCODING_INVALID_ENCODED_DATA', + (enc) => `The encoded data was not valid for encoding ${enc}`); +E('ERR_ENCODING_NOT_SUPPORTED', + (enc) => `The "${enc}" encoding is not supported`); E('ERR_FALSY_VALUE_REJECTION', 'Promise was rejected with falsy value'); +E('ERR_HTTP_HEADERS_SENT', + 'Cannot render headers after they are sent to the client'); +E('ERR_HTTP_INVALID_STATUS_CODE', 'Invalid status code: %s'); +E('ERR_HTTP_TRAILER_INVALID', + 'Trailers are invalid with this transfer encoding'); +E('ERR_INDEX_OUT_OF_RANGE', 'Index out of range'); E('ERR_INVALID_ARG_TYPE', invalidArgType); E('ERR_INVALID_CALLBACK', 'callback must be a function'); E('ERR_INVALID_FD', (fd) => `"fd" must be a positive integer: ${fd}`); diff --git a/lib/util.js b/lib/util.js index 897970d39b1829..9db58e5458d8db 100644 --- a/lib/util.js +++ b/lib/util.js @@ -22,6 +22,7 @@ 'use strict'; const errors = require('internal/errors'); +const { TextDecoder, TextEncoder } = require('internal/encoding'); const { errname } = process.binding('uv'); @@ -1128,6 +1129,8 @@ module.exports = exports = { isPrimitive, log, promisify, + TextDecoder, + TextEncoder, // Deprecated Old Stuff debug: deprecate(debug, diff --git a/node.gyp b/node.gyp index 7879ef2524d01f..1650f1598bf02a 100644 --- a/node.gyp +++ b/node.gyp @@ -82,6 +82,7 @@ 'lib/internal/cluster/shared_handle.js', 'lib/internal/cluster/utils.js', 'lib/internal/cluster/worker.js', + 'lib/internal/encoding.js', 'lib/internal/errors.js', 'lib/internal/freelist.js', 'lib/internal/fs.js', diff --git a/src/node_buffer.cc b/src/node_buffer.cc index a88d0e86732252..d6e4aa9da072db 100644 --- a/src/node_buffer.cc +++ b/src/node_buffer.cc @@ -1200,6 +1200,27 @@ void Swap64(const FunctionCallbackInfo& args) { } +// Encode a single string to a UTF-8 Uint8Array (not Buffer). +// Used in TextEncoder.prototype.encode. +static void EncodeUtf8String(const FunctionCallbackInfo& args) { + Environment* env = Environment::GetCurrent(args); + CHECK_GE(args.Length(), 1); + CHECK(args[0]->IsString()); + + Local str = args[0].As(); + size_t length = str->Utf8Length(); + char* data = node::UncheckedMalloc(length); + str->WriteUtf8(data, + -1, // We are certain that `data` is sufficiently large + NULL, + String::NO_NULL_TERMINATION | String::REPLACE_INVALID_UTF8); + auto array_buf = ArrayBuffer::New(env->isolate(), data, length, + ArrayBufferCreationMode::kInternalized); + auto array = Uint8Array::New(array_buf, 0, length); + args.GetReturnValue().Set(array); +} + + // pass Buffer object to load prototype methods void SetupBufferJS(const FunctionCallbackInfo& args) { Environment* env = Environment::GetCurrent(args); @@ -1266,6 +1287,8 @@ void Initialize(Local target, env->SetMethod(target, "swap32", Swap32); env->SetMethod(target, "swap64", Swap64); + env->SetMethod(target, "encodeUtf8String", EncodeUtf8String); + target->Set(env->context(), FIXED_ONE_BYTE_STRING(env->isolate(), "kMaxLength"), Integer::NewFromUnsigned(env->isolate(), kMaxLength)).FromJust(); diff --git a/src/node_i18n.cc b/src/node_i18n.cc index 3b337449495f4c..2e1aeaa4cb07c5 100644 --- a/src/node_i18n.cc +++ b/src/node_i18n.cc @@ -50,6 +50,8 @@ #include "env-inl.h" #include "util.h" #include "util-inl.h" +#include "base-object.h" +#include "base-object-inl.h" #include "v8.h" #include @@ -86,10 +88,12 @@ namespace node { using v8::Context; using v8::FunctionCallbackInfo; +using v8::HandleScope; using v8::Isolate; using v8::Local; using v8::MaybeLocal; using v8::Object; +using v8::ObjectTemplate; using v8::String; using v8::Value; @@ -123,6 +127,15 @@ struct Converter { } } + explicit Converter(UConverter* converter, + const char* sub = NULL) : conv(converter) { + CHECK_NE(conv, nullptr); + UErrorCode status = U_ZERO_ERROR; + if (sub != NULL) { + ucnv_setSubstChars(conv, sub, strlen(sub), &status); + } + } + ~Converter() { ucnv_close(conv); } @@ -130,6 +143,143 @@ struct Converter { UConverter* conv; }; +class ConverterObject : public BaseObject, Converter { + public: + enum ConverterFlags { + CONVERTER_FLAGS_FLUSH = 0x1, + CONVERTER_FLAGS_FATAL = 0x2, + CONVERTER_FLAGS_IGNORE_BOM = 0x4 + }; + + ~ConverterObject() override {} + + static void Has(const FunctionCallbackInfo& args) { + Environment* env = Environment::GetCurrent(args); + HandleScope scope(env->isolate()); + + CHECK_GE(args.Length(), 1); + Utf8Value label(env->isolate(), args[0]); + + UErrorCode status = U_ZERO_ERROR; + UConverter* conv = ucnv_open(*label, &status); + args.GetReturnValue().Set(!!U_SUCCESS(status)); + ucnv_close(conv); + } + + static void Create(const FunctionCallbackInfo& args) { + Environment* env = Environment::GetCurrent(args); + HandleScope scope(env->isolate()); + + CHECK_GE(args.Length(), 2); + Utf8Value label(env->isolate(), args[0]); + int flags = args[1]->Uint32Value(env->context()).ToChecked(); + bool fatal = + (flags & CONVERTER_FLAGS_FATAL) == CONVERTER_FLAGS_FATAL; + bool ignoreBOM = + (flags & CONVERTER_FLAGS_IGNORE_BOM) == CONVERTER_FLAGS_IGNORE_BOM; + + UErrorCode status = U_ZERO_ERROR; + UConverter* conv = ucnv_open(*label, &status); + if (U_FAILURE(status)) + return; + + if (fatal) { + status = U_ZERO_ERROR; + ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_STOP, + nullptr, nullptr, nullptr, &status); + } + + Local t = ObjectTemplate::New(env->isolate()); + t->SetInternalFieldCount(1); + Local obj = t->NewInstance(env->context()).ToLocalChecked(); + new ConverterObject(env, obj, conv, ignoreBOM); + args.GetReturnValue().Set(obj); + } + + static void Decode(const FunctionCallbackInfo& args) { + Environment* env = Environment::GetCurrent(args); + + CHECK_GE(args.Length(), 3); // Converter, Buffer, Flags + + Converter utf8("utf8"); + ConverterObject* converter; + ASSIGN_OR_RETURN_UNWRAP(&converter, args[0].As()); + SPREAD_BUFFER_ARG(args[1], input_obj); + int flags = args[2]->Uint32Value(env->context()).ToChecked(); + + UErrorCode status = U_ZERO_ERROR; + MaybeStackBuffer result; + MaybeLocal ret; + size_t limit = ucnv_getMinCharSize(converter->conv) * + input_obj_length; + if (limit > 0) + result.AllocateSufficientStorage(limit); + + UBool flush = (flags & CONVERTER_FLAGS_FLUSH) == CONVERTER_FLAGS_FLUSH; + + const char* source = input_obj_data; + size_t source_length = input_obj_length; + + if (converter->unicode_ && !converter->ignoreBOM_ && !converter->bomSeen_) { + int32_t bomOffset = 0; + ucnv_detectUnicodeSignature(source, source_length, &bomOffset, &status); + source += bomOffset; + source_length -= bomOffset; + converter->bomSeen_ = true; + } + + UChar* target = *result; + ucnv_toUnicode(converter->conv, + &target, target + (limit * sizeof(UChar)), + &source, source + source_length, + NULL, flush, &status); + + if (U_SUCCESS(status)) { + if (limit > 0) + result.SetLength(target - &result[0]); + ret = ToBufferEndian(env, &result); + args.GetReturnValue().Set(ret.ToLocalChecked()); + goto reset; + } + + args.GetReturnValue().Set(status); + + reset: + if (flush) { + // Reset the converter state + converter->bomSeen_ = false; + ucnv_reset(converter->conv); + } + } + + protected: + ConverterObject(Environment* env, + v8::Local wrap, + UConverter* converter, + bool ignoreBOM, + const char* sub = NULL) : + BaseObject(env, wrap), + Converter(converter, sub), + ignoreBOM_(ignoreBOM) { + MakeWeak(this); + + switch (ucnv_getType(converter)) { + case UCNV_UTF8: + case UCNV_UTF16_BigEndian: + case UCNV_UTF16_LittleEndian: + unicode_ = true; + break; + default: + unicode_ = false; + } + } + + private: + bool unicode_ = false; // True if this is a Unicode converter + bool ignoreBOM_ = false; // True if the BOM should be ignored on Unicode + bool bomSeen_ = false; // True if the BOM has been seen +}; + // One-Shot Converters void CopySourceBuffer(MaybeStackBuffer* dest, @@ -717,6 +867,11 @@ void Init(Local target, // One-shot converters env->SetMethod(target, "icuErrName", ICUErrorName); env->SetMethod(target, "transcode", Transcode); + + // ConverterObject + env->SetMethod(target, "getConverter", ConverterObject::Create); + env->SetMethod(target, "decode", ConverterObject::Decode); + env->SetMethod(target, "hasConverter", ConverterObject::Has); } } // namespace i18n diff --git a/src/node_i18n.h b/src/node_i18n.h index adf9feb414df5c..f7801ce6668468 100644 --- a/src/node_i18n.h +++ b/src/node_i18n.h @@ -25,6 +25,7 @@ #if defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS #include "node.h" +#include #include #if defined(NODE_HAVE_I18N_SUPPORT) diff --git a/src/node_util.cc b/src/node_util.cc index 50de94bfb2bf3a..c1dff77386d927 100644 --- a/src/node_util.cc +++ b/src/node_util.cc @@ -21,6 +21,7 @@ using v8::Value; #define VALUE_METHOD_MAP(V) \ + V(isArrayBuffer, IsArrayBuffer) \ V(isAsyncFunction, IsAsyncFunction) \ V(isDataView, IsDataView) \ V(isDate, IsDate) \ diff --git a/test/parallel/test-whatwg-encoding.js b/test/parallel/test-whatwg-encoding.js new file mode 100644 index 00000000000000..c181df860ca149 --- /dev/null +++ b/test/parallel/test-whatwg-encoding.js @@ -0,0 +1,385 @@ +// Flags: --expose-internals +'use strict'; + +const common = require('../common'); +const assert = require('assert'); +const { TextEncoder, TextDecoder } = require('util'); +const { customInspectSymbol: inspect } = require('internal/util'); +const { getEncodingFromLabel } = require('internal/encoding'); + +const encoded = Buffer.from([0xef, 0xbb, 0xbf, 0x74, 0x65, + 0x73, 0x74, 0xe2, 0x82, 0xac]); + +if (!common.hasIntl) { + common.skip('WHATWG Encoding tests because ICU is not present.'); +} + +// Make Sure TextDecoder and TextEncoder exist +assert(TextDecoder); +assert(TextEncoder); + +// Test TextEncoder +const enc = new TextEncoder(); +assert(enc); +const buf = enc.encode('\ufefftest€'); + +assert.strictEqual(Buffer.compare(buf, encoded), 0); + + +// Test TextDecoder, UTF-8, fatal: false, ignoreBOM: false +{ + ['unicode-1-1-utf-8', 'utf8', 'utf-8'].forEach((i) => { + const dec = new TextDecoder(i); + const res = dec.decode(buf); + assert.strictEqual(res, 'test€'); + }); + + ['unicode-1-1-utf-8', 'utf8', 'utf-8'].forEach((i) => { + const dec = new TextDecoder(i); + let res = ''; + res += dec.decode(buf.slice(0, 8), { stream: true }); + res += dec.decode(buf.slice(8)); + assert.strictEqual(res, 'test€'); + }); +} + +// Test TextDecoder, UTF-8, fatal: false, ignoreBOM: true +{ + ['unicode-1-1-utf-8', 'utf8', 'utf-8'].forEach((i) => { + const dec = new TextDecoder(i, { ignoreBOM: true }); + const res = dec.decode(buf); + assert.strictEqual(res, '\ufefftest€'); + }); + + ['unicode-1-1-utf-8', 'utf8', 'utf-8'].forEach((i) => { + const dec = new TextDecoder(i, { ignoreBOM: true }); + let res = ''; + res += dec.decode(buf.slice(0, 8), { stream: true }); + res += dec.decode(buf.slice(8)); + assert.strictEqual(res, '\ufefftest€'); + }); +} + +// Test TextDecoder, UTF-8, fatal: true, ignoreBOM: false +{ + ['unicode-1-1-utf-8', 'utf8', 'utf-8'].forEach((i) => { + const dec = new TextDecoder(i, { fatal: true }); + assert.throws(() => dec.decode(buf.slice(0, 8)), + common.expectsError({ + code: 'ERR_ENCODING_INVALID_ENCODED_DATA', + type: TypeError, + message: + /^The encoded data was not valid for encoding utf-8$/ + })); + }); + + ['unicode-1-1-utf-8', 'utf8', 'utf-8'].forEach((i) => { + const dec = new TextDecoder(i, { fatal: true }); + assert.doesNotThrow(() => dec.decode(buf.slice(0, 8), { stream: true })); + assert.doesNotThrow(() => dec.decode(buf.slice(8))); + }); +} + +// Test TextDecoder, UTF-16le +{ + const dec = new TextDecoder('utf-16le'); + const res = dec.decode(Buffer.from('test€', 'utf-16le')); + assert.strictEqual(res, 'test€'); +} + +// Test TextDecoder, UTF-16be +{ + const dec = new TextDecoder('utf-16be'); + const res = dec.decode(Buffer.from([0x00, 0x74, 0x00, 0x65, 0x00, + 0x73, 0x00, 0x74, 0x20, 0xac])); + assert.strictEqual(res, 'test€'); +} + +{ + const fn = TextDecoder.prototype[inspect]; + fn.call(new TextDecoder(), Infinity, {}); + + [{}, [], true, 1, '', new TextEncoder()].forEach((i) => { + assert.throws(() => fn.call(i, Infinity, {}), + common.expectsError({ + code: 'ERR_INVALID_THIS', + message: 'Value of "this" must be of type TextDecoder' + })); + }); +} + +{ + const fn = TextEncoder.prototype[inspect]; + fn.call(new TextEncoder(), Infinity, {}); + + [{}, [], true, 1, '', new TextDecoder()].forEach((i) => { + assert.throws(() => fn.call(i, Infinity, {}), + common.expectsError({ + code: 'ERR_INVALID_THIS', + message: 'Value of "this" must be of type TextEncoder' + })); + }); +} + +// Test Encoding Mappings +{ + + const mappings = { + 'utf-8': [ + 'unicode-1-1-utf-8', + 'utf8' + ], + 'utf-16be': [], + 'utf-16le': [ + 'utf-16' + ], + 'ibm866': [ + '866', + 'cp866', + 'csibm866' + ], + 'iso-8859-2': [ + 'csisolatin2', + 'iso-ir-101', + 'iso8859-2', + 'iso88592', + 'iso_8859-2', + 'iso_8859-2:1987', + 'l2', + 'latin2' + ], + 'iso-8859-3': [ + 'csisolatin3', + 'iso-ir-109', + 'iso8859-3', + 'iso88593', + 'iso_8859-3', + 'iso_8859-3:1988', + 'l3', + 'latin3' + ], + 'iso-8859-4': [ + 'csisolatin4', + 'iso-ir-110', + 'iso8859-4', + 'iso88594', + 'iso_8859-4', + 'iso_8859-4:1988', + 'l4', + 'latin4' + ], + 'iso-8859-5': [ + 'csisolatincyrillic', + 'cyrillic', + 'iso-ir-144', + 'iso8859-5', + 'iso88595', + 'iso_8859-5', + 'iso_8859-5:1988' + ], + 'iso-8859-6': [ + 'arabic', + 'asmo-708', + 'csiso88596e', + 'csiso88596i', + 'csisolatinarabic', + 'ecma-114', + 'iso-8859-6-e', + 'iso-8859-6-i', + 'iso-ir-127', + 'iso8859-6', + 'iso88596', + 'iso_8859-6', + 'iso_8859-6:1987' + ], + 'iso-8859-7': [ + 'csisolatingreek', + 'ecma-118', + 'elot_928', + 'greek', + 'greek8', + 'iso-ir-126', + 'iso8859-7', + 'iso88597', + 'iso_8859-7', + 'iso_8859-7:1987', + 'sun_eu_greek' + ], + 'iso-8859-8': [ + 'csiso88598e', + 'csisolatinhebrew', + 'hebrew', + 'iso-8859-8-e', + 'iso-ir-138', + 'iso8859-8', + 'iso88598', + 'iso_8859-8', + 'iso_8859-8:1988', + 'visual' + ], + 'iso-8859-8-i': [ + 'csiso88598i', + 'logical' + ], + 'iso-8859-10': [ + 'csisolatin6', + 'iso-ir-157', + 'iso8859-10', + 'iso885910', + 'l6', + 'latin6' + ], + 'iso-8859-13': [ + 'iso8859-13', + 'iso885913' + ], + 'iso-8859-14': [ + 'iso8859-14', + 'iso885914' + ], + 'iso-8859-15': [ + 'csisolatin9', + 'iso8859-15', + 'iso885915', + 'iso_8859-15', + 'l9' + ], + 'koi8-r': [ + 'cskoi8r', + 'koi', + 'koi8', + 'koi8_r' + ], + 'koi8-u': [ + 'koi8-ru' + ], + 'macintosh': [ + 'csmacintosh', + 'mac', + 'x-mac-roman' + ], + 'windows-874': [ + 'dos-874', + 'iso-8859-11', + 'iso8859-11', + 'iso885911', + 'tis-620' + ], + 'windows-1250': [ + 'cp1250', + 'x-cp1250' + ], + 'windows-1251': [ + 'cp1251', + 'x-cp1251' + ], + 'windows-1252': [ + 'ansi_x3.4-1968', + 'ascii', + 'cp1252', + 'cp819', + 'csisolatin1', + 'ibm819', + 'iso-8859-1', + 'iso-ir-100', + 'iso8859-1', + 'iso88591', + 'iso_8859-1', + 'iso_8859-1:1987', + 'l1', + 'latin1', + 'us-ascii', + 'x-cp1252' + ], + 'windows-1253': [ + 'cp1253', + 'x-cp1253' + ], + 'windows-1254': [ + 'cp1254', + 'csisolatin5', + 'iso-8859-9', + 'iso-ir-148', + 'iso8859-9', + 'iso88599', + 'iso_8859-9', + 'iso_8859-9:1989', + 'l5', + 'latin5', + 'x-cp1254' + ], + 'windows-1255': [ + 'cp1255', + 'x-cp1255' + ], + 'windows-1256': [ + 'cp1256', + 'x-cp1256' + ], + 'windows-1257': [ + 'cp1257', + 'x-cp1257' + ], + 'windows-1258': [ + 'cp1258', + 'x-cp1258' + ], + 'x-mac-cyrillic': [ + 'x-mac-ukrainian' + ], + 'gbk': [ + 'chinese', + 'csgb2312', + 'csiso58gb231280', + 'gb2312', + 'gb_2312', + 'gb_2312-80', + 'iso-ir-58', + 'x-gbk' + ], + 'gb18030': [ ], + 'big5': [ + 'big5-hkscs', + 'cn-big5', + 'csbig5', + 'x-x-big5' + ], + 'euc-jp': [ + 'cseucpkdfmtjapanese', + 'x-euc-jp' + ], + 'iso-2022-jp': [ + 'csiso2022jp' + ], + 'shift_jis': [ + 'csshiftjis', + 'ms932', + 'ms_kanji', + 'shift-jis', + 'sjis', + 'windows-31j', + 'x-sjis' + ], + 'euc-kr': [ + ' euc-kr \t', + 'EUC-kr \n', + 'cseuckr', + 'csksc56011987', + 'iso-ir-149', + 'korean', + 'ks_c_5601-1987', + 'ks_c_5601-1989', + 'ksc5601', + 'ksc_5601', + 'windows-949' + ] + }; + Object.entries(mappings).forEach((i) => { + const enc = i[0]; + const labels = i[1]; + assert.strictEqual(getEncodingFromLabel(enc), enc); + labels.forEach((l) => assert.strictEqual(getEncodingFromLabel(l), enc)); + }); + + assert.strictEqual(getEncodingFromLabel('made-up'), undefined); +} diff --git a/tools/icu/icu-generic.gyp b/tools/icu/icu-generic.gyp index 4c2125a0435b02..93d7cd5f6d9f39 100644 --- a/tools/icu/icu-generic.gyp +++ b/tools/icu/icu-generic.gyp @@ -30,15 +30,6 @@ 'type': 'none', 'toolsets': [ 'host', 'target' ], 'direct_dependent_settings': { - 'conditions': [ - [ 'icu_endianness == "l"', { - 'defines': [ - # ICU cannot swap the initial data without this. - # http://bugs.icu-project.org/trac/ticket/11046 - 'UCONFIG_NO_LEGACY_CONVERSION=1' - ], - }], - ], 'defines': [ 'UCONFIG_NO_SERVICE=1', 'UCONFIG_NO_REGULAR_EXPRESSIONS=1',