Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 43 additions & 17 deletions encoding-browser.browser.js
Original file line number Diff line number Diff line change
@@ -1,10 +1,4 @@
import {
fromSource,
getBOMEncoding,
normalizeEncoding,
E_ENCODING,
} from './fallback/encoding.api.js'
import labels from './fallback/encoding.labels.js'
import { getBOMEncoding } from './fallback/encoding.api.js'

// Lite-weight version which re-exports existing implementations on browsers,
// while still being aliased to the full impl in RN and Node.js
Expand All @@ -13,17 +7,49 @@ import labels from './fallback/encoding.labels.js'

const { TextDecoder, TextEncoder, TextDecoderStream, TextEncoderStream } = globalThis

export { normalizeEncoding, getBOMEncoding, labelToName } from './fallback/encoding.api.js'
export { getBOMEncoding } from './fallback/encoding.api.js'
export { TextDecoder, TextEncoder, TextDecoderStream, TextEncoderStream }

// https://encoding.spec.whatwg.org/#decode
export function normalizeEncoding(label) {
if (label === 'utf-8' || label === 'utf8' || label === 'UTF-8' || label === 'UTF8') return 'utf-8'
if (label === 'windows-1252' || label === 'ascii' || label === 'latin1') return 'windows-1252'
if (/[^\w\t\n\f\r .:-]/i.test(label)) return null
const l = `${label}`.trim().toLowerCase()
try {
return new TextDecoder(l).encoding
} catch {}

if (l === 'x-user-defined') return l
if (
l === 'replacement' ||
l === 'csiso2022kr' ||
l === 'hz-gb-2312' ||
l === 'iso-2022-cn' ||
l === 'iso-2022-cn-ext' ||
l === 'iso-2022-kr'
) {
return 'replacement'
}

return null
}

export function legacyHookDecode(input, fallbackEncoding = 'utf-8') {
let u8 = fromSource(input)
const bomEncoding = getBOMEncoding(u8)
if (bomEncoding) u8 = u8.subarray(bomEncoding === 'utf-8' ? 3 : 2)
const enc = bomEncoding ?? normalizeEncoding(fallbackEncoding) // "the byte order mark is more authoritative than anything else"
if (enc === 'utf-8') return new TextDecoder('utf-8', { ignoreBOM: true }).decode(u8) // fast path
if (enc === 'replacement') return u8.byteLength > 0 ? '\uFFFD' : ''
if (!Object.hasOwn(labels, enc)) throw new RangeError(E_ENCODING)
return new TextDecoder(enc, { ignoreBOM: true }).decode(u8)
const enc = getBOMEncoding(input) ?? normalizeEncoding(fallbackEncoding)
if (enc === 'replacement') return input.byteLength > 0 ? '\uFFFD' : ''
return new TextDecoder(enc).decode(input)
}

export function labelToName(label) {
const enc = normalizeEncoding(label)
if (enc === 'utf-8') return 'UTF-8'
if (!enc) return enc
const p = enc.slice(0, 3)
if (p === 'utf' || p === 'iso' || p === 'koi' || p === 'euc' || p === 'ibm' || p === 'gbk') {
return enc.toUpperCase()
}

if (enc === 'big5') return 'Big5'
if (enc === 'shift_jis') return 'Shift_JIS'
return enc
}
43 changes: 0 additions & 43 deletions fallback/encoding.api.js
Original file line number Diff line number Diff line change
@@ -1,32 +1,3 @@
import labels from './encoding.labels.js'

let labelsMap

export const E_ENCODING = 'Unknown encoding'

// Warning: unlike whatwg-encoding, returns lowercased labels
// Those are case-insensitive and that's how TextDecoder encoding getter normalizes them
// https://encoding.spec.whatwg.org/#names-and-labels
export function normalizeEncoding(label) {
// fast path
if (label === 'utf-8' || label === 'utf8' || label === 'UTF-8' || label === 'UTF8') return 'utf-8'
if (label === 'windows-1252' || label === 'ascii' || label === 'latin1') return 'windows-1252'
// full map
if (/[^\w\t\n\f\r .:-]/i.test(label)) return null // must be ASCII (with ASCII whitespace)
const low = `${label}`.trim().toLowerCase()
if (Object.hasOwn(labels, low)) return low
if (!labelsMap) {
labelsMap = new Map()
for (const [label, aliases] of Object.entries(labels)) {
for (const alias of aliases) labelsMap.set(alias, label)
}
}

const mapped = labelsMap.get(low)
if (mapped) return mapped
return null
}

// TODO: make this more strict against Symbol.toStringTag
// Is not very significant though, anything faking Symbol.toStringTag could as well override
// prototypes, which is not something we protect against
Expand Down Expand Up @@ -65,17 +36,3 @@ export function getBOMEncoding(input) {
if (u8[0] === 0xfe && u8[1] === 0xff) return 'utf-16be'
return null
}

const uppercasePrefixes = new Set(['utf', 'iso', 'koi', 'euc', 'ibm', 'gbk'])

// Unlike normalizeEncoding, case-sensitive
// https://encoding.spec.whatwg.org/#names-and-labels
export function labelToName(label) {
const enc = normalizeEncoding(label)
if (enc === 'utf-8') return 'UTF-8' // fast path
if (!enc) return enc
if (uppercasePrefixes.has(enc.slice(0, 3))) return enc.toUpperCase()
if (enc === 'big5') return 'Big5'
if (enc === 'shift_jis') return 'Shift_JIS'
return enc
}
43 changes: 41 additions & 2 deletions fallback/encoding.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,56 @@ import { utf16toString, utf16toStringLoose } from '@exodus/bytes/utf16.js'
import { utf8fromStringLoose, utf8toString, utf8toStringLoose } from '@exodus/bytes/utf8.js'
import { createSinglebyteDecoder } from '@exodus/bytes/single-byte.js'
import labels from './encoding.labels.js'
import { fromSource, getBOMEncoding, normalizeEncoding, E_ENCODING } from './encoding.api.js'
import { fromSource, getBOMEncoding } from './encoding.api.js'
import { unfinishedBytes, mergePrefix } from './encoding.util.js'

export { labelToName, getBOMEncoding, normalizeEncoding } from './encoding.api.js'
export { getBOMEncoding } from './encoding.api.js'

export const E_ENCODING = 'Unknown encoding'
const E_MULTI = "import '@exodus/bytes/encoding.js' for legacy multi-byte encodings support"
const E_OPTIONS = 'The "options" argument must be of type object'
const replacementChar = '\uFFFD'
const multibyteSet = new Set(['big5', 'euc-kr', 'euc-jp', 'iso-2022-jp', 'shift_jis', 'gbk', 'gb18030']) // prettier-ignore
let createMultibyteDecoder, multibyteEncoder

let labelsMap
// Warning: unlike whatwg-encoding, returns lowercased labels
// Those are case-insensitive and that's how TextDecoder encoding getter normalizes them
// https://encoding.spec.whatwg.org/#names-and-labels
export function normalizeEncoding(label) {
// fast path
if (label === 'utf-8' || label === 'utf8' || label === 'UTF-8' || label === 'UTF8') return 'utf-8'
if (label === 'windows-1252' || label === 'ascii' || label === 'latin1') return 'windows-1252'
// full map
if (/[^\w\t\n\f\r .:-]/i.test(label)) return null // must be ASCII (with ASCII whitespace)
const low = `${label}`.trim().toLowerCase()
if (Object.hasOwn(labels, low)) return low
if (!labelsMap) {
labelsMap = new Map()
for (const [name, aliases] of Object.entries(labels)) {
for (const alias of aliases) labelsMap.set(alias, name)
}
}

const mapped = labelsMap.get(low)
if (mapped) return mapped
return null
}

const uppercasePrefixes = new Set(['utf', 'iso', 'koi', 'euc', 'ibm', 'gbk'])

// Unlike normalizeEncoding, case-sensitive
// https://encoding.spec.whatwg.org/#names-and-labels
export function labelToName(label) {
const enc = normalizeEncoding(label)
if (enc === 'utf-8') return 'UTF-8' // fast path
if (!enc) return enc
if (uppercasePrefixes.has(enc.slice(0, 3))) return enc.toUpperCase()
if (enc === 'big5') return 'Big5'
if (enc === 'shift_jis') return 'Shift_JIS'
return enc
}

export const isMultibyte = (enc) => multibyteSet.has(enc)
export function setMultibyte(createDecoder, createEncoder) {
createMultibyteDecoder = createDecoder
Expand Down
154 changes: 154 additions & 0 deletions tests/encoding/browser.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
import {
TextDecoder,
TextEncoder,
getBOMEncoding,
legacyHookDecode,
} from '@exodus/bytes/encoding-browser.js'
import { fromHex } from '@exodus/bytes/hex.js'
import { test, describe } from 'node:test'
import unfinishedBytesFixtures from './fixtures/unfinishedBytes.js'

test('Unfinished bytes', (t) => {
for (const [encoding, trail, u8] of unfinishedBytesFixtures) {
const decoder = new TextDecoder(encoding)
const a0 = decoder.decode(u8, { stream: true })
const b0 = decoder.decode()
const ab = new TextDecoder(encoding).decode(u8)
const a1 = new TextDecoder(encoding).decode(u8.subarray(0, u8.length - trail))
const b1 = new TextDecoder(encoding).decode(u8.subarray(u8.length - trail))
t.assert.strictEqual(a0, a1)
t.assert.strictEqual(b0, b1)
t.assert.strictEqual(a0 + b0, ab)
t.assert.strictEqual(decoder.decode(u8), ab) // reuse

if (trail === 0) {
t.assert.strictEqual(a0, ab)
t.assert.strictEqual(b0, '')
}

if (trail === u8.length) {
t.assert.strictEqual(a0, '')
t.assert.strictEqual(b0, ab)
}
}
})

test('String coercion', (t) => {
const encoder = new TextEncoder()
const map = [
[{}, '[object Object]'],
[null, 'null'],
[undefined, 'undefined'],
]

for (const [arg, string] of map) {
const length = string.length
const a = encoder.encode(string)
t.assert.strictEqual(a.length, length)

const b = encoder.encode(arg)
if (arg === undefined) {
// undefined is special
t.assert.strictEqual(b.length, 0)
t.assert.deepStrictEqual(b, Uint8Array.of())
} else {
t.assert.strictEqual(b.length, length)
t.assert.deepStrictEqual(b, a)
}

const c = new Uint8Array(20)
t.assert.deepStrictEqual(encoder.encodeInto(arg, c), { read: length, written: length })
t.assert.deepStrictEqual(c.subarray(0, length), a)
}
})

// https://encoding.spec.whatwg.org/#x-user-defined-decoder
test('x-user-defined encoding', (t) => {
const decoder = new TextDecoder('x-user-defined')
for (let byte = 0; byte < 256; byte++) {
const codePoint = byte >= 128 ? 0xf7_80 + byte - 0x80 : byte
t.assert.strictEqual(decoder.decode(Uint8Array.of(byte)), String.fromCodePoint(codePoint))
}
})

// iso-8859-1, iso-8859-9, iso-8859-11 differ in WHATWG Encoding spec from https://unicode.org/Public/MAPPINGS/ISO8859
// and map to windows-1252, windows-1254, windows-874 instead
test('not all ISO-8859 encodings are present in TextDecoder', (t) => {
t.assert.strictEqual(new TextDecoder('iso-8859-1').encoding, 'windows-1252')
t.assert.strictEqual(new TextDecoder('iso-8859-2').encoding, 'iso-8859-2') // present
t.assert.strictEqual(new TextDecoder('iso-8859-9').encoding, 'windows-1254')
t.assert.strictEqual(new TextDecoder('iso-8859-11').encoding, 'windows-874')
t.assert.throws(() => new TextDecoder('iso-8859-12'))
t.assert.strictEqual(new TextDecoder('iso-8859-13').encoding, 'iso-8859-13') // present
})

describe('legacyHookDecode', () => {
const fixtures = {
replacement: [
['', ''],
['00', '\uFFFD'],
['ff', '\uFFFD'],
['20', '\uFFFD'],
['2020', '\uFFFD'],
// BOM takes preference
['efbbbf', ''],
['efbbbf2a', '*'],
['efbbbf202a', ' *'],
['fffe', ''],
['fffe2a20', '\u202A'],
['fffe2a', '\uFFFD'],
['fffe00d72a', '\uD700\uFFFD'],
['fffe00d82a', '\uFFFD'],
['fffe00dc2a', '\uFFFD\uFFFD'],
['feff', ''],
['feff202a', '\u202A'],
['feff20', '\uFFFD'],
['feffd70020', '\uD700\uFFFD'],
['feffd80020', '\uFFFD'],
['feffdc0020', '\uFFFD\uFFFD'],
],
// non-normalized names
Utf8: [['c280', '\x80']],
unicodefeff: [['c280', '\u80C2']],
UnicodeFFFE: [['c280', '\uC280']],
}

test('null encoding', (t) => {
t.assert.throws(() => legacyHookDecode(Uint8Array.of(), null), RangeError)
})

for (const [encoding, data] of Object.entries(fixtures)) {
test(encoding, (t) => {
for (const [hex, string] of data) {
t.assert.strictEqual(legacyHookDecode(fromHex(hex), encoding), string, `${hex}`)
}
})
}
})

test('getBOMEncoding', (t) => {
const fixtures = [
[null, ''],
[null, 'ff'],
[null, 'fe'],
[null, 'ef'],
[null, 'efbb'],
[null, 'efbb00'],
[null, 'efbfbb'],
[null, 'ffbbbf'],
['utf-8', 'efbbbf'],
['utf-8', 'efbbbf00'],
['utf-16le', 'fffe'],
['utf-16le', 'fffefffe'],
['utf-16le', 'fffefffefffe'],
['utf-16le', 'fffebb'],
['utf-16le', 'fffebf'],
['utf-16be', 'feff'],
['utf-16be', 'fefffeff'],
['utf-16be', 'fefffefffeff'],
]

for (const [enc, hex] of fixtures) {
t.assert.strictEqual(getBOMEncoding(fromHex(hex)), enc, `${hex} -> ${enc}`)
}
})
1 change: 0 additions & 1 deletion tests/encoding/generic.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ test('String coercion', (t) => {
t.assert.strictEqual(b.length, 0)
t.assert.deepStrictEqual(b, Uint8Array.of())
} else {
const b = encoder.encode(arg)
t.assert.strictEqual(b.length, length)
t.assert.deepStrictEqual(b, a)
}
Expand Down
2 changes: 1 addition & 1 deletion tests/vendor/whatwg-encoding/whatwg-encoding-mock.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import * as api from '@exodus/bytes/encoding.js'
import * as api from '@exodus/bytes/encoding-browser.js'

// prettier-ignore
const supported = new Set([
Expand Down
8 changes: 6 additions & 2 deletions whatwg.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
import { utf8fromStringLoose } from '@exodus/bytes/utf8.js'
import { createSinglebyteEncoder } from '@exodus/bytes/single-byte.js'
import { isMultibyte, getMultibyteEncoder } from './fallback/encoding.js'
import { normalizeEncoding, E_ENCODING } from './fallback/encoding.api.js'
import {
isMultibyte,
getMultibyteEncoder,
normalizeEncoding,
E_ENCODING,
} from './fallback/encoding.js'
import { percentEncoder } from './fallback/percent.js'
import { encodeMap } from './fallback/single-byte.js'
import { E_STRING } from './fallback/_utils.js'
Expand Down
Loading