Skip to content

Commit

Permalink
string_decoder: align UTF-8 handling with V8
Browse files Browse the repository at this point in the history
V8 5.5 changed how invalid characters are handled and it now appears
to follow the WHATWG Encoding standard, where all of an invalid
character's bytes are replaced by a single replacement character
(\ufffd) instead of replacing each invalid byte with separate
replacement characters.

Example: the byte sequence 0xF0,0xB8,0x41 is decoded as '\ufffdA' in
V8 5.5, but is decoded as '\ufffd\ufffdA' in previous versions of V8.

PR-URL: nodejs#9618
Reviewed-By: Ali Ijaz Sheikh <ofrobots@google.com>
Reviewed-By: Ben Noordhuis <info@bnoordhuis.nl>
  • Loading branch information
mscdex authored and targos committed Jan 26, 2017
1 parent 007386e commit 24ef1e6
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 28 deletions.
22 changes: 11 additions & 11 deletions lib/string_decoder.js
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ StringDecoder.prototype.fillLast = function(buf) {
};

// Checks the type of a UTF-8 byte, whether it's ASCII, a leading byte, or a
// continuation byte.
// continuation byte. If an invalid byte is detected, -2 is returned.
function utf8CheckByte(byte) {
if (byte <= 0x7F)
return 0;
Expand All @@ -91,7 +91,7 @@ function utf8CheckByte(byte) {
return 3;
else if (byte >> 3 === 0x1E)
return 4;
return -1;
return (byte >> 6 === 0x02 ? -1 : -2);
}

// Checks at most 3 bytes at the end of a Buffer in order to detect an
Expand All @@ -107,15 +107,15 @@ function utf8CheckIncomplete(self, buf, i) {
self.lastNeed = nb - 1;
return nb;
}
if (--j < i)
if (--j < i || nb === -2)
return 0;
nb = utf8CheckByte(buf[j]);
if (nb >= 0) {
if (nb > 0)
self.lastNeed = nb - 2;
return nb;
}
if (--j < i)
if (--j < i || nb === -2)
return 0;
nb = utf8CheckByte(buf[j]);
if (nb >= 0) {
Expand All @@ -133,25 +133,25 @@ function utf8CheckIncomplete(self, buf, i) {
// Validates as many continuation bytes for a multi-byte UTF-8 character as
// needed or are available. If we see a non-continuation byte where we expect
// one, we "replace" the validated continuation bytes we've seen so far with
// UTF-8 replacement characters ('\ufffd'), to match v8's UTF-8 decoding
// a single UTF-8 replacement character ('\ufffd'), to match v8's UTF-8 decoding
// behavior. The continuation byte check is included three times in the case
// where all of the continuation bytes for a character exist in the same buffer.
// It is also done this way as a slight performance increase instead of using a
// loop.
function utf8CheckExtraBytes(self, buf, p) {
if ((buf[0] & 0xC0) !== 0x80) {
self.lastNeed = 0;
return '\ufffd'.repeat(p);
return '\ufffd';
}
if (self.lastNeed > 1 && buf.length > 1) {
if ((buf[1] & 0xC0) !== 0x80) {
self.lastNeed = 1;
return '\ufffd'.repeat(p + 1);
return '\ufffd';
}
if (self.lastNeed > 2 && buf.length > 2) {
if ((buf[2] & 0xC0) !== 0x80) {
self.lastNeed = 2;
return '\ufffd'.repeat(p + 2);
return '\ufffd';
}
}
}
Expand Down Expand Up @@ -184,12 +184,12 @@ function utf8Text(buf, i) {
return buf.toString('utf8', i, end);
}

// For UTF-8, a replacement character for each buffered byte of a (partial)
// character needs to be added to the output.
// For UTF-8, a replacement character is added when ending on a partial
// character.
function utf8End(buf) {
const r = (buf && buf.length ? this.write(buf) : '');
if (this.lastNeed)
return r + '\ufffd'.repeat(this.lastTotal - this.lastNeed);
return r + '\ufffd';
return r;
}

Expand Down
7 changes: 0 additions & 7 deletions test/parallel/test-string-decoder-end.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,13 @@ for (let i = 1; i <= 16; i++) {

encodings.forEach(testEncoding);

console.log('ok');

function testEncoding(encoding) {
bufs.forEach((buf) => {
testBuf(encoding, buf);
});
}

function testBuf(encoding, buf) {
console.error('# %s', encoding, buf);

// write one byte at a time.
let s = new SD(encoding);
let res1 = '';
Expand All @@ -46,9 +42,6 @@ function testBuf(encoding, buf) {
// .toString() on the buffer
const res3 = buf.toString(encoding);

console.log('expect=%j', res3);
console.log('res1=%j', res1);
console.log('res2=%j', res2);
assert.strictEqual(res1, res3, 'one byte at a time should match toString');
assert.strictEqual(res2, res3, 'all bytes at once should match toString');
}
15 changes: 5 additions & 10 deletions test/parallel/test-string-decoder.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@ const StringDecoder = require('string_decoder').StringDecoder;
let decoder = new StringDecoder();
assert.strictEqual(decoder.encoding, 'utf8');

process.stdout.write('scanning ');

// UTF-8
test('utf-8', Buffer.from('$', 'utf-8'), '$');
test('utf-8', Buffer.from('¢', 'utf-8'), '¢');
Expand Down Expand Up @@ -42,32 +40,30 @@ test('utf-8', Buffer.from('C9B5A941', 'hex'), '\u0275\ufffdA');
test('utf-8', Buffer.from('E2', 'hex'), '\ufffd');
test('utf-8', Buffer.from('E241', 'hex'), '\ufffdA');
test('utf-8', Buffer.from('CCCCB8', 'hex'), '\ufffd\u0338');
test('utf-8', Buffer.from('F0B841', 'hex'), '\ufffd\ufffdA');
test('utf-8', Buffer.from('F0B841', 'hex'), '\ufffdA');
test('utf-8', Buffer.from('F1CCB8', 'hex'), '\ufffd\u0338');
test('utf-8', Buffer.from('F0FB00', 'hex'), '\ufffd\ufffd\0');
test('utf-8', Buffer.from('CCE2B8B8', 'hex'), '\ufffd\u2e38');
test('utf-8', Buffer.from('E2B8CCB8', 'hex'), '\ufffd\ufffd\u0338');
test('utf-8', Buffer.from('E2B8CCB8', 'hex'), '\ufffd\u0338');
test('utf-8', Buffer.from('E2FBCC01', 'hex'), '\ufffd\ufffd\ufffd\u0001');
test('utf-8', Buffer.from('EDA0B5EDB08D', 'hex'), // CESU-8 of U+1D40D
'\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd');
test('utf-8', Buffer.from('CCB8CDB9', 'hex'), '\u0338\u0379');
// CESU-8 of U+1D40D
test('utf-8', Buffer.from('EDA0B5EDB08D', 'hex'), '\ufffd\ufffd');

// UCS-2
test('ucs2', Buffer.from('ababc', 'ucs2'), 'ababc');

// UTF-16LE
test('utf16le', Buffer.from('3DD84DDC', 'hex'), '\ud83d\udc4d'); // thumbs up

console.log(' crayon!');

// Additional UTF-8 tests
decoder = new StringDecoder('utf8');
assert.strictEqual(decoder.write(Buffer.from('E1', 'hex')), '');
assert.strictEqual(decoder.end(), '\ufffd');

decoder = new StringDecoder('utf8');
assert.strictEqual(decoder.write(Buffer.from('E18B', 'hex')), '');
assert.strictEqual(decoder.end(), '\ufffd\ufffd');
assert.strictEqual(decoder.end(), '\ufffd');

decoder = new StringDecoder('utf8');
assert.strictEqual(decoder.write(Buffer.from('\ufffd')), '\ufffd');
Expand Down Expand Up @@ -131,7 +127,6 @@ function test(encoding, input, expected, singleSequence) {
output += decoder.write(input.slice(write[0], write[1]));
});
output += decoder.end();
process.stdout.write('.');
if (output !== expected) {
const message =
'Expected "' + unicodeEscape(expected) + '", ' +
Expand Down

0 comments on commit 24ef1e6

Please sign in to comment.