-
-
Notifications
You must be signed in to change notification settings - Fork 33.5k
string_decoder: fix bad utf8 character handling #7310
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,8 +25,10 @@ function StringDecoder(encoding) { | |
case 'utf16le': | ||
this.text = utf16Text; | ||
this.end = utf16End; | ||
// fall through | ||
nb = 4; | ||
break; | ||
case 'utf8': | ||
this.fillLast = utf8FillLast; | ||
nb = 4; | ||
break; | ||
case 'base64': | ||
|
@@ -68,7 +70,7 @@ StringDecoder.prototype.end = utf8End; | |
// Returns only complete characters in a Buffer | ||
StringDecoder.prototype.text = utf8Text; | ||
|
||
// Attempts to complete a partial character using bytes from a Buffer | ||
// Attempts to complete a partial non-UTF-8 character using bytes from a Buffer | ||
StringDecoder.prototype.fillLast = function(buf) { | ||
if (this.lastNeed <= buf.length) { | ||
buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, this.lastNeed); | ||
|
@@ -92,38 +94,83 @@ function utf8CheckByte(byte) { | |
return -1; | ||
} | ||
|
||
// Checks at most the last 3 bytes of a Buffer for an incomplete UTF-8 | ||
// character, returning the total number of bytes needed to complete the partial | ||
// character (if applicable). | ||
// Checks at most 3 bytes at the end of a Buffer in order to detect an | ||
// incomplete multi-byte UTF-8 character. The total number of bytes (2, 3, or 4) | ||
// needed to complete the UTF-8 character (if applicable) are returned. | ||
function utf8CheckIncomplete(self, buf, i) { | ||
var j = buf.length - 1; | ||
if (j < i) | ||
return 0; | ||
var nb = utf8CheckByte(buf[j--]); | ||
var nb = utf8CheckByte(buf[j]); | ||
if (nb >= 0) { | ||
if (nb > 0) | ||
self.lastNeed = nb + 1 - (buf.length - j); | ||
self.lastNeed = nb - 1; | ||
return nb; | ||
} | ||
if (j < i) | ||
if (--j < i) | ||
return 0; | ||
nb = utf8CheckByte(buf[j--]); | ||
nb = utf8CheckByte(buf[j]); | ||
if (nb >= 0) { | ||
if (nb > 0) | ||
self.lastNeed = nb + 1 - (buf.length - j); | ||
self.lastNeed = nb - 2; | ||
return nb; | ||
} | ||
if (j < i) | ||
if (--j < i) | ||
return 0; | ||
nb = utf8CheckByte(buf[j--]); | ||
nb = utf8CheckByte(buf[j]); | ||
if (nb >= 0) { | ||
if (nb > 0) | ||
self.lastNeed = nb + 1 - (buf.length - j); | ||
if (nb > 0) { | ||
if (nb === 2) | ||
nb = 0; | ||
else | ||
self.lastNeed = nb - 3; | ||
} | ||
return nb; | ||
} | ||
return 0; | ||
} | ||
|
||
// Validates as many continuation bytes for a multi-byte UTF-8 character as | ||
// needed or are available. If we see a non-continuation byte where we expect | ||
// one, we "replace" the validated continuation bytes we've seen so far with | ||
// UTF-8 replacement characters ('\ufffd'), to match v8's UTF-8 decoding | ||
// behavior. The continuation byte check is included three times in the case | ||
// where all of the continuation bytes for a character exist in the same buffer. | ||
// It is also done this way as a slight performance increase instead of using a | ||
// loop. | ||
function utf8CheckExtraBytes(self, buf, p) { | ||
if ((buf[0] & 0xC0) !== 0x80) { | ||
self.lastNeed = 0; | ||
return '\ufffd'.repeat(p); | ||
} | ||
if (self.lastNeed > 1 && buf.length > 1) { | ||
if ((buf[1] & 0xC0) !== 0x80) { | ||
self.lastNeed = 1; | ||
return '\ufffd'.repeat(p + 1); | ||
} | ||
if (self.lastNeed > 2 && buf.length > 2) { | ||
if ((buf[2] & 0xC0) !== 0x80) { | ||
self.lastNeed = 2; | ||
return '\ufffd'.repeat(p + 2); | ||
|
||
} | ||
} | ||
} | ||
} | ||
|
||
// Attempts to complete a multi-byte UTF-8 character using bytes from a Buffer. | ||
function utf8FillLast(buf) { | ||
const p = this.lastTotal - this.lastNeed; | ||
var r = utf8CheckExtraBytes(this, buf, p); | ||
if (r !== undefined) | ||
return r; | ||
if (this.lastNeed <= buf.length) { | ||
buf.copy(this.lastChar, p, 0, this.lastNeed); | ||
return this.lastChar.toString(this.encoding, 0, this.lastTotal); | ||
} | ||
buf.copy(this.lastChar, p, 0, buf.length); | ||
this.lastNeed -= buf.length; | ||
} | ||
|
||
// Returns all complete UTF-8 characters in a Buffer. If the Buffer ended on a | ||
// partial character, the character's bytes are buffered until the required | ||
// number of bytes are available. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,6 +28,30 @@ test( | |
'\u02e4\u0064\u12e4\u0030\u3045' | ||
); | ||
|
||
// Some invalid input, known to have caused trouble with chunking | ||
// in https://github.com/nodejs/node/pull/7310#issuecomment-226445923 | ||
// 00: |00000000 ASCII | ||
// 41: |01000001 ASCII | ||
// B8: 10|111000 continuation | ||
// CC: 110|01100 two-byte head | ||
// E2: 1110|0010 three-byte head | ||
// F0: 11110|000 four-byte head | ||
// F1: 11110|001'another four-byte head | ||
// FB: 111110|11 "five-byte head", not UTF-8 | ||
test('utf-8', Buffer.from('C9B5A941', 'hex'), '\u0275\ufffdA'); | ||
test('utf-8', Buffer.from('E2', 'hex'), '\ufffd'); | ||
test('utf-8', Buffer.from('E241', 'hex'), '\ufffdA'); | ||
test('utf-8', Buffer.from('CCCCB8', 'hex'), '\ufffd\u0338'); | ||
test('utf-8', Buffer.from('F0B841', 'hex'), '\ufffd\ufffdA'); | ||
test('utf-8', Buffer.from('F1CCB8', 'hex'), '\ufffd\u0338'); | ||
test('utf-8', Buffer.from('F0FB00', 'hex'), '\ufffd\ufffd\0'); | ||
test('utf-8', Buffer.from('CCE2B8B8', 'hex'), '\ufffd\u2e38'); | ||
test('utf-8', Buffer.from('E2B8CCB8', 'hex'), '\ufffd\ufffd\u0338'); | ||
test('utf-8', Buffer.from('E2FBCC01', 'hex'), '\ufffd\ufffd\ufffd\u0001'); | ||
test('utf-8', Buffer.from('EDA0B5EDB08D', 'hex'), // CESU-8 of U+1D40D | ||
'\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd'); | ||
test('utf-8', Buffer.from('CCB8CDB9', 'hex'), '\u0338\u0379'); | ||
|
||
// UCS-2 | ||
test('ucs2', Buffer.from('ababc', 'ucs2'), 'ababc'); | ||
|
||
|
@@ -55,7 +79,12 @@ assert.strictEqual(decoder.write(Buffer.from('\ufffd\ufffd\ufffd')), | |
assert.strictEqual(decoder.end(), ''); | ||
|
||
decoder = new StringDecoder('utf8'); | ||
assert.strictEqual(decoder.write(Buffer.from('efbfbde2', 'hex')), '\ufffd'); | ||
assert.strictEqual(decoder.write(Buffer.from('EFBFBDE2', 'hex')), '\ufffd'); | ||
|
||
assert.strictEqual(decoder.end(), '\ufffd'); | ||
|
||
decoder = new StringDecoder('utf8'); | ||
assert.strictEqual(decoder.write(Buffer.from('F1', 'hex')), ''); | ||
assert.strictEqual(decoder.write(Buffer.from('41F2', 'hex')), '\ufffdA'); | ||
assert.strictEqual(decoder.end(), '\ufffd'); | ||
|
||
|
||
|
@@ -93,6 +122,7 @@ function test(encoding, input, expected, singleSequence) { | |
sequence.forEach(function(write) { | ||
output += decoder.write(input.slice(write[0], write[1])); | ||
}); | ||
output += decoder.end(); | ||
process.stdout.write('.'); | ||
if (output !== expected) { | ||
var message = | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Couldn't it still go negative here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, because if execution has reached here,
nb
would have to be a 2, 3, or 4-byte character indicator. Soself.lastNeed
would range from 0 to 2.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh right,
nb
can't be 1.