Skip to content

Commit

Permalink
string_decoder: fix bad utf8 character handling
Browse files Browse the repository at this point in the history
This commit fixes an issue when extra utf8 continuation bytes appear
at the end of a chunk of data, causing miscalculations to be made
when checking how many bytes are needed to decode a complete
character.

Fixes: #7308
PR-URL: #7310
Reviewed-By: Colin Ihrig <cjihrig@gmail.com>
Reviewed-By: James M Snell <jasnell@gmail.com>
Reviewed-By: Fedor Indutny <fedor.indutny@gmail.com>
  • Loading branch information
mscdex committed Jun 24, 2016
1 parent 5267f29 commit 79ef3b6
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 15 deletions.
75 changes: 61 additions & 14 deletions lib/string_decoder.js
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@ function StringDecoder(encoding) {
case 'utf16le':
this.text = utf16Text;
this.end = utf16End;
// fall through
nb = 4;
break;
case 'utf8':
this.fillLast = utf8FillLast;
nb = 4;
break;
case 'base64':
Expand Down Expand Up @@ -68,7 +70,7 @@ StringDecoder.prototype.end = utf8End;
// Returns only complete characters in a Buffer
StringDecoder.prototype.text = utf8Text;

// Attempts to complete a partial character using bytes from a Buffer
// Attempts to complete a partial non-UTF-8 character using bytes from a Buffer
StringDecoder.prototype.fillLast = function(buf) {
if (this.lastNeed <= buf.length) {
buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, this.lastNeed);
Expand All @@ -92,38 +94,83 @@ function utf8CheckByte(byte) {
return -1;
}

// Checks at most the last 3 bytes of a Buffer for an incomplete UTF-8
// character, returning the total number of bytes needed to complete the partial
// character (if applicable).
// Checks at most 3 bytes at the end of a Buffer in order to detect an
// incomplete multi-byte UTF-8 character. The total number of bytes (2, 3, or 4)
// needed to complete the UTF-8 character (if applicable) are returned.
function utf8CheckIncomplete(self, buf, i) {
var j = buf.length - 1;
if (j < i)
return 0;
var nb = utf8CheckByte(buf[j--]);
var nb = utf8CheckByte(buf[j]);
if (nb >= 0) {
if (nb > 0)
self.lastNeed = nb + 1 - (buf.length - j);
self.lastNeed = nb - 1;
return nb;
}
if (j < i)
if (--j < i)
return 0;
nb = utf8CheckByte(buf[j--]);
nb = utf8CheckByte(buf[j]);
if (nb >= 0) {
if (nb > 0)
self.lastNeed = nb + 1 - (buf.length - j);
self.lastNeed = nb - 2;
return nb;
}
if (j < i)
if (--j < i)
return 0;
nb = utf8CheckByte(buf[j--]);
nb = utf8CheckByte(buf[j]);
if (nb >= 0) {
if (nb > 0)
self.lastNeed = nb + 1 - (buf.length - j);
if (nb > 0) {
if (nb === 2)
nb = 0;
else
self.lastNeed = nb - 3;
}
return nb;
}
return 0;
}

// Validates as many continuation bytes for a multi-byte UTF-8 character as
// needed or are available. If we see a non-continuation byte where we expect
// one, we "replace" the validated continuation bytes we've seen so far with
// UTF-8 replacement characters ('\ufffd'), to match v8's UTF-8 decoding
// behavior. The continuation byte check is included three times in the case
// where all of the continuation bytes for a character exist in the same buffer.
// It is also done this way as a slight performance increase instead of using a
// loop.
function utf8CheckExtraBytes(self, buf, p) {
if ((buf[0] & 0xC0) !== 0x80) {
self.lastNeed = 0;
return '\ufffd'.repeat(p);
}
if (self.lastNeed > 1 && buf.length > 1) {
if ((buf[1] & 0xC0) !== 0x80) {
self.lastNeed = 1;
return '\ufffd'.repeat(p + 1);
}
if (self.lastNeed > 2 && buf.length > 2) {
if ((buf[2] & 0xC0) !== 0x80) {
self.lastNeed = 2;
return '\ufffd'.repeat(p + 2);
}
}
}
}

// Attempts to complete a multi-byte UTF-8 character using bytes from a Buffer.
function utf8FillLast(buf) {
const p = this.lastTotal - this.lastNeed;
var r = utf8CheckExtraBytes(this, buf, p);
if (r !== undefined)
return r;
if (this.lastNeed <= buf.length) {
buf.copy(this.lastChar, p, 0, this.lastNeed);
return this.lastChar.toString(this.encoding, 0, this.lastTotal);
}
buf.copy(this.lastChar, p, 0, buf.length);
this.lastNeed -= buf.length;
}

// Returns all complete UTF-8 characters in a Buffer. If the Buffer ended on a
// partial character, the character's bytes are buffered until the required
// number of bytes are available.
Expand Down
2 changes: 1 addition & 1 deletion test/parallel/test-string-decoder.js
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ assert.strictEqual(decoder.write(Buffer.from('\ufffd\ufffd\ufffd')),
assert.strictEqual(decoder.end(), '');

decoder = new StringDecoder('utf8');
assert.strictEqual(decoder.write(Buffer.from('efbfbde2', 'hex')), '\ufffd');
assert.strictEqual(decoder.write(Buffer.from('EFBFBDE2', 'hex')), '\ufffd');
assert.strictEqual(decoder.end(), '\ufffd');


Expand Down

0 comments on commit 79ef3b6

Please sign in to comment.