Skip to content

Commit 4e99bf6

Browse files
mscdexFishrock123
authored andcommitted
string_decoder: fix bad utf8 character handling
This commit fixes an issue when extra utf8 continuation bytes appear at the end of a chunk of data, causing miscalculations to be made when checking how many bytes are needed to decode a complete character. Fixes: #7308 PR-URL: #7310 Reviewed-By: Colin Ihrig <cjihrig@gmail.com> Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Fedor Indutny <fedor.indutny@gmail.com>
1 parent 4000e0e commit 4e99bf6

File tree

2 files changed

+62
-15
lines changed

2 files changed

+62
-15
lines changed

lib/string_decoder.js

Lines changed: 61 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,10 @@ function StringDecoder(encoding) {
4545
case 'utf16le':
4646
this.text = utf16Text;
4747
this.end = utf16End;
48-
// fall through
48+
nb = 4;
49+
break;
4950
case 'utf8':
51+
this.fillLast = utf8FillLast;
5052
nb = 4;
5153
break;
5254
case 'base64':
@@ -88,7 +90,7 @@ StringDecoder.prototype.end = utf8End;
8890
// Returns only complete characters in a Buffer
8991
StringDecoder.prototype.text = utf8Text;
9092

91-
// Attempts to complete a partial character using bytes from a Buffer
93+
// Attempts to complete a partial non-UTF-8 character using bytes from a Buffer
9294
StringDecoder.prototype.fillLast = function(buf) {
9395
if (this.lastNeed <= buf.length) {
9496
buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, this.lastNeed);
@@ -112,38 +114,83 @@ function utf8CheckByte(byte) {
112114
return -1;
113115
}
114116

115-
// Checks at most the last 3 bytes of a Buffer for an incomplete UTF-8
116-
// character, returning the total number of bytes needed to complete the partial
117-
// character (if applicable).
117+
// Checks at most 3 bytes at the end of a Buffer in order to detect an
118+
// incomplete multi-byte UTF-8 character. The total number of bytes (2, 3, or 4)
119+
// needed to complete the UTF-8 character (if applicable) are returned.
118120
function utf8CheckIncomplete(self, buf, i) {
119121
var j = buf.length - 1;
120122
if (j < i)
121123
return 0;
122-
var nb = utf8CheckByte(buf[j--]);
124+
var nb = utf8CheckByte(buf[j]);
123125
if (nb >= 0) {
124126
if (nb > 0)
125-
self.lastNeed = nb + 1 - (buf.length - j);
127+
self.lastNeed = nb - 1;
126128
return nb;
127129
}
128-
if (j < i)
130+
if (--j < i)
129131
return 0;
130-
nb = utf8CheckByte(buf[j--]);
132+
nb = utf8CheckByte(buf[j]);
131133
if (nb >= 0) {
132134
if (nb > 0)
133-
self.lastNeed = nb + 1 - (buf.length - j);
135+
self.lastNeed = nb - 2;
134136
return nb;
135137
}
136-
if (j < i)
138+
if (--j < i)
137139
return 0;
138-
nb = utf8CheckByte(buf[j--]);
140+
nb = utf8CheckByte(buf[j]);
139141
if (nb >= 0) {
140-
if (nb > 0)
141-
self.lastNeed = nb + 1 - (buf.length - j);
142+
if (nb > 0) {
143+
if (nb === 2)
144+
nb = 0;
145+
else
146+
self.lastNeed = nb - 3;
147+
}
142148
return nb;
143149
}
144150
return 0;
145151
}
146152

153+
// Validates as many continuation bytes for a multi-byte UTF-8 character as
154+
// needed or are available. If we see a non-continuation byte where we expect
155+
// one, we "replace" the validated continuation bytes we've seen so far with
156+
// UTF-8 replacement characters ('\ufffd'), to match v8's UTF-8 decoding
157+
// behavior. The continuation byte check is included three times in the case
158+
// where all of the continuation bytes for a character exist in the same buffer.
159+
// It is also done this way as a slight performance increase instead of using a
160+
// loop.
161+
function utf8CheckExtraBytes(self, buf, p) {
162+
if ((buf[0] & 0xC0) !== 0x80) {
163+
self.lastNeed = 0;
164+
return '\ufffd'.repeat(p);
165+
}
166+
if (self.lastNeed > 1 && buf.length > 1) {
167+
if ((buf[1] & 0xC0) !== 0x80) {
168+
self.lastNeed = 1;
169+
return '\ufffd'.repeat(p + 1);
170+
}
171+
if (self.lastNeed > 2 && buf.length > 2) {
172+
if ((buf[2] & 0xC0) !== 0x80) {
173+
self.lastNeed = 2;
174+
return '\ufffd'.repeat(p + 2);
175+
}
176+
}
177+
}
178+
}
179+
180+
// Attempts to complete a multi-byte UTF-8 character using bytes from a Buffer.
181+
function utf8FillLast(buf) {
182+
const p = this.lastTotal - this.lastNeed;
183+
var r = utf8CheckExtraBytes(this, buf, p);
184+
if (r !== undefined)
185+
return r;
186+
if (this.lastNeed <= buf.length) {
187+
buf.copy(this.lastChar, p, 0, this.lastNeed);
188+
return this.lastChar.toString(this.encoding, 0, this.lastTotal);
189+
}
190+
buf.copy(this.lastChar, p, 0, buf.length);
191+
this.lastNeed -= buf.length;
192+
}
193+
147194
// Returns all complete UTF-8 characters in a Buffer. If the Buffer ended on a
148195
// partial character, the character's bytes are buffered until the required
149196
// number of bytes are available.

test/parallel/test-string-decoder.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ assert.strictEqual(decoder.write(Buffer.from('\ufffd\ufffd\ufffd')),
5555
assert.strictEqual(decoder.end(), '');
5656

5757
decoder = new StringDecoder('utf8');
58-
assert.strictEqual(decoder.write(Buffer.from('efbfbde2', 'hex')), '\ufffd');
58+
assert.strictEqual(decoder.write(Buffer.from('EFBFBDE2', 'hex')), '\ufffd');
5959
assert.strictEqual(decoder.end(), '\ufffd');
6060

6161

0 commit comments

Comments
 (0)