nodejs · mscdex · Jun 24, 2016 · Jun 15, 2016 · Jun 16, 2016 · indutny
diff --git a/lib/string_decoder.js b/lib/string_decoder.js
@@ -25,8 +25,10 @@ function StringDecoder(encoding) {
     case 'utf16le':
       this.text = utf16Text;
       this.end = utf16End;
-      // fall through
+      nb = 4;
+      break;
     case 'utf8':
+      this.fillLast = utf8FillLast;
       nb = 4;
       break;
     case 'base64':
@@ -68,7 +70,7 @@ StringDecoder.prototype.end = utf8End;
 // Returns only complete characters in a Buffer
 StringDecoder.prototype.text = utf8Text;
 
-// Attempts to complete a partial character using bytes from a Buffer
+// Attempts to complete a partial non-UTF-8 character using bytes from a Buffer
 StringDecoder.prototype.fillLast = function(buf) {
   if (this.lastNeed <= buf.length) {
     buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, this.lastNeed);
@@ -92,38 +94,83 @@ function utf8CheckByte(byte) {
   return -1;
 }
 
-// Checks at most the last 3 bytes of a Buffer for an incomplete UTF-8
-// character, returning the total number of bytes needed to complete the partial
-// character (if applicable).
+// Checks at most 3 bytes at the end of a Buffer in order to detect an
+// incomplete multi-byte UTF-8 character. The total number of bytes (2, 3, or 4)
+// needed to complete the UTF-8 character (if applicable) are returned.
 function utf8CheckIncomplete(self, buf, i) {
   var j = buf.length - 1;
   if (j < i)
     return 0;
-  var nb = utf8CheckByte(buf[j--]);
+  var nb = utf8CheckByte(buf[j]);
   if (nb >= 0) {
     if (nb > 0)
-      self.lastNeed = nb + 1 - (buf.length - j);
+      self.lastNeed = nb - 1;
     return nb;
   }
-  if (j < i)
+  if (--j < i)
     return 0;
-  nb = utf8CheckByte(buf[j--]);
+  nb = utf8CheckByte(buf[j]);
   if (nb >= 0) {
     if (nb > 0)
-      self.lastNeed = nb + 1 - (buf.length - j);
+      self.lastNeed = nb - 2;
     return nb;
   }
-  if (j < i)
+  if (--j < i)
     return 0;
-  nb = utf8CheckByte(buf[j--]);
+  nb = utf8CheckByte(buf[j]);
   if (nb >= 0) {
-    if (nb > 0)
-      self.lastNeed = nb + 1 - (buf.length - j);
+    if (nb > 0) {
+      if (nb === 2)
+        nb = 0;
+      else
+        self.lastNeed = nb - 3;
+    }
     return nb;
   }
   return 0;
 }
 
+// Validates as many continuation bytes for a multi-byte UTF-8 character as
+// needed or are available. If we see a non-continuation byte where we expect
+// one, we "replace" the validated continuation bytes we've seen so far with
+// UTF-8 replacement characters ('\ufffd'), to match v8's UTF-8 decoding
+// behavior. The continuation byte check is included three times in the case
+// where all of the continuation bytes for a character exist in the same buffer.
+// It is also done this way as a slight performance increase instead of using a
+// loop.
+function utf8CheckExtraBytes(self, buf, p) {
+  if ((buf[0] & 0xC0) !== 0x80) {
+    self.lastNeed = 0;
+    return '\ufffd'.repeat(p);
+  }
+  if (self.lastNeed > 1 && buf.length > 1) {
+    if ((buf[1] & 0xC0) !== 0x80) {
+      self.lastNeed = 1;
+      return '\ufffd'.repeat(p + 1);
+    }
+    if (self.lastNeed > 2 && buf.length > 2) {
+      if ((buf[2] & 0xC0) !== 0x80) {
+        self.lastNeed = 2;
+        return '\ufffd'.repeat(p + 2);
+      }
+    }
+  }
+}
+
+// Attempts to complete a multi-byte UTF-8 character using bytes from a Buffer.
+function utf8FillLast(buf) {
+  const p = this.lastTotal - this.lastNeed;
+  var r = utf8CheckExtraBytes(this, buf, p);
+  if (r !== undefined)
+    return r;
+  if (this.lastNeed <= buf.length) {
+    buf.copy(this.lastChar, p, 0, this.lastNeed);
+    return this.lastChar.toString(this.encoding, 0, this.lastTotal);
+  }
+  buf.copy(this.lastChar, p, 0, buf.length);
+  this.lastNeed -= buf.length;
+}
+
 // Returns all complete UTF-8 characters in a Buffer. If the Buffer ended on a
 // partial character, the character's bytes are buffered until the required
 // number of bytes are available.

diff --git a/test/parallel/test-string-decoder.js b/test/parallel/test-string-decoder.js
@@ -28,6 +28,30 @@ test(
   '\u02e4\u0064\u12e4\u0030\u3045'
 );
 
+// Some invalid input, known to have caused trouble with chunking
+// in https://github.com/nodejs/node/pull/7310#issuecomment-226445923
+// 00: |00000000 ASCII
+// 41: |01000001 ASCII
+// B8: 10|111000 continuation
+// CC: 110|01100 two-byte head
+// E2: 1110|0010 three-byte head
+// F0: 11110|000 four-byte head
+// F1: 11110|001'another four-byte head
+// FB: 111110|11 "five-byte head", not UTF-8
+test('utf-8', Buffer.from('C9B5A941', 'hex'), '\u0275\ufffdA');
+test('utf-8', Buffer.from('E2', 'hex'), '\ufffd');
+test('utf-8', Buffer.from('E241', 'hex'), '\ufffdA');
+test('utf-8', Buffer.from('CCCCB8', 'hex'), '\ufffd\u0338');
+test('utf-8', Buffer.from('F0B841', 'hex'), '\ufffd\ufffdA');
+test('utf-8', Buffer.from('F1CCB8', 'hex'), '\ufffd\u0338');
+test('utf-8', Buffer.from('F0FB00', 'hex'), '\ufffd\ufffd\0');
+test('utf-8', Buffer.from('CCE2B8B8', 'hex'), '\ufffd\u2e38');
+test('utf-8', Buffer.from('E2B8CCB8', 'hex'), '\ufffd\ufffd\u0338');
+test('utf-8', Buffer.from('E2FBCC01', 'hex'), '\ufffd\ufffd\ufffd\u0001');
+test('utf-8', Buffer.from('EDA0B5EDB08D', 'hex'), // CESU-8 of U+1D40D
+     '\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd');
+test('utf-8', Buffer.from('CCB8CDB9', 'hex'), '\u0338\u0379');
+
 // UCS-2
 test('ucs2', Buffer.from('ababc', 'ucs2'), 'ababc');
 
@@ -55,7 +79,12 @@ assert.strictEqual(decoder.write(Buffer.from('\ufffd\ufffd\ufffd')),
 assert.strictEqual(decoder.end(), '');
 
 decoder = new StringDecoder('utf8');
-assert.strictEqual(decoder.write(Buffer.from('efbfbde2', 'hex')), '\ufffd');
+assert.strictEqual(decoder.write(Buffer.from('EFBFBDE2', 'hex')), '\ufffd');
+assert.strictEqual(decoder.end(), '\ufffd');
+
+decoder = new StringDecoder('utf8');
+assert.strictEqual(decoder.write(Buffer.from('F1', 'hex')), '');
+assert.strictEqual(decoder.write(Buffer.from('41F2', 'hex')), '\ufffdA');
 assert.strictEqual(decoder.end(), '\ufffd');
 
 
@@ -93,6 +122,7 @@ function test(encoding, input, expected, singleSequence) {
     sequence.forEach(function(write) {
       output += decoder.write(input.slice(write[0], write[1]));
     });
+    output += decoder.end();
     process.stdout.write('.');
     if (output !== expected) {
       var message =