@@ -45,8 +45,10 @@ function StringDecoder(encoding) {
4545 case 'utf16le' :
4646 this . text = utf16Text ;
4747 this . end = utf16End ;
48- // fall through
48+ nb = 4 ;
49+ break ;
4950 case 'utf8' :
51+ this . fillLast = utf8FillLast ;
5052 nb = 4 ;
5153 break ;
5254 case 'base64' :
@@ -88,7 +90,7 @@ StringDecoder.prototype.end = utf8End;
8890// Returns only complete characters in a Buffer
8991StringDecoder . prototype . text = utf8Text ;
9092
91- // Attempts to complete a partial character using bytes from a Buffer
93+ // Attempts to complete a partial non-UTF-8 character using bytes from a Buffer
9294StringDecoder . prototype . fillLast = function ( buf ) {
9395 if ( this . lastNeed <= buf . length ) {
9496 buf . copy ( this . lastChar , this . lastTotal - this . lastNeed , 0 , this . lastNeed ) ;
@@ -112,38 +114,83 @@ function utf8CheckByte(byte) {
112114 return - 1 ;
113115}
114116
115- // Checks at most the last 3 bytes of a Buffer for an incomplete UTF-8
116- // character, returning the total number of bytes needed to complete the partial
117- // character (if applicable).
117+ // Checks at most 3 bytes at the end of a Buffer in order to detect an
118+ // incomplete multi-byte UTF-8 character. The total number of bytes (2, 3, or 4)
119+ // needed to complete the UTF-8 character (if applicable) are returned .
118120function utf8CheckIncomplete ( self , buf , i ) {
119121 var j = buf . length - 1 ;
120122 if ( j < i )
121123 return 0 ;
122- var nb = utf8CheckByte ( buf [ j -- ] ) ;
124+ var nb = utf8CheckByte ( buf [ j ] ) ;
123125 if ( nb >= 0 ) {
124126 if ( nb > 0 )
125- self . lastNeed = nb + 1 - ( buf . length - j ) ;
127+ self . lastNeed = nb - 1 ;
126128 return nb ;
127129 }
128- if ( j < i )
130+ if ( -- j < i )
129131 return 0 ;
130- nb = utf8CheckByte ( buf [ j -- ] ) ;
132+ nb = utf8CheckByte ( buf [ j ] ) ;
131133 if ( nb >= 0 ) {
132134 if ( nb > 0 )
133- self . lastNeed = nb + 1 - ( buf . length - j ) ;
135+ self . lastNeed = nb - 2 ;
134136 return nb ;
135137 }
136- if ( j < i )
138+ if ( -- j < i )
137139 return 0 ;
138- nb = utf8CheckByte ( buf [ j -- ] ) ;
140+ nb = utf8CheckByte ( buf [ j ] ) ;
139141 if ( nb >= 0 ) {
140- if ( nb > 0 )
141- self . lastNeed = nb + 1 - ( buf . length - j ) ;
142+ if ( nb > 0 ) {
143+ if ( nb === 2 )
144+ nb = 0 ;
145+ else
146+ self . lastNeed = nb - 3 ;
147+ }
142148 return nb ;
143149 }
144150 return 0 ;
145151}
146152
153+ // Validates as many continuation bytes for a multi-byte UTF-8 character as
154+ // needed or are available. If we see a non-continuation byte where we expect
155+ // one, we "replace" the validated continuation bytes we've seen so far with
156+ // UTF-8 replacement characters ('\ufffd'), to match v8's UTF-8 decoding
157+ // behavior. The continuation byte check is included three times in the case
158+ // where all of the continuation bytes for a character exist in the same buffer.
159+ // It is also done this way as a slight performance increase instead of using a
160+ // loop.
161+ function utf8CheckExtraBytes ( self , buf , p ) {
162+ if ( ( buf [ 0 ] & 0xC0 ) !== 0x80 ) {
163+ self . lastNeed = 0 ;
164+ return '\ufffd' . repeat ( p ) ;
165+ }
166+ if ( self . lastNeed > 1 && buf . length > 1 ) {
167+ if ( ( buf [ 1 ] & 0xC0 ) !== 0x80 ) {
168+ self . lastNeed = 1 ;
169+ return '\ufffd' . repeat ( p + 1 ) ;
170+ }
171+ if ( self . lastNeed > 2 && buf . length > 2 ) {
172+ if ( ( buf [ 2 ] & 0xC0 ) !== 0x80 ) {
173+ self . lastNeed = 2 ;
174+ return '\ufffd' . repeat ( p + 2 ) ;
175+ }
176+ }
177+ }
178+ }
179+
180+ // Attempts to complete a multi-byte UTF-8 character using bytes from a Buffer.
181+ function utf8FillLast ( buf ) {
182+ const p = this . lastTotal - this . lastNeed ;
183+ var r = utf8CheckExtraBytes ( this , buf , p ) ;
184+ if ( r !== undefined )
185+ return r ;
186+ if ( this . lastNeed <= buf . length ) {
187+ buf . copy ( this . lastChar , p , 0 , this . lastNeed ) ;
188+ return this . lastChar . toString ( this . encoding , 0 , this . lastTotal ) ;
189+ }
190+ buf . copy ( this . lastChar , p , 0 , buf . length ) ;
191+ this . lastNeed -= buf . length ;
192+ }
193+
147194// Returns all complete UTF-8 characters in a Buffer. If the Buffer ended on a
148195// partial character, the character's bytes are buffered until the required
149196// number of bytes are available.
0 commit comments