Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

encoding: make TextDecoder handle BOM correctly #30132

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
encoding: make TextDecoder handle BOM correctly
Do not accept the BOM if it comes from a different encoding, and
only discard the BOM after it has actually been read (including
when it is spread over multiple chunks in streaming mode).

Fixes: #25315
  • Loading branch information
addaleax committed Nov 1, 2019
commit a30272b33395317fe79d39c42dd2d1e313561080
27 changes: 12 additions & 15 deletions lib/internal/encoding.js
Original file line number Diff line number Diff line change
Expand Up @@ -484,25 +484,22 @@ function makeTextDecoderJS() {
this[kFlags] |= CONVERTER_FLAGS_FLUSH;
}

if (!this[kBOMSeen] && !(this[kFlags] & CONVERTER_FLAGS_IGNORE_BOM)) {
if (this[kEncoding] === 'utf-8') {
if (input.length >= 3 &&
input[0] === 0xEF && input[1] === 0xBB && input[2] === 0xBF) {
input = input.slice(3);
}
} else if (this[kEncoding] === 'utf-16le') {
if (input.length >= 2 && input[0] === 0xFF && input[1] === 0xFE) {
input = input.slice(2);
}
let result = this[kFlags] & CONVERTER_FLAGS_FLUSH ?
this[kHandle].end(input) :
this[kHandle].write(input);

if (result.length > 0 &&
!this[kBOMSeen] &&
!(this[kFlags] & CONVERTER_FLAGS_IGNORE_BOM)) {
// If the very first result in the stream is a BOM, and we are not
// explicitly told to ignore it, then we discard it.
if (result[0] === '\ufeff') {
result = result.slice(1);
}
this[kBOMSeen] = true;
}

if (this[kFlags] & CONVERTER_FLAGS_FLUSH) {
return this[kHandle].end(input);
}

return this[kHandle].write(input);
return result;
}
}

Expand Down
8 changes: 4 additions & 4 deletions src/node_buffer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -219,10 +219,10 @@ size_t Length(Local<Object> obj) {
}


inline MaybeLocal<Uint8Array> New(Environment* env,
Local<ArrayBuffer> ab,
size_t byte_offset,
size_t length) {
MaybeLocal<Uint8Array> New(Environment* env,
Local<ArrayBuffer> ab,
size_t byte_offset,
size_t length) {
CHECK(!env->buffer_prototype_object().IsEmpty());
Local<Uint8Array> ui = Uint8Array::New(ab, byte_offset, length);
Maybe<bool> mb =
Expand Down
37 changes: 27 additions & 10 deletions src/node_i18n.cc
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ using v8::NewStringType;
using v8::Object;
using v8::ObjectTemplate;
using v8::String;
using v8::Uint8Array;
using v8::Value;

namespace i18n {
Expand Down Expand Up @@ -227,25 +228,41 @@ class ConverterObject : public BaseObject, Converter {
const char* source = input.data();
size_t source_length = input.length();

if (converter->unicode_ && !converter->ignoreBOM_ && !converter->bomSeen_) {
int32_t bomOffset = 0;
ucnv_detectUnicodeSignature(source, source_length, &bomOffset, &status);
source += bomOffset;
source_length -= bomOffset;
converter->bomSeen_ = true;
}

UChar* target = *result;
ucnv_toUnicode(converter->conv,
&target, target + (limit * sizeof(UChar)),
&source, source + source_length,
nullptr, flush, &status);

if (U_SUCCESS(status)) {
if (limit > 0)
bool omit_initial_bom = false;
if (limit > 0) {
result.SetLength(target - &result[0]);
if (result.length() > 0 &&
converter->unicode_ &&
!converter->ignoreBOM_ &&
!converter->bomSeen_) {
// If the very first result in the stream is a BOM, and we are not
// explicitly told to ignore it, then we mark it for discarding.
if (result[0] == 0xFEFF) {
omit_initial_bom = true;
}
converter->bomSeen_ = true;
}
}
ret = ToBufferEndian(env, &result);
args.GetReturnValue().Set(ret.ToLocalChecked());
if (omit_initial_bom && !ret.IsEmpty()) {
// Peform `ret = ret.slice(2)`.
CHECK(ret.ToLocalChecked()->IsUint8Array());
Local<Uint8Array> orig_ret = ret.ToLocalChecked().As<Uint8Array>();
ret = Buffer::New(env,
orig_ret->Buffer(),
orig_ret->ByteOffset() + 2,
orig_ret->ByteLength() - 2)
.FromMaybe(Local<Uint8Array>());
}
if (!ret.IsEmpty())
args.GetReturnValue().Set(ret.ToLocalChecked());
return;
}

Expand Down
6 changes: 5 additions & 1 deletion src/node_internals.h
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,11 @@ v8::MaybeLocal<v8::Object> New(Environment* env,
char* data,
size_t length,
bool uses_malloc);

// Creates a Buffer instance over an existing Uint8Array.
v8::MaybeLocal<v8::Uint8Array> New(Environment* env,
v8::Local<v8::ArrayBuffer> ab,
size_t byte_offset,
size_t length);
// Construct a Buffer from a MaybeStackBuffer (and also its subclasses like
// Utf8Value and TwoByteValue).
// If |buf| is invalidated, an empty MaybeLocal is returned, and nothing is
Expand Down
5 changes: 1 addition & 4 deletions test/wpt/status/encoding.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,7 @@
"fail": "iso-2022-jp decoder state handling bug: https://encoding.spec.whatwg.org/#iso-2022-jp-decoder"
},
"textdecoder-byte-order-marks.any.js": {
"fail": "Mismatching BOM should not be ignored"
},
"textdecoder-copy.any.js": {
"fail": "Should not have output BOM: https://encoding.spec.whatwg.org/#concept-td-serialize"
"requires": ["small-icu"]
},
"textdecoder-fatal-single-byte.any.js": {
"requires": ["full-icu"],
Expand Down