Skip to content

url: enforce valid UTF-8 in WHATWG parser #11436

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions benchmark/url/usvstring.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
'use strict';
const common = require('../common.js');

const inputs = {
valid: 'adsfadsfadsf',
validsurr: '\uda23\ude23\uda1f\udfaa\ud800\udfff\uda23\ude23\uda1f\udfaa' +
'\ud800\udfff',
someinvalid: 'asasfdfasd\uda23',
allinvalid: '\udc45\uda23 \udf00\udc00 \udfaa\uda12 \udc00\udfaa',
nonstring: { toString() { return 'asdf'; } }
};
const bench = common.createBenchmark(main, {
input: Object.keys(inputs),
n: [5e7]
}, {
flags: ['--expose-internals']
});

function main(conf) {
const { toUSVString } = require('internal/url');
const str = inputs[conf.input];
const n = conf.n | 0;

bench.start();
for (var i = 0; i < n; i++)
toUSVString(str);
bench.end(n);
}
98 changes: 65 additions & 33 deletions lib/internal/url.js
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,18 @@ const IteratorPrototype = Object.getPrototypeOf(
Object.getPrototypeOf([][Symbol.iterator]())
);

const unpairedSurrogateRe =
/([^\uD800-\uDBFF]|^)[\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])/;
function toUSVString(val) {
const str = '' + val;
// As of V8 5.5, `str.search()` (and `unpairedSurrogateRe[@@search]()`) are
// slower than `unpairedSurrogateRe.exec()`.
const match = unpairedSurrogateRe.exec(str);
if (!match)
return str;
return binding.toUSVString(str, match.index);
}

class OpaqueOrigin {
toString() {
return 'null';
Expand Down Expand Up @@ -104,7 +116,6 @@ function onParseComplete(flags, protocol, username, password,

// Reused by URL constructor and URL#href setter.
function parse(url, input, base) {
input = String(input);
const base_context = base ? base[context] : undefined;
url[context] = new StorageObject();
binding.parse(input.trim(), -1,
Expand Down Expand Up @@ -206,8 +217,10 @@ function onParseHashComplete(flags, protocol, username, password,

class URL {
constructor(input, base) {
// toUSVString is not needed.
input = '' + input;
if (base !== undefined && !(base instanceof URL))
base = new URL(String(base));
base = new URL(base);
parse(this, input, base);
}

Expand Down Expand Up @@ -315,6 +328,8 @@ Object.defineProperties(URL.prototype, {
return this[kFormat]({});
},
set(input) {
// toUSVString is not needed.
input = '' + input;
parse(this, input);
}
},
Expand All @@ -332,7 +347,8 @@ Object.defineProperties(URL.prototype, {
return this[context].scheme;
},
set(scheme) {
scheme = String(scheme);
// toUSVString is not needed.
scheme = '' + scheme;
if (scheme.length === 0)
return;
binding.parse(scheme, binding.kSchemeStart, null, this[context],
Expand All @@ -346,7 +362,8 @@ Object.defineProperties(URL.prototype, {
return this[context].username || '';
},
set(username) {
username = String(username);
// toUSVString is not needed.
username = '' + username;
if (!this.hostname)
return;
const ctx = this[context];
Expand All @@ -366,7 +383,8 @@ Object.defineProperties(URL.prototype, {
return this[context].password || '';
},
set(password) {
password = String(password);
// toUSVString is not needed.
password = '' + password;
if (!this.hostname)
return;
const ctx = this[context];
Expand All @@ -391,7 +409,8 @@ Object.defineProperties(URL.prototype, {
},
set(host) {
const ctx = this[context];
host = String(host);
// toUSVString is not needed.
host = '' + host;
if (this[cannotBeBase] ||
(this[special] && host.length === 0)) {
// Cannot set the host if cannot-be-base is set or
Expand All @@ -415,7 +434,8 @@ Object.defineProperties(URL.prototype, {
},
set(host) {
const ctx = this[context];
host = String(host);
// toUSVString is not needed.
host = '' + host;
if (this[cannotBeBase] ||
(this[special] && host.length === 0)) {
// Cannot set the host if cannot-be-base is set or
Expand All @@ -439,11 +459,12 @@ Object.defineProperties(URL.prototype, {
return port === undefined ? '' : String(port);
},
set(port) {
// toUSVString is not needed.
port = '' + port;
const ctx = this[context];
if (!ctx.host || this[cannotBeBase] ||
this.protocol === 'file:')
return;
port = String(port);
if (port === '') {
ctx.port = undefined;
return;
Expand All @@ -462,9 +483,11 @@ Object.defineProperties(URL.prototype, {
return ctx.path !== undefined ? `/${ctx.path.join('/')}` : '';
},
set(path) {
// toUSVString is not needed.
path = '' + path;
if (this[cannotBeBase])
return;
binding.parse(String(path), binding.kPathStart, null, this[context],
binding.parse(path, binding.kPathStart, null, this[context],
onParsePathComplete.bind(this));
}
},
Expand All @@ -477,7 +500,7 @@ Object.defineProperties(URL.prototype, {
},
set(search) {
const ctx = this[context];
search = String(search);
search = toUSVString(search);
if (!search) {
ctx.query = null;
ctx.flags &= ~binding.URL_FLAGS_HAS_QUERY;
Expand Down Expand Up @@ -509,7 +532,8 @@ Object.defineProperties(URL.prototype, {
},
set(hash) {
const ctx = this[context];
hash = String(hash);
// toUSVString is not needed.
hash = '' + hash;
if (this.protocol === 'javascript:')
return;
if (!hash) {
Expand Down Expand Up @@ -652,19 +676,22 @@ class URLSearchParams {
if (pair.length !== 2) {
throw new TypeError('Each query pair must be a name/value tuple');
}
this[searchParams].push(String(pair[0]), String(pair[1]));
const key = toUSVString(pair[0]);
const value = toUSVString(pair[1]);
this[searchParams].push(key, value);
}
} else {
// record<USVString, USVString>
this[searchParams] = [];
for (const key of Object.keys(init)) {
const value = String(init[key]);
for (var key of Object.keys(init)) {
key = toUSVString(key);
const value = toUSVString(init[key]);
this[searchParams].push(key, value);
}
}
} else {
// USVString
init = String(init);
init = toUSVString(init);
if (init[0] === '?') init = init.slice(1);
initSearchParams(this, init);
}
Expand Down Expand Up @@ -743,8 +770,8 @@ defineIDLClass(URLSearchParams.prototype, 'URLSearchParams', {
throw new TypeError('"name" and "value" arguments must be specified');
}

name = String(name);
value = String(value);
name = toUSVString(name);
value = toUSVString(value);
this[searchParams].push(name, value);
update(this[context], this);
},
Expand All @@ -758,7 +785,7 @@ defineIDLClass(URLSearchParams.prototype, 'URLSearchParams', {
}

const list = this[searchParams];
name = String(name);
name = toUSVString(name);
for (var i = 0; i < list.length;) {
const cur = list[i];
if (cur === name) {
Expand All @@ -779,7 +806,7 @@ defineIDLClass(URLSearchParams.prototype, 'URLSearchParams', {
}

const list = this[searchParams];
name = String(name);
name = toUSVString(name);
for (var i = 0; i < list.length; i += 2) {
if (list[i] === name) {
return list[i + 1];
Expand All @@ -798,7 +825,7 @@ defineIDLClass(URLSearchParams.prototype, 'URLSearchParams', {

const list = this[searchParams];
const values = [];
name = String(name);
name = toUSVString(name);
for (var i = 0; i < list.length; i += 2) {
if (list[i] === name) {
values.push(list[i + 1]);
Expand All @@ -816,7 +843,7 @@ defineIDLClass(URLSearchParams.prototype, 'URLSearchParams', {
}

const list = this[searchParams];
name = String(name);
name = toUSVString(name);
for (var i = 0; i < list.length; i += 2) {
if (list[i] === name) {
return true;
Expand All @@ -834,8 +861,8 @@ defineIDLClass(URLSearchParams.prototype, 'URLSearchParams', {
}

const list = this[searchParams];
name = String(name);
value = String(value);
name = toUSVString(name);
value = toUSVString(value);

// If there are any name-value pairs whose name is `name`, in `list`, set
// the value of the first such name-value pair to `value` and remove the
Expand Down Expand Up @@ -1098,11 +1125,13 @@ function originFor(url, base) {
}

function domainToASCII(domain) {
return binding.domainToASCII(String(domain));
// toUSVString is not needed.
return binding.domainToASCII('' + domain);
}

function domainToUnicode(domain) {
return binding.domainToUnicode(String(domain));
// toUSVString is not needed.
return binding.domainToUnicode('' + domain);
}

// Utility function that converts a URL object into an ordinary
Expand Down Expand Up @@ -1188,11 +1217,14 @@ function getPathFromURL(path) {
return isWindows ? getPathFromURLWin32(path) : getPathFromURLPosix(path);
}

exports.getPathFromURL = getPathFromURL;
exports.URL = URL;
exports.URLSearchParams = URLSearchParams;
exports.domainToASCII = domainToASCII;
exports.domainToUnicode = domainToUnicode;
exports.urlToOptions = urlToOptions;
exports.formatSymbol = kFormat;
exports.searchParamsSymbol = searchParams;
module.exports = {
toUSVString,
getPathFromURL,
URL,
URLSearchParams,
domainToASCII,
domainToUnicode,
urlToOptions,
formatSymbol: kFormat,
searchParamsSymbol: searchParams
};
53 changes: 53 additions & 0 deletions src/node_url.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
#include <unicode/utf.h>
#endif

#define UNICODE_REPLACEMENT_CHARACTER 0xFFFD

namespace node {

using v8::Array;
Expand Down Expand Up @@ -143,6 +145,21 @@ namespace url {
}
#endif

// If a UTF-16 character is a low/trailing surrogate.
static inline bool IsUnicodeTrail(uint16_t c) {
return (c & 0xFC00) == 0xDC00;
}

// If a UTF-16 character is a surrogate.
static inline bool IsUnicodeSurrogate(uint16_t c) {
return (c & 0xF800) == 0xD800;
}

// If a UTF-16 surrogate is a low/trailing one.
static inline bool IsUnicodeSurrogateTrail(uint16_t c) {
return (c & 0x400) != 0;
}

static url_host_type ParseIPv6Host(url_host* host,
const char* input,
size_t length) {
Expand Down Expand Up @@ -1351,6 +1368,41 @@ namespace url {
v8::NewStringType::kNormal).ToLocalChecked());
}

static void ToUSVString(const FunctionCallbackInfo<Value>& args) {
Environment* env = Environment::GetCurrent(args);
CHECK_GE(args.Length(), 2);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CHECK_EQ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All the existing functions use CHECK_GE, and I don't see a reason to be stricter than what this function actually uses.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah there are functions in other files doing EQ...not sure if we have a convention or not, just think GE is implying there could be more args, which doesn't seem to be the case for this one, though I don't feel very strongly about this.

CHECK(args[0]->IsString());
CHECK(args[1]->IsNumber());

TwoByteValue value(env->isolate(), args[0]);
const size_t n = value.length();

const int64_t start = args[1]->IntegerValue(env->context()).FromJust();
CHECK_GE(start, 0);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe another CHECK_LT(start, n)? Doesn't do any harm even if start is larger though.

Copy link
Member Author

@TimothyGu TimothyGu Feb 25, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm only checking start >= 0 because I'm converting start to a size_t, which is an unsigned type. In C++, signed-to-unsigned conversion, though a defined operation, acts oddly when the signed value is negative. On the other hand, n >= start is a more benign case, and I don't think requires the full effects of a runtime assertion (i.e. crashing).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh yep, not worth a full on abort ;)


for (size_t i = start; i < n; i++) {
uint16_t c = value[i];
if (!IsUnicodeSurrogate(c)) {
continue;
} else if (IsUnicodeSurrogateTrail(c) || i == n - 1) {
value[i] = UNICODE_REPLACEMENT_CHARACTER;
} else {
uint16_t d = value[i + 1];
if (IsUnicodeTrail(d)) {
i++;
} else {
value[i] = UNICODE_REPLACEMENT_CHARACTER;
}
}
}

args.GetReturnValue().Set(
String::NewFromTwoByte(env->isolate(),
*value,
v8::NewStringType::kNormal,
n).ToLocalChecked());
}

static void DomainToASCII(const FunctionCallbackInfo<Value>& args) {
Environment* env = Environment::GetCurrent(args);
CHECK_GE(args.Length(), 1);
Expand Down Expand Up @@ -1398,6 +1450,7 @@ namespace url {
Environment* env = Environment::GetCurrent(context);
env->SetMethod(target, "parse", Parse);
env->SetMethod(target, "encodeAuth", EncodeAuthSet);
env->SetMethod(target, "toUSVString", ToUSVString);
env->SetMethod(target, "domainToASCII", DomainToASCII);
env->SetMethod(target, "domainToUnicode", DomainToUnicode);

Expand Down
3 changes: 1 addition & 2 deletions src/util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,7 @@ TwoByteValue::TwoByteValue(Isolate* isolate, Local<Value> value) {
const size_t storage = string->Length() + 1;
AllocateSufficientStorage(storage);

const int flags =
String::NO_NULL_TERMINATION | String::REPLACE_INVALID_UTF8;
const int flags = String::NO_NULL_TERMINATION;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure this is right, since this would affect more than just the WHATWG URL implementation?

Copy link
Member

@bnoordhuis bnoordhuis Feb 17, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed, this is not an acceptable change. You could turn the flags into an optional argument that defaults to what is now or make it a template trait.

EDIT: Objection withdrawn.

Copy link
Member Author

@TimothyGu TimothyGu Feb 17, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As I've explained in 424722af4541cd0eec1bf299aa8afcdff6284f52, this flag is actually a no-op. TwoByteValue::TwoByteValue uses v8::String::Write as opposed to v8::String::WriteUtf8. The flag is only respected by WriteUtf8, and hence misleading being applied here in TwoByteValue.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, you're right, I missed that it's the UTF-16 version. Objection withdrawn.

const int length = string->Write(out(), 0, storage, flags);
SetLengthAndZeroTerminate(length);
}
Expand Down
3 changes: 1 addition & 2 deletions test/common.js
Original file line number Diff line number Diff line change
Expand Up @@ -604,8 +604,7 @@ exports.WPT = {
try {
fn();
} catch (err) {
if (err instanceof Error)
err.message = `In ${desc}:\n ${err.message}`;
console.error(`In ${desc}:`);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Left-over debug code?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, it was intentional, since it seems like after the Error is constructed changing the message doesn't change stack. (See 56f6b8dbad36a46f07668b2b35574dc367e6d146.)

Copy link
Member

@joyeecheung joyeecheung Feb 17, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, yes, to patch the error message we would have to extend the Error class, something like:

class WPTError extends Error {
  constructor(desc, err) {
    super(`In ${desc}, ${err.name}: ${err.message}`);
    Error.captureStackTrace(this, WPTError);
  }
}

throw new WPTError(desc, err);

But then console.error() works too, though we would see two error traces this way?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm.. it appears to:

> var m = new Error('test')
undefined
> m.message = 'foo'
'foo'
> m
Error: foo
    at repl:1:9
    at ContextifyScript.Script.runInThisContext (vm.js:23:33)
    at REPLServer.defaultEval (repl.js:340:29)
    at bound (domain.js:280:14)
    at REPLServer.runBound [as eval] (domain.js:293:12)
    at REPLServer.onLine (repl.js:537:10)
    at emitOne (events.js:101:20)
    at REPLServer.emit (events.js:189:7)
    at REPLServer.Interface._onLine (readline.js:238:10)
    at REPLServer.Interface._line (readline.js:582:8)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, looks like it's the Error.captureStackTrace call in the AssertionError constructor "freezes" the stack. To recapture the stack, we need to:

diff --git a/test/common.js b/test/common.js
index 5f7dc25..f74d5f0 100644
--- a/test/common.js
+++ b/test/common.js
@@ -598,8 +598,10 @@ exports.WPT = {
     try {
       fn();
     } catch (err) {
-      if (err instanceof Error)
+      if (err instanceof Error) {
         err.message = `In ${desc}:\n  ${err.message}`;
+        Error.captureStackTrace(err);
+      }
       throw err;
     }
   },

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@joyeecheung, calling Error.captureStackTrace() there will only capture the stack of WPT.test, not the actual stack from err.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, yes, this probably deserves another PR to sort out. I am fine with a console.error work around.

throw err;
}
},
Expand Down
Loading