nodejs · TimothyGu · May 11, 2021 · TimothyGu · May 11, 2021 · jasnell
diff --git a/doc/api/errors.md b/doc/api/errors.md
@@ -1677,10 +1677,10 @@ An invalid URI was passed.
 <a id="ERR_INVALID_URL"></a>
 ### `ERR_INVALID_URL`
 
-An invalid URL was passed to the [WHATWG][WHATWG URL API]
-[`URL` constructor][`new URL(input)`] to be parsed. The thrown error object
-typically has an additional property `'input'` that contains the URL that failed
-to parse.
+An invalid URL was passed to the [WHATWG][WHATWG URL API] [`URL`
+constructor][`new URL(input)`] or the legacy [`url.parse()`][] to be parsed.
+The thrown error object typically has an additional property `'input'` that
+contains the URL that failed to parse.
 
 <a id="ERR_INVALID_URL_SCHEME"></a>
 ### `ERR_INVALID_URL_SCHEME`
@@ -2824,6 +2824,7 @@ The native call from `process.cpuUsage` could not be processed.
 [`stream.write()`]: stream.md#stream_writable_write_chunk_encoding_callback
 [`subprocess.kill()`]: child_process.md#child_process_subprocess_kill_signal
 [`subprocess.send()`]: child_process.md#child_process_subprocess_send_message_sendhandle_options_callback
+[`url.parse()`]: url.md#url_url_parse_urlstring_parsequerystring_slashesdenotehost
 [`util.getSystemErrorName(error.errno)`]: util.md#util_util_getsystemerrorname_err
 [`zlib`]: zlib.md
 [crypto digest algorithm]: crypto.md#crypto_crypto_gethashes

diff --git a/doc/api/url.md b/doc/api/url.md
@@ -1232,6 +1232,11 @@ forward-slash characters (`/`) are required following the colon in the
 <!-- YAML
 added: v0.1.25
 changes:
+  - version: REPLACEME
+    pr-url: https://github.com/nodejs/node/pull/38631
+    description: Now throws an `ERR_INVALID_URL` exception when Punycode
+                 conversion of a hostname introduces changes that could cause
+                 the URL to be re-parsed differently.
   - version:
       - v15.13.0
       - v14.17.0

diff --git a/lib/url.js b/lib/url.js
@@ -34,7 +34,8 @@ const { toASCII } = require('internal/idna');
 const { encodeStr, hexTable } = require('internal/querystring');
 
 const {
-  ERR_INVALID_ARG_TYPE
+  ERR_INVALID_ARG_TYPE,
+  ERR_INVALID_URL,
 } = require('internal/errors').codes;
 const { validateString } = require('internal/validators');
 
@@ -167,6 +168,20 @@ function isIpv6Hostname(hostname) {
   );
 }
 
+// This prevents some common spoofing bugs due to our use of IDNA toASCII. For
+// compatibility, the set of characters we use here is the *intersection* of
+// "forbidden host code point" in the WHATWG URL Standard [1] and the
+// characters in the host parsing loop in Url.prototype.parse, with the
+// following additions:
+//
+// - ':' since this could cause a "protocol spoofing" bug
+// - '@' since this could cause parts of the hostname to be confused with auth
+// - '[' and ']' since this could cause a non-IPv6 hostname to be interpreted
+//   as IPv6 by isIpv6Hostname above
+//
+// [1]: https://url.spec.whatwg.org/#forbidden-host-code-point
+const forbiddenHostChars = /[\t\n\r #%/:<>?@[\\\]^|]/;
+
 Url.prototype.parse = function parse(url, parseQueryString, slashesDenoteHost) {
   validateString(url, 'url');
 
@@ -389,7 +404,7 @@ Url.prototype.parse = function parse(url, parseQueryString, slashesDenoteHost) {
       this.hostname = this.hostname.toLowerCase();
     }
 
-    if (!ipv6Hostname) {
+    if (!ipv6Hostname && this.hostname !== '') {
       // IDNA Support: Returns a punycoded representation of "domain".
       // It only converts parts of the domain name that
       // have non-ASCII characters, i.e. it doesn't matter if
@@ -398,6 +413,19 @@ Url.prototype.parse = function parse(url, parseQueryString, slashesDenoteHost) {
       // Use lenient mode (`true`) to try to support even non-compliant
       // URLs.
       this.hostname = toASCII(this.hostname, true);
+
+      // Prevent two potential routes of hostname spoofing.
+      // 1. If this.hostname is empty, it must have become empty due to toASCII
+      //    since we checked this.hostname above.
+      // 2. If any of forbiddenHostChars appears in this.hostname, it must have
+      //    also gotten in due to toASCII. This is since getHostname would have
+      //    filtered them out otherwise.
+      // Rather than trying to correct this by moving the non-host part into
+      // the pathname as we've done in getHostname, throw an exception to
+      // convey the severity of this issue.
+      if (this.hostname === '' || forbiddenHostChars.test(this.hostname)) {
+        throw new ERR_INVALID_URL(url);
+      }
     }
 
     const p = this.port ? ':' + this.port : '';

diff --git a/test/parallel/test-url-parse-invalid-input.js b/test/parallel/test-url-parse-invalid-input.js
@@ -36,3 +36,37 @@ assert.throws(() => { url.parse('http://%E0%A4%A@fail'); },
                 // JS engine errors do not have the `code` property.
                 return e.code === undefined;
               });
+
+if (common.hasIntl) {
+  // An array of Unicode code points whose Unicode NFKD contains a "bad
+  // character".
+  const badIDNA = (() => {
+    const BAD_CHARS = '#%/:?@[\\]^|';
+    const out = [];
+    for (let i = 0x80; i < 0x110000; i++) {
+      const cp = String.fromCodePoint(i);
+      for (const badChar of BAD_CHARS) {
+        if (cp.normalize('NFKD').includes(badChar)) {
+          out.push(cp);
+        }
+      }
+    }
+    return out;
+  })();
+
+  // The generation logic above should at a minimum produce these two
+  // characters.
+  assert(badIDNA.includes('℀'));
+  assert(badIDNA.includes('＠'));
+
+  for (const badCodePoint of badIDNA) {
+    const badURL = `http://fail${badCodePoint}fail.com/`;
+    assert.throws(() => { url.parse(badURL); },
+                  (e) => e.code === 'ERR_INVALID_URL',
+                  `parsing ${badURL}`);
+  }
+
+  assert.throws(() => { url.parse('http://\u00AD/bad.com/'); },
+                (e) => e.code === 'ERR_INVALID_URL',
+                'parsing http://\u00AD/bad.com/');
+}