Support URL fragment resolution against non-hierarchical schemes

Support URL fragment resolution against non-hierarchical schemes As a result, data: about: etc now have 'query' and 'ref' components parsed; as a result a new GURL::GetContent() convenience is added to retrieve the spec with the scheme stripped off. A complication in supporting this is that we now need to allow whitespace to trailing whitespace to be preserved when transferring url_parse::Parsed structs between KURL and GURL. Without this, the URL prior to the #fragment can change (i.e. whitespace stripped) when following an anchor link which breaks the page (causes reload from source). See http://crbug.com/291747 for more details on this. R=brettw@chromium.org TBR=cbentzel@chromium.org BUG=291747 Review URL: https://codereview.chromium.org/23835019 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@236917 0039d316-1c4b-4281-b951-d872f2087c98
dednal · Nov 23, 2013 · 369e84f · 369e84f
1 parent a7e3691
commit 369e84f
Show file tree

Hide file tree

Showing 14 changed files with 211 additions and 115 deletions.
diff --git a/net/base/net_util_unittest.cc b/net/base/net_util_unittest.cc
@@ -3026,9 +3026,9 @@ TEST(NetUtilTest, SimplifyUrlForRequest) {
       "ftp://user:pass@google.com:80/sup?yo#X#X",
       "ftp://google.com:80/sup?yo",
     },
-    { // Try an nonstandard URL
-      "foobar://user:pass@google.com:80/sup?yo#X#X",
+    { // Try a nonstandard URL
       "foobar://user:pass@google.com:80/sup?yo#X#X",
+      "foobar://user:pass@google.com:80/sup?yo",
     },
   };
   for (size_t i = 0; i < ARRAYSIZE_UNSAFE(tests); ++i) {

diff --git a/url/gurl.cc b/url/gurl.cc
@@ -19,25 +19,6 @@
 
 namespace {
 
-// External template that can handle initialization of either character type.
-// The input spec is given, and the canonical version will be placed in
-// |*canonical|, along with the parsing of the canonical spec in |*parsed|.
-template<typename STR>
-bool InitCanonical(const STR& input_spec,
-                   std::string* canonical,
-                   url_parse::Parsed* parsed) {
-  // Reserve enough room in the output for the input, plus some extra so that
-  // we have room if we have to escape a few things without reallocating.
-  canonical->reserve(input_spec.size() + 32);
-  url_canon::StdStringCanonOutput output(canonical);
-  bool success = url_util::Canonicalize(
-      input_spec.data(), static_cast<int>(input_spec.length()),
-      NULL, &output, parsed);
-
-  output.Complete();  // Must be done before using string.
-  return success;
-}
-
 static std::string* empty_string = NULL;
 static GURL* empty_gurl = NULL;
 
@@ -94,21 +75,15 @@ GURL::GURL(const GURL& other)
 }
 
 GURL::GURL(const std::string& url_string) {
-  is_valid_ = InitCanonical(url_string, &spec_, &parsed_);
-  if (is_valid_ && SchemeIsFileSystem()) {
-    inner_url_.reset(
-        new GURL(spec_.data(), parsed_.Length(),
-                 *parsed_.inner_parsed(), true));
-  }
+  InitCanonical(url_string, true);
 }
 
 GURL::GURL(const base::string16& url_string) {
-  is_valid_ = InitCanonical(url_string, &spec_, &parsed_);
-  if (is_valid_ && SchemeIsFileSystem()) {
-    inner_url_.reset(
-        new GURL(spec_.data(), parsed_.Length(),
-                 *parsed_.inner_parsed(), true));
-  }
+  InitCanonical(url_string, true);
+}
+
+GURL::GURL(const std::string& url_string, RetainWhiteSpaceSelector) {
+  InitCanonical(url_string, false);
 }
 
 GURL::GURL(const char* canonical_spec, size_t canonical_spec_len,
@@ -127,6 +102,23 @@ GURL::GURL(std::string canonical_spec,
   InitializeFromCanonicalSpec();
 }
 
+template<typename STR>
+void GURL::InitCanonical(const STR& input_spec, bool trim_path_end) {
+  // Reserve enough room in the output for the input, plus some extra so that
+  // we have room if we have to escape a few things without reallocating.
+  spec_.reserve(input_spec.size() + 32);
+  url_canon::StdStringCanonOutput output(&spec_);
+  is_valid_ = url_util::Canonicalize(
+      input_spec.data(), static_cast<int>(input_spec.length()), trim_path_end,
+      NULL, &output, &parsed_);
+
+  output.Complete();  // Must be done before using string.
+  if (is_valid_ && SchemeIsFileSystem()) {
+    inner_url_.reset(new GURL(spec_.data(), parsed_.Length(),
+                              *parsed_.inner_parsed(), true));
+  }
+}
+
 void GURL::InitializeFromCanonicalSpec() {
   if (is_valid_ && SchemeIsFileSystem()) {
     inner_url_.reset(
@@ -140,13 +132,17 @@ void GURL::InitializeFromCanonicalSpec() {
   // and we can't always canonicalize then reproducabely.
   if (is_valid_) {
     url_parse::Component scheme;
+    // We can't do this check on the inner_url of a filesystem URL, as
+    // canonical_spec actually points to the start of the outer URL, so we'd
+    // end up with infinite recursion in this constructor.
     if (!url_util::FindAndCompareScheme(spec_.data(), spec_.length(),
                                         "filesystem", &scheme) ||
         scheme.begin == parsed_.scheme.begin) {
-      // We can't do this check on the inner_url of a filesystem URL, as
-      // canonical_spec actually points to the start of the outer URL, so we'd
-      // end up with infinite recursion in this constructor.
-      GURL test_url(spec_);
+      // We need to retain trailing whitespace on path URLs, as the |parsed_|
+      // spec we originally received may legitimately contain trailing white-
+      // space on the path or  components e.g. if the #ref has been
+      // removed from a "foo:hello #ref" URL (see http://crbug.com/291747).
+      GURL test_url(spec_, RETAIN_TRAILING_PATH_WHITEPACE);
 
       DCHECK(test_url.is_valid_ == is_valid_);
       DCHECK(test_url.spec_ == spec_);

diff --git a/url/gurl.h b/url/gurl.h
@@ -358,6 +358,17 @@ class URL_EXPORT GURL {
   }
 
  private:
+  // Variant of the string parsing constructor that allows the caller to elect
+  // retain trailing whitespace, if any, on the passed URL spec but only  if the
+  // scheme is one that allows trailing whitespace. The primary use-case is
+  // for data: URLs. In most cases, you want to use the single parameter
+  // constructor above.
+  enum RetainWhiteSpaceSelector { RETAIN_TRAILING_PATH_WHITEPACE };
+  GURL(const std::string& url_string, RetainWhiteSpaceSelector);
+
+  template<typename STR>
+  void InitCanonical(const STR& input_spec, bool trim_path_end);
+
   void InitializeFromCanonicalSpec();
 
   // Returns the substring of the input identified by the given component.

diff --git a/url/gurl_unittest.cc b/url/gurl_unittest.cc
@@ -350,6 +350,29 @@ TEST(GURLTest, Replacements) {
   }
 }
 
+TEST(GURLTest, ClearFragmentOnDataUrl) {
+  // http://crbug.com/291747 - a data URL may legitimately have trailing
+  // whitespace in the spec after the ref is cleared. Test this does not trigger
+  // the url_parse::Parsed importing validation DCHECK in GURL.
+  GURL url(" data: one ? two # three ");
+
+  // By default the trailing whitespace will have been stripped.
+  EXPECT_EQ("data: one ? two # three", url.spec());
+  GURL::Replacements repl;
+  repl.ClearRef();
+  GURL url_no_ref = url.ReplaceComponents(repl);
+
+  EXPECT_EQ("data: one ? two ", url_no_ref.spec());
+
+  // Importing a parsed url via this constructor overload will retain trailing
+  // whitespace.
+  GURL import_url(url_no_ref.spec(),
+                  url_no_ref.parsed_for_possibly_invalid_spec(),
+                  url_no_ref.is_valid());
+  EXPECT_EQ(url_no_ref, import_url);
+  EXPECT_EQ(import_url.query(), " two ");
+}
+
 TEST(GURLTest, PathForRequest) {
   struct TestCase {
     const char* input;

diff --git a/url/third_party/mozilla/url_parse.cc b/url/third_party/mozilla/url_parse.cc
@@ -455,45 +455,53 @@ void DoParseFileSystemURL(const CHAR* spec, int spec_len, Parsed* parsed) {
 // Initializes a path URL which is merely a scheme followed by a path. Examples
 // include "about:foo" and "javascript:alert('bar');"
 template<typename CHAR>
-void DoParsePathURL(const CHAR* spec, int spec_len, Parsed* parsed) {
+void DoParsePathURL(const CHAR* spec, int spec_len,
+                    bool trim_path_end,
+                    Parsed* parsed) {
   // Get the non-path and non-scheme parts of the URL out of the way, we never
   // use them.
   parsed->username.reset();
   parsed->password.reset();
   parsed->host.reset();
   parsed->port.reset();
+  parsed->path.reset();
   parsed->query.reset();
   parsed->ref.reset();
 
   // Strip leading & trailing spaces and control characters.
-  int begin = 0;
-  TrimURL(spec, &begin, &spec_len);
+  int scheme_begin = 0;
+  TrimURL(spec, &scheme_begin, &spec_len, trim_path_end);
 
   // Handle empty specs or ones that contain only whitespace or control chars.
-  if (begin == spec_len) {
+  if (scheme_begin == spec_len) {
     parsed->scheme.reset();
     parsed->path.reset();
     return;
   }
 
+  int path_begin;
   // Extract the scheme, with the path being everything following. We also
   // handle the case where there is no scheme.
-  if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) {
+  if (ExtractScheme(&spec[scheme_begin], spec_len - scheme_begin,
+                    &parsed->scheme)) {
     // Offset the results since we gave ExtractScheme a substring.
-    parsed->scheme.begin += begin;
-
-    // For compatability with the standard URL parser, we treat no path as
-    // -1, rather than having a length of 0 (we normally wouldn't care so
-    // much for these non-standard URLs).
-    if (parsed->scheme.end() == spec_len - 1)
-      parsed->path.reset();
-    else
-      parsed->path = MakeRange(parsed->scheme.end() + 1, spec_len);
+    parsed->scheme.begin += scheme_begin;
+    path_begin = parsed->scheme.end() + 1;
   } else {
-    // No scheme found, just path.
+    // No scheme case.
     parsed->scheme.reset();
-    parsed->path = MakeRange(begin, spec_len);
+    path_begin = scheme_begin;
   }
+
+  if (path_begin == spec_len)
+    return;
+  DCHECK_LT(path_begin, spec_len);
+
+  ParsePath(spec,
+            MakeRange(path_begin, spec_len),
+            &parsed->path,
+            &parsed->query,
+            &parsed->ref);
 }
 
 template<typename CHAR>
@@ -875,12 +883,18 @@ void ParseStandardURL(const base::char16* url, int url_len, Parsed* parsed) {
   DoParseStandardURL(url, url_len, parsed);
 }
 
-void ParsePathURL(const char* url, int url_len, Parsed* parsed) {
-  DoParsePathURL(url, url_len, parsed);
+void ParsePathURL(const char* url,
+                  int url_len,
+                  bool trim_path_end,
+                  Parsed* parsed) {
+  DoParsePathURL(url, url_len, trim_path_end, parsed);
 }
 
-void ParsePathURL(const base::char16* url, int url_len, Parsed* parsed) {
-  DoParsePathURL(url, url_len, parsed);
+void ParsePathURL(const base::char16* url,
+                  int url_len,
+                  bool trim_path_end,
+                  Parsed* parsed) {
+  DoParsePathURL(url, url_len, trim_path_end, parsed);
 }
 
 void ParseFileSystemURL(const char* url, int url_len, Parsed* parsed) {

diff --git a/url/third_party/mozilla/url_parse.h b/url/third_party/mozilla/url_parse.h
@@ -238,9 +238,13 @@ URL_EXPORT void ParseStandardURL(const base::char16* url,
 // section but that aren't file URLs either. The scheme is parsed, and
 // everything after the scheme is considered as the path. This is used for
 // things like "about:" and "javascript:"
-URL_EXPORT void ParsePathURL(const char* url, int url_len, Parsed* parsed);
+URL_EXPORT void ParsePathURL(const char* url,
+                             int url_len,
+                             bool trim_path_end,
+                             Parsed* parsed);
 URL_EXPORT void ParsePathURL(const base::char16* url,
                              int url_len,
+                             bool trim_path_end,
                              Parsed* parsed);
 
 // FileURL is for file URLs. There are some special rules for interpreting

diff --git a/url/url_canon_pathurl.cc b/url/url_canon_pathurl.cc
@@ -13,6 +13,39 @@ namespace url_canon {
 
 namespace {
 
+// Canonicalize the given |component| from |source| into |output| and
+// |new_component|. If |separator| is non-zero, it is pre-pended to |ouput|
+// prior to the canonicalized component; i.e. for the '?' or '#' characters.
+template<typename CHAR, typename UCHAR>
+bool DoCanonicalizePathComponent(const CHAR* source,
+                                 const url_parse::Component& component,
+                                 CHAR seperator,
+                                 CanonOutput* output,
+                                 url_parse::Component* new_component) {
+  bool success = true;
+  if (component.is_valid()) {
+    if (seperator)
+      output->push_back(seperator);
+    // Copy the path using path URL's more lax escaping rules (think for
+    // javascript:). We convert to UTF-8 and escape non-ASCII, but leave all
+    // ASCII characters alone. This helps readability of JavaStript.
+    new_component->begin = output->length();
+    int end = component.end();
+    for (int i = component.begin; i < end; i++) {
+      UCHAR uch = static_cast<UCHAR>(source[i]);
+      if (uch < 0x20 || uch >= 0x80)
+        success &= AppendUTF8EscapedChar(source, &i, end, output);
+      else
+        output->push_back(static_cast<char>(uch));
+    }
+    new_component->len = output->length() - new_component->begin;
+  } else {
+    // Empty part.
+    new_component->reset();
+  }
+  return success;
+}
+
 template<typename CHAR, typename UCHAR>
 bool DoCanonicalizePathURL(const URLComponentSource<CHAR>& source,
                            const url_parse::Parsed& parsed,
@@ -28,29 +61,14 @@ bool DoCanonicalizePathURL(const URLComponentSource<CHAR>& source,
   new_parsed->password.reset();
   new_parsed->host.reset();
   new_parsed->port.reset();
-
-  if (parsed.path.is_valid()) {
-    // Copy the path using path URL's more lax escaping rules (think for
-    // javascript:). We convert to UTF-8 and escape non-ASCII, but leave all
-    // ASCII characters alone. This helps readability of JavaStript.
-    new_parsed->path.begin = output->length();
-    int end = parsed.path.end();
-    for (int i = parsed.path.begin; i < end; i++) {
-      UCHAR uch = static_cast<UCHAR>(source.path[i]);
-      if (uch < 0x20 || uch >= 0x80)
-        success &= AppendUTF8EscapedChar(source.path, &i, end, output);
-      else
-        output->push_back(static_cast<char>(uch));
-    }
-    new_parsed->path.len = output->length() - new_parsed->path.begin;
-  } else {
-    // Empty path.
-    new_parsed->path.reset();
-  }
-
-  // Assume there's no query or ref.
-  new_parsed->query.reset();
-  new_parsed->ref.reset();
+  // We allow path URLs to have the path, query and fragment components, but we
+  // will canonicalize each of the via the weaker path URL rules.
+  success &= DoCanonicalizePathComponent<CHAR, UCHAR>(
+      source.path, parsed.path, 0, output, &new_parsed->path);
+  success &= DoCanonicalizePathComponent<CHAR, UCHAR>(
+      source.query, parsed.query, '?', output, &new_parsed->query);
+  success &= DoCanonicalizePathComponent<CHAR, UCHAR>(
+      source.ref, parsed.ref, '#', output, &new_parsed->ref);
 
   return success;
 }

diff --git a/url/url_canon_relative.cc b/url/url_canon_relative.cc
@@ -101,10 +101,16 @@ bool DoIsRelativeURL(const char* base,
   // "http:foo.html" is a relative URL with path "foo.html". If the scheme is
   // empty, we treat it as relative (":foo") like IE does.
   url_parse::Component scheme;
-  if (!url_parse::ExtractScheme(url, url_len, &scheme) || scheme.len == 0) {
-    // Don't allow relative URLs if the base scheme doesn't support it.
-    if (!is_base_hierarchical)
+  const bool scheme_is_empty =
+      !url_parse::ExtractScheme(url, url_len, &scheme) || scheme.len == 0;
+  if (scheme_is_empty) {
+    if (url[begin] == '#') {
+      // |url| is a bare fragement (e.g. "#foo"). This can be resolved against
+      // any base. Fall-through.
+    } else if (!is_base_hierarchical) {
+      // Don't allow relative URLs if the base scheme doesn't support it.
       return false;
+    }
 
     *relative_component = url_parse::MakeRange(begin, url_len);
     *is_relative = true;