Skip to content

Commit

Permalink
Support URL fragment resolution against non-hierarchical schemes
Browse files Browse the repository at this point in the history
Support URL fragment resolution against non-hierarchical schemes
As a result, data: about: etc now have 'query' and 'ref' components
parsed; as a result a new GURL::GetContent() convenience is added to
retrieve the spec with the scheme stripped off.

A complication in supporting this is that we now need to allow whitespace
to trailing whitespace to be preserved when transferring url_parse::Parsed
structs between KURL and GURL. Without this, the URL prior to the
#fragment can change (i.e. whitespace stripped) when following an anchor
link which breaks the page (causes reload from source). See
http://crbug.com/291747 for more details on this.

R=brettw@chromium.org
TBR=cbentzel@chromium.org
BUG=291747

Review URL: https://codereview.chromium.org/23835019

git-svn-id: svn://svn.chromium.org/chrome/trunk/src@236917 0039d316-1c4b-4281-b951-d872f2087c98
  • Loading branch information
joth@chromium.org committed Nov 23, 2013
1 parent a7e3691 commit 369e84f
Show file tree
Hide file tree
Showing 14 changed files with 211 additions and 115 deletions.
4 changes: 2 additions & 2 deletions net/base/net_util_unittest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3026,9 +3026,9 @@ TEST(NetUtilTest, SimplifyUrlForRequest) {
"ftp://user:pass@google.com:80/sup?yo#X#X",
"ftp://google.com:80/sup?yo",
},
{ // Try an nonstandard URL
"foobar://user:pass@google.com:80/sup?yo#X#X",
{ // Try a nonstandard URL
"foobar://user:pass@google.com:80/sup?yo#X#X",
"foobar://user:pass@google.com:80/sup?yo",
},
};
for (size_t i = 0; i < ARRAYSIZE_UNSAFE(tests); ++i) {
Expand Down
66 changes: 31 additions & 35 deletions url/gurl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,25 +19,6 @@

namespace {

// External template that can handle initialization of either character type.
// The input spec is given, and the canonical version will be placed in
// |*canonical|, along with the parsing of the canonical spec in |*parsed|.
template<typename STR>
bool InitCanonical(const STR& input_spec,
std::string* canonical,
url_parse::Parsed* parsed) {
// Reserve enough room in the output for the input, plus some extra so that
// we have room if we have to escape a few things without reallocating.
canonical->reserve(input_spec.size() + 32);
url_canon::StdStringCanonOutput output(canonical);
bool success = url_util::Canonicalize(
input_spec.data(), static_cast<int>(input_spec.length()),
NULL, &output, parsed);

output.Complete(); // Must be done before using string.
return success;
}

static std::string* empty_string = NULL;
static GURL* empty_gurl = NULL;

Expand Down Expand Up @@ -94,21 +75,15 @@ GURL::GURL(const GURL& other)
}

GURL::GURL(const std::string& url_string) {
is_valid_ = InitCanonical(url_string, &spec_, &parsed_);
if (is_valid_ && SchemeIsFileSystem()) {
inner_url_.reset(
new GURL(spec_.data(), parsed_.Length(),
*parsed_.inner_parsed(), true));
}
InitCanonical(url_string, true);
}

GURL::GURL(const base::string16& url_string) {
is_valid_ = InitCanonical(url_string, &spec_, &parsed_);
if (is_valid_ && SchemeIsFileSystem()) {
inner_url_.reset(
new GURL(spec_.data(), parsed_.Length(),
*parsed_.inner_parsed(), true));
}
InitCanonical(url_string, true);
}

GURL::GURL(const std::string& url_string, RetainWhiteSpaceSelector) {
InitCanonical(url_string, false);
}

GURL::GURL(const char* canonical_spec, size_t canonical_spec_len,
Expand All @@ -127,6 +102,23 @@ GURL::GURL(std::string canonical_spec,
InitializeFromCanonicalSpec();
}

template<typename STR>
void GURL::InitCanonical(const STR& input_spec, bool trim_path_end) {
// Reserve enough room in the output for the input, plus some extra so that
// we have room if we have to escape a few things without reallocating.
spec_.reserve(input_spec.size() + 32);
url_canon::StdStringCanonOutput output(&spec_);
is_valid_ = url_util::Canonicalize(
input_spec.data(), static_cast<int>(input_spec.length()), trim_path_end,
NULL, &output, &parsed_);

output.Complete(); // Must be done before using string.
if (is_valid_ && SchemeIsFileSystem()) {
inner_url_.reset(new GURL(spec_.data(), parsed_.Length(),
*parsed_.inner_parsed(), true));
}
}

void GURL::InitializeFromCanonicalSpec() {
if (is_valid_ && SchemeIsFileSystem()) {
inner_url_.reset(
Expand All @@ -140,13 +132,17 @@ void GURL::InitializeFromCanonicalSpec() {
// and we can't always canonicalize then reproducabely.
if (is_valid_) {
url_parse::Component scheme;
// We can't do this check on the inner_url of a filesystem URL, as
// canonical_spec actually points to the start of the outer URL, so we'd
// end up with infinite recursion in this constructor.
if (!url_util::FindAndCompareScheme(spec_.data(), spec_.length(),
"filesystem", &scheme) ||
scheme.begin == parsed_.scheme.begin) {
// We can't do this check on the inner_url of a filesystem URL, as
// canonical_spec actually points to the start of the outer URL, so we'd
// end up with infinite recursion in this constructor.
GURL test_url(spec_);
// We need to retain trailing whitespace on path URLs, as the |parsed_|
// spec we originally received may legitimately contain trailing white-
// space on the path or components e.g. if the #ref has been
// removed from a "foo:hello #ref" URL (see http://crbug.com/291747).
GURL test_url(spec_, RETAIN_TRAILING_PATH_WHITEPACE);

DCHECK(test_url.is_valid_ == is_valid_);
DCHECK(test_url.spec_ == spec_);
Expand Down
11 changes: 11 additions & 0 deletions url/gurl.h
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,17 @@ class URL_EXPORT GURL {
}

private:
// Variant of the string parsing constructor that allows the caller to elect
// retain trailing whitespace, if any, on the passed URL spec but only if the
// scheme is one that allows trailing whitespace. The primary use-case is
// for data: URLs. In most cases, you want to use the single parameter
// constructor above.
enum RetainWhiteSpaceSelector { RETAIN_TRAILING_PATH_WHITEPACE };
GURL(const std::string& url_string, RetainWhiteSpaceSelector);

template<typename STR>
void InitCanonical(const STR& input_spec, bool trim_path_end);

void InitializeFromCanonicalSpec();

// Returns the substring of the input identified by the given component.
Expand Down
23 changes: 23 additions & 0 deletions url/gurl_unittest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,29 @@ TEST(GURLTest, Replacements) {
}
}

TEST(GURLTest, ClearFragmentOnDataUrl) {
// http://crbug.com/291747 - a data URL may legitimately have trailing
// whitespace in the spec after the ref is cleared. Test this does not trigger
// the url_parse::Parsed importing validation DCHECK in GURL.
GURL url(" data: one ? two # three ");

// By default the trailing whitespace will have been stripped.
EXPECT_EQ("data: one ? two # three", url.spec());
GURL::Replacements repl;
repl.ClearRef();
GURL url_no_ref = url.ReplaceComponents(repl);

EXPECT_EQ("data: one ? two ", url_no_ref.spec());

// Importing a parsed url via this constructor overload will retain trailing
// whitespace.
GURL import_url(url_no_ref.spec(),
url_no_ref.parsed_for_possibly_invalid_spec(),
url_no_ref.is_valid());
EXPECT_EQ(url_no_ref, import_url);
EXPECT_EQ(import_url.query(), " two ");
}

TEST(GURLTest, PathForRequest) {
struct TestCase {
const char* input;
Expand Down
54 changes: 34 additions & 20 deletions url/third_party/mozilla/url_parse.cc
Original file line number Diff line number Diff line change
Expand Up @@ -455,45 +455,53 @@ void DoParseFileSystemURL(const CHAR* spec, int spec_len, Parsed* parsed) {
// Initializes a path URL which is merely a scheme followed by a path. Examples
// include "about:foo" and "javascript:alert('bar');"
template<typename CHAR>
void DoParsePathURL(const CHAR* spec, int spec_len, Parsed* parsed) {
void DoParsePathURL(const CHAR* spec, int spec_len,
bool trim_path_end,
Parsed* parsed) {
// Get the non-path and non-scheme parts of the URL out of the way, we never
// use them.
parsed->username.reset();
parsed->password.reset();
parsed->host.reset();
parsed->port.reset();
parsed->path.reset();
parsed->query.reset();
parsed->ref.reset();

// Strip leading & trailing spaces and control characters.
int begin = 0;
TrimURL(spec, &begin, &spec_len);
int scheme_begin = 0;
TrimURL(spec, &scheme_begin, &spec_len, trim_path_end);

// Handle empty specs or ones that contain only whitespace or control chars.
if (begin == spec_len) {
if (scheme_begin == spec_len) {
parsed->scheme.reset();
parsed->path.reset();
return;
}

int path_begin;
// Extract the scheme, with the path being everything following. We also
// handle the case where there is no scheme.
if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) {
if (ExtractScheme(&spec[scheme_begin], spec_len - scheme_begin,
&parsed->scheme)) {
// Offset the results since we gave ExtractScheme a substring.
parsed->scheme.begin += begin;

// For compatability with the standard URL parser, we treat no path as
// -1, rather than having a length of 0 (we normally wouldn't care so
// much for these non-standard URLs).
if (parsed->scheme.end() == spec_len - 1)
parsed->path.reset();
else
parsed->path = MakeRange(parsed->scheme.end() + 1, spec_len);
parsed->scheme.begin += scheme_begin;
path_begin = parsed->scheme.end() + 1;
} else {
// No scheme found, just path.
// No scheme case.
parsed->scheme.reset();
parsed->path = MakeRange(begin, spec_len);
path_begin = scheme_begin;
}

if (path_begin == spec_len)
return;
DCHECK_LT(path_begin, spec_len);

ParsePath(spec,
MakeRange(path_begin, spec_len),
&parsed->path,
&parsed->query,
&parsed->ref);
}

template<typename CHAR>
Expand Down Expand Up @@ -875,12 +883,18 @@ void ParseStandardURL(const base::char16* url, int url_len, Parsed* parsed) {
DoParseStandardURL(url, url_len, parsed);
}

void ParsePathURL(const char* url, int url_len, Parsed* parsed) {
DoParsePathURL(url, url_len, parsed);
void ParsePathURL(const char* url,
int url_len,
bool trim_path_end,
Parsed* parsed) {
DoParsePathURL(url, url_len, trim_path_end, parsed);
}

void ParsePathURL(const base::char16* url, int url_len, Parsed* parsed) {
DoParsePathURL(url, url_len, parsed);
void ParsePathURL(const base::char16* url,
int url_len,
bool trim_path_end,
Parsed* parsed) {
DoParsePathURL(url, url_len, trim_path_end, parsed);
}

void ParseFileSystemURL(const char* url, int url_len, Parsed* parsed) {
Expand Down
6 changes: 5 additions & 1 deletion url/third_party/mozilla/url_parse.h
Original file line number Diff line number Diff line change
Expand Up @@ -238,9 +238,13 @@ URL_EXPORT void ParseStandardURL(const base::char16* url,
// section but that aren't file URLs either. The scheme is parsed, and
// everything after the scheme is considered as the path. This is used for
// things like "about:" and "javascript:"
URL_EXPORT void ParsePathURL(const char* url, int url_len, Parsed* parsed);
URL_EXPORT void ParsePathURL(const char* url,
int url_len,
bool trim_path_end,
Parsed* parsed);
URL_EXPORT void ParsePathURL(const base::char16* url,
int url_len,
bool trim_path_end,
Parsed* parsed);

// FileURL is for file URLs. There are some special rules for interpreting
Expand Down
64 changes: 41 additions & 23 deletions url/url_canon_pathurl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,39 @@ namespace url_canon {

namespace {

// Canonicalize the given |component| from |source| into |output| and
// |new_component|. If |separator| is non-zero, it is pre-pended to |ouput|
// prior to the canonicalized component; i.e. for the '?' or '#' characters.
template<typename CHAR, typename UCHAR>
bool DoCanonicalizePathComponent(const CHAR* source,
const url_parse::Component& component,
CHAR seperator,
CanonOutput* output,
url_parse::Component* new_component) {
bool success = true;
if (component.is_valid()) {
if (seperator)
output->push_back(seperator);
// Copy the path using path URL's more lax escaping rules (think for
// javascript:). We convert to UTF-8 and escape non-ASCII, but leave all
// ASCII characters alone. This helps readability of JavaStript.
new_component->begin = output->length();
int end = component.end();
for (int i = component.begin; i < end; i++) {
UCHAR uch = static_cast<UCHAR>(source[i]);
if (uch < 0x20 || uch >= 0x80)
success &= AppendUTF8EscapedChar(source, &i, end, output);
else
output->push_back(static_cast<char>(uch));
}
new_component->len = output->length() - new_component->begin;
} else {
// Empty part.
new_component->reset();
}
return success;
}

template<typename CHAR, typename UCHAR>
bool DoCanonicalizePathURL(const URLComponentSource<CHAR>& source,
const url_parse::Parsed& parsed,
Expand All @@ -28,29 +61,14 @@ bool DoCanonicalizePathURL(const URLComponentSource<CHAR>& source,
new_parsed->password.reset();
new_parsed->host.reset();
new_parsed->port.reset();

if (parsed.path.is_valid()) {
// Copy the path using path URL's more lax escaping rules (think for
// javascript:). We convert to UTF-8 and escape non-ASCII, but leave all
// ASCII characters alone. This helps readability of JavaStript.
new_parsed->path.begin = output->length();
int end = parsed.path.end();
for (int i = parsed.path.begin; i < end; i++) {
UCHAR uch = static_cast<UCHAR>(source.path[i]);
if (uch < 0x20 || uch >= 0x80)
success &= AppendUTF8EscapedChar(source.path, &i, end, output);
else
output->push_back(static_cast<char>(uch));
}
new_parsed->path.len = output->length() - new_parsed->path.begin;
} else {
// Empty path.
new_parsed->path.reset();
}

// Assume there's no query or ref.
new_parsed->query.reset();
new_parsed->ref.reset();
// We allow path URLs to have the path, query and fragment components, but we
// will canonicalize each of the via the weaker path URL rules.
success &= DoCanonicalizePathComponent<CHAR, UCHAR>(
source.path, parsed.path, 0, output, &new_parsed->path);
success &= DoCanonicalizePathComponent<CHAR, UCHAR>(
source.query, parsed.query, '?', output, &new_parsed->query);
success &= DoCanonicalizePathComponent<CHAR, UCHAR>(
source.ref, parsed.ref, '#', output, &new_parsed->ref);

return success;
}
Expand Down
12 changes: 9 additions & 3 deletions url/url_canon_relative.cc
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,16 @@ bool DoIsRelativeURL(const char* base,
// "http:foo.html" is a relative URL with path "foo.html". If the scheme is
// empty, we treat it as relative (":foo") like IE does.
url_parse::Component scheme;
if (!url_parse::ExtractScheme(url, url_len, &scheme) || scheme.len == 0) {
// Don't allow relative URLs if the base scheme doesn't support it.
if (!is_base_hierarchical)
const bool scheme_is_empty =
!url_parse::ExtractScheme(url, url_len, &scheme) || scheme.len == 0;
if (scheme_is_empty) {
if (url[begin] == '#') {
// |url| is a bare fragement (e.g. "#foo"). This can be resolved against
// any base. Fall-through.
} else if (!is_base_hierarchical) {
// Don't allow relative URLs if the base scheme doesn't support it.
return false;
}

*relative_component = url_parse::MakeRange(begin, url_len);
*is_relative = true;
Expand Down
Loading

0 comments on commit 369e84f

Please sign in to comment.