Skip to content

Commit

Permalink
Move http_util ParseContentType to mime_util and generalize step 2 of 2
Browse files Browse the repository at this point in the history
Step 1: Move ParseContentType to mime_util.
Step 2: Generalize API for any mime or parameter names.

End state will be that this function can still be used for
HttpUtil::ParseContentType() but can also called by exo to parse a
mime string such as:
 application/octet-string; name="image.jpg"

There has been a minor change to logic for HttpUtil::ParseContentType().
Previously it was setting |boundary| even when it would later reject
mime type such as "*/*". I believe this was unintentional (a bug?).
The new code returns without setting boundary.

Bug: 1202034
Change-Id: Ia5d90e369b87ed0d12f23c0f86bfe7c20dbacce6
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2895987
Reviewed-by: Matt Menke <mmenke@chromium.org>
Commit-Queue: Joel Hockey <joelhockey@chromium.org>
Cr-Commit-Position: refs/heads/master@{#883246}
  • Loading branch information
Joel Hockey authored and Chromium LUCI CQ committed May 15, 2021
1 parent e89bebf commit 1864da8
Show file tree
Hide file tree
Showing 4 changed files with 168 additions and 106 deletions.
128 changes: 44 additions & 84 deletions net/base/mime_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -453,56 +453,59 @@ bool MimeUtil::MatchesMimeType(const std::string& mime_type_pattern,
return MatchesMimeTypeParameters(mime_type_pattern, mime_type);
}

void ParseContentType(const std::string& content_type_str,
std::string* mime_type,
std::string* charset,
bool* had_charset,
std::string* boundary) {
const std::string::const_iterator begin = content_type_str.begin();

bool ParseMimeType(const std::string& type_str,
std::string* mime_type,
base::StringPairs* params) {
// Trim leading and trailing whitespace from type. We include '(' in
// the trailing trim set to catch media-type comments, which are not at all
// standard, but may occur in rare cases.
size_t type_val = content_type_str.find_first_not_of(HTTP_LWS);
type_val = std::min(type_val, content_type_str.length());
size_t type_end = content_type_str.find_first_of(HTTP_LWS ";(", type_val);
size_t type_val = type_str.find_first_not_of(HTTP_LWS);
type_val = std::min(type_val, type_str.length());
size_t type_end = type_str.find_first_of(HTTP_LWS ";(", type_val);
if (type_end == std::string::npos)
type_end = content_type_str.length();

std::string charset_value;
bool type_has_charset = false;
bool type_has_boundary = false;
type_end = type_str.length();

// Reject a mime-type if it does not include a slash.
// TODO(crbug.com/1202034): This is currently matching old code to search
// anywhere in the string for a slash. Update to require the slash in the
// mime which was the intention.
size_t slash_pos = type_str.find_first_of('/');
if (slash_pos == std::string::npos)
return false;
if (mime_type)
*mime_type = type_str.substr(type_val, type_end - type_val);

// Iterate over parameters. Can't split the string around semicolons
// preemptively because quoted strings may include semicolons. Mostly matches
// logic in https://mimesniff.spec.whatwg.org/. Main differences: Does not
// validate characters are HTTP token code points / HTTP quoted-string token
// code points, and ignores spaces after "=" in parameters.
std::string::size_type offset = content_type_str.find_first_of(';', type_end);
while (offset < content_type_str.size()) {
DCHECK_EQ(';', content_type_str[offset]);
if (params)
params->clear();
std::string::size_type offset = type_str.find_first_of(';', type_end);
while (offset < type_str.size()) {
DCHECK_EQ(';', type_str[offset]);
// Trim off the semicolon.
++offset;

// Trim off any following spaces.
offset = content_type_str.find_first_not_of(HTTP_LWS, offset);
offset = type_str.find_first_not_of(HTTP_LWS, offset);
std::string::size_type param_name_start = offset;

// Extend parameter name until run into a semicolon or equals sign. Per
// spec, trailing spaces are not removed.
offset = content_type_str.find_first_of(";=", offset);
offset = type_str.find_first_of(";=", offset);

// Nothing more to do if at end of string, or if there's no parameter
// value, since names without values aren't allowed.
if (offset == std::string::npos || content_type_str[offset] == ';')
if (offset == std::string::npos || type_str[offset] == ';')
continue;

auto param_name =
base::MakeStringPiece(content_type_str.begin() + param_name_start,
content_type_str.begin() + offset);
auto param_name = base::MakeStringPiece(type_str.begin() + param_name_start,
type_str.begin() + offset);

// Now parse the value.
DCHECK_EQ('=', content_type_str[offset]);
DCHECK_EQ('=', type_str[offset]);
// Trim off the '='.
offset++;

Expand All @@ -514,97 +517,54 @@ void ParseContentType(const std::string& content_type_str,
// GET spec's way of getting an encoding, and the spec for handling
// boundary values as well.
// See https://encoding.spec.whatwg.org/#names-and-labels.
offset = content_type_str.find_first_not_of(HTTP_LWS, offset);
offset = type_str.find_first_not_of(HTTP_LWS, offset);

std::string param_value;
if (offset == std::string::npos || content_type_str[offset] == ';') {
if (offset == std::string::npos || type_str[offset] == ';') {
// Nothing to do here - an unquoted string of only whitespace should be
// skipped.
continue;
} else if (content_type_str[offset] != '"') {
} else if (type_str[offset] != '"') {
// If the first character is not a quotation mark, copy data directly.
std::string::size_type value_start = offset;
offset = content_type_str.find_first_of(';', offset);
offset = type_str.find_first_of(';', offset);
std::string::size_type value_end = offset;

// Remove terminal whitespace. If ran off the end of the string, have to
// update |value_end| first.
if (value_end == std::string::npos)
value_end = content_type_str.size();
value_end = type_str.size();
while (value_end > value_start &&
HttpUtil::IsLWS(content_type_str[value_end - 1])) {
HttpUtil::IsLWS(type_str[value_end - 1])) {
--value_end;
}

param_value =
content_type_str.substr(value_start, value_end - value_start);
param_value = type_str.substr(value_start, value_end - value_start);
} else {
// Otherwise, append data, with special handling for backslashes, until
// a close quote.
// a close quote. Do not trim whitespace for quoted-string.

// Skip open quote.
DCHECK_EQ('"', content_type_str[offset]);
DCHECK_EQ('"', type_str[offset]);
++offset;

while (offset < content_type_str.size() &&
content_type_str[offset] != '"') {
while (offset < type_str.size() && type_str[offset] != '"') {
// Skip over backslash and append the next character, when not at
// the end of the string. Otherwise, copy the next character (Which may
// be a backslash).
if (content_type_str[offset] == '\\' &&
offset + 1 < content_type_str.size()) {
if (type_str[offset] == '\\' && offset + 1 < type_str.size()) {
++offset;
}
param_value += content_type_str[offset];
param_value += type_str[offset];
++offset;
}

param_value = std::string(HttpUtil::TrimLWS(param_value));

offset = content_type_str.find_first_of(';', offset);
offset = type_str.find_first_of(';', offset);
}

// TODO(mmenke): Check that name has only valid characters.
if (!type_has_charset &&
base::LowerCaseEqualsASCII(param_name, "charset")) {
type_has_charset = true;
charset_value = param_value;
continue;
}

if (boundary && !type_has_boundary &&
base::LowerCaseEqualsASCII(param_name, "boundary")) {
type_has_boundary = true;
boundary->assign(std::move(param_value));
continue;
}
}

// If the server sent "*/*", it is meaningless, so do not store it.
// Also, reject a mime-type if it does not include a slash.
// Some servers give junk after the charset parameter, which may
// include a comma, so this check makes us a bit more tolerant.
if (content_type_str.length() == 0 || content_type_str == "*/*" ||
content_type_str.find_first_of('/') == std::string::npos) {
return;
}

// If type_val is the same as mime_type, then just update the charset.
// However, if charset is empty and mime_type hasn't changed, then don't
// wipe-out an existing charset.
// It is common that mime_type is empty.
bool eq = !mime_type->empty() &&
base::LowerCaseEqualsASCII(
base::MakeStringPiece(begin + type_val, begin + type_end),
mime_type->data());
if (!eq) {
*mime_type = base::ToLowerASCII(
base::MakeStringPiece(begin + type_val, begin + type_end));
}
if ((!eq && *had_charset) || type_has_charset) {
*had_charset = true;
*charset = base::ToLowerASCII(charset_value);
if (params)
params->emplace_back(param_name, param_value);
}
return true;
}

bool MimeUtil::ParseMimeTypeWithoutParameter(
Expand Down
31 changes: 11 additions & 20 deletions net/base/mime_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include <vector>

#include "base/files/file_path.h"
#include "base/strings/string_split.h"
#include "net/base/net_export.h"

namespace net {
Expand Down Expand Up @@ -57,27 +58,17 @@ NET_EXPORT bool GetPreferredExtensionForMimeType(
NET_EXPORT bool MatchesMimeType(const std::string& mime_type_pattern,
const std::string& mime_type);

// Parses the value of a Content-Type header. |mime_type|, |charset|, and
// |had_charset| output parameters must be valid pointers. |boundary| may be
// nullptr. |*mime_type| and |*charset| should be empty and |*had_charset|
// false when called with the first Content-Type header value in a given
// header list.
// Parses |type_str| for |mime_type| and any |params|. Returns false if mime
// cannot be parsed, and does not modify |mime_type| or |params|.
//
// ParseContentType() supports parsing multiple Content-Type headers in the
// same header list. For this operation, subsequent calls should pass in the
// same |mime_type|, |charset|, and |had_charset| arguments without clearing
// them.
//
// The resulting mime_type and charset values are normalized to lowercase.
// The mime_type and charset output values are only modified if the
// content_type_str contains a mime type and charset value, respectively. If
// |boundary| is not null, then |*boundary| will be assigned the (unquoted)
// value of the boundary parameter, if any.
NET_EXPORT void ParseContentType(const std::string& content_type_str,
std::string* mime_type,
std::string* charset,
bool* had_charset,
std::string* boundary);
// Returns true when mime can be parsed and:
// If |mime_type| is non-NULL, sets it to parsed mime string.
// If |params| is non-NULL, clears it and sets it with name-value pairs of
// parsed parameters. Parsing of parameters is lenient, and invalid params are
// ignored.
NET_EXPORT bool ParseMimeType(const std::string& type_str,
std::string* mime_type,
base::StringPairs* params);

// Returns true if the |type_string| is a correctly-formed mime type specifier
// with no parameter, i.e. string that matches the following ABNF (see the
Expand Down
71 changes: 71 additions & 0 deletions net/base/mime_util_unittest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,77 @@ TEST(MimeUtilTest, MatchesMimeType) {
EXPECT_TRUE(MatchesMimeType("ab/*cd", "ab/xxxcd"));
}

TEST(MimeUtilTest, TestParseMimeType) {
const struct {
std::string type_str;
std::string mime_type;
base::StringPairs params;
} tests[] = {
// Simple tests.
{"image/jpeg", "image/jpeg"},
{"application/octet-stream;foo=bar;name=\"test.jpg\"",
"application/octet-stream",
{{"foo", "bar"}, {"name", "test.jpg"}}},
// Quoted string parsing.
{"t/s;name=\"t\\\\est\\\".jpg\"", "t/s", {{"name", "t\\est\".jpg"}}},
{"t/s;name=\"test.jpg\"", "t/s", {{"name", "test.jpg"}}},
{"t/s;name=\"test;jpg\"", "t/s", {{"name", "test;jpg"}}},
// Lenient for no closing quote.
{"t/s;name=\"test.jpg", "t/s", {{"name", "test.jpg"}}},
{"t/s;name=\"ab\\\"", "t/s", {{"name", "ab\""}}},
// Strip whitespace from start/end of mime_type.
{" t/s", "t/s"},
{"t/s ", "t/s"},
{" t/s ", "t/s"},
{"t/=", "t/="},
// Generally ignore whitespace.
{"t/s;a=1;b=2", "t/s", {{"a", "1"}, {"b", "2"}}},
{"t/s ;a=1;b=2", "t/s", {{"a", "1"}, {"b", "2"}}},
{"t/s; a=1;b=2", "t/s", {{"a", "1"}, {"b", "2"}}},
// Special case, include whitespace after param name until equals.
{"t/s;a =1;b=2", "t/s", {{"a ", "1"}, {"b", "2"}}},
{"t/s;a= 1;b=2", "t/s", {{"a", "1"}, {"b", "2"}}},
{"t/s;a=1 ;b=2", "t/s", {{"a", "1"}, {"b", "2"}}},
{"t/s;a=1; b=2", "t/s", {{"a", "1"}, {"b", "2"}}},
{"t/s; a = 1;b=2", "t/s", {{"a ", "1"}, {"b", "2"}}},
// Do not trim whitespace from quoted-string param values.
{"t/s;a=\" 1\";b=2", "t/s", {{"a", " 1"}, {"b", "2"}}},
{"t/s;a=\"1 \";b=2", "t/s", {{"a", "1 "}, {"b", "2"}}},
{"t/s;a=\" 1 \";b=2", "t/s", {{"a", " 1 "}, {"b", "2"}}},
// Ignore incomplete params.
{"t/s;a", "t/s", {}},
{"t/s;a=", "t/s", {}},
{"t/s;a=1;", "t/s", {{"a", "1"}}},
{"t/s;a=1;b", "t/s", {{"a", "1"}}},
{"t/s;a=1;b=", "t/s", {{"a", "1"}}},
// Allow empty subtype.
{"t/", "t/", {}},
{"ts/", "ts/", {}},
{"t/;", "t/", {}},
{"t/ s", "t/", {}},
// Questionable: allow anything as long as there is a slash somewhere.
{"/ts", "/ts", {}},
{"/s", "/s", {}},
{"/", "/", {}},
// TODO(crbug.com/1202034): This is a bug and should fail.
{"t / s", "t", {}},
};
for (const auto& test : tests) {
std::string mime_type;
base::StringPairs params;
EXPECT_TRUE(ParseMimeType(test.type_str, &mime_type, &params));
EXPECT_EQ(test.mime_type, mime_type);
EXPECT_EQ(test.params, params);
}
for (auto* type_str : {
// Must have slash in mime type.
"",
"ts",
}) {
EXPECT_FALSE(ParseMimeType(type_str, nullptr, nullptr));
}
}

TEST(MimeUtilTest, TestParseMimeTypeWithoutParameter) {
std::string nonAscii("application/nonutf8");
EXPECT_TRUE(ParseMimeTypeWithoutParameter(nonAscii, nullptr, nullptr));
Expand Down
44 changes: 42 additions & 2 deletions net/http/http_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,48 @@ void HttpUtil::ParseContentType(const std::string& content_type_str,
std::string* charset,
bool* had_charset,
std::string* boundary) {
net::ParseContentType(content_type_str, mime_type, charset, had_charset,
boundary);
std::string mime_type_value;
base::StringPairs params;
bool result = ParseMimeType(content_type_str, &mime_type_value, &params);
// If the server sent "*/*", it is meaningless, so do not store it.
// Also, reject a mime-type if it does not include a slash.
// Some servers give junk after the charset parameter, which may
// include a comma, so this check makes us a bit more tolerant.
if (!result || content_type_str == "*/*")
return;

std::string charset_value;
bool type_has_charset = false;
bool type_has_boundary = false;
for (const auto& param : params) {
// Trim LWS from param value, ParseMimeType() leaves WS for quoted-string.
// TODO(mmenke): Check that name has only valid characters.
if (!type_has_charset &&
base::LowerCaseEqualsASCII(param.first, "charset")) {
type_has_charset = true;
charset_value = std::string(HttpUtil::TrimLWS(param.second));
continue;
}

if (boundary && !type_has_boundary &&
base::LowerCaseEqualsASCII(param.first, "boundary")) {
type_has_boundary = true;
*boundary = std::string(HttpUtil::TrimLWS(param.second));
continue;
}
}

// If `mime_type_value` is the same as `mime_type`, then just update
// `charset`. However, if `charset` is empty and `mime_type` hasn't changed,
// then don't wipe-out an existing `charset`.
bool eq = base::LowerCaseEqualsASCII(mime_type->data(), mime_type_value);
if (!eq) {
*mime_type = base::ToLowerASCII(mime_type_value);
}
if ((!eq && *had_charset) || type_has_charset) {
*had_charset = true;
*charset = base::ToLowerASCII(charset_value);
}
}

// static
Expand Down

0 comments on commit 1864da8

Please sign in to comment.