Move http_util ParseContentType to mime_util and generalize step 2 of 2

Step 1: Move ParseContentType to mime_util. Step 2: Generalize API for any mime or parameter names. End state will be that this function can still be used for HttpUtil::ParseContentType() but can also called by exo to parse a mime string such as: application/octet-string; name="image.jpg" There has been a minor change to logic for HttpUtil::ParseContentType(). Previously it was setting |boundary| even when it would later reject mime type such as "*/*". I believe this was unintentional (a bug?). The new code returns without setting boundary. Bug: 1202034 Change-Id: Ia5d90e369b87ed0d12f23c0f86bfe7c20dbacce6 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2895987 Reviewed-by: Matt Menke <mmenke@chromium.org> Commit-Queue: Joel Hockey <joelhockey@chromium.org> Cr-Commit-Position: refs/heads/master@{#883246}
FairyWorld · May 15, 2021 · 1864da8 · 1864da8
1 parent e89bebf
commit 1864da8
Show file tree

Hide file tree

Showing 4 changed files with 168 additions and 106 deletions.
diff --git a/net/base/mime_util.cc b/net/base/mime_util.cc
@@ -453,56 +453,59 @@ bool MimeUtil::MatchesMimeType(const std::string& mime_type_pattern,
   return MatchesMimeTypeParameters(mime_type_pattern, mime_type);
 }
 
-void ParseContentType(const std::string& content_type_str,
-                      std::string* mime_type,
-                      std::string* charset,
-                      bool* had_charset,
-                      std::string* boundary) {
-  const std::string::const_iterator begin = content_type_str.begin();
-
+bool ParseMimeType(const std::string& type_str,
+                   std::string* mime_type,
+                   base::StringPairs* params) {
   // Trim leading and trailing whitespace from type.  We include '(' in
   // the trailing trim set to catch media-type comments, which are not at all
   // standard, but may occur in rare cases.
-  size_t type_val = content_type_str.find_first_not_of(HTTP_LWS);
-  type_val = std::min(type_val, content_type_str.length());
-  size_t type_end = content_type_str.find_first_of(HTTP_LWS ";(", type_val);
+  size_t type_val = type_str.find_first_not_of(HTTP_LWS);
+  type_val = std::min(type_val, type_str.length());
+  size_t type_end = type_str.find_first_of(HTTP_LWS ";(", type_val);
   if (type_end == std::string::npos)
-    type_end = content_type_str.length();
-
-  std::string charset_value;
-  bool type_has_charset = false;
-  bool type_has_boundary = false;
+    type_end = type_str.length();
+
+  // Reject a mime-type if it does not include a slash.
+  // TODO(crbug.com/1202034): This is currently matching old code to search
+  // anywhere in the string for a slash. Update to require the slash in the
+  // mime which was the intention.
+  size_t slash_pos = type_str.find_first_of('/');
+  if (slash_pos == std::string::npos)
+    return false;
+  if (mime_type)
+    *mime_type = type_str.substr(type_val, type_end - type_val);
 
   // Iterate over parameters. Can't split the string around semicolons
   // preemptively because quoted strings may include semicolons. Mostly matches
   // logic in https://mimesniff.spec.whatwg.org/. Main differences: Does not
   // validate characters are HTTP token code points / HTTP quoted-string token
   // code points, and ignores spaces after "=" in parameters.
-  std::string::size_type offset = content_type_str.find_first_of(';', type_end);
-  while (offset < content_type_str.size()) {
-    DCHECK_EQ(';', content_type_str[offset]);
+  if (params)
+    params->clear();
+  std::string::size_type offset = type_str.find_first_of(';', type_end);
+  while (offset < type_str.size()) {
+    DCHECK_EQ(';', type_str[offset]);
     // Trim off the semicolon.
     ++offset;
 
     // Trim off any following spaces.
-    offset = content_type_str.find_first_not_of(HTTP_LWS, offset);
+    offset = type_str.find_first_not_of(HTTP_LWS, offset);
     std::string::size_type param_name_start = offset;
 
     // Extend parameter name until run into a semicolon or equals sign.  Per
     // spec, trailing spaces are not removed.
-    offset = content_type_str.find_first_of(";=", offset);
+    offset = type_str.find_first_of(";=", offset);
 
     // Nothing more to do if at end of string, or if there's no parameter
     // value, since names without values aren't allowed.
-    if (offset == std::string::npos || content_type_str[offset] == ';')
+    if (offset == std::string::npos || type_str[offset] == ';')
       continue;
 
-    auto param_name =
-        base::MakeStringPiece(content_type_str.begin() + param_name_start,
-                              content_type_str.begin() + offset);
+    auto param_name = base::MakeStringPiece(type_str.begin() + param_name_start,
+                                            type_str.begin() + offset);
 
     // Now parse the value.
-    DCHECK_EQ('=', content_type_str[offset]);
+    DCHECK_EQ('=', type_str[offset]);
     // Trim off the '='.
     offset++;
 
@@ -514,97 +517,54 @@ void ParseContentType(const std::string& content_type_str,
     // GET spec's way of getting an encoding, and the spec for handling
     // boundary values as well.
     // See https://encoding.spec.whatwg.org/#names-and-labels.
-    offset = content_type_str.find_first_not_of(HTTP_LWS, offset);
+    offset = type_str.find_first_not_of(HTTP_LWS, offset);
 
     std::string param_value;
-    if (offset == std::string::npos || content_type_str[offset] == ';') {
+    if (offset == std::string::npos || type_str[offset] == ';') {
       // Nothing to do here - an unquoted string of only whitespace should be
       // skipped.
       continue;
-    } else if (content_type_str[offset] != '"') {
+    } else if (type_str[offset] != '"') {
       // If the first character is not a quotation mark, copy data directly.
       std::string::size_type value_start = offset;
-      offset = content_type_str.find_first_of(';', offset);
+      offset = type_str.find_first_of(';', offset);
       std::string::size_type value_end = offset;
 
       // Remove terminal whitespace. If ran off the end of the string, have to
       // update |value_end| first.
       if (value_end == std::string::npos)
-        value_end = content_type_str.size();
+        value_end = type_str.size();
       while (value_end > value_start &&
-             HttpUtil::IsLWS(content_type_str[value_end - 1])) {
+             HttpUtil::IsLWS(type_str[value_end - 1])) {
         --value_end;
       }
 
-      param_value =
-          content_type_str.substr(value_start, value_end - value_start);
+      param_value = type_str.substr(value_start, value_end - value_start);
     } else {
       // Otherwise, append data, with special handling for backslashes, until
-      // a close quote.
+      // a close quote.  Do not trim whitespace for quoted-string.
 
       // Skip open quote.
-      DCHECK_EQ('"', content_type_str[offset]);
+      DCHECK_EQ('"', type_str[offset]);
       ++offset;
 
-      while (offset < content_type_str.size() &&
-             content_type_str[offset] != '"') {
+      while (offset < type_str.size() && type_str[offset] != '"') {
         // Skip over backslash and append the next character, when not at
         // the end of the string. Otherwise, copy the next character (Which may
         // be a backslash).
-        if (content_type_str[offset] == '\\' &&
-            offset + 1 < content_type_str.size()) {
+        if (type_str[offset] == '\\' && offset + 1 < type_str.size()) {
           ++offset;
         }
-        param_value += content_type_str[offset];
+        param_value += type_str[offset];
         ++offset;
       }
 
-      param_value = std::string(HttpUtil::TrimLWS(param_value));
-
-      offset = content_type_str.find_first_of(';', offset);
+      offset = type_str.find_first_of(';', offset);
     }
-
-    // TODO(mmenke): Check that name has only valid characters.
-    if (!type_has_charset &&
-        base::LowerCaseEqualsASCII(param_name, "charset")) {
-      type_has_charset = true;
-      charset_value = param_value;
-      continue;
-    }
-
-    if (boundary && !type_has_boundary &&
-        base::LowerCaseEqualsASCII(param_name, "boundary")) {
-      type_has_boundary = true;
-      boundary->assign(std::move(param_value));
-      continue;
-    }
-  }
-
-  // If the server sent "*/*", it is meaningless, so do not store it.
-  // Also, reject a mime-type if it does not include a slash.
-  // Some servers give junk after the charset parameter, which may
-  // include a comma, so this check makes us a bit more tolerant.
-  if (content_type_str.length() == 0 || content_type_str == "*/*" ||
-      content_type_str.find_first_of('/') == std::string::npos) {
-    return;
-  }
-
-  // If type_val is the same as mime_type, then just update the charset.
-  // However, if charset is empty and mime_type hasn't changed, then don't
-  // wipe-out an existing charset.
-  // It is common that mime_type is empty.
-  bool eq = !mime_type->empty() &&
-            base::LowerCaseEqualsASCII(
-                base::MakeStringPiece(begin + type_val, begin + type_end),
-                mime_type->data());
-  if (!eq) {
-    *mime_type = base::ToLowerASCII(
-        base::MakeStringPiece(begin + type_val, begin + type_end));
-  }
-  if ((!eq && *had_charset) || type_has_charset) {
-    *had_charset = true;
-    *charset = base::ToLowerASCII(charset_value);
+    if (params)
+      params->emplace_back(param_name, param_value);
   }
+  return true;
 }
 
 bool MimeUtil::ParseMimeTypeWithoutParameter(

diff --git a/net/base/mime_util.h b/net/base/mime_util.h
@@ -23,6 +23,7 @@
 #include <vector>
 
 #include "base/files/file_path.h"
+#include "base/strings/string_split.h"
 #include "net/base/net_export.h"
 
 namespace net {
@@ -57,27 +58,17 @@ NET_EXPORT bool GetPreferredExtensionForMimeType(
 NET_EXPORT bool MatchesMimeType(const std::string& mime_type_pattern,
                                 const std::string& mime_type);
 
-// Parses the value of a Content-Type header.  |mime_type|, |charset|, and
-// |had_charset| output parameters must be valid pointers.  |boundary| may be
-// nullptr.  |*mime_type| and |*charset| should be empty and |*had_charset|
-// false when called with the first Content-Type header value in a given
-// header list.
+// Parses |type_str| for |mime_type| and any |params|. Returns false if mime
+// cannot be parsed, and does not modify |mime_type| or |params|.
 //
-// ParseContentType() supports parsing multiple Content-Type headers in the
-// same header list.  For this operation, subsequent calls should pass in the
-// same |mime_type|, |charset|, and |had_charset| arguments without clearing
-// them.
-//
-// The resulting mime_type and charset values are normalized to lowercase.
-// The mime_type and charset output values are only modified if the
-// content_type_str contains a mime type and charset value, respectively.  If
-// |boundary| is not null, then |*boundary| will be assigned the (unquoted)
-// value of the boundary parameter, if any.
-NET_EXPORT void ParseContentType(const std::string& content_type_str,
-                                 std::string* mime_type,
-                                 std::string* charset,
-                                 bool* had_charset,
-                                 std::string* boundary);
+// Returns true when mime can be parsed and:
+// If |mime_type| is non-NULL, sets it to parsed mime string.
+// If |params| is non-NULL, clears it and sets it with name-value pairs of
+// parsed parameters. Parsing of parameters is lenient, and invalid params are
+// ignored.
+NET_EXPORT bool ParseMimeType(const std::string& type_str,
+                              std::string* mime_type,
+                              base::StringPairs* params);
 
 // Returns true if the |type_string| is a correctly-formed mime type specifier
 // with no parameter, i.e. string that matches the following ABNF (see the

diff --git a/net/base/mime_util_unittest.cc b/net/base/mime_util_unittest.cc
@@ -192,6 +192,77 @@ TEST(MimeUtilTest, MatchesMimeType) {
   EXPECT_TRUE(MatchesMimeType("ab/*cd", "ab/xxxcd"));
 }
 
+TEST(MimeUtilTest, TestParseMimeType) {
+  const struct {
+    std::string type_str;
+    std::string mime_type;
+    base::StringPairs params;
+  } tests[] = {
+      // Simple tests.
+      {"image/jpeg", "image/jpeg"},
+      {"application/octet-stream;foo=bar;name=\"test.jpg\"",
+       "application/octet-stream",
+       {{"foo", "bar"}, {"name", "test.jpg"}}},
+      // Quoted string parsing.
+      {"t/s;name=\"t\\\\est\\\".jpg\"", "t/s", {{"name", "t\\est\".jpg"}}},
+      {"t/s;name=\"test.jpg\"", "t/s", {{"name", "test.jpg"}}},
+      {"t/s;name=\"test;jpg\"", "t/s", {{"name", "test;jpg"}}},
+      // Lenient for no closing quote.
+      {"t/s;name=\"test.jpg", "t/s", {{"name", "test.jpg"}}},
+      {"t/s;name=\"ab\\\"", "t/s", {{"name", "ab\""}}},
+      // Strip whitespace from start/end of mime_type.
+      {" t/s", "t/s"},
+      {"t/s ", "t/s"},
+      {" t/s ", "t/s"},
+      {"t/=", "t/="},
+      // Generally ignore whitespace.
+      {"t/s;a=1;b=2", "t/s", {{"a", "1"}, {"b", "2"}}},
+      {"t/s ;a=1;b=2", "t/s", {{"a", "1"}, {"b", "2"}}},
+      {"t/s; a=1;b=2", "t/s", {{"a", "1"}, {"b", "2"}}},
+      // Special case, include whitespace after param name until equals.
+      {"t/s;a =1;b=2", "t/s", {{"a ", "1"}, {"b", "2"}}},
+      {"t/s;a= 1;b=2", "t/s", {{"a", "1"}, {"b", "2"}}},
+      {"t/s;a=1 ;b=2", "t/s", {{"a", "1"}, {"b", "2"}}},
+      {"t/s;a=1; b=2", "t/s", {{"a", "1"}, {"b", "2"}}},
+      {"t/s; a = 1;b=2", "t/s", {{"a ", "1"}, {"b", "2"}}},
+      // Do not trim whitespace from quoted-string param values.
+      {"t/s;a=\" 1\";b=2", "t/s", {{"a", " 1"}, {"b", "2"}}},
+      {"t/s;a=\"1 \";b=2", "t/s", {{"a", "1 "}, {"b", "2"}}},
+      {"t/s;a=\" 1 \";b=2", "t/s", {{"a", " 1 "}, {"b", "2"}}},
+      // Ignore incomplete params.
+      {"t/s;a", "t/s", {}},
+      {"t/s;a=", "t/s", {}},
+      {"t/s;a=1;", "t/s", {{"a", "1"}}},
+      {"t/s;a=1;b", "t/s", {{"a", "1"}}},
+      {"t/s;a=1;b=", "t/s", {{"a", "1"}}},
+      // Allow empty subtype.
+      {"t/", "t/", {}},
+      {"ts/", "ts/", {}},
+      {"t/;", "t/", {}},
+      {"t/ s", "t/", {}},
+      // Questionable: allow anything as long as there is a slash somewhere.
+      {"/ts", "/ts", {}},
+      {"/s", "/s", {}},
+      {"/", "/", {}},
+      // TODO(crbug.com/1202034): This is a bug and should fail.
+      {"t / s", "t", {}},
+  };
+  for (const auto& test : tests) {
+    std::string mime_type;
+    base::StringPairs params;
+    EXPECT_TRUE(ParseMimeType(test.type_str, &mime_type, &params));
+    EXPECT_EQ(test.mime_type, mime_type);
+    EXPECT_EQ(test.params, params);
+  }
+  for (auto* type_str : {
+           // Must have slash in mime type.
+           "",
+           "ts",
+       }) {
+    EXPECT_FALSE(ParseMimeType(type_str, nullptr, nullptr));
+  }
+}
+
 TEST(MimeUtilTest, TestParseMimeTypeWithoutParameter) {
   std::string nonAscii("application/nonutf8");
   EXPECT_TRUE(ParseMimeTypeWithoutParameter(nonAscii, nullptr, nullptr));

diff --git a/net/http/http_util.cc b/net/http/http_util.cc
@@ -96,8 +96,48 @@ void HttpUtil::ParseContentType(const std::string& content_type_str,
                                 std::string* charset,
                                 bool* had_charset,
                                 std::string* boundary) {
-  net::ParseContentType(content_type_str, mime_type, charset, had_charset,
-                        boundary);
+  std::string mime_type_value;
+  base::StringPairs params;
+  bool result = ParseMimeType(content_type_str, &mime_type_value, &params);
+  // If the server sent "*/*", it is meaningless, so do not store it.
+  // Also, reject a mime-type if it does not include a slash.
+  // Some servers give junk after the charset parameter, which may
+  // include a comma, so this check makes us a bit more tolerant.
+  if (!result || content_type_str == "*/*")
+    return;
+
+  std::string charset_value;
+  bool type_has_charset = false;
+  bool type_has_boundary = false;
+  for (const auto& param : params) {
+    // Trim LWS from param value, ParseMimeType() leaves WS for quoted-string.
+    // TODO(mmenke): Check that name has only valid characters.
+    if (!type_has_charset &&
+        base::LowerCaseEqualsASCII(param.first, "charset")) {
+      type_has_charset = true;
+      charset_value = std::string(HttpUtil::TrimLWS(param.second));
+      continue;
+    }
+
+    if (boundary && !type_has_boundary &&
+        base::LowerCaseEqualsASCII(param.first, "boundary")) {
+      type_has_boundary = true;
+      *boundary = std::string(HttpUtil::TrimLWS(param.second));
+      continue;
+    }
+  }
+
+  // If `mime_type_value` is the same as `mime_type`, then just update
+  // `charset`. However, if `charset` is empty and `mime_type` hasn't changed,
+  // then don't wipe-out an existing `charset`.
+  bool eq = base::LowerCaseEqualsASCII(mime_type->data(), mime_type_value);
+  if (!eq) {
+    *mime_type = base::ToLowerASCII(mime_type_value);
+  }
+  if ((!eq && *had_charset) || type_has_charset) {
+    *had_charset = true;
+    *charset = base::ToLowerASCII(charset_value);
+  }
 }
 
 // static