Refactor base::FuzzedDataProvider and fix the calling sites. The main…

… goals: 1) Avoid using std::string as a container for non-string data. The problem is that the underlying std::string buffer is bigger than the data we put inside (at least by 1 byte (null terminator), and might be even bigger). This may hide buffer overflow errors from ASan. 2) Make FuzzedDataProvider portable (remove //base dependency). 3) Make the types it returns more explicit (e.g. `int32_t` instead of `int`). Bug: 907103, 906080 Change-Id: Ibe1cd5ef6cb72140459a8ba3ac301f8c2bef48b9 Reviewed-on: https://chromium-review.googlesource.com/c/1344993 Commit-Queue: Max Moroz <mmoroz@chromium.org> Reviewed-by: Cait Phillips <caitkp@chromium.org> Reviewed-by: Nico Weber <thakis@chromium.org> Reviewed-by: Matt Menke <mmenke@chromium.org> Reviewed-by: Oliver Chang <ochang@chromium.org> Cr-Commit-Position: refs/heads/master@{#610236}
marcosholgado · Nov 21, 2018 · c416f80 · c416f80
1 parent 9cc9723
commit c416f80
Show file tree

Hide file tree

Showing 21 changed files with 256 additions and 175 deletions.
diff --git a/base/test/BUILD.gn b/base/test/BUILD.gn
@@ -48,7 +48,6 @@ static_library("test_support") {
     "android/url_utils.h",
     "bind_test_util.h",
     "copy_only_int.h",
-    "fuzzed_data_provider.cc",
     "fuzzed_data_provider.h",
     "gtest_util.cc",
     "gtest_util.h",

diff --git a/base/test/fuzzed_data_provider.cc b/base/test/fuzzed_data_provider.cc
diff --git a/base/test/fuzzed_data_provider.h b/base/test/fuzzed_data_provider.h
@@ -5,74 +5,207 @@
 #ifndef BASE_TEST_FUZZED_DATA_PROVIDER_H_
 #define BASE_TEST_FUZZED_DATA_PROVIDER_H_
 
+#include <stddef.h>
 #include <stdint.h>
 
+#include <algorithm>
+#include <limits>
 #include <string>
+#include <utility>
+#include <vector>
 
-#include "base/base_export.h"
-#include "base/macros.h"
-#include "base/strings/string_piece.h"
+// A single-header library providing an utility class to break up an array of
+// bytes (supposedly provided by a fuzzing engine) for multiple consumers.
+// Whenever run on the same input, provides the same output, as long as its
+// methods are called in the same order, with the same arguments.
 
+// TODO(mmoroz): move this out of //base as it should be an independent library.
 namespace base {
 
-// Utility class to break up fuzzer input for multiple consumers. Whenever run
-// on the same input, provides the same output, as long as its methods are
-// called in the same order, with the same arguments.
 class FuzzedDataProvider {
  public:
+  typedef uint8_t data_type;
+
   // |data| is an array of length |size| that the FuzzedDataProvider wraps to
   // provide more granular access. |data| must outlive the FuzzedDataProvider.
-  FuzzedDataProvider(const uint8_t* data, size_t size);
-  ~FuzzedDataProvider();
+  FuzzedDataProvider(const uint8_t* data, size_t size)
+      : data_ptr_(data), remaining_bytes_(size) {}
+  ~FuzzedDataProvider() = default;
+
+  // Returns a std::vector containing |num_bytes| of input data. If fewer than
+  // |num_bytes| of data remain, returns a shorter std::vector containing all
+  // of the data that's left.
+  template <typename T = data_type>
+  std::vector<T> ConsumeBytes(size_t num_bytes) {
+    static_assert(sizeof(T) == sizeof(data_type), "Incompatible data type.");
+
+    num_bytes = std::min(num_bytes, remaining_bytes_);
+
+    // The point of using the size-based constructor below is to increase the
+    // odds of having a vector object with capacity being equal to the length.
+    // That part is always implementation specific, but at least both libc++ and
+    // libstdc++ allocate the requested number of bytes in that constructor,
+    // which seems to be a natual choice for other implementations as well.
+    // To increase the odds even more, we also call |shrink_to_fit| below.
+    std::vector<T> result(num_bytes);
+    std::memcpy(result.data(), data_ptr_, num_bytes);
+    Advance(num_bytes);
+
+    // Even though |shrink_to_fit| is also implementation specific, we expect it
+    // to provide an additional assurance in case vector's constructor allocated
+    // a buffer which is larger than the actual amount of data we put inside it.
+    result.shrink_to_fit();
+    return result;
+  }
 
+  // Prefer using |ConsumeBytes| unless you actually need a std::string object.
   // Returns a std::string containing |num_bytes| of input data. If fewer than
   // |num_bytes| of data remain, returns a shorter std::string containing all
   // of the data that's left.
-  std::string ConsumeBytes(size_t num_bytes);
+  std::string ConsumeBytesAsString(size_t num_bytes) {
+    static_assert(sizeof(std::string::value_type) == sizeof(data_type),
+                  "ConsumeBytesAsString cannot convert the data to a string.");
+
+    num_bytes = std::min(num_bytes, remaining_bytes_);
+    std::string result(
+        reinterpret_cast<const std::string::value_type*>(data_ptr_), num_bytes);
+    Advance(num_bytes);
+    return result;
+  }
 
-  // Returns a std::string containing all remaining bytes of the input data.
-  std::string ConsumeRemainingBytes();
+  // Returns a number in the range [min, max] by consuming bytes from the input
+  // data. The value might not be uniformly distributed in the given range. If
+  // there's no input data left, always returns |min|. |min| must be less than
+  // or equal to |max|.
+  template <typename T>
+  T ConsumeIntegralInRange(T min, T max) {
+    static_assert(std::is_integral<T>::value, "An integral type is required.");
+    static_assert(sizeof(T) <= sizeof(uint64_t), "Unsupported integral type.");
+
+    if (min > max)
+      abort();
+
+    // Use the biggest type possible to hold the range and the result.
+    uint64_t range = max - min;
+    uint64_t result = 0;
+    size_t offset = 0;
+
+    while (offset < sizeof(T) * CHAR_BIT && (range >> offset) > 0 &&
+           remaining_bytes_ != 0) {
+      // Pull bytes off the end of the seed data. Experimentally, this seems to
+      // allow the fuzzer to more easily explore the input space. This makes
+      // sense, since it works by modifying inputs that caused new code to run,
+      // and this data is often used to encode length of data read by
+      // |ConsumeBytes|. Separating out read lengths makes it easier modify the
+      // contents of the data that is actually read.
+      --remaining_bytes_;
+      result = (result << CHAR_BIT) | data_ptr_[remaining_bytes_];
+      offset += CHAR_BIT;
+    }
+
+    // Avoid division by 0, in the case |range + 1| results in overflow.
+    if (range != std::numeric_limits<decltype(range)>::max())
+      result = result % (range + 1);
+
+    return min + static_cast<T>(result);
+  }
 
   // Returns a std::string of length from 0 to |max_length|. When it runs out of
   // input data, returns what remains of the input. Designed to be more stable
   // with respect to a fuzzer inserting characters than just picking a random
-  // length and then consuming that many bytes with ConsumeBytes().
-  std::string ConsumeRandomLengthString(size_t max_length);
+  // length and then consuming that many bytes with |ConsumeBytes|.
+  std::string ConsumeRandomLengthString(size_t max_length) {
+    // Reads bytes from the start of |data_ptr_|. Maps "\\" to "\", and maps "\"
+    // followed by anything else to the end of the string. As a result of this
+    // logic, a fuzzer can insert characters into the string, and the string
+    // will be lengthened to include those new characters, resulting in a more
+    // stable fuzzer than picking the length of a string independently from
+    // picking its contents.
+    std::string result;
+    for (size_t i = 0; i < max_length && remaining_bytes_ != 0; ++i) {
+      char next = static_cast<char>(data_ptr_[0]);
+      Advance(1);
+      if (next == '\\' && remaining_bytes_ != 0) {
+        next = static_cast<char>(data_ptr_[0]);
+        Advance(1);
+        if (next != '\\')
+          return result;
+      }
+      result += next;
+    }
+
+    result.shrink_to_fit();
+    return result;
+  }
 
-  // Returns a number in the range [min, max] by consuming bytes from the input
-  // data. The value might not be uniformly distributed in the given range. If
-  // there's no input data left, always returns |min|. |min| must be less than
-  // or equal to |max|.
-  uint32_t ConsumeUint32InRange(uint32_t min, uint32_t max);
-  int ConsumeInt32InRange(int min, int max);
+  // Returns a std::vector containing all remaining bytes of the input data.
+  template <typename T = data_type>
+  std::vector<T> ConsumeRemainingBytes() {
+    return ConsumeBytes<T>(remaining_bytes_);
+  }
+
+  // Prefer using |ConsumeRemainingBytes| unless you actually need a std::string
+  // object.
+  // Returns a std::vector containing all remaining bytes of the input data.
+  std::string ConsumeRemainingBytesAsString() {
+    return ConsumeBytesAsString(remaining_bytes_);
+  }
+
+  // TODO(mmoroz): consider deprecating these methods.
+  uint32_t ConsumeUint32InRange(uint32_t min, uint32_t max) {
+    return ConsumeIntegralInRange(min, max);
+  }
+
+  int32_t ConsumeInt32InRange(int32_t min, int32_t max) {
+    return ConsumeIntegralInRange(min, max);
+  }
 
-  // Returns a bool, or false when no data remains.
-  bool ConsumeBool();
+  int ConsumeIntInRange(int min, int max) {
+    return ConsumeIntegralInRange(min, max);
+  }
+
+  // Reads one byte and returns a bool, or false when no data remains.
+  bool ConsumeBool() { return 1 & ConsumeUint8(); }
 
   // Returns a uint8_t from the input or 0 if nothing remains. This is
   // equivalent to ConsumeUint32InRange(0, 0xFF).
-  uint8_t ConsumeUint8();
+  uint8_t ConsumeUint8() {
+    return ConsumeIntegralInRange(std::numeric_limits<uint8_t>::min(),
+                                  std::numeric_limits<uint8_t>::max());
+  }
 
   // Returns a uint16_t from the input. If fewer than 2 bytes of data remain
   // will fill the most significant bytes with 0. This is equivalent to
   // ConsumeUint32InRange(0, 0xFFFF).
-  uint16_t ConsumeUint16();
+  uint16_t ConsumeUint16() {
+    return ConsumeIntegralInRange(std::numeric_limits<uint16_t>::min(),
+                                  std::numeric_limits<uint16_t>::max());
+  }
 
   // Returns a value from |array|, consuming as many bytes as needed to do so.
-  // |array| must be a fixed-size array. Equivalent to
-  // array[ConsumeUint32InRange(sizeof(array)-1)];
+  // |array| must be a fixed-size array.
   template <typename Type, size_t size>
   Type PickValueInArray(Type (&array)[size]) {
-    return array[ConsumeUint32InRange(0, size - 1)];
+    return array[ConsumeIntegralInRange<size_t>(0, size - 1)];
   }
 
   // Reports the remaining bytes available for fuzzed input.
-  size_t remaining_bytes() { return remaining_data_.length(); }
+  size_t remaining_bytes() { return remaining_bytes_; }
 
  private:
-  StringPiece remaining_data_;
+  FuzzedDataProvider(const FuzzedDataProvider&) = delete;
+  FuzzedDataProvider& operator=(const FuzzedDataProvider&) = delete;
+
+  void Advance(size_t num_bytes) {
+    if (num_bytes > remaining_bytes_)
+      abort();
+
+    data_ptr_ += num_bytes;
+    remaining_bytes_ -= num_bytes;
+  }
 
-  DISALLOW_COPY_AND_ASSIGN(FuzzedDataProvider);
+  const data_type* data_ptr_;
+  size_t remaining_bytes_;
 };
 
 }  // namespace base

diff --git a/components/subresource_filter/core/common/indexed_ruleset_fuzzer.cc b/components/subresource_filter/core/common/indexed_ruleset_fuzzer.cc
@@ -4,7 +4,11 @@
 
 #include "components/subresource_filter/core/common/indexed_ruleset.h"
 
+#include <stddef.h>
+#include <stdint.h>
+
 #include <string>
+#include <vector>
 
 #include "base/at_exit.h"
 #include "base/i18n/icu_util.h"
@@ -37,7 +41,7 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
   if (!url_to_check.is_valid())
     return 0;
 
-  const std::string remaining_bytes = fuzzed_data.ConsumeRemainingBytes();
+  std::vector<uint8_t> remaining_bytes = fuzzed_data.ConsumeRemainingBytes();
 
   // First, interpret the remaining fuzzed data as an unindexed ruleset.
   google::protobuf::io::ArrayInputStream input_stream(remaining_bytes.data(),

diff --git a/components/viz/host/hit_test/hit_test_query_fuzzer.cc b/components/viz/host/hit_test/hit_test_query_fuzzer.cc
@@ -6,6 +6,8 @@
 #include <stdint.h>
 #include <string.h>
 
+#include <vector>
+
 #include "base/command_line.h"
 #include "base/test/fuzzed_data_provider.h"
 #include "components/viz/host/hit_test/hit_test_query.h"
@@ -37,8 +39,9 @@ void AddHitTestRegion(base::FuzzedDataProvider* fuzz,
       depth < kMaxDepthAllowed ? fuzz->ConsumeUint32InRange(0, 10) : 0;
   gfx::Transform transform;
   if (fuzz->ConsumeBool() && fuzz->remaining_bytes() >= sizeof(transform)) {
-    std::string matrix_bytes = fuzz->ConsumeBytes(sizeof(gfx::Transform));
-    memcpy(&transform, matrix_bytes.data(), sizeof(gfx::Transform));
+    std::vector<uint8_t> matrix_bytes =
+        fuzz->ConsumeBytes(sizeof(gfx::Transform));
+    memcpy(&transform, matrix_bytes.data(), matrix_bytes.size());
   }
   regions->emplace_back(frame_sink_id, flags, rect, transform, child_count,
                         reasons);