Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 55 additions & 14 deletions cpp/src/parquet/arrow/generate_fuzz_corpus.cc
Original file line number Diff line number Diff line change
Expand Up @@ -81,25 +81,68 @@ std::shared_ptr<Field> FieldForArray(const std::shared_ptr<Array>& array,
}

std::vector<WriteConfig> GetWriteConfigurations() {
auto default_properties_builder = [] {
auto builder = WriterProperties::Builder();
// Override current default of 1MB
builder.data_pagesize(10'000);
// Reduce max dictionary page size so that less pages are dict-encoded.
builder.dictionary_pagesize_limit(1'000);
// Emit various physical types for decimal columns
builder.enable_store_decimal_as_integer();
// DataPageV2 has more interesting features such as selective compression
builder.data_page_version(parquet::ParquetDataPageVersion::V2);
return builder;
};

auto default_arrow_properties_builder = [] {
auto builder = ArrowWriterProperties::Builder();
// Store the Arrow schema so as to exercise more data types when reading
builder.store_schema();
return builder;
};

// clang-format off
auto w_brotli = WriterProperties::Builder()
.disable_dictionary("no_dict")
->compression("compressed", Compression::BROTLI)
// Override current default of 1MB
->data_pagesize(20'000)
// Reduce max dictionary page size so that less pages are dict-encoded.
->dictionary_pagesize_limit(1'000)
// Emit various physical types for decimal columns
->enable_store_decimal_as_integer()
auto w_uncompressed = default_properties_builder()
.build();
// compressed columns with dictionary disabled
auto w_brotli = default_properties_builder()
.disable_dictionary()
->compression(Compression::BROTLI)
->build();
auto w_gzip = default_properties_builder()
.disable_dictionary()
->compression(Compression::GZIP)
->build();
// Store the Arrow schema so as to exercise more data types when reading
auto a_default = ArrowWriterProperties::Builder{}
.store_schema()
auto w_lz4 = default_properties_builder()
.disable_dictionary()
->compression(Compression::LZ4)
->build();
auto w_snappy = default_properties_builder()
.disable_dictionary()
->compression(Compression::SNAPPY)
->build();
auto w_zstd = default_properties_builder()
.disable_dictionary()
->compression(Compression::ZSTD)
->build();
// v1 data pages
auto w_pages_v1 = default_properties_builder()
.disable_dictionary()
->compression(Compression::LZ4)
->data_page_version(parquet::ParquetDataPageVersion::V1)
->build();

auto a_default = default_arrow_properties_builder().build();
// clang-format on

std::vector<WriteConfig> configs;
configs.push_back({w_uncompressed, a_default});
configs.push_back({w_brotli, a_default});
configs.push_back({w_gzip, a_default});
configs.push_back({w_lz4, a_default});
configs.push_back({w_snappy, a_default});
configs.push_back({w_zstd, a_default});
configs.push_back({w_pages_v1, a_default});
return configs;
}

Expand Down Expand Up @@ -255,8 +298,6 @@ Result<std::vector<Column>> ExampleColumns(int32_t length,

// TODO extension types: UUID, JSON, GEOMETRY, GEOGRAPHY

// A non-dict-encoded column (see GetWriteConfigurations)
columns.push_back({"no_dict", gen.String(length, 0, 30, null_probability)});
// A column that should be quite compressible (see GetWriteConfigurations)
columns.push_back({"compressed", gen.Int64(length, -10, 10, null_probability)});

Expand Down
25 changes: 18 additions & 7 deletions cpp/src/parquet/arrow/reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1414,21 +1414,32 @@ Status FuzzReader(std::unique_ptr<FileReader> reader) {
} // namespace

Status FuzzReader(const uint8_t* data, int64_t size) {
auto buffer = std::make_shared<::arrow::Buffer>(data, size);
Status st;
for (auto batch_size : std::vector<std::optional<int>>{std::nullopt, 1, 13, 300}) {
auto file = std::make_shared<::arrow::io::BufferReader>(buffer);
FileReaderBuilder builder;

auto buffer = std::make_shared<::arrow::Buffer>(data, size);
auto file = std::make_shared<::arrow::io::BufferReader>(buffer);
auto pool = ::arrow::default_memory_pool();

// Read Parquet file metadata only once, which will reduce iteration time slightly
std::shared_ptr<FileMetaData> pq_md;
BEGIN_PARQUET_CATCH_EXCEPTIONS
pq_md = ParquetFileReader::Open(file)->metadata();
END_PARQUET_CATCH_EXCEPTIONS

// Note that very small batch sizes probably make fuzzing slower
for (auto batch_size : std::vector<std::optional<int>>{std::nullopt, 13, 300}) {
ArrowReaderProperties properties;
if (batch_size) {
properties.set_batch_size(batch_size.value());
}
builder.properties(properties);

RETURN_NOT_OK(builder.Open(std::move(file)));
std::unique_ptr<ParquetFileReader> pq_file_reader;
BEGIN_PARQUET_CATCH_EXCEPTIONS
pq_file_reader = ParquetFileReader::Open(file, default_reader_properties(), pq_md);
END_PARQUET_CATCH_EXCEPTIONS

std::unique_ptr<FileReader> reader;
RETURN_NOT_OK(builder.Build(&reader));
RETURN_NOT_OK(FileReader::Make(pool, std::move(pq_file_reader), properties, &reader));
st &= FuzzReader(std::move(reader));
}
return st;
Expand Down
3 changes: 0 additions & 3 deletions cpp/src/parquet/properties.h
Original file line number Diff line number Diff line change
Expand Up @@ -321,8 +321,6 @@ class PARQUET_EXPORT WriterProperties {
content_defined_chunking_options_(
properties.content_defined_chunking_options()) {}

virtual ~Builder() {}

/// \brief EXPERIMENTAL: Use content-defined page chunking for all columns.
///
/// Optimize parquet files for content addressable storage (CAS) systems by writing
Expand Down Expand Up @@ -1183,7 +1181,6 @@ class PARQUET_EXPORT ArrowWriterProperties {
use_threads_(kArrowDefaultUseThreads),
executor_(NULLPTR),
write_time_adjusted_to_utc_(false) {}
virtual ~Builder() = default;

/// \brief Disable writing legacy int96 timestamps (default disabled).
Builder* disable_deprecated_int96_timestamps() {
Expand Down
Loading