Skip to content

Commit efa842a

Browse files
committed
GH-47978: [C++][Parquet][CI] Add more compression codecs to fuzzing seed corpus
1 parent 9d556a9 commit efa842a

File tree

3 files changed

+57
-18
lines changed

3 files changed

+57
-18
lines changed

cpp/src/parquet/arrow/generate_fuzz_corpus.cc

Lines changed: 55 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -81,25 +81,68 @@ std::shared_ptr<Field> FieldForArray(const std::shared_ptr<Array>& array,
8181
}
8282

8383
std::vector<WriteConfig> GetWriteConfigurations() {
84+
auto default_properties_builder = [] {
85+
auto builder = WriterProperties::Builder();
86+
// Override current default of 1MB
87+
builder.data_pagesize(10'000);
88+
// Reduce max dictionary page size so that less pages are dict-encoded.
89+
builder.dictionary_pagesize_limit(1'000);
90+
// Emit various physical types for decimal columns
91+
builder.enable_store_decimal_as_integer();
92+
// DataPageV2 has more interesting features such as selective compression
93+
builder.data_page_version(parquet::ParquetDataPageVersion::V2);
94+
return builder;
95+
};
96+
97+
auto default_arrow_properties_builder = [] {
98+
auto builder = ArrowWriterProperties::Builder();
99+
// Store the Arrow schema so as to exercise more data types when reading
100+
builder.store_schema();
101+
return builder;
102+
};
103+
84104
// clang-format off
85-
auto w_brotli = WriterProperties::Builder()
86-
.disable_dictionary("no_dict")
87-
->compression("compressed", Compression::BROTLI)
88-
// Override current default of 1MB
89-
->data_pagesize(20'000)
90-
// Reduce max dictionary page size so that less pages are dict-encoded.
91-
->dictionary_pagesize_limit(1'000)
92-
// Emit various physical types for decimal columns
93-
->enable_store_decimal_as_integer()
105+
auto w_uncompressed = default_properties_builder()
106+
.build();
107+
// compressed columns with dictionary disabled
108+
auto w_brotli = default_properties_builder()
109+
.disable_dictionary()
110+
->compression(Compression::BROTLI)
111+
->build();
112+
auto w_gzip = default_properties_builder()
113+
.disable_dictionary()
114+
->compression(Compression::GZIP)
94115
->build();
95-
// Store the Arrow schema so as to exercise more data types when reading
96-
auto a_default = ArrowWriterProperties::Builder{}
97-
.store_schema()
116+
auto w_lz4 = default_properties_builder()
117+
.disable_dictionary()
118+
->compression(Compression::LZ4)
98119
->build();
120+
auto w_snappy = default_properties_builder()
121+
.disable_dictionary()
122+
->compression(Compression::SNAPPY)
123+
->build();
124+
auto w_zstd = default_properties_builder()
125+
.disable_dictionary()
126+
->compression(Compression::ZSTD)
127+
->build();
128+
// v1 data pages
129+
auto w_pages_v1 = default_properties_builder()
130+
.disable_dictionary()
131+
->compression(Compression::LZ4)
132+
->data_page_version(parquet::ParquetDataPageVersion::V1)
133+
->build();
134+
135+
auto a_default = default_arrow_properties_builder().build();
99136
// clang-format on
100137

101138
std::vector<WriteConfig> configs;
139+
configs.push_back({w_uncompressed, a_default});
102140
configs.push_back({w_brotli, a_default});
141+
configs.push_back({w_gzip, a_default});
142+
configs.push_back({w_lz4, a_default});
143+
configs.push_back({w_snappy, a_default});
144+
configs.push_back({w_zstd, a_default});
145+
configs.push_back({w_pages_v1, a_default});
103146
return configs;
104147
}
105148

@@ -255,8 +298,6 @@ Result<std::vector<Column>> ExampleColumns(int32_t length,
255298

256299
// TODO extension types: UUID, JSON, GEOMETRY, GEOGRAPHY
257300

258-
// A non-dict-encoded column (see GetWriteConfigurations)
259-
columns.push_back({"no_dict", gen.String(length, 0, 30, null_probability)});
260301
// A column that should be quite compressible (see GetWriteConfigurations)
261302
columns.push_back({"compressed", gen.Int64(length, -10, 10, null_probability)});
262303

cpp/src/parquet/arrow/reader.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1416,7 +1416,8 @@ Status FuzzReader(std::unique_ptr<FileReader> reader) {
14161416
Status FuzzReader(const uint8_t* data, int64_t size) {
14171417
auto buffer = std::make_shared<::arrow::Buffer>(data, size);
14181418
Status st;
1419-
for (auto batch_size : std::vector<std::optional<int>>{std::nullopt, 1, 13, 300}) {
1419+
// Note that very small batch sizes probably make fuzzing slower
1420+
for (auto batch_size : std::vector<std::optional<int>>{std::nullopt, 13, 300}) {
14201421
auto file = std::make_shared<::arrow::io::BufferReader>(buffer);
14211422
FileReaderBuilder builder;
14221423
ArrowReaderProperties properties;

cpp/src/parquet/properties.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -321,8 +321,6 @@ class PARQUET_EXPORT WriterProperties {
321321
content_defined_chunking_options_(
322322
properties.content_defined_chunking_options()) {}
323323

324-
virtual ~Builder() {}
325-
326324
/// \brief EXPERIMENTAL: Use content-defined page chunking for all columns.
327325
///
328326
/// Optimize parquet files for content addressable storage (CAS) systems by writing
@@ -1183,7 +1181,6 @@ class PARQUET_EXPORT ArrowWriterProperties {
11831181
use_threads_(kArrowDefaultUseThreads),
11841182
executor_(NULLPTR),
11851183
write_time_adjusted_to_utc_(false) {}
1186-
virtual ~Builder() = default;
11871184

11881185
/// \brief Disable writing legacy int96 timestamps (default disabled).
11891186
Builder* disable_deprecated_int96_timestamps() {

0 commit comments

Comments
 (0)