Skip to content

Commit 95a77b4

Browse files
committed
add options to control page encoding stats reading
1 parent a0db198 commit 95a77b4

File tree

8 files changed

+395
-23
lines changed

8 files changed

+395
-23
lines changed

parquet/benches/metadata.rs

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,22 @@ fn criterion_benchmark(c: &mut Criterion) {
173173
})
174174
});
175175

176+
let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(true);
177+
c.bench_function("decode metadata with stats mask", |b| {
178+
b.iter(|| {
179+
ParquetMetaDataReader::decode_metadata_with_options(&meta_data, Some(&options))
180+
.unwrap();
181+
})
182+
});
183+
184+
let options = ParquetMetaDataOptions::new().with_skip_encoding_stats(true);
185+
c.bench_function("decode metadata with skip PES", |b| {
186+
b.iter(|| {
187+
ParquetMetaDataReader::decode_metadata_with_options(&meta_data, Some(&options))
188+
.unwrap();
189+
})
190+
});
191+
176192
let buf: Bytes = black_box(encoded_meta()).into();
177193
c.bench_function("decode parquet metadata (wide)", |b| {
178194
b.iter(|| {
@@ -187,6 +203,20 @@ fn criterion_benchmark(c: &mut Criterion) {
187203
ParquetMetaDataReader::decode_metadata_with_options(&buf, Some(&options)).unwrap();
188204
})
189205
});
206+
207+
let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(true);
208+
c.bench_function("decode metadata (wide) with stats mask", |b| {
209+
b.iter(|| {
210+
ParquetMetaDataReader::decode_metadata_with_options(&buf, Some(&options)).unwrap();
211+
})
212+
});
213+
214+
let options = ParquetMetaDataOptions::new().with_skip_encoding_stats(true);
215+
c.bench_function("decode metadata (wide) with skip PES", |b| {
216+
b.iter(|| {
217+
ParquetMetaDataReader::decode_metadata_with_options(&buf, Some(&options)).unwrap();
218+
})
219+
});
190220
}
191221

192222
criterion_group!(benches, criterion_benchmark);

parquet/src/arrow/arrow_reader/mod.rs

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -537,6 +537,30 @@ impl ArrowReaderOptions {
537537
self
538538
}
539539

540+
/// Set whether to convert `encoding_stats` to a bitmask.
541+
///
542+
/// See [`ColumnChunkMetaData::page_encoding_stats_mask`] for an explanation of why this
543+
/// might be desirable.
544+
///
545+
/// [`ColumnChunkMetaData::page_encoding_stats_mask`]:
546+
/// crate::file::metadata::ColumnChunkMetaData::page_encoding_stats_mask
547+
pub fn with_encoding_stats_as_mask(mut self, val: bool) -> Self {
548+
self.metadata_options.set_encoding_stats_as_mask(val);
549+
self
550+
}
551+
552+
/// Set whether to skip decoding `encoding_stats`.
553+
pub fn with_skip_encoding_stats(mut self, val: bool) -> Self {
554+
self.metadata_options.set_skip_encoding_stats(val);
555+
self
556+
}
557+
558+
/// Provide a list of column indicies for which to decode `encoding_stats`.
559+
pub fn with_keep_encoding_stats(mut self, keep: &[usize]) -> Self {
560+
self.metadata_options.set_keep_encoding_stats(keep);
561+
self
562+
}
563+
540564
/// Provide the file decryption properties to use when reading encrypted parquet files.
541565
///
542566
/// If encryption is enabled and the file is encrypted, the `file_decryption_properties` must be provided.
@@ -1282,6 +1306,72 @@ mod tests {
12821306
assert_eq!(expected.as_ref(), builder.metadata.as_ref());
12831307
}
12841308

1309+
#[test]
1310+
fn test_page_encoding_stats_mask() {
1311+
let testdata = arrow::util::test_util::parquet_test_data();
1312+
let path = format!("{testdata}/alltypes_tiny_pages.parquet");
1313+
let file = File::open(path).unwrap();
1314+
1315+
let arrow_options = ArrowReaderOptions::new().with_encoding_stats_as_mask(true);
1316+
let builder =
1317+
ParquetRecordBatchReaderBuilder::try_new_with_options(file, arrow_options).unwrap();
1318+
1319+
let row_group_metadata = builder.metadata.row_group(0);
1320+
1321+
// test page encoding stats
1322+
let page_encoding_stats = row_group_metadata
1323+
.column(0)
1324+
.page_encoding_stats_mask()
1325+
.unwrap();
1326+
assert!(page_encoding_stats.is_set(Encoding::PLAIN));
1327+
// PLAIN = 0, so 1 << 0 or 1
1328+
assert_eq!(page_encoding_stats.as_i32() ^ 1, 0);
1329+
let page_encoding_stats = row_group_metadata
1330+
.column(2)
1331+
.page_encoding_stats_mask()
1332+
.unwrap();
1333+
assert!(page_encoding_stats.is_set(Encoding::PLAIN_DICTIONARY));
1334+
// PLAIN_DICTIONARY = 2, so 1 << 2
1335+
assert_eq!(page_encoding_stats.as_i32() ^ (1 << 2), 0);
1336+
}
1337+
1338+
#[test]
1339+
fn test_page_encoding_stats_skipped() {
1340+
let testdata = arrow::util::test_util::parquet_test_data();
1341+
let path = format!("{testdata}/alltypes_tiny_pages.parquet");
1342+
let file = File::open(path).unwrap();
1343+
1344+
// test skipping all
1345+
let arrow_options = ArrowReaderOptions::new().with_skip_encoding_stats(true);
1346+
let builder = ParquetRecordBatchReaderBuilder::try_new_with_options(
1347+
file.try_clone().unwrap(),
1348+
arrow_options,
1349+
)
1350+
.unwrap();
1351+
1352+
let row_group_metadata = builder.metadata.row_group(0);
1353+
for column in row_group_metadata.columns() {
1354+
assert!(column.page_encoding_stats().is_none());
1355+
assert!(column.page_encoding_stats_mask().is_none());
1356+
}
1357+
1358+
// test skipping all but one column and converting to mask
1359+
let arrow_options = ArrowReaderOptions::new()
1360+
.with_encoding_stats_as_mask(true)
1361+
.with_keep_encoding_stats(&[0]);
1362+
let builder = ParquetRecordBatchReaderBuilder::try_new_with_options(
1363+
file.try_clone().unwrap(),
1364+
arrow_options,
1365+
)
1366+
.unwrap();
1367+
1368+
let row_group_metadata = builder.metadata.row_group(0);
1369+
for (idx, column) in row_group_metadata.columns().iter().enumerate() {
1370+
assert!(column.page_encoding_stats().is_none());
1371+
assert_eq!(column.page_encoding_stats_mask().is_some(), idx == 0);
1372+
}
1373+
}
1374+
12851375
#[test]
12861376
fn test_arrow_reader_single_column() {
12871377
let file = get_test_file("parquet/generated_simple_numerics/blogs.parquet");

parquet/src/basic.rs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -737,6 +737,11 @@ impl EncodingMask {
737737
self.0 & (1 << (val as i32)) != 0
738738
}
739739

740+
/// Test if this mask has only the bit for the given [`Encoding`] set.
741+
pub fn is_only(&self, val: Encoding) -> bool {
742+
self.0 == (1 << (val as i32))
743+
}
744+
740745
/// Test if all [`Encoding`]s in a given set are present in this mask.
741746
pub fn all_set<'a>(&self, mut encodings: impl Iterator<Item = &'a Encoding>) -> bool {
742747
encodings.all(|&e| self.is_set(e))
@@ -2498,4 +2503,14 @@ mod tests {
24982503
"Parquet error: Attempt to create invalid mask: 0x2"
24992504
);
25002505
}
2506+
2507+
#[test]
2508+
fn test_encoding_mask_is_only() {
2509+
let mask = EncodingMask::new_from_encodings([Encoding::PLAIN].iter());
2510+
assert!(mask.is_only(Encoding::PLAIN));
2511+
2512+
let mask =
2513+
EncodingMask::new_from_encodings([Encoding::PLAIN, Encoding::PLAIN_DICTIONARY].iter());
2514+
assert!(!mask.is_only(Encoding::PLAIN));
2515+
}
25012516
}

parquet/src/file/metadata/mod.rs

Lines changed: 47 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -813,6 +813,7 @@ pub struct ColumnChunkMetaData {
813813
statistics: Option<Statistics>,
814814
geo_statistics: Option<Box<geo_statistics::GeospatialStatistics>>,
815815
encoding_stats: Option<Vec<PageEncodingStats>>,
816+
encoding_stats_mask: Option<EncodingMask>,
816817
bloom_filter_offset: Option<i64>,
817818
bloom_filter_length: Option<i32>,
818819
offset_index_offset: Option<i64>,
@@ -1050,12 +1051,43 @@ impl ColumnChunkMetaData {
10501051
self.geo_statistics.as_deref()
10511052
}
10521053

1053-
/// Returns the offset for the page encoding stats,
1054-
/// or `None` if no page encoding stats are available.
1054+
/// Returns the page encoding statistics, or `None` if no page encoding statistics
1055+
/// are available.
10551056
pub fn page_encoding_stats(&self) -> Option<&Vec<PageEncodingStats>> {
10561057
self.encoding_stats.as_ref()
10571058
}
10581059

1060+
/// Returns the page encoding statistics reduced to a bitmask, or `None` if statistics are
1061+
/// not available.
1062+
///
1063+
/// The [`PageEncodingStats`] struct was added to the Parquet specification specifically to
1064+
/// enable fast determination of whether all pages in a column chunk are dictionary encoded
1065+
/// (see <https://github.com/apache/parquet-format/pull/16>).
1066+
/// Decoding the full page encoding statistics, however, can be very costly, and is not
1067+
/// necessary to support the aforementioned use case. As an alternative, this crate can
1068+
/// instead distill the list of `PageEncodingStats` down to a bitmask of just the encodings
1069+
/// used for data pages
1070+
/// (see [`ParquetMetaDataOptions::set_encoding_stats_as_mask`]).
1071+
/// To test for an all-dictionary-encoded chunk one could use this bitmask in the following way:
1072+
///
1073+
/// ```rust
1074+
/// use parquet::basic::Encoding;
1075+
/// use parquet::file::metadata::ColumnChunkMetaData;
1076+
/// // test if all data pages in the column chunk are dictionary encoded
1077+
/// fn is_all_dictionary_encoded(col_meta: &ColumnChunkMetaData) -> bool {
1078+
/// // check that dictionary encoding was used
1079+
/// col_meta.dictionary_page_offset().is_some()
1080+
/// && col_meta.page_encoding_stats_mask().is_some_and(|mask| {
1081+
/// // mask should only have one bit set, either for PLAIN_DICTIONARY or
1082+
/// // RLE_DICTIONARY
1083+
/// mask.is_only(Encoding::PLAIN_DICTIONARY) || mask.is_only(Encoding::RLE_DICTIONARY)
1084+
/// })
1085+
/// }
1086+
/// ```
1087+
pub fn page_encoding_stats_mask(&self) -> Option<&EncodingMask> {
1088+
self.encoding_stats_mask.as_ref()
1089+
}
1090+
10591091
/// Returns the offset for the bloom filter.
10601092
pub fn bloom_filter_offset(&self) -> Option<i64> {
10611093
self.bloom_filter_offset
@@ -1178,6 +1210,7 @@ impl ColumnChunkMetaDataBuilder {
11781210
statistics: None,
11791211
geo_statistics: None,
11801212
encoding_stats: None,
1213+
encoding_stats_mask: None,
11811214
bloom_filter_offset: None,
11821215
bloom_filter_length: None,
11831216
offset_index_offset: None,
@@ -1278,6 +1311,12 @@ impl ColumnChunkMetaDataBuilder {
12781311
self
12791312
}
12801313

1314+
/// Sets page encoding stats mask for this column chunk.
1315+
pub fn set_page_encoding_stats_mask(mut self, value: EncodingMask) -> Self {
1316+
self.0.encoding_stats_mask = Some(value);
1317+
self
1318+
}
1319+
12811320
/// Clears the page encoding stats for this column chunk.
12821321
pub fn clear_page_encoding_stats(mut self) -> Self {
12831322
self.0.encoding_stats = None;
@@ -1882,9 +1921,9 @@ mod tests {
18821921
.build();
18831922

18841923
#[cfg(not(feature = "encryption"))]
1885-
let base_expected_size = 2766;
1924+
let base_expected_size = 2798;
18861925
#[cfg(feature = "encryption")]
1887-
let base_expected_size = 2934;
1926+
let base_expected_size = 2966;
18881927

18891928
assert_eq!(parquet_meta.memory_size(), base_expected_size);
18901929

@@ -1913,9 +1952,9 @@ mod tests {
19131952
.build();
19141953

19151954
#[cfg(not(feature = "encryption"))]
1916-
let bigger_expected_size = 3192;
1955+
let bigger_expected_size = 3224;
19171956
#[cfg(feature = "encryption")]
1918-
let bigger_expected_size = 3360;
1957+
let bigger_expected_size = 3392;
19191958

19201959
// more set fields means more memory usage
19211960
assert!(bigger_expected_size > base_expected_size);
@@ -1962,7 +2001,7 @@ mod tests {
19622001
.set_row_groups(row_group_meta.clone())
19632002
.build();
19642003

1965-
let base_expected_size = 2058;
2004+
let base_expected_size = 2074;
19662005
assert_eq!(parquet_meta_data.memory_size(), base_expected_size);
19672006

19682007
let footer_key = "0123456789012345".as_bytes();
@@ -1988,7 +2027,7 @@ mod tests {
19882027
.set_file_decryptor(Some(decryptor))
19892028
.build();
19902029

1991-
let expected_size_with_decryptor = 3072;
2030+
let expected_size_with_decryptor = 3088;
19922031
assert!(expected_size_with_decryptor > base_expected_size);
19932032

19942033
assert_eq!(

0 commit comments

Comments
 (0)