Skip to content

Commit d5df352

Browse files
etseidlmbrobbelalamb
authored
Use custom thrift parser for parquet metadata (phase 1 of Thrift remodel) (#8530)
# Which issue does this PR close? - Closes #5854. - Closes #5775 - Related to #5853 # Rationale for this change See issue, but this change is needed to allow greater control over Parquet metadata handling. These changes speed up Thrift decoding about 2X, and enable further optimizations down the line, such as allowing selective decoding of only that metadata that is needed. # What changes are included in this PR? In short, the `format` crate is now no longer necessary. This crate will decode from Thrift encoded bytes straight to the metadata objects defined by this crate (`ParquetMetaData` and children). # Are these changes tested? Yes, but more should be added by follow-on PRs # Are there any user-facing changes? Yes, many. --------- Co-authored-by: Matthijs Brobbel <m1brobbel@gmail.com> Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent 521f219 commit d5df352

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

56 files changed

+7454
-3846
lines changed

parquet/THRIFT.md

Lines changed: 447 additions & 0 deletions
Large diffs are not rendered by default.

parquet/src/arrow/arrow_reader/mod.rs

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ pub use crate::arrow::array_reader::RowGroups;
3030
use crate::arrow::array_reader::{ArrayReader, ArrayReaderBuilder};
3131
use crate::arrow::schema::{parquet_to_arrow_schema_and_fields, ParquetField};
3232
use crate::arrow::{parquet_to_arrow_field_levels, FieldLevels, ProjectionMask};
33+
use crate::basic::{BloomFilterAlgorithm, BloomFilterCompression, BloomFilterHash};
3334
use crate::bloom_filter::{
3435
chunk_read_bloom_filter_header_and_offset, Sbbf, SBBF_HEADER_SIZE_ESTIMATE,
3536
};
@@ -39,7 +40,6 @@ use crate::encryption::decrypt::FileDecryptionProperties;
3940
use crate::errors::{ParquetError, Result};
4041
use crate::file::metadata::{PageIndexPolicy, ParquetMetaData, ParquetMetaDataReader};
4142
use crate::file::reader::{ChunkReader, SerializedPageReader};
42-
use crate::format::{BloomFilterAlgorithm, BloomFilterCompression, BloomFilterHash};
4343
use crate::schema::types::SchemaDescriptor;
4444

4545
use crate::arrow::arrow_reader::metrics::ArrowReaderMetrics;
@@ -261,7 +261,7 @@ impl<T> ArrowReaderBuilder<T> {
261261
/// Skip 1100 (skip the remaining 900 rows in row group 2 and the first 200 rows in row group 3)
262262
/// ```
263263
///
264-
/// [`Index`]: crate::file::page_index::index::Index
264+
/// [`Index`]: crate::file::page_index::column_index::ColumnIndexMetaData
265265
pub fn with_row_selection(self, selection: RowSelection) -> Self {
266266
Self {
267267
selection: Some(selection),
@@ -819,17 +819,17 @@ impl<T: ChunkReader + 'static> ParquetRecordBatchReaderBuilder<T> {
819819
chunk_read_bloom_filter_header_and_offset(offset, buffer.clone())?;
820820

821821
match header.algorithm {
822-
BloomFilterAlgorithm::BLOCK(_) => {
822+
BloomFilterAlgorithm::BLOCK => {
823823
// this match exists to future proof the singleton algorithm enum
824824
}
825825
}
826826
match header.compression {
827-
BloomFilterCompression::UNCOMPRESSED(_) => {
827+
BloomFilterCompression::UNCOMPRESSED => {
828828
// this match exists to future proof the singleton compression enum
829829
}
830830
}
831831
match header.hash {
832-
BloomFilterHash::XXHASH(_) => {
832+
BloomFilterHash::XXHASH => {
833833
// this match exists to future proof the singleton hash enum
834834
}
835835
}
@@ -1185,6 +1185,7 @@ mod tests {
11851185
FloatType, Int32Type, Int64Type, Int96, Int96Type,
11861186
};
11871187
use crate::errors::Result;
1188+
use crate::file::metadata::ParquetMetaData;
11881189
use crate::file::properties::{EnabledStatistics, WriterProperties, WriterVersion};
11891190
use crate::file::writer::SerializedFileWriter;
11901191
use crate::schema::parser::parse_message_type;
@@ -2913,7 +2914,7 @@ mod tests {
29132914
schema: TypePtr,
29142915
field: Option<Field>,
29152916
opts: &TestOptions,
2916-
) -> Result<crate::format::FileMetaData> {
2917+
) -> Result<ParquetMetaData> {
29172918
let mut writer_props = opts.writer_props();
29182919
if let Some(field) = field {
29192920
let arrow_schema = Schema::new(vec![field]);

parquet/src/arrow/arrow_reader/selection.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ use std::cmp::Ordering;
2121
use std::collections::VecDeque;
2222
use std::ops::Range;
2323

24+
use crate::file::page_index::offset_index::PageLocation;
25+
2426
/// [`RowSelection`] is a collection of [`RowSelector`] used to skip rows when
2527
/// scanning a parquet file
2628
#[derive(Debug, Clone, Copy, Eq, PartialEq)]
@@ -95,7 +97,7 @@ impl RowSelector {
9597
/// * It contains no [`RowSelector`] of 0 rows
9698
/// * Consecutive [`RowSelector`]s alternate skipping or selecting rows
9799
///
98-
/// [`PageIndex`]: crate::file::page_index::index::PageIndex
100+
/// [`PageIndex`]: crate::file::page_index::column_index::ColumnIndexMetaData
99101
#[derive(Debug, Clone, Default, Eq, PartialEq)]
100102
pub struct RowSelection {
101103
selectors: Vec<RowSelector>,
@@ -162,7 +164,7 @@ impl RowSelection {
162164
/// Note: this method does not make any effort to combine consecutive ranges, nor coalesce
163165
/// ranges that are close together. This is instead delegated to the IO subsystem to optimise,
164166
/// e.g. [`ObjectStore::get_ranges`](object_store::ObjectStore::get_ranges)
165-
pub fn scan_ranges(&self, page_locations: &[crate::format::PageLocation]) -> Vec<Range<u64>> {
167+
pub fn scan_ranges(&self, page_locations: &[PageLocation]) -> Vec<Range<u64>> {
166168
let mut ranges: Vec<Range<u64>> = vec![];
167169
let mut row_offset = 0;
168170

@@ -693,7 +695,6 @@ fn union_row_selections(left: &[RowSelector], right: &[RowSelector]) -> RowSelec
693695
#[cfg(test)]
694696
mod tests {
695697
use super::*;
696-
use crate::format::PageLocation;
697698
use rand::{rng, Rng};
698699

699700
#[test]

0 commit comments

Comments
 (0)