Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
6d6f4e3
[thrift-remodel] Reduce use of `parquet::format` in the public API (#…
etseidl Aug 6, 2025
a69c578
Merge remote-tracking branch 'origin/main' into gh5854_thrift_remodel
etseidl Aug 6, 2025
0d2f33e
Merge remote-tracking branch 'origin/main' into gh5854_thrift_remodel
etseidl Aug 11, 2025
50f3323
[thrift-remodel] Redo thrift enums and unions (#8072)
etseidl Aug 11, 2025
2fcbca9
Merge remote-tracking branch 'origin/main' into gh5854_thrift_remodel
etseidl Aug 14, 2025
f315dbe
[thrift-remodel] Complete decoding of `FileMetaData` and `RowGroupMet…
etseidl Aug 15, 2025
569411b
Merge remote-tracking branch 'origin/main' into gh5854_thrift_remodel
etseidl Aug 15, 2025
3c353e2
[thrift-remodel] Decoding of page indexes (#8160)
etseidl Aug 20, 2025
8e5ea14
Merge remote-tracking branch 'origin/main' into gh5854_thrift_remodel
etseidl Aug 20, 2025
ddcb89c
Merge remote-tracking branch 'origin/main' into gh5854_thrift_remodel
etseidl Aug 26, 2025
f777584
[thrift-remodel] PoC new form for column index (#8191)
etseidl Aug 27, 2025
db16cb4
[thrift-remodel] Add custom `PageLocation` decoder to speed up decodi…
etseidl Aug 27, 2025
d07708c
Merge remote-tracking branch 'origin/main' into gh5854_thrift_remodel
etseidl Aug 30, 2025
9fbe80a
Merge remote-tracking branch 'origin/main' into gh5854_thrift_remodel
etseidl Sep 5, 2025
64a59c3
finish merge
etseidl Sep 8, 2025
9596775
Merge remote-tracking branch 'origin/main' into gh5854_thrift_remodel
etseidl Sep 8, 2025
1404608
[thrift-remodel] Add thrift write support (#8237)
etseidl Sep 10, 2025
50e78e4
Merge remote-tracking branch 'origin/main' into gh5854_thrift_remodel
etseidl Sep 10, 2025
8f5be54
[thrift-remodel] Begin replacing file metadata reader and convert foo…
etseidl Sep 12, 2025
bbed01d
Merge remote-tracking branch 'origin/main' into gh5854_thrift_remodel
etseidl Sep 12, 2025
c327d7f
[thrift-remodel] Rework thrift reader API (#8341)
etseidl Sep 17, 2025
c4967d0
Merge remote-tracking branch 'origin/main' into gh5854_thrift_remodel
etseidl Sep 17, 2025
3dbd42e
[thrift-remodel] Use new Thrift encoder/decoder for Parquet page head…
etseidl Sep 23, 2025
8dbfe4c
Merge remote-tracking branch 'origin/main' into gh5854_thrift_remodel
etseidl Sep 23, 2025
67576bf
Merge remote-tracking branch 'origin/main' into gh5854_thrift_remodel
etseidl Sep 23, 2025
b0cc254
[thrift-remodel] Write Parquet page indexes (#8427)
etseidl Sep 25, 2025
bd6d1da
Merge remote-tracking branch 'origin/main' into gh5854_thrift_remodel
etseidl Sep 25, 2025
d5005e2
Merge remote-tracking branch 'origin/main' into gh5854_thrift_remodel
etseidl Sep 25, 2025
94306a5
Merge remote-tracking branch 'origin/main' into gh5854_thrift_remodel
etseidl Sep 25, 2025
50b8d35
Merge remote-tracking branch 'origin/main' into gh5854_thrift_remodel
etseidl Sep 25, 2025
3ec0cc9
Merge remote-tracking branch 'origin/main' into gh5854_thrift_remodel
etseidl Sep 26, 2025
aa26c0c
[thrift-remodel] Use new writer to write Parquet file metadata (#8445)
etseidl Sep 26, 2025
300aa2a
Merge remote-tracking branch 'origin/main' into gh5854_thrift_remodel
etseidl Sep 26, 2025
1365ec2
Merge remote-tracking branch 'origin/main' into gh5854_thrift_remodel
etseidl Sep 26, 2025
b4b4d26
Merge remote-tracking branch 'origin/main' into gh5854_thrift_remodel
etseidl Sep 29, 2025
2a9b061
Merge remote-tracking branch 'origin/main' into gh5854_thrift_remodel
etseidl Sep 30, 2025
77a6cf7
Merge remote-tracking branch 'origin/main' into gh5854_thrift_remodel
etseidl Oct 1, 2025
10ea3b2
[thrift-remodel] Remove most usage of `parquet::format` structures (#…
etseidl Oct 1, 2025
3dafbe0
Merge remote-tracking branch 'origin/main' into gh5854_thrift_remodel
etseidl Oct 1, 2025
a6d1d8e
[thrift-remodel] Incorporate changes made to geospatial statistics (#…
etseidl Oct 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
447 changes: 447 additions & 0 deletions parquet/THRIFT.md

Large diffs are not rendered by default.

13 changes: 7 additions & 6 deletions parquet/src/arrow/arrow_reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ pub use crate::arrow::array_reader::RowGroups;
use crate::arrow::array_reader::{ArrayReader, ArrayReaderBuilder};
use crate::arrow::schema::{parquet_to_arrow_schema_and_fields, ParquetField};
use crate::arrow::{parquet_to_arrow_field_levels, FieldLevels, ProjectionMask};
use crate::basic::{BloomFilterAlgorithm, BloomFilterCompression, BloomFilterHash};
use crate::bloom_filter::{
chunk_read_bloom_filter_header_and_offset, Sbbf, SBBF_HEADER_SIZE_ESTIMATE,
};
Expand All @@ -39,7 +40,6 @@ use crate::encryption::decrypt::FileDecryptionProperties;
use crate::errors::{ParquetError, Result};
use crate::file::metadata::{PageIndexPolicy, ParquetMetaData, ParquetMetaDataReader};
use crate::file::reader::{ChunkReader, SerializedPageReader};
use crate::format::{BloomFilterAlgorithm, BloomFilterCompression, BloomFilterHash};
use crate::schema::types::SchemaDescriptor;

use crate::arrow::arrow_reader::metrics::ArrowReaderMetrics;
Expand Down Expand Up @@ -261,7 +261,7 @@ impl<T> ArrowReaderBuilder<T> {
/// Skip 1100 (skip the remaining 900 rows in row group 2 and the first 200 rows in row group 3)
/// ```
///
/// [`Index`]: crate::file::page_index::index::Index
/// [`Index`]: crate::file::page_index::column_index::ColumnIndexMetaData
pub fn with_row_selection(self, selection: RowSelection) -> Self {
Self {
selection: Some(selection),
Expand Down Expand Up @@ -819,17 +819,17 @@ impl<T: ChunkReader + 'static> ParquetRecordBatchReaderBuilder<T> {
chunk_read_bloom_filter_header_and_offset(offset, buffer.clone())?;

match header.algorithm {
BloomFilterAlgorithm::BLOCK(_) => {
BloomFilterAlgorithm::BLOCK => {
// this match exists to future proof the singleton algorithm enum
}
}
match header.compression {
BloomFilterCompression::UNCOMPRESSED(_) => {
BloomFilterCompression::UNCOMPRESSED => {
// this match exists to future proof the singleton compression enum
}
}
match header.hash {
BloomFilterHash::XXHASH(_) => {
BloomFilterHash::XXHASH => {
// this match exists to future proof the singleton hash enum
}
}
Expand Down Expand Up @@ -1185,6 +1185,7 @@ mod tests {
FloatType, Int32Type, Int64Type, Int96, Int96Type,
};
use crate::errors::Result;
use crate::file::metadata::ParquetMetaData;
use crate::file::properties::{EnabledStatistics, WriterProperties, WriterVersion};
use crate::file::writer::SerializedFileWriter;
use crate::schema::parser::parse_message_type;
Expand Down Expand Up @@ -2913,7 +2914,7 @@ mod tests {
schema: TypePtr,
field: Option<Field>,
opts: &TestOptions,
) -> Result<crate::format::FileMetaData> {
) -> Result<ParquetMetaData> {
let mut writer_props = opts.writer_props();
if let Some(field) = field {
let arrow_schema = Schema::new(vec![field]);
Expand Down
7 changes: 4 additions & 3 deletions parquet/src/arrow/arrow_reader/selection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ use std::cmp::Ordering;
use std::collections::VecDeque;
use std::ops::Range;

use crate::file::page_index::offset_index::PageLocation;

/// [`RowSelection`] is a collection of [`RowSelector`] used to skip rows when
/// scanning a parquet file
#[derive(Debug, Clone, Copy, Eq, PartialEq)]
Expand Down Expand Up @@ -95,7 +97,7 @@ impl RowSelector {
/// * It contains no [`RowSelector`] of 0 rows
/// * Consecutive [`RowSelector`]s alternate skipping or selecting rows
///
/// [`PageIndex`]: crate::file::page_index::index::PageIndex
/// [`PageIndex`]: crate::file::page_index::column_index::ColumnIndexMetaData
#[derive(Debug, Clone, Default, Eq, PartialEq)]
pub struct RowSelection {
selectors: Vec<RowSelector>,
Expand Down Expand Up @@ -162,7 +164,7 @@ impl RowSelection {
/// Note: this method does not make any effort to combine consecutive ranges, nor coalesce
/// ranges that are close together. This is instead delegated to the IO subsystem to optimise,
/// e.g. [`ObjectStore::get_ranges`](object_store::ObjectStore::get_ranges)
pub fn scan_ranges(&self, page_locations: &[crate::format::PageLocation]) -> Vec<Range<u64>> {
pub fn scan_ranges(&self, page_locations: &[PageLocation]) -> Vec<Range<u64>> {
let mut ranges: Vec<Range<u64>> = vec![];
let mut row_offset = 0;

Expand Down Expand Up @@ -693,7 +695,6 @@ fn union_row_selections(left: &[RowSelector], right: &[RowSelector]) -> RowSelec
#[cfg(test)]
mod tests {
use super::*;
use crate::format::PageLocation;
use rand::{rng, Rng};

#[test]
Expand Down
Loading
Loading