Skip to content

Commit b0cc254

Browse files
authored
[thrift-remodel] Write Parquet page indexes (#8427)
# Which issue does this PR close? **Note: this targets a feature branch, not main** - Closes #5854. # Rationale for this change Continues the remodel by implementing writing of the page index structures. # What changes are included in this PR? This PR removes the old `parquet::file::page_index::Index` enum and replaces with the new `ColumnIndexMetaData` struct. # Are these changes tested? Covered by existing tests # Are there any user-facing changes? Yes.
1 parent 67576bf commit b0cc254

File tree

14 files changed

+415
-838
lines changed

14 files changed

+415
-838
lines changed

parquet/src/arrow/arrow_reader/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -261,7 +261,7 @@ impl<T> ArrowReaderBuilder<T> {
261261
/// Skip 1100 (skip the remaining 900 rows in row group 2 and the first 200 rows in row group 3)
262262
/// ```
263263
///
264-
/// [`Index`]: crate::file::page_index::index::Index
264+
/// [`Index`]: crate::file::page_index::column_index::ColumnIndexMetaData
265265
pub fn with_row_selection(self, selection: RowSelection) -> Self {
266266
Self {
267267
selection: Some(selection),

parquet/src/arrow/arrow_reader/selection.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ impl RowSelector {
9797
/// * It contains no [`RowSelector`] of 0 rows
9898
/// * Consecutive [`RowSelector`]s alternate skipping or selecting rows
9999
///
100-
/// [`PageIndex`]: crate::file::page_index::index::PageIndex
100+
/// [`PageIndex`]: crate::file::page_index::column_index::ColumnIndexMetaData
101101
#[derive(Debug, Clone, Default, Eq, PartialEq)]
102102
pub struct RowSelection {
103103
selectors: Vec<RowSelector>,

parquet/src/column/writer/mod.rs

Lines changed: 35 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ use bytes::Bytes;
2121
use half::f16;
2222

2323
use crate::bloom_filter::Sbbf;
24-
use crate::file::page_index::index::Index;
24+
use crate::file::page_index::column_index::ColumnIndexMetaData;
2525
use crate::file::page_index::offset_index::OffsetIndexMetaData;
2626
use std::collections::{BTreeSet, VecDeque};
2727
use std::str;
@@ -192,7 +192,7 @@ pub struct ColumnCloseResult {
192192
/// Optional bloom filter for this column
193193
pub bloom_filter: Option<Sbbf>,
194194
/// Optional column index, for filtering
195-
pub column_index: Option<Index>,
195+
pub column_index: Option<ColumnIndexMetaData>,
196196
/// Optional offset index, identifying page locations
197197
pub offset_index: Option<OffsetIndexMetaData>,
198198
}
@@ -2959,28 +2959,22 @@ mod tests {
29592959
assert!(r.column_index.is_some());
29602960
let col_idx = r.column_index.unwrap();
29612961
let col_idx = match col_idx {
2962-
Index::INT32(col_idx) => col_idx,
2962+
ColumnIndexMetaData::INT32(col_idx) => col_idx,
29632963
_ => panic!("wrong stats type"),
29642964
};
29652965
// null_pages should be true for page 0
2966-
assert!(col_idx.indexes[0].is_null_page());
2966+
assert!(col_idx.is_null_page(0));
29672967
// min and max should be empty byte arrays
2968-
assert!(col_idx.indexes[0].min().is_none());
2969-
assert!(col_idx.indexes[0].max().is_none());
2968+
assert!(col_idx.min_value(0).is_none());
2969+
assert!(col_idx.max_value(0).is_none());
29702970
// null_counts should be defined and be 4 for page 0
2971-
assert!(col_idx.indexes[0].null_count().is_some());
2972-
assert_eq!(col_idx.indexes[0].null_count().unwrap(), 4);
2971+
assert!(col_idx.null_count(0).is_some());
2972+
assert_eq!(col_idx.null_count(0), Some(4));
29732973
// there is no repetition so rep histogram should be absent
2974-
assert!(col_idx.indexes[0].repetition_level_histogram().is_none());
2974+
assert!(col_idx.repetition_level_histogram(0).is_none());
29752975
// definition_level_histogram should be present and should be 0:4, 1:0
2976-
assert!(col_idx.indexes[0].definition_level_histogram().is_some());
2977-
assert_eq!(
2978-
col_idx.indexes[0]
2979-
.definition_level_histogram()
2980-
.unwrap()
2981-
.values(),
2982-
&[4, 0]
2983-
);
2976+
assert!(col_idx.definition_level_histogram(0).is_some());
2977+
assert_eq!(col_idx.definition_level_histogram(0).unwrap(), &[4, 0]);
29842978
}
29852979

29862980
#[test]
@@ -3004,15 +2998,15 @@ mod tests {
30042998

30052999
// column index
30063000
let column_index = match column_index {
3007-
Index::INT32(column_index) => column_index,
3001+
ColumnIndexMetaData::INT32(column_index) => column_index,
30083002
_ => panic!("wrong stats type"),
30093003
};
3010-
assert_eq!(2, column_index.indexes.len());
3004+
assert_eq!(2, column_index.num_pages());
30113005
assert_eq!(2, offset_index.page_locations.len());
30123006
assert_eq!(BoundaryOrder::UNORDERED, column_index.boundary_order);
30133007
for idx in 0..2 {
3014-
assert!(!column_index.indexes[idx].is_null_page());
3015-
assert_eq!(0, *column_index.indexes[idx].null_count.as_ref().unwrap());
3008+
assert!(!column_index.is_null_page(idx));
3009+
assert_eq!(0, column_index.null_count(0).unwrap());
30163010
}
30173011

30183012
if let Some(stats) = r.metadata.statistics() {
@@ -3022,8 +3016,8 @@ mod tests {
30223016
// first page is [1,2,3,4]
30233017
// second page is [-5,2,4,8]
30243018
// note that we don't increment here, as this is a non BinaryArray type.
3025-
assert_eq!(stats.min_opt(), column_index.indexes[1].min());
3026-
assert_eq!(stats.max_opt(), column_index.indexes[1].max());
3019+
assert_eq!(stats.min_opt(), column_index.min_value(1));
3020+
assert_eq!(stats.max_opt(), column_index.max_value(1));
30273021
} else {
30283022
panic!("expecting Statistics::Int32");
30293023
}
@@ -3064,25 +3058,25 @@ mod tests {
30643058
let offset_index = r.offset_index.unwrap();
30653059

30663060
let column_index = match column_index {
3067-
Index::FIXED_LEN_BYTE_ARRAY(column_index) => column_index,
3061+
ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(column_index) => column_index,
30683062
_ => panic!("wrong stats type"),
30693063
};
30703064

30713065
assert_eq!(3, r.rows_written);
30723066

30733067
// column index
3074-
assert_eq!(1, column_index.indexes.len());
3068+
assert_eq!(1, column_index.num_pages());
30753069
assert_eq!(1, offset_index.page_locations.len());
30763070
assert_eq!(BoundaryOrder::ASCENDING, column_index.boundary_order);
3077-
assert!(!column_index.indexes[0].is_null_page());
3078-
assert_eq!(Some(0), column_index.indexes[0].null_count());
3071+
assert!(!column_index.is_null_page(0));
3072+
assert_eq!(Some(0), column_index.null_count(0));
30793073

30803074
if let Some(stats) = r.metadata.statistics() {
30813075
assert_eq!(stats.null_count_opt(), Some(0));
30823076
assert_eq!(stats.distinct_count_opt(), None);
30833077
if let Statistics::FixedLenByteArray(stats) = stats {
3084-
let column_index_min_value = column_index.indexes[0].min_bytes().unwrap();
3085-
let column_index_max_value = column_index.indexes[0].max_bytes().unwrap();
3078+
let column_index_min_value = column_index.min_value(0).unwrap();
3079+
let column_index_max_value = column_index.max_value(0).unwrap();
30863080

30873081
// Column index stats are truncated, while the column chunk's aren't.
30883082
assert_ne!(stats.min_bytes_opt().unwrap(), column_index_min_value);
@@ -3135,25 +3129,25 @@ mod tests {
31353129
let offset_index = r.offset_index.unwrap();
31363130

31373131
let column_index = match column_index {
3138-
Index::FIXED_LEN_BYTE_ARRAY(column_index) => column_index,
3132+
ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(column_index) => column_index,
31393133
_ => panic!("wrong stats type"),
31403134
};
31413135

31423136
assert_eq!(1, r.rows_written);
31433137

31443138
// column index
3145-
assert_eq!(1, column_index.indexes.len());
3139+
assert_eq!(1, column_index.num_pages());
31463140
assert_eq!(1, offset_index.page_locations.len());
31473141
assert_eq!(BoundaryOrder::ASCENDING, column_index.boundary_order);
3148-
assert!(!column_index.indexes[0].is_null_page());
3149-
assert_eq!(Some(0), column_index.indexes[0].null_count());
3142+
assert!(!column_index.is_null_page(0));
3143+
assert_eq!(Some(0), column_index.null_count(0));
31503144

31513145
if let Some(stats) = r.metadata.statistics() {
31523146
assert_eq!(stats.null_count_opt(), Some(0));
31533147
assert_eq!(stats.distinct_count_opt(), None);
31543148
if let Statistics::FixedLenByteArray(_stats) = stats {
3155-
let column_index_min_value = column_index.indexes[0].min_bytes().unwrap();
3156-
let column_index_max_value = column_index.indexes[0].max_bytes().unwrap();
3149+
let column_index_min_value = column_index.min_value(0).unwrap();
3150+
let column_index_max_value = column_index.max_value(0).unwrap();
31573151

31583152
assert_eq!(column_index_min_value.len(), 1);
31593153
assert_eq!(column_index_max_value.len(), 1);
@@ -3190,11 +3184,11 @@ mod tests {
31903184
// ensure bytes weren't truncated for column index
31913185
let column_index = r.column_index.unwrap();
31923186
let column_index = match column_index {
3193-
Index::FIXED_LEN_BYTE_ARRAY(column_index) => column_index,
3187+
ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(column_index) => column_index,
31943188
_ => panic!("wrong stats type"),
31953189
};
3196-
let column_index_min_bytes = column_index.indexes[0].min_bytes().unwrap();
3197-
let column_index_max_bytes = column_index.indexes[0].min_bytes().unwrap();
3190+
let column_index_min_bytes = column_index.min_value(0).unwrap();
3191+
let column_index_max_bytes = column_index.max_value(0).unwrap();
31983192
assert_eq!(expected_value, column_index_min_bytes);
31993193
assert_eq!(expected_value, column_index_max_bytes);
32003194

@@ -3233,11 +3227,11 @@ mod tests {
32333227
// ensure bytes weren't truncated for column index
32343228
let column_index = r.column_index.unwrap();
32353229
let column_index = match column_index {
3236-
Index::FIXED_LEN_BYTE_ARRAY(column_index) => column_index,
3230+
ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(column_index) => column_index,
32373231
_ => panic!("wrong stats type"),
32383232
};
3239-
let column_index_min_bytes = column_index.indexes[0].min_bytes().unwrap();
3240-
let column_index_max_bytes = column_index.indexes[0].min_bytes().unwrap();
3233+
let column_index_min_bytes = column_index.min_value(0).unwrap();
3234+
let column_index_max_bytes = column_index.max_value(0).unwrap();
32413235
assert_eq!(expected_value, column_index_min_bytes);
32423236
assert_eq!(expected_value, column_index_max_bytes);
32433237

parquet/src/file/metadata/memory.rs

Lines changed: 0 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ use crate::file::page_encoding_stats::PageEncodingStats;
2727
use crate::file::page_index::column_index::{
2828
ByteArrayColumnIndex, ColumnIndex, ColumnIndexMetaData, PrimitiveColumnIndex,
2929
};
30-
use crate::file::page_index::index::{Index, NativeIndex, PageIndex};
3130
use crate::file::page_index::offset_index::{OffsetIndexMetaData, PageLocation};
3231
use crate::file::statistics::{Statistics, ValueStatistics};
3332
use std::sync::Arc;
@@ -199,34 +198,6 @@ impl HeapSize for ByteArrayColumnIndex {
199198
}
200199
}
201200

202-
impl HeapSize for Index {
203-
fn heap_size(&self) -> usize {
204-
match self {
205-
Index::NONE => 0,
206-
Index::BOOLEAN(native_index) => native_index.heap_size(),
207-
Index::INT32(native_index) => native_index.heap_size(),
208-
Index::INT64(native_index) => native_index.heap_size(),
209-
Index::INT96(native_index) => native_index.heap_size(),
210-
Index::FLOAT(native_index) => native_index.heap_size(),
211-
Index::DOUBLE(native_index) => native_index.heap_size(),
212-
Index::BYTE_ARRAY(native_index) => native_index.heap_size(),
213-
Index::FIXED_LEN_BYTE_ARRAY(native_index) => native_index.heap_size(),
214-
}
215-
}
216-
}
217-
218-
impl<T: ParquetValueType> HeapSize for NativeIndex<T> {
219-
fn heap_size(&self) -> usize {
220-
self.indexes.heap_size() + self.boundary_order.heap_size()
221-
}
222-
}
223-
224-
impl<T: ParquetValueType> HeapSize for PageIndex<T> {
225-
fn heap_size(&self) -> usize {
226-
self.min.heap_size() + self.max.heap_size() + self.null_count.heap_size()
227-
}
228-
}
229-
230201
impl<T: ParquetValueType> HeapSize for ValueStatistics<T> {
231202
fn heap_size(&self) -> usize {
232203
self.min_opt().map(T::heap_size).unwrap_or(0)

0 commit comments

Comments
 (0)