Skip to content

Commit 827cc3e

Browse files
authored
Expose column index and offset index (#1318)
# Which issue does this PR close? Closes #1317. Exposing the column index and offset index offsets and lengths so parquet engines could optimize their reads.
1 parent c26a0a1 commit 827cc3e

File tree

3 files changed

+113
-4
lines changed

3 files changed

+113
-4
lines changed

parquet/src/file/metadata.rs

Lines changed: 84 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,10 @@ pub struct ColumnChunkMetaData {
350350
dictionary_page_offset: Option<i64>,
351351
statistics: Option<Statistics>,
352352
bloom_filter_offset: Option<i64>,
353+
offset_index_offset: Option<i64>,
354+
offset_index_length: Option<i32>,
355+
column_index_offset: Option<i64>,
356+
column_index_length: Option<i32>,
353357
}
354358

355359
/// Represents common operations for a column chunk.
@@ -473,6 +477,34 @@ impl ColumnChunkMetaData {
473477
self.bloom_filter_offset
474478
}
475479

480+
/// Returns `true` if this column chunk contains a column index, `false` otherwise.
481+
pub fn has_column_index(&self) -> bool {
482+
self.column_index_offset.is_some()
483+
&& self.column_index_length.is_some()
484+
&& self.offset_index_offset.is_some()
485+
&& self.offset_index_length.is_some()
486+
}
487+
488+
/// Returns the offset for the column index.
489+
pub fn column_index_offset(&self) -> Option<i64> {
490+
self.column_index_offset
491+
}
492+
493+
/// Returns the offset for the column index length.
494+
pub fn column_index_length(&self) -> Option<i32> {
495+
self.column_index_length
496+
}
497+
498+
/// Returns the offset for the offset index.
499+
pub fn offset_index_offset(&self) -> Option<i64> {
500+
self.offset_index_offset
501+
}
502+
503+
/// Returns the offset for the offset index length.
504+
pub fn offset_index_length(&self) -> Option<i32> {
505+
self.offset_index_length
506+
}
507+
476508
/// Method to convert from Thrift.
477509
pub fn from_thrift(column_descr: ColumnDescPtr, cc: ColumnChunk) -> Result<Self> {
478510
if cc.meta_data.is_none() {
@@ -497,6 +529,10 @@ impl ColumnChunkMetaData {
497529
let dictionary_page_offset = col_metadata.dictionary_page_offset;
498530
let statistics = statistics::from_thrift(column_type, col_metadata.statistics);
499531
let bloom_filter_offset = col_metadata.bloom_filter_offset;
532+
let offset_index_offset = cc.offset_index_offset;
533+
let offset_index_length = cc.offset_index_length;
534+
let column_index_offset = cc.column_index_offset;
535+
let column_index_length = cc.column_index_length;
500536
let result = ColumnChunkMetaData {
501537
column_type,
502538
column_path,
@@ -513,6 +549,10 @@ impl ColumnChunkMetaData {
513549
dictionary_page_offset,
514550
statistics,
515551
bloom_filter_offset,
552+
offset_index_offset,
553+
offset_index_length,
554+
column_index_offset,
555+
column_index_length,
516556
};
517557
Ok(result)
518558
}
@@ -540,10 +580,10 @@ impl ColumnChunkMetaData {
540580
file_path: self.file_path().map(|s| s.to_owned()),
541581
file_offset: self.file_offset,
542582
meta_data: Some(column_metadata),
543-
offset_index_offset: None,
544-
offset_index_length: None,
545-
column_index_offset: None,
546-
column_index_length: None,
583+
offset_index_offset: self.offset_index_offset,
584+
offset_index_length: self.offset_index_length,
585+
column_index_offset: self.column_index_offset,
586+
column_index_length: self.column_index_length,
547587
crypto_metadata: None,
548588
encrypted_column_metadata: None,
549589
}
@@ -565,6 +605,10 @@ pub struct ColumnChunkMetaDataBuilder {
565605
dictionary_page_offset: Option<i64>,
566606
statistics: Option<Statistics>,
567607
bloom_filter_offset: Option<i64>,
608+
offset_index_offset: Option<i64>,
609+
offset_index_length: Option<i32>,
610+
column_index_offset: Option<i64>,
611+
column_index_length: Option<i32>,
568612
}
569613

570614
impl ColumnChunkMetaDataBuilder {
@@ -584,6 +628,10 @@ impl ColumnChunkMetaDataBuilder {
584628
dictionary_page_offset: None,
585629
statistics: None,
586630
bloom_filter_offset: None,
631+
offset_index_offset: None,
632+
offset_index_length: None,
633+
column_index_offset: None,
634+
column_index_length: None,
587635
}
588636
}
589637

@@ -659,6 +707,30 @@ impl ColumnChunkMetaDataBuilder {
659707
self
660708
}
661709

710+
/// Sets optional offset index offset in bytes.
711+
pub fn set_offset_index_offset(mut self, value: Option<i64>) -> Self {
712+
self.offset_index_offset = value;
713+
self
714+
}
715+
716+
/// Sets optional offset index length in bytes.
717+
pub fn set_offset_index_length(mut self, value: Option<i32>) -> Self {
718+
self.offset_index_length = value;
719+
self
720+
}
721+
722+
/// Sets optional column index offset in bytes.
723+
pub fn set_column_index_offset(mut self, value: Option<i64>) -> Self {
724+
self.column_index_offset = value;
725+
self
726+
}
727+
728+
/// Sets optional column index length in bytes.
729+
pub fn set_column_index_length(mut self, value: Option<i32>) -> Self {
730+
self.column_index_length = value;
731+
self
732+
}
733+
662734
/// Builds column chunk metadata.
663735
pub fn build(self) -> Result<ColumnChunkMetaData> {
664736
Ok(ColumnChunkMetaData {
@@ -677,6 +749,10 @@ impl ColumnChunkMetaDataBuilder {
677749
dictionary_page_offset: self.dictionary_page_offset,
678750
statistics: self.statistics,
679751
bloom_filter_offset: self.bloom_filter_offset,
752+
offset_index_offset: self.offset_index_offset,
753+
offset_index_length: self.offset_index_length,
754+
column_index_offset: self.column_index_offset,
755+
column_index_length: self.column_index_length,
680756
})
681757
}
682758
}
@@ -740,6 +816,10 @@ mod tests {
740816
.set_data_page_offset(4000)
741817
.set_dictionary_page_offset(Some(5000))
742818
.set_bloom_filter_offset(Some(6000))
819+
.set_offset_index_offset(Some(7000))
820+
.set_offset_index_length(Some(25))
821+
.set_column_index_offset(Some(8000))
822+
.set_column_index_length(Some(25))
743823
.build()
744824
.unwrap();
745825

parquet/src/file/serialized_reader.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -771,6 +771,15 @@ mod tests {
771771

772772
// test optional bloom filter offset
773773
assert_eq!(col0_metadata.bloom_filter_offset().unwrap(), 192);
774+
775+
// test optional column index offset
776+
assert!(col0_metadata.has_column_index());
777+
assert_eq!(col0_metadata.column_index_offset().unwrap(), 156);
778+
assert_eq!(col0_metadata.column_index_length().unwrap(), 25);
779+
780+
// test optional offset index offset
781+
assert_eq!(col0_metadata.offset_index_offset().unwrap(), 181);
782+
assert_eq!(col0_metadata.offset_index_length().unwrap(), 11);
774783
}
775784

776785
#[test]

parquet/src/schema/printer.rs

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,26 @@ fn print_column_chunk_metadata(
168168
Some(bfo) => bfo.to_string(),
169169
};
170170
writeln!(out, "bloom filter offset: {}", bloom_filter_offset_str);
171+
let offset_index_offset_str = match cc_metadata.offset_index_offset() {
172+
None => "N/A".to_owned(),
173+
Some(oio) => oio.to_string(),
174+
};
175+
writeln!(out, "offset index offset: {}", offset_index_offset_str);
176+
let offset_index_length_str = match cc_metadata.offset_index_length() {
177+
None => "N/A".to_owned(),
178+
Some(oil) => oil.to_string(),
179+
};
180+
writeln!(out, "offset index length: {}", offset_index_length_str);
181+
let column_index_offset_str = match cc_metadata.column_index_offset() {
182+
None => "N/A".to_owned(),
183+
Some(cio) => cio.to_string(),
184+
};
185+
writeln!(out, "column index offset: {}", column_index_offset_str);
186+
let column_index_length_str = match cc_metadata.column_index_length() {
187+
None => "N/A".to_owned(),
188+
Some(cil) => cil.to_string(),
189+
};
190+
writeln!(out, "column index length: {}", column_index_length_str);
171191
writeln!(out);
172192
}
173193

0 commit comments

Comments
 (0)