@@ -33,7 +33,7 @@ use crate::data_type::private::ParquetValueType;
3333use crate :: data_type:: * ;
3434use crate :: encodings:: levels:: LevelEncoder ;
3535use crate :: errors:: { ParquetError , Result } ;
36- use crate :: file:: metadata:: { ColumnIndexBuilder , OffsetIndexBuilder } ;
36+ use crate :: file:: metadata:: { ColumnIndexBuilder , LevelHistogram , OffsetIndexBuilder } ;
3737use crate :: file:: properties:: EnabledStatistics ;
3838use crate :: file:: statistics:: { Statistics , ValueStatistics } ;
3939use crate :: file:: {
@@ -189,6 +189,54 @@ struct PageMetrics {
189189 num_buffered_values : u32 ,
190190 num_buffered_rows : u32 ,
191191 num_page_nulls : u64 ,
192+ repetition_level_histogram : Option < LevelHistogram > ,
193+ definition_level_histogram : Option < LevelHistogram > ,
194+ }
195+
196+ impl PageMetrics {
197+ fn new ( ) -> Self {
198+ Default :: default ( )
199+ }
200+
201+ /// Initialize the repetition level histogram
202+ fn with_repetition_level_histogram ( mut self , max_level : i16 ) -> Self {
203+ self . repetition_level_histogram = LevelHistogram :: try_new ( max_level) ;
204+ self
205+ }
206+
207+ /// Initialize the definition level histogram
208+ fn with_definition_level_histogram ( mut self , max_level : i16 ) -> Self {
209+ self . definition_level_histogram = LevelHistogram :: try_new ( max_level) ;
210+ self
211+ }
212+
213+ /// Resets the state of this `PageMetrics` to the initial state.
214+ /// If histograms have been initialized their contents will be reset to zero.
215+ fn new_page ( & mut self ) {
216+ self . num_buffered_values = 0 ;
217+ self . num_buffered_rows = 0 ;
218+ self . num_page_nulls = 0 ;
219+ self . repetition_level_histogram
220+ . as_mut ( )
221+ . map ( LevelHistogram :: reset) ;
222+ self . definition_level_histogram
223+ . as_mut ( )
224+ . map ( LevelHistogram :: reset) ;
225+ }
226+
227+ /// Updates histogram values using provided repetition levels
228+ fn update_repetition_level_histogram ( & mut self , levels : & [ i16 ] ) {
229+ if let Some ( ref mut rep_hist) = self . repetition_level_histogram {
230+ rep_hist. update_from_levels ( levels) ;
231+ }
232+ }
233+
234+ /// Updates histogram values using provided definition levels
235+ fn update_definition_level_histogram ( & mut self , levels : & [ i16 ] ) {
236+ if let Some ( ref mut def_hist) = self . definition_level_histogram {
237+ def_hist. update_from_levels ( levels) ;
238+ }
239+ }
192240}
193241
194242// Metrics per column writer
@@ -206,13 +254,50 @@ struct ColumnMetrics<T: Default> {
206254 num_column_nulls : u64 ,
207255 column_distinct_count : Option < u64 > ,
208256 variable_length_bytes : Option < i64 > ,
257+ repetition_level_histogram : Option < LevelHistogram > ,
258+ definition_level_histogram : Option < LevelHistogram > ,
209259}
210260
211261impl < T : Default > ColumnMetrics < T > {
212262 fn new ( ) -> Self {
213263 Default :: default ( )
214264 }
215265
266+ /// Initialize the repetition level histogram
267+ fn with_repetition_level_histogram ( mut self , max_level : i16 ) -> Self {
268+ self . repetition_level_histogram = LevelHistogram :: try_new ( max_level) ;
269+ self
270+ }
271+
272+ /// Initialize the definition level histogram
273+ fn with_definition_level_histogram ( mut self , max_level : i16 ) -> Self {
274+ self . definition_level_histogram = LevelHistogram :: try_new ( max_level) ;
275+ self
276+ }
277+
278+ /// Sum `page_histogram` into `chunk_histogram`
279+ fn update_histogram (
280+ chunk_histogram : & mut Option < LevelHistogram > ,
281+ page_histogram : & Option < LevelHistogram > ,
282+ ) {
283+ if let ( Some ( page_hist) , Some ( chunk_hist) ) = ( page_histogram, chunk_histogram) {
284+ chunk_hist. add ( page_hist) ;
285+ }
286+ }
287+
288+ /// Sum the provided PageMetrics histograms into the chunk histograms. Does nothing if
289+ /// page histograms are not initialized.
290+ fn update_from_page_metrics ( & mut self , page_metrics : & PageMetrics ) {
291+ ColumnMetrics :: < T > :: update_histogram (
292+ & mut self . definition_level_histogram ,
293+ & page_metrics. definition_level_histogram ,
294+ ) ;
295+ ColumnMetrics :: < T > :: update_histogram (
296+ & mut self . repetition_level_histogram ,
297+ & page_metrics. repetition_level_histogram ,
298+ ) ;
299+ }
300+
216301 /// Sum the provided page variable_length_bytes into the chunk variable_length_bytes
217302 fn update_variable_length_bytes ( & mut self , variable_length_bytes : Option < i64 > ) {
218303 if let Some ( var_bytes) = variable_length_bytes {
@@ -275,6 +360,19 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
275360 // Used for level information
276361 encodings. insert ( Encoding :: RLE ) ;
277362
363+ let mut page_metrics = PageMetrics :: new ( ) ;
364+ let mut column_metrics = ColumnMetrics :: < E :: T > :: new ( ) ;
365+
366+ // Initialize level histograms if collecting page or chunk statistics
367+ if statistics_enabled != EnabledStatistics :: None {
368+ page_metrics = page_metrics
369+ . with_repetition_level_histogram ( descr. max_rep_level ( ) )
370+ . with_definition_level_histogram ( descr. max_def_level ( ) ) ;
371+ column_metrics = column_metrics
372+ . with_repetition_level_histogram ( descr. max_rep_level ( ) )
373+ . with_definition_level_histogram ( descr. max_def_level ( ) )
374+ }
375+
278376 // Disable column_index_builder if not collecting page statistics.
279377 let mut column_index_builder = ColumnIndexBuilder :: new ( ) ;
280378 if statistics_enabled != EnabledStatistics :: Page {
@@ -292,12 +390,8 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
292390 def_levels_sink : vec ! [ ] ,
293391 rep_levels_sink : vec ! [ ] ,
294392 data_pages : VecDeque :: new ( ) ,
295- page_metrics : PageMetrics {
296- num_buffered_values : 0 ,
297- num_buffered_rows : 0 ,
298- num_page_nulls : 0 ,
299- } ,
300- column_metrics : ColumnMetrics :: < E :: T > :: new ( ) ,
393+ page_metrics,
394+ column_metrics,
301395 column_index_builder,
302396 offset_index_builder : OffsetIndexBuilder :: new ( ) ,
303397 encodings,
@@ -547,6 +641,9 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
547641 }
548642 }
549643
644+ // Update histogram
645+ self . page_metrics . update_definition_level_histogram ( levels) ;
646+
550647 self . def_levels_sink . extend_from_slice ( levels) ;
551648 values_to_write
552649 } else {
@@ -575,6 +672,9 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
575672 self . page_metrics . num_buffered_rows += ( level == 0 ) as u32
576673 }
577674
675+ // Update histogram
676+ self . page_metrics . update_repetition_level_histogram ( levels) ;
677+
578678 self . rep_levels_sink . extend_from_slice ( levels) ;
579679 } else {
580680 // Each value is exactly one row.
@@ -718,7 +818,14 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
718818 }
719819 }
720820 }
721- // update the offset index
821+
822+ // Append page histograms to the `ColumnIndex` histograms
823+ self . column_index_builder . append_histograms (
824+ & self . page_metrics . repetition_level_histogram ,
825+ & self . page_metrics . definition_level_histogram ,
826+ ) ;
827+
828+ // Update the offset index
722829 self . offset_index_builder
723830 . append_row_count ( self . page_metrics . num_buffered_rows as i64 ) ;
724831
@@ -804,7 +911,9 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
804911 values_data. variable_length_bytes ,
805912 ) ;
806913
807- // Update variable_length_bytes in column_metrics
914+ // Update histograms and variable_length_bytes in column_metrics
915+ self . column_metrics
916+ . update_from_page_metrics ( & self . page_metrics ) ;
808917 self . column_metrics
809918 . update_variable_length_bytes ( values_data. variable_length_bytes ) ;
810919
@@ -911,7 +1020,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
9111020 // Reset state.
9121021 self . rep_levels_sink . clear ( ) ;
9131022 self . def_levels_sink . clear ( ) ;
914- self . page_metrics = PageMetrics :: default ( ) ;
1023+ self . page_metrics . new_page ( ) ;
9151024
9161025 Ok ( ( ) )
9171026 }
@@ -1019,7 +1128,13 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
10191128
10201129 builder = builder
10211130 . set_statistics ( statistics)
1022- . set_unencoded_byte_array_data_bytes ( self . column_metrics . variable_length_bytes ) ;
1131+ . set_unencoded_byte_array_data_bytes ( self . column_metrics . variable_length_bytes )
1132+ . set_repetition_level_histogram (
1133+ self . column_metrics . repetition_level_histogram . take ( ) ,
1134+ )
1135+ . set_definition_level_histogram (
1136+ self . column_metrics . definition_level_histogram . take ( ) ,
1137+ ) ;
10231138 }
10241139
10251140 let metadata = builder. build ( ) ?;
0 commit comments