1515// specific language governing permissions and limitations
1616// under the License.
1717
18- use arrow:: {
19- array:: ArrayRef ,
20- datatypes:: { DataType , Schema } ,
21- } ;
18+ use arrow:: { array:: ArrayRef , datatypes:: Schema } ;
2219use datafusion_common:: tree_node:: { TreeNode , VisitRecursion } ;
2320use datafusion_common:: { Column , DataFusionError , Result , ScalarValue } ;
2421use parquet:: {
2522 arrow:: { async_reader:: AsyncFileReader , ParquetRecordBatchStreamBuilder } ,
2623 bloom_filter:: Sbbf ,
27- file:: { metadata:: RowGroupMetaData , statistics :: Statistics as ParquetStatistics } ,
24+ file:: metadata:: RowGroupMetaData ,
2825} ;
2926use std:: {
3027 collections:: { HashMap , HashSet } ,
3128 sync:: Arc ,
3229} ;
3330
34- use crate :: datasource:: {
35- listing:: FileRange ,
36- physical_plan:: parquet:: { from_bytes_to_i128, parquet_to_arrow_decimal_type} ,
37- } ;
31+ use crate :: datasource:: listing:: FileRange ;
3832use crate :: logical_expr:: Operator ;
3933use crate :: physical_expr:: expressions as phys_expr;
4034use crate :: physical_optimizer:: pruning:: { PruningPredicate , PruningStatistics } ;
4135use crate :: physical_plan:: PhysicalExpr ;
4236
37+ use super :: statistics:: RowGoupStatisticsConverter ;
4338use super :: ParquetFileMetrics ;
4439
4540/// Prune row groups based on statistics
@@ -303,112 +298,6 @@ struct RowGroupPruningStatistics<'a> {
303298 parquet_schema : & ' a Schema ,
304299}
305300
306- /// Extract the min/max statistics from a `ParquetStatistics` object
307- macro_rules! get_statistic {
308- ( $column_statistics: expr, $func: ident, $bytes_func: ident, $target_arrow_type: expr) => { {
309- if !$column_statistics. has_min_max_set( ) {
310- return None ;
311- }
312- match $column_statistics {
313- ParquetStatistics :: Boolean ( s) => Some ( ScalarValue :: Boolean ( Some ( * s. $func( ) ) ) ) ,
314- ParquetStatistics :: Int32 ( s) => {
315- match $target_arrow_type {
316- // int32 to decimal with the precision and scale
317- Some ( DataType :: Decimal128 ( precision, scale) ) => {
318- Some ( ScalarValue :: Decimal128 (
319- Some ( * s. $func( ) as i128 ) ,
320- precision,
321- scale,
322- ) )
323- }
324- _ => Some ( ScalarValue :: Int32 ( Some ( * s. $func( ) ) ) ) ,
325- }
326- }
327- ParquetStatistics :: Int64 ( s) => {
328- match $target_arrow_type {
329- // int64 to decimal with the precision and scale
330- Some ( DataType :: Decimal128 ( precision, scale) ) => {
331- Some ( ScalarValue :: Decimal128 (
332- Some ( * s. $func( ) as i128 ) ,
333- precision,
334- scale,
335- ) )
336- }
337- _ => Some ( ScalarValue :: Int64 ( Some ( * s. $func( ) ) ) ) ,
338- }
339- }
340- // 96 bit ints not supported
341- ParquetStatistics :: Int96 ( _) => None ,
342- ParquetStatistics :: Float ( s) => Some ( ScalarValue :: Float32 ( Some ( * s. $func( ) ) ) ) ,
343- ParquetStatistics :: Double ( s) => Some ( ScalarValue :: Float64 ( Some ( * s. $func( ) ) ) ) ,
344- ParquetStatistics :: ByteArray ( s) => {
345- match $target_arrow_type {
346- // decimal data type
347- Some ( DataType :: Decimal128 ( precision, scale) ) => {
348- Some ( ScalarValue :: Decimal128 (
349- Some ( from_bytes_to_i128( s. $bytes_func( ) ) ) ,
350- precision,
351- scale,
352- ) )
353- }
354- _ => {
355- let s = std:: str :: from_utf8( s. $bytes_func( ) )
356- . map( |s| s. to_string( ) )
357- . ok( ) ;
358- Some ( ScalarValue :: Utf8 ( s) )
359- }
360- }
361- }
362- // type not supported yet
363- ParquetStatistics :: FixedLenByteArray ( s) => {
364- match $target_arrow_type {
365- // just support the decimal data type
366- Some ( DataType :: Decimal128 ( precision, scale) ) => {
367- Some ( ScalarValue :: Decimal128 (
368- Some ( from_bytes_to_i128( s. $bytes_func( ) ) ) ,
369- precision,
370- scale,
371- ) )
372- }
373- _ => None ,
374- }
375- }
376- }
377- } } ;
378- }
379-
380- // Extract the min or max value calling `func` or `bytes_func` on the ParquetStatistics as appropriate
381- macro_rules! get_min_max_values {
382- ( $self: expr, $column: expr, $func: ident, $bytes_func: ident) => { {
383- let ( _column_index, field) =
384- if let Some ( ( v, f) ) = $self. parquet_schema. column_with_name( & $column. name) {
385- ( v, f)
386- } else {
387- // Named column was not present
388- return None ;
389- } ;
390-
391- let data_type = field. data_type( ) ;
392- // The result may be None, because DataFusion doesn't have support for ScalarValues of the column type
393- let null_scalar: ScalarValue = data_type. try_into( ) . ok( ) ?;
394-
395- $self. row_group_metadata
396- . columns( )
397- . iter( )
398- . find( |c| c. column_descr( ) . name( ) == & $column. name)
399- . and_then( |c| if c. statistics( ) . is_some( ) { Some ( ( c. statistics( ) . unwrap( ) , c. column_descr( ) ) ) } else { None } )
400- . map( |( stats, column_descr) |
401- {
402- let target_data_type = parquet_to_arrow_decimal_type( column_descr) ;
403- get_statistic!( stats, $func, $bytes_func, target_data_type)
404- } )
405- . flatten( )
406- // column either didn't have statistics at all or didn't have min/max values
407- . or_else( || Some ( null_scalar. clone( ) ) )
408- . and_then( |s| s. to_array( ) . ok( ) )
409- } }
410- }
411-
412301// Extract the null count value on the ParquetStatistics
413302macro_rules! get_null_count_values {
414303 ( $self: expr, $column: expr) => { {
@@ -431,11 +320,29 @@ macro_rules! get_null_count_values {
431320
432321impl < ' a > PruningStatistics for RowGroupPruningStatistics < ' a > {
433322 fn min_values ( & self , column : & Column ) -> Option < ArrayRef > {
434- get_min_max_values ! ( self , column, min, min_bytes)
323+ let field = self
324+ . parquet_schema
325+ . fields ( )
326+ . find ( & column. name )
327+ . map ( |( _idx, field) | field) ?;
328+
329+ RowGoupStatisticsConverter :: new ( & field)
330+ . min ( [ self . row_group_metadata ] )
331+ // ignore errors during conversion, and just use no statistics
332+ . ok ( )
435333 }
436334
437335 fn max_values ( & self , column : & Column ) -> Option < ArrayRef > {
438- get_min_max_values ! ( self , column, max, max_bytes)
336+ let field = self
337+ . parquet_schema
338+ . fields ( )
339+ . find ( & column. name )
340+ . map ( |( _idx, field) | field) ?;
341+
342+ RowGoupStatisticsConverter :: new ( & field)
343+ . max ( [ self . row_group_metadata ] )
344+ // ignore errors during conversion, and just use no statistics
345+ . ok ( )
439346 }
440347
441348 fn num_containers ( & self ) -> usize {
0 commit comments