Skip to content

Commit 62f91b6

Browse files
committed
Extract parquet statistics to its own module, add tests
1 parent 58483fb commit 62f91b6

File tree

4 files changed

+832
-142
lines changed

4 files changed

+832
-142
lines changed

datafusion/core/src/datasource/physical_plan/parquet.rs

Lines changed: 1 addition & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ mod metrics;
6666
pub mod page_filter;
6767
mod row_filter;
6868
mod row_groups;
69+
mod statistics;
6970

7071
pub use metrics::ParquetFileMetrics;
7172

@@ -718,28 +719,6 @@ pub async fn plan_to_parquet(
718719
Ok(())
719720
}
720721

721-
// Copy from the arrow-rs
722-
// https://github.com/apache/arrow-rs/blob/733b7e7fd1e8c43a404c3ce40ecf741d493c21b4/parquet/src/arrow/buffer/bit_util.rs#L55
723-
// Convert the byte slice to fixed length byte array with the length of 16
724-
fn sign_extend_be(b: &[u8]) -> [u8; 16] {
725-
assert!(b.len() <= 16, "Array too large, expected less than 16");
726-
let is_negative = (b[0] & 128u8) == 128u8;
727-
let mut result = if is_negative { [255u8; 16] } else { [0u8; 16] };
728-
for (d, s) in result.iter_mut().skip(16 - b.len()).zip(b) {
729-
*d = *s;
730-
}
731-
result
732-
}
733-
734-
// Convert the bytes array to i128.
735-
// The endian of the input bytes array must be big-endian.
736-
pub(crate) fn from_bytes_to_i128(b: &[u8]) -> i128 {
737-
// The bytes array are from parquet file and must be the big-endian.
738-
// The endian is defined by parquet format, and the reference document
739-
// https://github.com/apache/parquet-format/blob/54e53e5d7794d383529dd30746378f19a12afd58/src/main/thrift/parquet.thrift#L66
740-
i128::from_be_bytes(sign_extend_be(b))
741-
}
742-
743722
// Convert parquet column schema to arrow data type, and just consider the
744723
// decimal data type.
745724
pub(crate) fn parquet_to_arrow_decimal_type(

datafusion/core/src/datasource/physical_plan/parquet/page_filter.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,8 @@ use parquet::{
3939
};
4040
use std::sync::Arc;
4141

42-
use crate::datasource::physical_plan::parquet::{
43-
from_bytes_to_i128, parquet_to_arrow_decimal_type,
44-
};
42+
use crate::datasource::physical_plan::parquet::parquet_to_arrow_decimal_type;
43+
use crate::datasource::physical_plan::parquet::statistics::from_bytes_to_i128;
4544
use crate::physical_optimizer::pruning::{PruningPredicate, PruningStatistics};
4645

4746
use super::metrics::ParquetFileMetrics;

datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs

Lines changed: 24 additions & 117 deletions
Original file line numberDiff line numberDiff line change
@@ -15,31 +15,26 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
use arrow::{
19-
array::ArrayRef,
20-
datatypes::{DataType, Schema},
21-
};
18+
use arrow::{array::ArrayRef, datatypes::Schema};
2219
use datafusion_common::tree_node::{TreeNode, VisitRecursion};
2320
use datafusion_common::{Column, DataFusionError, Result, ScalarValue};
2421
use parquet::{
2522
arrow::{async_reader::AsyncFileReader, ParquetRecordBatchStreamBuilder},
2623
bloom_filter::Sbbf,
27-
file::{metadata::RowGroupMetaData, statistics::Statistics as ParquetStatistics},
24+
file::metadata::RowGroupMetaData,
2825
};
2926
use std::{
3027
collections::{HashMap, HashSet},
3128
sync::Arc,
3229
};
3330

34-
use crate::datasource::{
35-
listing::FileRange,
36-
physical_plan::parquet::{from_bytes_to_i128, parquet_to_arrow_decimal_type},
37-
};
31+
use crate::datasource::listing::FileRange;
3832
use crate::logical_expr::Operator;
3933
use crate::physical_expr::expressions as phys_expr;
4034
use crate::physical_optimizer::pruning::{PruningPredicate, PruningStatistics};
4135
use crate::physical_plan::PhysicalExpr;
4236

37+
use super::statistics::RowGoupStatisticsConverter;
4338
use super::ParquetFileMetrics;
4439

4540
/// Prune row groups based on statistics
@@ -303,112 +298,6 @@ struct RowGroupPruningStatistics<'a> {
303298
parquet_schema: &'a Schema,
304299
}
305300

306-
/// Extract the min/max statistics from a `ParquetStatistics` object
307-
macro_rules! get_statistic {
308-
($column_statistics:expr, $func:ident, $bytes_func:ident, $target_arrow_type:expr) => {{
309-
if !$column_statistics.has_min_max_set() {
310-
return None;
311-
}
312-
match $column_statistics {
313-
ParquetStatistics::Boolean(s) => Some(ScalarValue::Boolean(Some(*s.$func()))),
314-
ParquetStatistics::Int32(s) => {
315-
match $target_arrow_type {
316-
// int32 to decimal with the precision and scale
317-
Some(DataType::Decimal128(precision, scale)) => {
318-
Some(ScalarValue::Decimal128(
319-
Some(*s.$func() as i128),
320-
precision,
321-
scale,
322-
))
323-
}
324-
_ => Some(ScalarValue::Int32(Some(*s.$func()))),
325-
}
326-
}
327-
ParquetStatistics::Int64(s) => {
328-
match $target_arrow_type {
329-
// int64 to decimal with the precision and scale
330-
Some(DataType::Decimal128(precision, scale)) => {
331-
Some(ScalarValue::Decimal128(
332-
Some(*s.$func() as i128),
333-
precision,
334-
scale,
335-
))
336-
}
337-
_ => Some(ScalarValue::Int64(Some(*s.$func()))),
338-
}
339-
}
340-
// 96 bit ints not supported
341-
ParquetStatistics::Int96(_) => None,
342-
ParquetStatistics::Float(s) => Some(ScalarValue::Float32(Some(*s.$func()))),
343-
ParquetStatistics::Double(s) => Some(ScalarValue::Float64(Some(*s.$func()))),
344-
ParquetStatistics::ByteArray(s) => {
345-
match $target_arrow_type {
346-
// decimal data type
347-
Some(DataType::Decimal128(precision, scale)) => {
348-
Some(ScalarValue::Decimal128(
349-
Some(from_bytes_to_i128(s.$bytes_func())),
350-
precision,
351-
scale,
352-
))
353-
}
354-
_ => {
355-
let s = std::str::from_utf8(s.$bytes_func())
356-
.map(|s| s.to_string())
357-
.ok();
358-
Some(ScalarValue::Utf8(s))
359-
}
360-
}
361-
}
362-
// type not supported yet
363-
ParquetStatistics::FixedLenByteArray(s) => {
364-
match $target_arrow_type {
365-
// just support the decimal data type
366-
Some(DataType::Decimal128(precision, scale)) => {
367-
Some(ScalarValue::Decimal128(
368-
Some(from_bytes_to_i128(s.$bytes_func())),
369-
precision,
370-
scale,
371-
))
372-
}
373-
_ => None,
374-
}
375-
}
376-
}
377-
}};
378-
}
379-
380-
// Extract the min or max value calling `func` or `bytes_func` on the ParquetStatistics as appropriate
381-
macro_rules! get_min_max_values {
382-
($self:expr, $column:expr, $func:ident, $bytes_func:ident) => {{
383-
let (_column_index, field) =
384-
if let Some((v, f)) = $self.parquet_schema.column_with_name(&$column.name) {
385-
(v, f)
386-
} else {
387-
// Named column was not present
388-
return None;
389-
};
390-
391-
let data_type = field.data_type();
392-
// The result may be None, because DataFusion doesn't have support for ScalarValues of the column type
393-
let null_scalar: ScalarValue = data_type.try_into().ok()?;
394-
395-
$self.row_group_metadata
396-
.columns()
397-
.iter()
398-
.find(|c| c.column_descr().name() == &$column.name)
399-
.and_then(|c| if c.statistics().is_some() {Some((c.statistics().unwrap(), c.column_descr()))} else {None})
400-
.map(|(stats, column_descr)|
401-
{
402-
let target_data_type = parquet_to_arrow_decimal_type(column_descr);
403-
get_statistic!(stats, $func, $bytes_func, target_data_type)
404-
})
405-
.flatten()
406-
// column either didn't have statistics at all or didn't have min/max values
407-
.or_else(|| Some(null_scalar.clone()))
408-
.and_then(|s| s.to_array().ok())
409-
}}
410-
}
411-
412301
// Extract the null count value on the ParquetStatistics
413302
macro_rules! get_null_count_values {
414303
($self:expr, $column:expr) => {{
@@ -431,11 +320,29 @@ macro_rules! get_null_count_values {
431320

432321
impl<'a> PruningStatistics for RowGroupPruningStatistics<'a> {
433322
fn min_values(&self, column: &Column) -> Option<ArrayRef> {
434-
get_min_max_values!(self, column, min, min_bytes)
323+
let field = self
324+
.parquet_schema
325+
.fields()
326+
.find(&column.name)
327+
.map(|(_idx, field)| field)?;
328+
329+
RowGoupStatisticsConverter::new(&field)
330+
.min([self.row_group_metadata])
331+
// ignore errors during conversion, and just use no statistics
332+
.ok()
435333
}
436334

437335
fn max_values(&self, column: &Column) -> Option<ArrayRef> {
438-
get_min_max_values!(self, column, max, max_bytes)
336+
let field = self
337+
.parquet_schema
338+
.fields()
339+
.find(&column.name)
340+
.map(|(_idx, field)| field)?;
341+
342+
RowGoupStatisticsConverter::new(&field)
343+
.max([self.row_group_metadata])
344+
// ignore errors during conversion, and just use no statistics
345+
.ok()
439346
}
440347

441348
fn num_containers(&self) -> usize {

0 commit comments

Comments
 (0)