Skip to content

Commit

Permalink
Extract Date32 parquet statistics as Date32Array rather than `Int…
Browse files Browse the repository at this point in the history
…32Array` (#10593)

* Fixes bug expect `Date32Array` but returns Int32Array

* Add round trip ut

* Update arrow_statistics.rs

* remove unreachable code

---------

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
  • Loading branch information
xinlifoobar and alamb authored May 23, 2024
1 parent 656da83 commit 529d2c0
Show file tree
Hide file tree
Showing 2 changed files with 116 additions and 18 deletions.
96 changes: 93 additions & 3 deletions datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,12 @@ macro_rules! get_statistic {
*scale,
))
}
Some(DataType::Date32) => {
Some(ScalarValue::Date32(Some(*s.$func())))
}
Some(DataType::Date64) => {
Some(ScalarValue::Date64(Some(i64::from(*s.$func()) * 24 * 60 * 60 * 1000)))
}
_ => Some(ScalarValue::Int32(Some(*s.$func()))),
}
}
Expand Down Expand Up @@ -363,10 +369,12 @@ impl<'a> StatisticsConverter<'a> {
#[cfg(test)]
mod test {
use super::*;
use arrow::compute::kernels::cast_utils::Parser;
use arrow::datatypes::{Date32Type, Date64Type};
use arrow_array::{
new_null_array, Array, BinaryArray, BooleanArray, Decimal128Array, Float32Array,
Float64Array, Int32Array, Int64Array, RecordBatch, StringArray, StructArray,
TimestampNanosecondArray,
new_null_array, Array, BinaryArray, BooleanArray, Date32Array, Date64Array,
Decimal128Array, Float32Array, Float64Array, Int32Array, Int64Array, RecordBatch,
StringArray, StructArray, TimestampNanosecondArray,
};
use arrow_schema::{Field, SchemaRef};
use bytes::Bytes;
Expand Down Expand Up @@ -664,6 +672,68 @@ mod test {
.run()
}

#[test]
fn roundtrip_date32() {
Test {
input: date32_array(vec![
// row group 1
Some("2021-01-01"),
None,
Some("2021-01-03"),
// row group 2
Some("2021-01-01"),
Some("2021-01-05"),
None,
// row group 3
None,
None,
None,
]),
expected_min: date32_array(vec![
Some("2021-01-01"),
Some("2021-01-01"),
None,
]),
expected_max: date32_array(vec![
Some("2021-01-03"),
Some("2021-01-05"),
None,
]),
}
.run()
}

#[test]
fn roundtrip_date64() {
Test {
input: date64_array(vec![
// row group 1
Some("2021-01-01"),
None,
Some("2021-01-03"),
// row group 2
Some("2021-01-01"),
Some("2021-01-05"),
None,
// row group 3
None,
None,
None,
]),
expected_min: date64_array(vec![
Some("2021-01-01"),
Some("2021-01-01"),
None,
]),
expected_max: date64_array(vec![
Some("2021-01-03"),
Some("2021-01-05"),
None,
]),
}
.run()
}

#[test]
fn struct_and_non_struct() {
// Ensures that statistics for an array that appears *after* a struct
Expand Down Expand Up @@ -1069,4 +1139,24 @@ mod test {
]);
Arc::new(struct_array)
}

fn date32_array<'a>(input: impl IntoIterator<Item = Option<&'a str>>) -> ArrayRef {
let array = Date32Array::from(
input
.into_iter()
.map(|s| Date32Type::parse(s.unwrap_or_default()))
.collect::<Vec<_>>(),
);
Arc::new(array)
}

fn date64_array<'a>(input: impl IntoIterator<Item = Option<&'a str>>) -> ArrayRef {
let array = Date64Array::from(
input
.into_iter()
.map(|s| Date64Type::parse(s.unwrap_or_default()))
.collect::<Vec<_>>(),
);
Arc::new(array)
}
}
38 changes: 23 additions & 15 deletions datafusion/core/tests/parquet/arrow_statistics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,12 @@
use std::fs::File;
use std::sync::Arc;

use arrow::compute::kernels::cast_utils::Parser;
use arrow::datatypes::{Date32Type, Date64Type};
use arrow_array::{
make_array, Array, ArrayRef, BooleanArray, Decimal128Array, FixedSizeBinaryArray,
Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, RecordBatch,
StringArray, UInt64Array,
make_array, Array, ArrayRef, BooleanArray, Date32Array, Date64Array, Decimal128Array,
FixedSizeBinaryArray, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array,
RecordBatch, StringArray, UInt64Array,
};
use arrow_schema::{DataType, Field, Schema};
use datafusion::datasource::physical_plan::parquet::{
Expand Down Expand Up @@ -638,8 +640,6 @@ async fn test_timestamp_diff_rg_sizes() {
}

// date with different row group sizes
// Bug expect `Date32Array` but returns Int32Array
// https://github.com/apache/datafusion/issues/10587
#[tokio::test]
async fn test_dates_32_diff_rg_sizes() {
// This creates a parquet files of 3 columns named "date32", "date64", "names"
Expand All @@ -654,10 +654,16 @@ async fn test_dates_32_diff_rg_sizes() {
};
Test {
reader: reader.build().await,
// mins are [18262, 18565,]
expected_min: Arc::new(Int32Array::from(vec![18262, 18565])),
// maxes are [18564, 21865,]
expected_max: Arc::new(Int32Array::from(vec![18564, 21865])),
// mins are [2020-01-01, 2020-10-30]
expected_min: Arc::new(Date32Array::from(vec![
Date32Type::parse("2020-01-01"),
Date32Type::parse("2020-10-30"),
])),
// maxes are [2020-10-29, 2029-11-12]
expected_max: Arc::new(Date32Array::from(vec![
Date32Type::parse("2020-10-29"),
Date32Type::parse("2029-11-12"),
])),
// nulls are [2, 2]
expected_null_counts: UInt64Array::from(vec![2, 2]),
// row counts are [13, 7]
Expand All @@ -667,10 +673,6 @@ async fn test_dates_32_diff_rg_sizes() {
.run();
}

// BUG: same as above. Expect to return Date64Array but returns Int32Array
// test date with different row group sizes
// https://github.com/apache/datafusion/issues/10587
#[ignore]
#[tokio::test]
async fn test_dates_64_diff_rg_sizes() {
// The file is created by 4 record batches (each has a null row), each has 5 rows but then will be split into 2 row groups with size 13, 7
Expand All @@ -680,8 +682,14 @@ async fn test_dates_64_diff_rg_sizes() {
};
Test {
reader: reader.build().await,
expected_min: Arc::new(Int64Array::from(vec![18262, 18565])), // panic here because the actual data is Int32Array
expected_max: Arc::new(Int64Array::from(vec![18564, 21865])),
expected_min: Arc::new(Date64Array::from(vec![
Date64Type::parse("2020-01-01"),
Date64Type::parse("2020-10-30"),
])),
expected_max: Arc::new(Date64Array::from(vec![
Date64Type::parse("2020-10-29"),
Date64Type::parse("2029-11-12"),
])),
expected_null_counts: UInt64Array::from(vec![2, 2]),
expected_row_counts: UInt64Array::from(vec![13, 7]),
column_name: "date64",
Expand Down

0 comments on commit 529d2c0

Please sign in to comment.