Skip to content

Commit 1e03a1b

Browse files
authored
[Variant] Add variant to arrow primitive support for boolean/timestamp/time (#8516)
# Which issue does this PR close? - Closes #8515 . # What changes are included in this PR? - Add a macro_rule `define_variant_to_primitive_builder` used to construct the variant to x arrow row builder - implement `VariantToBooleanArrowRowBuilder`/`VariantToPrimitiveArrowRowBuilder`/`VariantToTimestampArrowRowBuilder` using the macro `define_variant_to_primitive_builder` - Add type access for `Variant::Timestamp/Time`(timestamp will automatic widen micros to nanos) - Add tests to cover `Variant::{int8/float32/float64}` for existing code # Are these changes tested? Added tests # Are there any user-facing changes? If there are user-facing changes then we may require documentation to be updated before approving the PR. If there are any breaking changes to public APIs, please call them out.
1 parent d5df352 commit 1e03a1b

File tree

4 files changed

+475
-87
lines changed

4 files changed

+475
-87
lines changed

parquet-variant-compute/src/type_conversion.rs

Lines changed: 51 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
//! Module for transforming a typed arrow `Array` to `VariantArray`.
1919
20-
use arrow::datatypes::{self, ArrowPrimitiveType};
20+
use arrow::datatypes::{self, ArrowPrimitiveType, ArrowTimestampType, Date32Type};
2121
use parquet_variant::Variant;
2222

2323
/// Options for controlling the behavior of `cast_to_variant_with_options`.
@@ -38,12 +38,31 @@ pub(crate) trait PrimitiveFromVariant: ArrowPrimitiveType {
3838
fn from_variant(variant: &Variant<'_, '_>) -> Option<Self::Native>;
3939
}
4040

41+
/// Extension trait for Arrow timestamp types that can extract their native value from a Variant
42+
/// We can't use [`PrimitiveFromVariant`] directly because we need _two_ implementations for each
43+
/// timestamp type -- the `NTZ` param here.
44+
pub(crate) trait TimestampFromVariant<const NTZ: bool>: ArrowTimestampType {
45+
fn from_variant(variant: &Variant<'_, '_>) -> Option<Self::Native>;
46+
}
47+
4148
/// Macro to generate PrimitiveFromVariant implementations for Arrow primitive types
4249
macro_rules! impl_primitive_from_variant {
43-
($arrow_type:ty, $variant_method:ident) => {
50+
($arrow_type:ty, $variant_method:ident $(, $cast_fn:expr)?) => {
4451
impl PrimitiveFromVariant for $arrow_type {
4552
fn from_variant(variant: &Variant<'_, '_>) -> Option<Self::Native> {
46-
variant.$variant_method()
53+
let value = variant.$variant_method();
54+
$( let value = value.map($cast_fn); )?
55+
value
56+
}
57+
}
58+
};
59+
}
60+
61+
macro_rules! impl_timestamp_from_variant {
62+
($timestamp_type:ty, $variant_method:ident, ntz=$ntz:ident, $cast_fn:expr $(,)?) => {
63+
impl TimestampFromVariant<{ $ntz }> for $timestamp_type {
64+
fn from_variant(variant: &Variant<'_, '_>) -> Option<Self::Native> {
65+
variant.$variant_method().and_then($cast_fn)
4766
}
4867
}
4968
};
@@ -60,6 +79,35 @@ impl_primitive_from_variant!(datatypes::UInt64Type, as_u64);
6079
impl_primitive_from_variant!(datatypes::Float16Type, as_f16);
6180
impl_primitive_from_variant!(datatypes::Float32Type, as_f32);
6281
impl_primitive_from_variant!(datatypes::Float64Type, as_f64);
82+
impl_primitive_from_variant!(
83+
datatypes::Date32Type,
84+
as_naive_date,
85+
Date32Type::from_naive_date
86+
);
87+
impl_timestamp_from_variant!(
88+
datatypes::TimestampMicrosecondType,
89+
as_timestamp_ntz_micros,
90+
ntz = true,
91+
Self::make_value,
92+
);
93+
impl_timestamp_from_variant!(
94+
datatypes::TimestampMicrosecondType,
95+
as_timestamp_micros,
96+
ntz = false,
97+
|timestamp| Self::make_value(timestamp.naive_utc())
98+
);
99+
impl_timestamp_from_variant!(
100+
datatypes::TimestampNanosecondType,
101+
as_timestamp_ntz_nanos,
102+
ntz = true,
103+
Self::make_value
104+
);
105+
impl_timestamp_from_variant!(
106+
datatypes::TimestampNanosecondType,
107+
as_timestamp_nanos,
108+
ntz = false,
109+
|timestamp| Self::make_value(timestamp.naive_utc())
110+
);
63111

64112
/// Convert the value at a specific index in the given array into a `Variant`.
65113
macro_rules! non_generic_conversion_single_value {

parquet-variant-compute/src/variant_get.rs

Lines changed: 185 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -300,13 +300,14 @@ mod test {
300300
use crate::json_to_variant;
301301
use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder};
302302
use arrow::array::{
303-
Array, ArrayRef, AsArray, BinaryViewArray, Date32Array, Float32Array, Float64Array,
304-
Int8Array, Int16Array, Int32Array, Int64Array, StringArray, StructArray,
303+
Array, ArrayRef, AsArray, BinaryViewArray, BooleanArray, Date32Array, Float32Array,
304+
Float64Array, Int8Array, Int16Array, Int32Array, Int64Array, StringArray, StructArray,
305305
};
306306
use arrow::buffer::NullBuffer;
307307
use arrow::compute::CastOptions;
308308
use arrow::datatypes::DataType::{Int16, Int32, Int64};
309-
use arrow_schema::{DataType, Field, FieldRef, Fields};
309+
use arrow_schema::DataType::{Boolean, Float32, Float64, Int8};
310+
use arrow_schema::{DataType, Field, FieldRef, Fields, TimeUnit};
310311
use chrono::DateTime;
311312
use parquet_variant::{EMPTY_VARIANT_METADATA_BYTES, Variant, VariantPath};
312313

@@ -700,7 +701,7 @@ mod test {
700701
}
701702

702703
macro_rules! perfectly_shredded_to_arrow_primitive_test {
703-
($name:ident, $primitive_type:ident, $perfectly_shredded_array_gen_fun:ident, $expected_array:expr) => {
704+
($name:ident, $primitive_type:expr, $perfectly_shredded_array_gen_fun:ident, $expected_array:expr) => {
704705
#[test]
705706
fn $name() {
706707
let array = $perfectly_shredded_array_gen_fun();
@@ -713,6 +714,13 @@ mod test {
713714
};
714715
}
715716

717+
perfectly_shredded_to_arrow_primitive_test!(
718+
get_variant_perfectly_shredded_int18_as_int8,
719+
Int8,
720+
perfectly_shredded_int8_variant_array,
721+
Int8Array::from(vec![Some(1), Some(2), Some(3)])
722+
);
723+
716724
perfectly_shredded_to_arrow_primitive_test!(
717725
get_variant_perfectly_shredded_int16_as_int16,
718726
Int16,
@@ -734,31 +742,37 @@ mod test {
734742
Int64Array::from(vec![Some(1), Some(2), Some(3)])
735743
);
736744

737-
/// Return a VariantArray that represents a perfectly "shredded" variant
738-
/// for the given typed value.
739-
///
740-
/// The schema of the corresponding `StructArray` would look like this:
741-
///
742-
/// ```text
743-
/// StructArray {
744-
/// metadata: BinaryViewArray,
745-
/// typed_value: Int32Array,
746-
/// }
747-
/// ```
748-
macro_rules! numeric_perfectly_shredded_variant_array_fn {
749-
($func:ident, $array_type:ident, $primitive_type:ty) => {
745+
perfectly_shredded_to_arrow_primitive_test!(
746+
get_variant_perfectly_shredded_float32_as_float32,
747+
Float32,
748+
perfectly_shredded_float32_variant_array,
749+
Float32Array::from(vec![Some(1.0), Some(2.0), Some(3.0)])
750+
);
751+
752+
perfectly_shredded_to_arrow_primitive_test!(
753+
get_variant_perfectly_shredded_float64_as_float64,
754+
Float64,
755+
perfectly_shredded_float64_variant_array,
756+
Float64Array::from(vec![Some(1.0), Some(2.0), Some(3.0)])
757+
);
758+
759+
perfectly_shredded_to_arrow_primitive_test!(
760+
get_variant_perfectly_shredded_boolean_as_boolean,
761+
Boolean,
762+
perfectly_shredded_bool_variant_array,
763+
BooleanArray::from(vec![Some(true), Some(false), Some(true)])
764+
);
765+
766+
macro_rules! perfectly_shredded_variant_array_fn {
767+
($func:ident, $typed_value_gen:expr) => {
750768
fn $func() -> ArrayRef {
751769
// At the time of writing, the `VariantArrayBuilder` does not support shredding.
752770
// so we must construct the array manually. see https://github.com/apache/arrow-rs/issues/7895
753771
let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(
754772
EMPTY_VARIANT_METADATA_BYTES,
755773
3,
756774
));
757-
let typed_value = $array_type::from(vec![
758-
Some(<$primitive_type>::try_from(1u8).unwrap()),
759-
Some(<$primitive_type>::try_from(2u8).unwrap()),
760-
Some(<$primitive_type>::try_from(3u8).unwrap()),
761-
]);
775+
let typed_value = $typed_value_gen();
762776

763777
let struct_array = StructArrayBuilder::new()
764778
.with_field("metadata", Arc::new(metadata), false)
@@ -772,6 +786,33 @@ mod test {
772786
};
773787
}
774788

789+
perfectly_shredded_variant_array_fn!(perfectly_shredded_bool_variant_array, || {
790+
BooleanArray::from(vec![Some(true), Some(false), Some(true)])
791+
});
792+
793+
/// Return a VariantArray that represents a perfectly "shredded" variant
794+
/// for the given typed value.
795+
///
796+
/// The schema of the corresponding `StructArray` would look like this:
797+
///
798+
/// ```text
799+
/// StructArray {
800+
/// metadata: BinaryViewArray,
801+
/// typed_value: Int32Array,
802+
/// }
803+
/// ```
804+
macro_rules! numeric_perfectly_shredded_variant_array_fn {
805+
($func:ident, $array_type:ident, $primitive_type:ty) => {
806+
perfectly_shredded_variant_array_fn!($func, || {
807+
$array_type::from(vec![
808+
Some(<$primitive_type>::try_from(1u8).unwrap()),
809+
Some(<$primitive_type>::try_from(2u8).unwrap()),
810+
Some(<$primitive_type>::try_from(3u8).unwrap()),
811+
])
812+
});
813+
};
814+
}
815+
775816
numeric_perfectly_shredded_variant_array_fn!(
776817
perfectly_shredded_int8_variant_array,
777818
Int8Array,
@@ -803,6 +844,128 @@ mod test {
803844
f64
804845
);
805846

847+
perfectly_shredded_variant_array_fn!(
848+
perfectly_shredded_timestamp_micro_ntz_variant_array,
849+
|| {
850+
arrow::array::TimestampMicrosecondArray::from(vec![
851+
Some(-456000),
852+
Some(1758602096000001),
853+
Some(1758602096000002),
854+
])
855+
}
856+
);
857+
858+
perfectly_shredded_to_arrow_primitive_test!(
859+
get_variant_perfectly_shredded_timestamp_micro_ntz_as_timestamp_micro_ntz,
860+
DataType::Timestamp(TimeUnit::Microsecond, None),
861+
perfectly_shredded_timestamp_micro_ntz_variant_array,
862+
arrow::array::TimestampMicrosecondArray::from(vec![
863+
Some(-456000),
864+
Some(1758602096000001),
865+
Some(1758602096000002),
866+
])
867+
);
868+
869+
// test converting micro to nano
870+
perfectly_shredded_to_arrow_primitive_test!(
871+
get_variant_perfectly_shredded_timestamp_micro_ntz_as_nano_ntz,
872+
DataType::Timestamp(TimeUnit::Nanosecond, None),
873+
perfectly_shredded_timestamp_micro_ntz_variant_array,
874+
arrow::array::TimestampNanosecondArray::from(vec![
875+
Some(-456000000),
876+
Some(1758602096000001000),
877+
Some(1758602096000002000)
878+
])
879+
);
880+
881+
perfectly_shredded_variant_array_fn!(perfectly_shredded_timestamp_micro_variant_array, || {
882+
arrow::array::TimestampMicrosecondArray::from(vec![
883+
Some(-456000),
884+
Some(1758602096000001),
885+
Some(1758602096000002),
886+
])
887+
.with_timezone("+00:00")
888+
});
889+
890+
perfectly_shredded_to_arrow_primitive_test!(
891+
get_variant_perfectly_shredded_timestamp_micro_as_timestamp_micro,
892+
DataType::Timestamp(TimeUnit::Microsecond, Some(Arc::from("+00:00"))),
893+
perfectly_shredded_timestamp_micro_variant_array,
894+
arrow::array::TimestampMicrosecondArray::from(vec![
895+
Some(-456000),
896+
Some(1758602096000001),
897+
Some(1758602096000002),
898+
])
899+
.with_timezone("+00:00")
900+
);
901+
902+
// test converting micro to nano
903+
perfectly_shredded_to_arrow_primitive_test!(
904+
get_variant_perfectly_shredded_timestamp_micro_as_nano,
905+
DataType::Timestamp(TimeUnit::Nanosecond, Some(Arc::from("+00:00"))),
906+
perfectly_shredded_timestamp_micro_variant_array,
907+
arrow::array::TimestampNanosecondArray::from(vec![
908+
Some(-456000000),
909+
Some(1758602096000001000),
910+
Some(1758602096000002000)
911+
])
912+
.with_timezone("+00:00")
913+
);
914+
915+
perfectly_shredded_variant_array_fn!(
916+
perfectly_shredded_timestamp_nano_ntz_variant_array,
917+
|| {
918+
arrow::array::TimestampNanosecondArray::from(vec![
919+
Some(-4999999561),
920+
Some(1758602096000000001),
921+
Some(1758602096000000002),
922+
])
923+
}
924+
);
925+
926+
perfectly_shredded_to_arrow_primitive_test!(
927+
get_variant_perfectly_shredded_timestamp_nano_ntz_as_timestamp_nano_ntz,
928+
DataType::Timestamp(TimeUnit::Nanosecond, None),
929+
perfectly_shredded_timestamp_nano_ntz_variant_array,
930+
arrow::array::TimestampNanosecondArray::from(vec![
931+
Some(-4999999561),
932+
Some(1758602096000000001),
933+
Some(1758602096000000002),
934+
])
935+
);
936+
937+
perfectly_shredded_variant_array_fn!(perfectly_shredded_timestamp_nano_variant_array, || {
938+
arrow::array::TimestampNanosecondArray::from(vec![
939+
Some(-4999999561),
940+
Some(1758602096000000001),
941+
Some(1758602096000000002),
942+
])
943+
.with_timezone("+00:00")
944+
});
945+
946+
perfectly_shredded_to_arrow_primitive_test!(
947+
get_variant_perfectly_shredded_timestamp_nano_as_timestamp_nano,
948+
DataType::Timestamp(TimeUnit::Nanosecond, Some(Arc::from("+00:00"))),
949+
perfectly_shredded_timestamp_nano_variant_array,
950+
arrow::array::TimestampNanosecondArray::from(vec![
951+
Some(-4999999561),
952+
Some(1758602096000000001),
953+
Some(1758602096000000002),
954+
])
955+
.with_timezone("+00:00")
956+
);
957+
958+
perfectly_shredded_variant_array_fn!(perfectly_shredded_date_variant_array, || {
959+
Date32Array::from(vec![Some(-12345), Some(17586), Some(20000)])
960+
});
961+
962+
perfectly_shredded_to_arrow_primitive_test!(
963+
get_variant_perfectly_shredded_date_as_date,
964+
DataType::Date32,
965+
perfectly_shredded_date_variant_array,
966+
Date32Array::from(vec![Some(-12345), Some(17586), Some(20000)])
967+
);
968+
806969
macro_rules! assert_variant_get_as_variant_array_with_default_option {
807970
($variant_array: expr, $array_expected: expr) => {{
808971
let options = GetOptions::new();

0 commit comments

Comments
 (0)