Skip to content

Commit 2fc17be

Browse files
committed
[Variant] Support primitive variant to arrow row for timestamp(micro&nano) and time
1 parent 56649bf commit 2fc17be

File tree

3 files changed

+190
-9
lines changed

3 files changed

+190
-9
lines changed

parquet-variant-compute/src/type_conversion.rs

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
//! Module for transforming a typed arrow `Array` to `VariantArray`.
1919
2020
use arrow::datatypes::{self, ArrowPrimitiveType};
21+
use chrono::Datelike;
2122
use parquet_variant::Variant;
2223

2324
/// Options for controlling the behavior of `cast_to_variant_with_options`.
@@ -98,6 +99,34 @@ impl VariantAsPrimitive<datatypes::UInt64Type> for Variant<'_, '_> {
9899
}
99100
}
100101

102+
impl VariantAsPrimitive<datatypes::TimestampMicrosecondType> for Variant<'_, '_> {
103+
fn as_primitive(&self) -> Option<i64> {
104+
match self {
105+
Variant::TimestampMicros(dt) => Some(dt.timestamp_micros()),
106+
Variant::TimestampNtzMicros(ndt) => Some(ndt.and_utc().timestamp_micros()),
107+
_ => None,
108+
}
109+
}
110+
}
111+
112+
impl VariantAsPrimitive<datatypes::TimestampNanosecondType> for Variant<'_, '_> {
113+
fn as_primitive(&self) -> Option<i64> {
114+
match self {
115+
Variant::TimestampNanos(dt) => dt.timestamp_nanos_opt(),
116+
Variant::TimestampNtzNanos(ndt) => ndt.and_utc().timestamp_nanos_opt(),
117+
_ => None,
118+
}
119+
}
120+
}
121+
122+
impl VariantAsPrimitive<datatypes::Date32Type> for Variant<'_, '_> {
123+
fn as_primitive(&self) -> Option<i32> {
124+
// The number of days from 0001-01-01 to 1970-01-01.
125+
const DAYS_FROM_CE_TO_UNIX_EPOCH: i32 = 719163;
126+
self.as_naive_date()
127+
.map(|d| d.num_days_from_ce() - DAYS_FROM_CE_TO_UNIX_EPOCH)
128+
}
129+
}
101130
/// Convert the value at a specific index in the given array into a `Variant`.
102131
macro_rules! non_generic_conversion_single_value {
103132
($array:expr, $cast_fn:expr, $index:expr) => {{

parquet-variant-compute/src/variant_get.rs

Lines changed: 102 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -297,18 +297,19 @@ mod test {
297297
use std::sync::Arc;
298298

299299
use super::{variant_get, GetOptions};
300+
use crate::arrow_to_variant::ArrowToVariantRowBuilder::Date32;
300301
use crate::json_to_variant;
301302
use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder};
302303
use crate::VariantArray;
303304
use arrow::array::{
304-
Array, ArrayRef, AsArray, BinaryViewArray, Date32Array, Float32Array, Float64Array,
305-
Int16Array, Int32Array, Int64Array, Int8Array, StringArray, StructArray,
305+
Array, ArrayRef, AsArray, BinaryViewArray, BooleanArray, Date32Array, Float32Array,
306+
Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, StringArray, StructArray,
306307
};
307308
use arrow::buffer::NullBuffer;
308309
use arrow::compute::CastOptions;
309310
use arrow::datatypes::DataType::{Int16, Int32, Int64};
310311
use arrow_schema::DataType::{Boolean, Float32, Float64, Int8};
311-
use arrow_schema::{DataType, Field, FieldRef, Fields};
312+
use arrow_schema::{DataType, Field, FieldRef, Fields, TimeUnit};
312313
use chrono::DateTime;
313314
use parquet_variant::{Variant, VariantPath, EMPTY_VARIANT_METADATA_BYTES};
314315

@@ -699,7 +700,7 @@ mod test {
699700
}
700701

701702
macro_rules! perfectly_shredded_to_arrow_primitive_test {
702-
($name:ident, $primitive_type:ident, $perfectly_shredded_array_gen_fun:ident, $expected_array:expr) => {
703+
($name:ident, $primitive_type:expr, $perfectly_shredded_array_gen_fun:ident, $expected_array:expr) => {
703704
#[test]
704705
fn $name() {
705706
let array = $perfectly_shredded_array_gen_fun();
@@ -842,6 +843,103 @@ mod test {
842843
f64
843844
);
844845

846+
perfectly_shredded_variant_array_fn!(
847+
perfectly_shredded_timestamp_micro_ntz_variant_array,
848+
|| {
849+
arrow::array::TimestampMicrosecondArray::from(vec![
850+
Some(-456000),
851+
Some(1758602096000001),
852+
Some(1758602096000002),
853+
])
854+
}
855+
);
856+
857+
perfectly_shredded_to_arrow_primitive_test!(
858+
get_variant_perfectly_shredded_timestamp_micro_ntz_as_timestamp_micro_ntz,
859+
DataType::Timestamp(TimeUnit::Microsecond, None),
860+
perfectly_shredded_timestamp_micro_ntz_variant_array,
861+
arrow::array::TimestampMicrosecondArray::from(vec![
862+
Some(-456000),
863+
Some(1758602096000001),
864+
Some(1758602096000002),
865+
])
866+
);
867+
868+
perfectly_shredded_variant_array_fn!(perfectly_shredded_timestamp_micro_variant_array, || {
869+
arrow::array::TimestampMicrosecondArray::from(vec![
870+
Some(-456000),
871+
Some(1758602096000001),
872+
Some(1758602096000002),
873+
])
874+
.with_timezone("+00:00")
875+
});
876+
877+
perfectly_shredded_to_arrow_primitive_test!(
878+
get_variant_perfectly_shredded_timestamp_micro_as_timestamp_micro,
879+
DataType::Timestamp(TimeUnit::Microsecond, Some(Arc::from("+00:00"))),
880+
perfectly_shredded_timestamp_micro_variant_array,
881+
arrow::array::TimestampMicrosecondArray::from(vec![
882+
Some(-456000),
883+
Some(1758602096000001),
884+
Some(1758602096000002),
885+
])
886+
.with_timezone("+00:00")
887+
);
888+
889+
perfectly_shredded_variant_array_fn!(
890+
perfectly_shredded_timestamp_nano_ntz_variant_array,
891+
|| {
892+
arrow::array::TimestampNanosecondArray::from(vec![
893+
Some(-4999999561),
894+
Some(1758602096000000001),
895+
Some(1758602096000000002),
896+
])
897+
}
898+
);
899+
900+
perfectly_shredded_to_arrow_primitive_test!(
901+
get_variant_perfectly_shredded_timestamp_nano_ntz_as_timestamp_nano_ntz,
902+
DataType::Timestamp(TimeUnit::Nanosecond, None),
903+
perfectly_shredded_timestamp_nano_ntz_variant_array,
904+
arrow::array::TimestampNanosecondArray::from(vec![
905+
Some(-4999999561),
906+
Some(1758602096000000001),
907+
Some(1758602096000000002),
908+
])
909+
);
910+
911+
perfectly_shredded_variant_array_fn!(perfectly_shredded_timestamp_nano_variant_array, || {
912+
arrow::array::TimestampNanosecondArray::from(vec![
913+
Some(-4999999561),
914+
Some(1758602096000000001),
915+
Some(1758602096000000002),
916+
])
917+
.with_timezone("+00:00")
918+
});
919+
920+
perfectly_shredded_to_arrow_primitive_test!(
921+
get_variant_perfectly_shredded_timestamp_nano_as_timestamp_nano,
922+
DataType::Timestamp(TimeUnit::Nanosecond, Some(Arc::from("+00:00"))),
923+
perfectly_shredded_timestamp_nano_variant_array,
924+
arrow::array::TimestampNanosecondArray::from(vec![
925+
Some(-4999999561),
926+
Some(1758602096000000001),
927+
Some(1758602096000000002),
928+
])
929+
.with_timezone("+00:00")
930+
);
931+
932+
perfectly_shredded_variant_array_fn!(perfectly_shredded_date_variant_array, || {
933+
Date32Array::from(vec![Some(-12345), Some(17586), Some(20000)])
934+
});
935+
936+
perfectly_shredded_to_arrow_primitive_test!(
937+
get_variant_perfectly_shredded_date_as_date,
938+
DataType::Date32,
939+
perfectly_shredded_date_variant_array,
940+
Date32Array::from(vec![Some(-12345), Some(17586), Some(20000)])
941+
);
942+
845943
macro_rules! assert_variant_get_as_variant_array_with_default_option {
846944
($variant_array: expr, $array_expected: expr) => {{
847945
let options = GetOptions::new();

parquet-variant-compute/src/variant_to_arrow.rs

Lines changed: 59 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,15 @@ use arrow::array::{
1919
Array, ArrayRef, BinaryViewArray, NullBufferBuilder, PrimitiveArray, PrimitiveBuilder,
2020
};
2121
use arrow::compute::CastOptions;
22-
use arrow::datatypes::{self, ArrowPrimitiveType, DataType};
22+
use arrow::datatypes::{self, ArrowPrimitiveType, DataType, Date32Type};
2323
use arrow::error::{ArrowError, Result};
2424
use parquet_variant::{Variant, VariantPath};
2525

2626
use crate::type_conversion::VariantAsPrimitive;
2727
use crate::{VariantArray, VariantValueArrayBuilder};
2828

29+
use arrow_schema::DataType::Date32;
30+
use arrow_schema::TimeUnit;
2931
use std::sync::Arc;
3032

3133
/// Builder for converting variant values to primitive Arrow arrays. It is used by both
@@ -44,7 +46,9 @@ pub(crate) enum PrimitiveVariantToArrowRowBuilder<'a> {
4446
Float16(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Float16Type>),
4547
Float32(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Float32Type>),
4648
Float64(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Float64Type>),
47-
Boolean(VariantToBooleanArrowRowBuilder<'a>),
49+
TimestampMicro(VariantToPrimitiveArrowRowBuilder<'a, datatypes::TimestampMicrosecondType>),
50+
TimestampNano(VariantToPrimitiveArrowRowBuilder<'a, datatypes::TimestampNanosecondType>),
51+
Date(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Date32Type>),
4852
}
4953

5054
/// Builder for converting variant values into strongly typed Arrow arrays.
@@ -77,6 +81,7 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> {
7781
Float64(b) => b.append_null(),
7882
TimestampMicro(b) => b.append_null(),
7983
TimestampNano(b) => b.append_null(),
84+
Date(b) => b.append_null(),
8085
}
8186
}
8287

@@ -95,6 +100,9 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> {
95100
Float32(b) => b.append_value(value),
96101
Float64(b) => b.append_value(value),
97102
Boolean(b) => b.append_value(value),
103+
TimestampMicro(b) => b.append_value(value),
104+
TimestampNano(b) => b.append_value(value),
105+
Date(b) => b.append_value(value),
98106
}
99107
}
100108

@@ -113,7 +121,9 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> {
113121
Float16(b) => b.finish(),
114122
Float32(b) => b.finish(),
115123
Float64(b) => b.finish(),
116-
Boolean(b) => b.finish(),
124+
TimestampMicro(b) => b.finish(),
125+
TimestampNano(b) => b.finish(),
126+
Date(b) => b.finish(),
117127
}
118128
}
119129
}
@@ -201,7 +211,29 @@ pub(crate) fn make_primitive_variant_to_arrow_row_builder<'a>(
201211
cast_options,
202212
capacity,
203213
)),
204-
DataType::Boolean => Boolean(VariantToBooleanArrowRowBuilder::new(cast_options, capacity)),
214+
DataType::Timestamp(TimeUnit::Microsecond, tz) => {
215+
let target_type = DataType::Timestamp(TimeUnit::Microsecond, tz.clone());
216+
217+
TimestampMicro(VariantToPrimitiveArrowRowBuilder::new_with_target_type(
218+
cast_options,
219+
capacity,
220+
Some(target_type),
221+
))
222+
}
223+
DataType::Timestamp(TimeUnit::Nanosecond, tz) => {
224+
let target_type = DataType::Timestamp(TimeUnit::Nanosecond, tz.clone());
225+
226+
TimestampNano(VariantToPrimitiveArrowRowBuilder::new_with_target_type(
227+
cast_options,
228+
capacity,
229+
Some(target_type),
230+
))
231+
}
232+
DataType::Date32 => Date(VariantToPrimitiveArrowRowBuilder::new_with_target_type(
233+
cast_options,
234+
capacity,
235+
Some(Date32),
236+
)),
205237
_ if data_type.is_primitive() => {
206238
return Err(ArrowError::NotYetImplemented(format!(
207239
"Primitive data_type {data_type:?} not yet implemented"
@@ -305,6 +337,8 @@ fn get_type_name<T: ArrowPrimitiveType>() -> &'static str {
305337
"arrow_array::types::Float32Type" => "Float32",
306338
"arrow_array::types::Float64Type" => "Float64",
307339
"arrow_array::types::Float16Type" => "Float16",
340+
"arrow_array::types::TimestampMicrosecondType" => "Timestamp(Microsecond)",
341+
"arrow_array::types::TimestampNanosecondType" => "Timestamp(Nanosecond)",
308342
_ => "Unknown",
309343
}
310344
}
@@ -356,13 +390,24 @@ impl<'a> VariantToBooleanArrowRowBuilder<'a> {
356390
pub(crate) struct VariantToPrimitiveArrowRowBuilder<'a, T: ArrowPrimitiveType> {
357391
builder: arrow::array::PrimitiveBuilder<T>,
358392
cast_options: &'a CastOptions<'a>,
393+
// this used to change the data type of the resulting array, e.g. to add timezone info
394+
target_data_type: Option<DataType>,
359395
}
360396

361397
impl<'a, T: ArrowPrimitiveType> VariantToPrimitiveArrowRowBuilder<'a, T> {
362398
fn new(cast_options: &'a CastOptions<'a>, capacity: usize) -> Self {
399+
Self::new_with_target_type(cast_options, capacity, None)
400+
}
401+
402+
fn new_with_target_type(
403+
cast_options: &'a CastOptions<'a>,
404+
capacity: usize,
405+
target_data_type: Option<DataType>,
406+
) -> Self {
363407
Self {
364408
builder: PrimitiveBuilder::<T>::with_capacity(capacity),
365409
cast_options,
410+
target_data_type,
366411
}
367412
}
368413
}
@@ -397,7 +442,16 @@ where
397442
}
398443

399444
fn finish(mut self) -> Result<ArrayRef> {
400-
Ok(Arc::new(self.builder.finish()))
445+
let array: PrimitiveArray<T> = self.builder.finish();
446+
447+
if let Some(target_type) = self.target_data_type {
448+
let data = array.into_data();
449+
let new_data = data.into_builder().data_type(target_type).build()?;
450+
let array_with_new_type = PrimitiveArray::<T>::from(new_data);
451+
return Ok(Arc::new(array_with_new_type));
452+
}
453+
454+
Ok(Arc::new(array))
401455
}
402456
}
403457

0 commit comments

Comments
 (0)