Skip to content

Commit 60ce764

Browse files
authored
[Variant] Add list support to unshred_variant (#8514)
* NOTE: Stacked on #8481, ignore the first commit when reviewing. # Which issue does this PR close? - Closes #8337 # Rationale for this change Add a missing feature. # What changes are included in this PR? Leveraging the recently added `ListLikeArray` trait, support all five list types when unshredding variant data. # Are these changes tested? Yes -- all the list-related variant shredding integration tests pass now. # Are there any user-facing changes? No.
1 parent d454fdd commit 60ce764

File tree

4 files changed

+122
-37
lines changed

4 files changed

+122
-37
lines changed

parquet-variant-compute/src/unshred_variant.rs

Lines changed: 88 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,11 @@
1717

1818
//! Module for unshredding VariantArray by folding typed_value columns back into the value column.
1919
20+
use crate::arrow_to_variant::ListLikeArray;
2021
use crate::{BorrowedShreddingState, VariantArray, VariantValueArrayBuilder};
2122
use arrow::array::{
22-
Array, AsArray as _, BinaryViewArray, BooleanArray, FixedSizeBinaryArray, PrimitiveArray,
23-
StringArray, StructArray,
23+
Array, AsArray as _, BinaryViewArray, BooleanArray, FixedSizeBinaryArray, FixedSizeListArray,
24+
GenericListArray, GenericListViewArray, PrimitiveArray, StringArray, StructArray,
2425
};
2526
use arrow::buffer::NullBuffer;
2627
use arrow::datatypes::{
@@ -99,6 +100,11 @@ enum UnshredVariantRowBuilder<'a> {
99100
PrimitiveString(UnshredPrimitiveRowBuilder<'a, StringArray>),
100101
PrimitiveBinaryView(UnshredPrimitiveRowBuilder<'a, BinaryViewArray>),
101102
PrimitiveUuid(UnshredPrimitiveRowBuilder<'a, FixedSizeBinaryArray>),
103+
List(ListUnshredVariantBuilder<'a, GenericListArray<i32>>),
104+
LargeList(ListUnshredVariantBuilder<'a, GenericListArray<i64>>),
105+
ListView(ListUnshredVariantBuilder<'a, GenericListViewArray<i32>>),
106+
LargeListView(ListUnshredVariantBuilder<'a, GenericListViewArray<i64>>),
107+
FixedSizeList(ListUnshredVariantBuilder<'a, FixedSizeListArray>),
102108
Struct(StructUnshredVariantBuilder<'a>),
103109
ValueOnly(ValueOnlyUnshredVariantBuilder<'a>),
104110
Null(NullUnshredVariantBuilder<'a>),
@@ -132,6 +138,11 @@ impl<'a> UnshredVariantRowBuilder<'a> {
132138
Self::PrimitiveString(b) => b.append_row(builder, metadata, index),
133139
Self::PrimitiveBinaryView(b) => b.append_row(builder, metadata, index),
134140
Self::PrimitiveUuid(b) => b.append_row(builder, metadata, index),
141+
Self::List(b) => b.append_row(builder, metadata, index),
142+
Self::LargeList(b) => b.append_row(builder, metadata, index),
143+
Self::ListView(b) => b.append_row(builder, metadata, index),
144+
Self::LargeListView(b) => b.append_row(builder, metadata, index),
145+
Self::FixedSizeList(b) => b.append_row(builder, metadata, index),
135146
Self::Struct(b) => b.append_row(builder, metadata, index),
136147
Self::ValueOnly(b) => b.append_row(builder, metadata, index),
137148
Self::Null(b) => b.append_row(builder, metadata, index),
@@ -208,6 +219,25 @@ impl<'a> UnshredVariantRowBuilder<'a> {
208219
value,
209220
typed_value.as_struct(),
210221
)?),
222+
DataType::List(_) => Self::List(ListUnshredVariantBuilder::try_new(
223+
value,
224+
typed_value.as_list(),
225+
)?),
226+
DataType::LargeList(_) => Self::LargeList(ListUnshredVariantBuilder::try_new(
227+
value,
228+
typed_value.as_list(),
229+
)?),
230+
DataType::ListView(_) => Self::ListView(ListUnshredVariantBuilder::try_new(
231+
value,
232+
typed_value.as_list_view(),
233+
)?),
234+
DataType::LargeListView(_) => Self::LargeListView(ListUnshredVariantBuilder::try_new(
235+
value,
236+
typed_value.as_list_view(),
237+
)?),
238+
DataType::FixedSizeList(_, _) => Self::FixedSizeList(
239+
ListUnshredVariantBuilder::try_new(value, typed_value.as_fixed_size_list())?,
240+
),
211241
_ => {
212242
return Err(ArrowError::NotYetImplemented(format!(
213243
"Unshredding not yet supported for type: {}",
@@ -517,5 +547,61 @@ impl<'a> StructUnshredVariantBuilder<'a> {
517547
}
518548
}
519549

550+
/// Builder for unshredding list/array types with recursive element processing
551+
struct ListUnshredVariantBuilder<'a, L: ListLikeArray> {
552+
value: Option<&'a BinaryViewArray>,
553+
typed_value: &'a L,
554+
element_unshredder: Box<UnshredVariantRowBuilder<'a>>,
555+
}
556+
557+
impl<'a, L: ListLikeArray> ListUnshredVariantBuilder<'a, L> {
558+
fn try_new(value: Option<&'a BinaryViewArray>, typed_value: &'a L) -> Result<Self> {
559+
// Create a recursive unshredder for the list elements
560+
// The element type comes from the values array of the list
561+
let element_values = typed_value.values();
562+
563+
// For shredded lists, each element would be a ShreddedVariantFieldArray (struct)
564+
// Extract value/typed_value from the element struct
565+
let Some(element_values) = element_values.as_struct_opt() else {
566+
return Err(ArrowError::InvalidArgumentError(format!(
567+
"Invalid shredded variant array element: expected Struct, got {}",
568+
element_values.data_type()
569+
)));
570+
};
571+
572+
// Create recursive unshredder for elements
573+
//
574+
// NOTE: A None/None array element is technically invalid, but the shredding spec
575+
// requires us to emit `Variant::Null` when a required value is missing.
576+
let element_unshredder = UnshredVariantRowBuilder::try_new_opt(element_values.try_into()?)?
577+
.unwrap_or_else(|| UnshredVariantRowBuilder::null(None));
578+
579+
Ok(Self {
580+
value,
581+
typed_value,
582+
element_unshredder: Box::new(element_unshredder),
583+
})
584+
}
585+
586+
fn append_row(
587+
&mut self,
588+
builder: &mut impl VariantBuilderExt,
589+
metadata: &VariantMetadata,
590+
index: usize,
591+
) -> Result<()> {
592+
handle_unshredded_case!(self, builder, metadata, index, false);
593+
594+
// If we get here, typed_value is valid and value is NULL -- process the list elements
595+
let mut list_builder = builder.try_new_list()?;
596+
for element_index in self.typed_value.element_range(index) {
597+
self.element_unshredder
598+
.append_row(&mut list_builder, metadata, element_index)?;
599+
}
600+
601+
list_builder.finish();
602+
Ok(())
603+
}
604+
}
605+
520606
// TODO: This code is covered by tests in `parquet/tests/variant_integration.rs`. Does that suffice?
521607
// Or do we also need targeted stand-alone unit tests for full coverage?

parquet-variant/src/variant/list.rs

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ impl VariantListHeader {
117117
///
118118
/// [valid]: VariantMetadata#Validation
119119
/// [Variant spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-data-for-array-basic_type3
120-
#[derive(Debug, Clone, PartialEq)]
120+
#[derive(Debug, Clone)]
121121
pub struct VariantList<'m, 'v> {
122122
pub metadata: VariantMetadata<'m>,
123123
pub value: &'v [u8],
@@ -302,6 +302,20 @@ impl<'m, 'v> VariantList<'m, 'v> {
302302
}
303303
}
304304

305+
// Custom implementation of PartialEq for variant arrays
306+
//
307+
// Instead of comparing the raw bytes of 2 variant lists, this implementation recursively
308+
// checks whether their elements are equal.
309+
impl<'m, 'v> PartialEq for VariantList<'m, 'v> {
310+
fn eq(&self, other: &Self) -> bool {
311+
if self.num_elements != other.num_elements {
312+
return false;
313+
}
314+
315+
self.iter().zip(other.iter()).all(|(a, b)| a == b)
316+
}
317+
}
318+
305319
#[cfg(test)]
306320
mod tests {
307321
use super::*;

parquet-variant/src/variant/object.rs

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -419,13 +419,9 @@ impl<'m, 'v> PartialEq for VariantObject<'m, 'v> {
419419
// IFF two objects are valid and logically equal, they will have the same
420420
// field names in the same order, because the spec requires the object
421421
// fields to be sorted lexicographically.
422-
for ((name_a, value_a), (name_b, value_b)) in self.iter().zip(other.iter()) {
423-
if name_a != name_b || value_a != value_b {
424-
return false;
425-
}
426-
}
427-
428-
true
422+
self.iter()
423+
.zip(other.iter())
424+
.all(|((name_a, value_a), (name_b, value_b))| name_a == name_b && value_a == value_b)
429425
}
430426
}
431427

parquet/tests/variant_integration.rs

Lines changed: 16 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -34,24 +34,16 @@ use std::{fs, path::PathBuf};
3434

3535
type Result<T> = std::result::Result<T, String>;
3636

37-
/// Creates a test function for a given case number
37+
/// Creates a test function for a given case number.
38+
///
39+
/// If an error message is provided, generate an error test case that expects it.
3840
///
3941
/// Note the index is zero-based, while the case number is one-based
4042
macro_rules! variant_test_case {
41-
($case_num:literal) => {
42-
paste::paste! {
43-
#[test]
44-
fn [<test_variant_integration_case_ $case_num>]() {
45-
all_cases()[$case_num - 1].run()
46-
}
47-
}
48-
};
49-
50-
// Generates an error test case, where the expected result is an error message
51-
($case_num:literal, $expected_error:literal) => {
43+
($case_num:literal $(, $expected_error:literal )? ) => {
5244
paste::paste! {
5345
#[test]
54-
#[should_panic(expected = $expected_error)]
46+
$( #[should_panic(expected = $expected_error)] )?
5547
fn [<test_variant_integration_case_ $case_num>]() {
5648
all_cases()[$case_num - 1].run()
5749
}
@@ -65,8 +57,8 @@ macro_rules! variant_test_case {
6557
// - cases 40, 42, 87, 127 and 128 are expected to fail always (they include invalid variants)
6658
// - the remaining cases are expected to (eventually) pass
6759

68-
variant_test_case!(1, "Unshredding not yet supported for type: List(");
69-
variant_test_case!(2, "Unshredding not yet supported for type: List(");
60+
variant_test_case!(1);
61+
variant_test_case!(2);
7062
// case 3 is empty in cases.json 🤷
7163
// ```json
7264
// {
@@ -130,16 +122,14 @@ variant_test_case!(37);
130122
variant_test_case!(38);
131123
variant_test_case!(39);
132124
// Is an error case (should be failing as the expected error message indicates)
133-
// TODO: Once we support lists: "both value and typed_value are non-null"
134-
variant_test_case!(40, "Unshredding not yet supported for type: List(");
135-
variant_test_case!(41, "Unshredding not yet supported for type: List(");
125+
variant_test_case!(40, "both value and typed_value are non-null");
126+
variant_test_case!(41);
136127
// Is an error case (should be failing as the expected error message indicates)
137128
variant_test_case!(42, "both value and typed_value are non-null");
138129
// Is an error case (should be failing as the expected error message indicates)
139130
variant_test_case!(43, "Field 'b' appears in both typed_value and value");
140131
variant_test_case!(44);
141-
// https://github.com/apache/arrow-rs/issues/8337
142-
variant_test_case!(45, "Unshredding not yet supported for type: List(");
132+
variant_test_case!(45);
143133
variant_test_case!(46);
144134
variant_test_case!(47);
145135
variant_test_case!(48);
@@ -180,12 +170,11 @@ variant_test_case!(82);
180170
variant_test_case!(83);
181171
// Invalid case, implementations can choose to read the shredded value or error out
182172
variant_test_case!(84);
183-
// https://github.com/apache/arrow-rs/issues/8337
184-
variant_test_case!(85, "Unshredding not yet supported for type: List(");
185-
variant_test_case!(86, "Unshredding not yet supported for type: List(");
173+
variant_test_case!(85);
174+
variant_test_case!(86);
186175
// Is an error case (should be failing as the expected error message indicates)
187176
variant_test_case!(87, "Expected object in value field");
188-
variant_test_case!(88, "Unshredding not yet supported for type: List(");
177+
variant_test_case!(88);
189178
variant_test_case!(89);
190179
variant_test_case!(90);
191180
variant_test_case!(91);
@@ -224,7 +213,7 @@ variant_test_case!(123);
224213
variant_test_case!(124);
225214
// Is an error case (should be failing as the expected error message indicates)
226215
variant_test_case!(125, "Field 'b' appears in both typed_value and value");
227-
variant_test_case!(126, "Unshredding not yet supported for type: List(");
216+
variant_test_case!(126);
228217
// Is an error case (should be failing as the expected error message indicates)
229218
variant_test_case!(127, "Illegal shredded value type: UInt32");
230219
// Is an error case (should be failing as the expected error message indicates)
@@ -235,8 +224,8 @@ variant_test_case!(131);
235224
variant_test_case!(132);
236225
variant_test_case!(133);
237226
variant_test_case!(134);
238-
variant_test_case!(135, "Unshredding not yet supported for type: List(");
239-
variant_test_case!(136, "Unshredding not yet supported for type: List(");
227+
variant_test_case!(135);
228+
variant_test_case!(136);
240229
// Is an error case (should be failing as the expected error message indicates)
241230
variant_test_case!(137, "Illegal shredded value type: FixedSizeBinary(4)");
242231
variant_test_case!(138);

0 commit comments

Comments
 (0)