1717
1818//! [`VariantArray`] implementation
1919
20- use arrow:: array:: { Array , ArrayData , ArrayRef , AsArray , StructArray } ;
20+ use arrow:: array:: { Array , ArrayData , ArrayRef , AsArray , BinaryViewArray , StructArray } ;
2121use arrow:: buffer:: NullBuffer ;
22+ use arrow:: datatypes:: Int32Type ;
2223use arrow_schema:: { ArrowError , DataType } ;
2324use parquet_variant:: Variant ;
2425use std:: any:: Any ;
@@ -44,27 +45,90 @@ use std::sync::Arc;
4445/// [document]: https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?usp=sharing
4546#[ derive( Debug ) ]
4647pub struct VariantArray {
47- /// StructArray of up to three fields:
48- ///
49- /// 1. A required field named `metadata` which is binary, large_binary, or
50- /// binary_view
51- ///
52- /// 2. An optional field named `value` that is binary, large_binary, or
53- /// binary_view
54- ///
55- /// 3. An optional field named `typed_value` which can be any primitive type
56- /// or be a list, large_list, list_view or struct
57- ///
58- /// NOTE: It is also permissible for the metadata field to be
59- /// Dictionary-Encoded, preferably (but not required) with an index type of
60- /// int8.
48+ /// Reference to the underlying StructArray
6149 inner : StructArray ,
6250
63- /// Reference to the metadata column of inner
64- metadata_ref : ArrayRef ,
51+ /// how is this variant array shredded?
52+ shredding_state : ShreddingState ,
53+ }
54+
55+ /// Variant arrays can be shredded in one of three states, encoded here
56+ #[ derive( Debug ) ]
57+ pub enum ShreddingState {
58+ /// This variant has no typed_value field
59+ Unshredded {
60+ metadata : BinaryViewArray ,
61+ value : BinaryViewArray ,
62+ } ,
63+ /// This variant has a typed_value field and no value field
64+ /// meaning it is fully shredded (aka the value is stored in typed_value)
65+ FullyShredded {
66+ metadata : BinaryViewArray ,
67+ typed_value : ArrayRef ,
68+ } ,
69+ /// This variant has both a value field and a typed_value field
70+ /// meaning it is partially shredded: first the typed_value is used, and
71+ /// if that is null, the value field is used.
72+ PartiallyShredded {
73+ metadata : BinaryViewArray ,
74+ value : BinaryViewArray ,
75+ typed_value : ArrayRef ,
76+ } ,
77+ }
78+
79+ impl ShreddingState {
80+ /// Return a reference to the metadata field
81+ pub fn metadata_field ( & self ) -> & BinaryViewArray {
82+ match self {
83+ ShreddingState :: Unshredded { metadata, .. } => metadata,
84+ ShreddingState :: FullyShredded { metadata, .. } => metadata,
85+ ShreddingState :: PartiallyShredded { metadata, .. } => metadata,
86+ }
87+ }
88+
89+ /// Return a reference to the value field, if present
90+ pub fn value_field ( & self ) -> Option < & BinaryViewArray > {
91+ match self {
92+ ShreddingState :: Unshredded { value, .. } => Some ( value) ,
93+ ShreddingState :: FullyShredded { .. } => None ,
94+ ShreddingState :: PartiallyShredded { value, .. } => Some ( value) ,
95+ }
96+ }
97+
98+ /// Return a reference to the typed_value field, if present
99+ pub fn typed_value_field ( & self ) -> Option < & ArrayRef > {
100+ match self {
101+ ShreddingState :: Unshredded { .. } => None ,
102+ ShreddingState :: FullyShredded { typed_value, .. } => Some ( typed_value) ,
103+ ShreddingState :: PartiallyShredded { typed_value, .. } => Some ( typed_value) ,
104+ }
105+ }
65106
66- /// Reference to the value column of inner
67- value_ref : ArrayRef ,
107+ /// Slice all the underlying arrays
108+ pub fn slice ( & self , offset : usize , length : usize ) -> Self {
109+ match self {
110+ ShreddingState :: Unshredded { metadata, value } => ShreddingState :: Unshredded {
111+ metadata : metadata. slice ( offset, length) ,
112+ value : value. slice ( offset, length) ,
113+ } ,
114+ ShreddingState :: FullyShredded {
115+ metadata,
116+ typed_value,
117+ } => ShreddingState :: FullyShredded {
118+ metadata : metadata. slice ( offset, length) ,
119+ typed_value : typed_value. slice ( offset, length) ,
120+ } ,
121+ ShreddingState :: PartiallyShredded {
122+ metadata,
123+ value,
124+ typed_value,
125+ } => ShreddingState :: PartiallyShredded {
126+ metadata : metadata. slice ( offset, length) ,
127+ value : value. slice ( offset, length) ,
128+ typed_value : typed_value. slice ( offset, length) ,
129+ } ,
130+ }
131+ }
68132}
69133
70134impl VariantArray {
@@ -79,12 +143,22 @@ impl VariantArray {
79143 /// # Errors:
80144 /// - If the `StructArray` does not contain the required fields
81145 ///
82- /// # Current support
83- /// This structure does not (yet) support the full Arrow Variant Array specification.
146+ /// # Requirements of the `StructArray`
147+ ///
148+ /// 1. A required field named `metadata` which is binary, large_binary, or
149+ /// binary_view
84150 ///
85- /// Only `StructArrays` with `metadata` and `value` fields that are
86- /// [`BinaryViewArray`] are supported. Shredded values are not currently supported
87- /// nor are using types other than `BinaryViewArray`
151+ /// 2. An optional field named `value` that is binary, large_binary, or
152+ /// binary_view
153+ ///
154+ /// 3. An optional field named `typed_value` which can be any primitive type
155+ /// or be a list, large_list, list_view or struct
156+ ///
157+ /// NOTE: It is also permissible for the metadata field to be
158+ /// Dictionary-Encoded, preferably (but not required) with an index type of
159+ /// int8.
160+ ///
161+ /// Currently, only [`BinaryViewArray`] are supported.
88162 ///
89163 /// [`BinaryViewArray`]: arrow::array::BinaryViewArray
90164 pub fn try_new ( inner : ArrayRef ) -> Result < Self , ArrowError > {
@@ -93,35 +167,64 @@ impl VariantArray {
93167 "Invalid VariantArray: requires StructArray as input" . to_string ( ) ,
94168 ) ) ;
95169 } ;
96- // Ensure the StructArray has a metadata field of BinaryView
97170
98- let Some ( metadata_field) = VariantArray :: find_metadata_field ( inner) else {
171+ // Note the specification allows for any order so we must search by name
172+
173+ // Ensure the StructArray has a metadata field of BinaryView
174+ let Some ( metadata_field) = inner. column_by_name ( "metadata" ) else {
99175 return Err ( ArrowError :: InvalidArgumentError (
100176 "Invalid VariantArray: StructArray must contain a 'metadata' field" . to_string ( ) ,
101177 ) ) ;
102178 } ;
103- if metadata_field . data_type ( ) != & DataType :: BinaryView {
179+ let Some ( metadata ) = metadata_field . as_binary_view_opt ( ) else {
104180 return Err ( ArrowError :: NotYetImplemented ( format ! (
105181 "VariantArray 'metadata' field must be BinaryView, got {}" ,
106182 metadata_field. data_type( )
107183 ) ) ) ;
108- }
109- let Some ( value_field) = VariantArray :: find_value_field ( inner) else {
110- return Err ( ArrowError :: InvalidArgumentError (
111- "Invalid VariantArray: StructArray must contain a 'value' field" . to_string ( ) ,
112- ) ) ;
113184 } ;
114- if value_field. data_type ( ) != & DataType :: BinaryView {
115- return Err ( ArrowError :: NotYetImplemented ( format ! (
116- "VariantArray 'value' field must be BinaryView, got {}" ,
117- value_field. data_type( )
118- ) ) ) ;
119- }
185+
186+ // Find the value field, if present
187+ let value_field = inner. column_by_name ( "value" ) ;
188+ let value = value_field
189+ . map ( |v| match v. as_binary_view_opt ( ) {
190+ Some ( bv) => Ok ( bv) ,
191+ None => Err ( ArrowError :: NotYetImplemented ( format ! (
192+ "VariantArray 'value' field must be BinaryView, got {}" ,
193+ v. data_type( )
194+ ) ) ) ,
195+ } )
196+ . transpose ( ) ?;
197+
198+ // Find the typed_value field, if present
199+ let typed_value = inner. column_by_name ( "typed_value" ) ;
200+
201+ // Note these clones are cheap, they just bump the ref count
202+ let inner = inner. clone ( ) ;
203+ let metadata = metadata. clone ( ) ;
204+ let value = value. cloned ( ) ;
205+ let typed_value = typed_value. cloned ( ) ;
206+
207+ let shredding_state = match ( metadata, value, typed_value) {
208+ ( metadata, Some ( value) , Some ( typed_value) ) => ShreddingState :: PartiallyShredded {
209+ metadata,
210+ value,
211+ typed_value,
212+ } ,
213+ ( metadata, Some ( value) , None ) => ShreddingState :: Unshredded { metadata, value } ,
214+ ( metadata, None , Some ( typed_value) ) => ShreddingState :: FullyShredded {
215+ metadata,
216+ typed_value,
217+ } ,
218+ ( _metadata_field, None , None ) => {
219+ return Err ( ArrowError :: InvalidArgumentError ( String :: from (
220+ "VariantArray has neither value nor typed_value field" ,
221+ ) ) ) ;
222+ }
223+ } ;
120224
121225 Ok ( Self {
122- inner : inner. clone ( ) ,
123- metadata_ref : metadata_field,
124- value_ref : value_field,
226+ inner,
227+ shredding_state,
125228 } )
126229 }
127230
@@ -135,36 +238,87 @@ impl VariantArray {
135238 self . inner
136239 }
137240
241+ /// Return the shredding state of this `VariantArray`
242+ pub fn shredding_state ( & self ) -> & ShreddingState {
243+ & self . shredding_state
244+ }
245+
138246 /// Return the [`Variant`] instance stored at the given row
139247 ///
140- /// Panics if the index is out of bounds.
248+ /// Consistently with other Arrow arrays types, this API requires you to
249+ /// check for nulls first using [`Self::is_valid`].
250+ ///
251+ /// # Panics
252+ /// * if the index is out of bounds
253+ /// * if the array value is null
254+ ///
255+ /// If this is a shredded variant but has no value at the shredded location, it
256+ /// will return [`Variant::Null`].
257+ ///
258+ ///
259+ /// # Performance Note
260+ ///
261+ /// This is certainly not the most efficient way to access values in a
262+ /// `VariantArray`, but it is useful for testing and debugging.
141263 ///
142264 /// Note: Does not do deep validation of the [`Variant`], so it is up to the
143265 /// caller to ensure that the metadata and value were constructed correctly.
144266 pub fn value ( & self , index : usize ) -> Variant {
145- let metadata = self . metadata_field ( ) . as_binary_view ( ) . value ( index) ;
146- let value = self . value_field ( ) . as_binary_view ( ) . value ( index) ;
147- Variant :: new ( metadata, value)
267+ match & self . shredding_state {
268+ ShreddingState :: Unshredded { metadata, value } => {
269+ Variant :: new ( metadata. value ( index) , value. value ( index) )
270+ }
271+ ShreddingState :: FullyShredded {
272+ metadata : _,
273+ typed_value,
274+ } => {
275+ if typed_value. is_null ( index) {
276+ Variant :: Null
277+ } else {
278+ typed_value_to_variant ( typed_value, index)
279+ }
280+ }
281+ ShreddingState :: PartiallyShredded {
282+ metadata,
283+ value,
284+ typed_value,
285+ } => {
286+ if typed_value. is_null ( index) {
287+ Variant :: new ( metadata. value ( index) , value. value ( index) )
288+ } else {
289+ typed_value_to_variant ( typed_value, index)
290+ }
291+ }
292+ }
148293 }
149294
150- fn find_metadata_field ( array : & StructArray ) -> Option < ArrayRef > {
151- array. column_by_name ( "metadata" ) . cloned ( )
295+ /// Return a reference to the metadata field of the [`StructArray`]
296+ pub fn metadata_field ( & self ) -> & BinaryViewArray {
297+ self . shredding_state . metadata_field ( )
152298 }
153299
154- fn find_value_field ( array : & StructArray ) -> Option < ArrayRef > {
155- array. column_by_name ( "value" ) . cloned ( )
300+ /// Return a reference to the value field of the `StructArray`
301+ pub fn value_field ( & self ) -> Option < & BinaryViewArray > {
302+ self . shredding_state . value_field ( )
156303 }
157304
158- /// Return a reference to the metadata field of the [`StructArray`]
159- pub fn metadata_field ( & self ) -> & ArrayRef {
160- // spec says fields order is not guaranteed, so we search by name
161- & self . metadata_ref
305+ /// Return a reference to the typed_value field of the `StructArray`, if present
306+ pub fn typed_value_field ( & self ) -> Option < & ArrayRef > {
307+ self . shredding_state . typed_value_field ( )
162308 }
309+ }
163310
164- /// Return a reference to the value field of the `StructArray`
165- pub fn value_field ( & self ) -> & ArrayRef {
166- // spec says fields order is not guaranteed, so we search by name
167- & self . value_ref
311+ /// returns the non-null element at index as a Variant
312+ fn typed_value_to_variant ( typed_value : & ArrayRef , index : usize ) -> Variant {
313+ match typed_value. data_type ( ) {
314+ DataType :: Int32 => {
315+ let typed_value = typed_value. as_primitive :: < Int32Type > ( ) ;
316+ Variant :: from ( typed_value. value ( index) )
317+ }
318+ // todo other types here
319+ _ => {
320+ todo ! ( ) ; // Unsupported typed_value type
321+ }
168322 }
169323}
170324
@@ -186,13 +340,11 @@ impl Array for VariantArray {
186340 }
187341
188342 fn slice ( & self , offset : usize , length : usize ) -> ArrayRef {
189- let slice = self . inner . slice ( offset, length) ;
190- let met = self . metadata_ref . slice ( offset, length) ;
191- let val = self . value_ref . slice ( offset, length) ;
343+ let inner = self . inner . slice ( offset, length) ;
344+ let shredding_state = self . shredding_state . slice ( offset, length) ;
192345 Arc :: new ( Self {
193- inner : slice,
194- metadata_ref : met,
195- value_ref : val,
346+ inner,
347+ shredding_state,
196348 } )
197349 }
198350
@@ -258,7 +410,7 @@ mod test {
258410 let err = VariantArray :: try_new ( Arc :: new ( array) ) ;
259411 assert_eq ! (
260412 err. unwrap_err( ) . to_string( ) ,
261- "Invalid argument error: Invalid VariantArray: StructArray must contain a 'value' field"
413+ "Invalid argument error: VariantArray has neither value nor typed_value field"
262414 ) ;
263415 }
264416
0 commit comments