@@ -25,6 +25,7 @@ use arrow_schema::ArrowError;
25
25
use chrono:: { DateTime , NaiveDate , NaiveDateTime , Utc } ;
26
26
use std:: num:: TryFromIntError ;
27
27
28
+ /// The number of bytes used to store offsets in the [`VariantMetadataHeader`]
28
29
#[ derive( Clone , Debug , Copy , PartialEq ) ]
29
30
enum OffsetSizeBytes {
30
31
One = 1 ,
@@ -91,7 +92,7 @@ impl OffsetSizeBytes {
91
92
}
92
93
}
93
94
94
- /// A parsed version of the variant metadata header byte.
95
+ /// Header structure for [`VariantMetadata`]
95
96
#[ derive( Clone , Debug , Copy , PartialEq ) ]
96
97
pub ( crate ) struct VariantMetadataHeader {
97
98
version : u8 ,
@@ -140,8 +141,12 @@ impl VariantMetadataHeader {
140
141
}
141
142
}
142
143
144
+ /// [`Variant`] Metadata
145
+ ///
146
+ /// See the [Variant Spec] file for more information
147
+ ///
148
+ /// [Variant Spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#metadata-encoding
143
149
#[ derive( Clone , Copy , Debug , PartialEq ) ]
144
- /// Encodes the Variant Metadata, see the Variant spec file for more information
145
150
pub struct VariantMetadata < ' m > {
146
151
bytes : & ' m [ u8 ] ,
147
152
header : VariantMetadataHeader ,
@@ -238,7 +243,7 @@ impl<'m> VariantMetadata<'m> {
238
243
}
239
244
}
240
245
241
- /// A parsed version of the variant object value header byte.
246
+ /// Header structure for [`VariantObject`]
242
247
#[ derive( Clone , Debug , PartialEq ) ]
243
248
pub ( crate ) struct VariantObjectHeader {
244
249
field_offset_size : OffsetSizeBytes ,
@@ -262,6 +267,7 @@ impl VariantObjectHeader {
262
267
}
263
268
}
264
269
270
+ /// A [`Variant`] Object (struct with named fields).
265
271
#[ derive( Clone , Debug , PartialEq ) ]
266
272
pub struct VariantObject < ' m , ' v > {
267
273
pub metadata : VariantMetadata < ' m > ,
@@ -282,6 +288,7 @@ impl<'m, 'v> VariantObject<'m, 'v> {
282
288
/// particular, that all field ids exist in `metadata`, and all offsets are in-bounds and point
283
289
/// to valid objects.
284
290
// TODO: How to make the validation non-recursive while still making iterators safely infallible??
291
+ // See https://github.com/apache/arrow-rs/issues/7711
285
292
pub fn try_new ( metadata : VariantMetadata < ' m > , value : & ' v [ u8 ] ) -> Result < Self , ArrowError > {
286
293
let header_byte = first_byte_from_slice ( value) ?;
287
294
let header = VariantObjectHeader :: try_new ( header_byte) ?;
@@ -420,10 +427,10 @@ impl VariantListHeader {
420
427
}
421
428
}
422
429
423
- /// Represents a variant array .
430
+ /// [`Variant`] Array .
424
431
///
425
432
/// NOTE: The "list" naming differs from the variant spec -- which calls it "array" -- in order to be
426
- /// consistent with parquet and arrow type naming. Otherwise, the name would conflict with the
433
+ /// consistent with Parquet and Arrow type naming. Otherwise, the name would conflict with the
427
434
/// `VariantArray : Array` we must eventually define for variant-typed arrow arrays.
428
435
#[ derive( Clone , Debug , PartialEq ) ]
429
436
pub struct VariantList < ' m , ' v > {
@@ -443,6 +450,7 @@ impl<'m, 'v> VariantList<'m, 'v> {
443
450
/// This constructor verifies that `value` points to a valid variant array value. In particular,
444
451
/// that all offsets are in-bounds and point to valid objects.
445
452
// TODO: How to make the validation non-recursive while still making iterators safely infallible??
453
+ // See https://github.com/apache/arrow-rs/issues/7711
446
454
pub fn try_new ( metadata : VariantMetadata < ' m > , value : & ' v [ u8 ] ) -> Result < Self , ArrowError > {
447
455
let header_byte = first_byte_from_slice ( value) ?;
448
456
let header = VariantListHeader :: try_new ( header_byte) ?;
@@ -536,33 +544,134 @@ impl<'m, 'v> VariantList<'m, 'v> {
536
544
}
537
545
}
538
546
539
- /// Variant value. May contain references to metadata and value
547
+ /// Represents a [Parquet Variant]
548
+ ///
549
+ /// The lifetimes `'m` and `'v` are for metadata and value buffers, respectively.
550
+ ///
551
+ /// # Background
552
+ ///
553
+ /// The [specification] says:
554
+ ///
555
+ /// The Variant Binary Encoding allows representation of semi-structured data
556
+ /// (e.g. JSON) in a form that can be efficiently queried by path. The design is
557
+ /// intended to allow efficient access to nested data even in the presence of
558
+ /// very wide or deep structures.
559
+ ///
560
+ /// Another motivation for the representation is that (aside from metadata) each
561
+ /// nested Variant value is contiguous and self-contained. For example, in a
562
+ /// Variant containing an Array of Variant values, the representation of an
563
+ /// inner Variant value, when paired with the metadata of the full variant, is
564
+ /// itself a valid Variant.
565
+ ///
566
+ /// When stored in Parquet files, Variant fields can also be *shredded*. Shredding
567
+ /// refers to extracting some elements of the variant into separate columns for
568
+ /// more efficient extraction/filter pushdown. The [Variant Shredding
569
+ /// specification] describes the details of shredding Variant values as typed
570
+ /// Parquet columns.
571
+ ///
572
+ /// A Variant represents a type that contains one of:
573
+ ///
574
+ /// * Primitive: A type and corresponding value (e.g. INT, STRING)
575
+ ///
576
+ /// * Array: An ordered list of Variant values
577
+ ///
578
+ /// * Object: An unordered collection of string/Variant pairs (i.e. key/value
579
+ /// pairs). An object may not contain duplicate keys.
580
+ ///
581
+ /// # Encoding
582
+ ///
583
+ /// A Variant is encoded with 2 binary values, the value and the metadata. The
584
+ /// metadata stores a header and an optional dictionary of field names which are
585
+ /// referred to by offset in the value. The value is a binary representation of
586
+ /// the actual data, and varies depending on the type.
587
+ ///
588
+ /// # Design Goals
589
+ ///
590
+ /// The design goals of the Rust API are as follows:
591
+ /// 1. Speed / Zero copy access (no `clone`ing is required)
592
+ /// 2. Safety
593
+ /// 3. Follow standard Rust conventions
594
+ ///
595
+ /// [Parquet Variant]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md
596
+ /// [specification]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md
597
+ /// [Variant Shredding specification]: https://github.com/apache/parquet-format/blob/master/VariantShredding.md
598
+ ///
599
+ /// # Examples:
600
+ ///
601
+ /// ## Creating `Variant` from Rust Types
602
+ /// ```
603
+ /// # use parquet_variant::Variant;
604
+ /// // variants can be directly constructed
605
+ /// let variant = Variant::Int32(123);
606
+ /// // or constructed via `From` impls
607
+ /// assert_eq!(variant, Variant::from(123i32));
608
+ /// ```
609
+ /// ## Creating `Variant` from metadata and value
610
+ /// ```
611
+ /// # use parquet_variant::{Variant, VariantMetadata};
612
+ /// let metadata = [0x01, 0x00, 0x00];
613
+ /// let value = [0x09, 0x48, 0x49];
614
+ /// // parse the header metadata
615
+ /// assert_eq!(
616
+ /// Variant::ShortString("HI"),
617
+ /// Variant::try_new(&metadata, &value).unwrap()
618
+ /// );
619
+ /// ```
620
+ ///
621
+ /// ## Using `Variant` values
622
+ /// ```
623
+ /// # use parquet_variant::Variant;
624
+ /// # let variant = Variant::Int32(123);
625
+ /// // variants can be used in match statements like normal enums
626
+ /// match variant {
627
+ /// Variant::Int32(i) => println!("Integer: {}", i),
628
+ /// Variant::String(s) => println!("String: {}", s),
629
+ /// _ => println!("Other variant"),
630
+ /// }
631
+ /// ```
540
632
#[ derive( Clone , Debug , PartialEq ) ]
541
633
pub enum Variant < ' m , ' v > {
542
- // TODO: Add types for the rest of the primitive types, once API is agreed upon
634
+ /// Primitive type: Null
543
635
Null ,
636
+ /// Primitive (type_id=1): INT(8, SIGNED)
544
637
Int8 ( i8 ) ,
638
+ /// Primitive (type_id=1): INT(16, SIGNED)
545
639
Int16 ( i16 ) ,
640
+ /// Primitive (type_id=1): INT(32, SIGNED)
546
641
Int32 ( i32 ) ,
642
+ /// Primitive (type_id=1): INT(64, SIGNED)
547
643
Int64 ( i64 ) ,
644
+ /// Primitive (type_id=1): DATE
548
645
Date ( NaiveDate ) ,
646
+ /// Primitive (type_id=1): TIMESTAMP(isAdjustedToUTC=true, MICROS)
549
647
TimestampMicros ( DateTime < Utc > ) ,
648
+ /// Primitive (type_id=1): TIMESTAMP(isAdjustedToUTC=false, MICROS)
550
649
TimestampNtzMicros ( NaiveDateTime ) ,
650
+ /// Primitive (type_id=1): DECIMAL(precision, scale) 32-bits
551
651
Decimal4 { integer : i32 , scale : u8 } ,
652
+ /// Primitive (type_id=1): DECIMAL(precision, scale) 64-bits
552
653
Decimal8 { integer : i64 , scale : u8 } ,
654
+ /// Primitive (type_id=1): DECIMAL(precision, scale) 128-bits
553
655
Decimal16 { integer : i128 , scale : u8 } ,
656
+ /// Primitive (type_id=1): FLOAT
554
657
Float ( f32 ) ,
658
+ /// Primitive (type_id=1): DOUBLE
555
659
Double ( f64 ) ,
660
+ /// Primitive (type_id=1): BOOLEAN (true)
556
661
BooleanTrue ,
662
+ /// Primitive (type_id=1): BOOLEAN (false)
557
663
BooleanFalse ,
558
-
559
- // Note: only need the *value* buffer
664
+ // Note: only need the *value* buffer for these types
665
+ /// Primitive (type_id=1): BINARY
560
666
Binary ( & ' v [ u8 ] ) ,
667
+ /// Primitive (type_id=1): STRING
561
668
String ( & ' v str ) ,
669
+ /// Short String (type_id=2): STRING
562
670
ShortString ( & ' v str ) ,
563
-
564
671
// need both metadata & value
672
+ /// Object (type_id=3): N/A
565
673
Object ( VariantObject < ' m , ' v > ) ,
674
+ /// Array (type_id=4): N/A
566
675
List ( VariantList < ' m , ' v > ) ,
567
676
}
568
677
@@ -574,6 +683,7 @@ impl<'m, 'v> Variant<'m, 'v> {
574
683
/// # use parquet_variant::{Variant, VariantMetadata};
575
684
/// let metadata = [0x01, 0x00, 0x00];
576
685
/// let value = [0x09, 0x48, 0x49];
686
+ /// // parse the header metadata
577
687
/// assert_eq!(
578
688
/// Variant::ShortString("HI"),
579
689
/// Variant::try_new(&metadata, &value).unwrap()
@@ -629,7 +739,6 @@ impl<'m, 'v> Variant<'m, 'v> {
629
739
}
630
740
VariantPrimitiveType :: BooleanTrue => Variant :: BooleanTrue ,
631
741
VariantPrimitiveType :: BooleanFalse => Variant :: BooleanFalse ,
632
- // TODO: Add types for the rest, once API is agreed upon
633
742
VariantPrimitiveType :: Date => Variant :: Date ( decoder:: decode_date ( value_data) ?) ,
634
743
VariantPrimitiveType :: TimestampMicros => {
635
744
Variant :: TimestampMicros ( decoder:: decode_timestamp_micros ( value_data) ?)
0 commit comments