Skip to content

Commit fe65b8d

Browse files
alambscovich
andauthored
[Variant] Add variant docs and examples (#7661)
# Which issue does this PR close? We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. - Follow on to #7644 - Part of #6736 # Rationale for this change Using the parquet APIs came up in #7644 (comment) so I wanted to help contribute some additional documentation / tests # What changes are included in this PR? Add documentation and tests about `Variant`, specifically some examples of how to create `Variant` values # Are there any user-facing changes? More docs --------- Co-authored-by: Ryan Johnson <scovich@users.noreply.github.com>
1 parent 20c1c34 commit fe65b8d

File tree

1 file changed

+120
-11
lines changed

1 file changed

+120
-11
lines changed

parquet-variant/src/variant.rs

Lines changed: 120 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ use arrow_schema::ArrowError;
2525
use chrono::{DateTime, NaiveDate, NaiveDateTime, Utc};
2626
use std::num::TryFromIntError;
2727

28+
/// The number of bytes used to store offsets in the [`VariantMetadataHeader`]
2829
#[derive(Clone, Debug, Copy, PartialEq)]
2930
enum OffsetSizeBytes {
3031
One = 1,
@@ -91,7 +92,7 @@ impl OffsetSizeBytes {
9192
}
9293
}
9394

94-
/// A parsed version of the variant metadata header byte.
95+
/// Header structure for [`VariantMetadata`]
9596
#[derive(Clone, Debug, Copy, PartialEq)]
9697
pub(crate) struct VariantMetadataHeader {
9798
version: u8,
@@ -140,8 +141,12 @@ impl VariantMetadataHeader {
140141
}
141142
}
142143

144+
/// [`Variant`] Metadata
145+
///
146+
/// See the [Variant Spec] file for more information
147+
///
148+
/// [Variant Spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#metadata-encoding
143149
#[derive(Clone, Copy, Debug, PartialEq)]
144-
/// Encodes the Variant Metadata, see the Variant spec file for more information
145150
pub struct VariantMetadata<'m> {
146151
bytes: &'m [u8],
147152
header: VariantMetadataHeader,
@@ -238,7 +243,7 @@ impl<'m> VariantMetadata<'m> {
238243
}
239244
}
240245

241-
/// A parsed version of the variant object value header byte.
246+
/// Header structure for [`VariantObject`]
242247
#[derive(Clone, Debug, PartialEq)]
243248
pub(crate) struct VariantObjectHeader {
244249
field_offset_size: OffsetSizeBytes,
@@ -262,6 +267,7 @@ impl VariantObjectHeader {
262267
}
263268
}
264269

270+
/// A [`Variant`] Object (struct with named fields).
265271
#[derive(Clone, Debug, PartialEq)]
266272
pub struct VariantObject<'m, 'v> {
267273
pub metadata: VariantMetadata<'m>,
@@ -282,6 +288,7 @@ impl<'m, 'v> VariantObject<'m, 'v> {
282288
/// particular, that all field ids exist in `metadata`, and all offsets are in-bounds and point
283289
/// to valid objects.
284290
// TODO: How to make the validation non-recursive while still making iterators safely infallible??
291+
// See https://github.com/apache/arrow-rs/issues/7711
285292
pub fn try_new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Result<Self, ArrowError> {
286293
let header_byte = first_byte_from_slice(value)?;
287294
let header = VariantObjectHeader::try_new(header_byte)?;
@@ -420,10 +427,10 @@ impl VariantListHeader {
420427
}
421428
}
422429

423-
/// Represents a variant array.
430+
/// [`Variant`] Array.
424431
///
425432
/// NOTE: The "list" naming differs from the variant spec -- which calls it "array" -- in order to be
426-
/// consistent with parquet and arrow type naming. Otherwise, the name would conflict with the
433+
/// consistent with Parquet and Arrow type naming. Otherwise, the name would conflict with the
427434
/// `VariantArray : Array` we must eventually define for variant-typed arrow arrays.
428435
#[derive(Clone, Debug, PartialEq)]
429436
pub struct VariantList<'m, 'v> {
@@ -443,6 +450,7 @@ impl<'m, 'v> VariantList<'m, 'v> {
443450
/// This constructor verifies that `value` points to a valid variant array value. In particular,
444451
/// that all offsets are in-bounds and point to valid objects.
445452
// TODO: How to make the validation non-recursive while still making iterators safely infallible??
453+
// See https://github.com/apache/arrow-rs/issues/7711
446454
pub fn try_new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Result<Self, ArrowError> {
447455
let header_byte = first_byte_from_slice(value)?;
448456
let header = VariantListHeader::try_new(header_byte)?;
@@ -536,33 +544,134 @@ impl<'m, 'v> VariantList<'m, 'v> {
536544
}
537545
}
538546

539-
/// Variant value. May contain references to metadata and value
547+
/// Represents a [Parquet Variant]
548+
///
549+
/// The lifetimes `'m` and `'v` are for metadata and value buffers, respectively.
550+
///
551+
/// # Background
552+
///
553+
/// The [specification] says:
554+
///
555+
/// The Variant Binary Encoding allows representation of semi-structured data
556+
/// (e.g. JSON) in a form that can be efficiently queried by path. The design is
557+
/// intended to allow efficient access to nested data even in the presence of
558+
/// very wide or deep structures.
559+
///
560+
/// Another motivation for the representation is that (aside from metadata) each
561+
/// nested Variant value is contiguous and self-contained. For example, in a
562+
/// Variant containing an Array of Variant values, the representation of an
563+
/// inner Variant value, when paired with the metadata of the full variant, is
564+
/// itself a valid Variant.
565+
///
566+
/// When stored in Parquet files, Variant fields can also be *shredded*. Shredding
567+
/// refers to extracting some elements of the variant into separate columns for
568+
/// more efficient extraction/filter pushdown. The [Variant Shredding
569+
/// specification] describes the details of shredding Variant values as typed
570+
/// Parquet columns.
571+
///
572+
/// A Variant represents a type that contains one of:
573+
///
574+
/// * Primitive: A type and corresponding value (e.g. INT, STRING)
575+
///
576+
/// * Array: An ordered list of Variant values
577+
///
578+
/// * Object: An unordered collection of string/Variant pairs (i.e. key/value
579+
/// pairs). An object may not contain duplicate keys.
580+
///
581+
/// # Encoding
582+
///
583+
/// A Variant is encoded with 2 binary values, the value and the metadata. The
584+
/// metadata stores a header and an optional dictionary of field names which are
585+
/// referred to by offset in the value. The value is a binary representation of
586+
/// the actual data, and varies depending on the type.
587+
///
588+
/// # Design Goals
589+
///
590+
/// The design goals of the Rust API are as follows:
591+
/// 1. Speed / Zero copy access (no `clone`ing is required)
592+
/// 2. Safety
593+
/// 3. Follow standard Rust conventions
594+
///
595+
/// [Parquet Variant]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md
596+
/// [specification]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md
597+
/// [Variant Shredding specification]: https://github.com/apache/parquet-format/blob/master/VariantShredding.md
598+
///
599+
/// # Examples:
600+
///
601+
/// ## Creating `Variant` from Rust Types
602+
/// ```
603+
/// # use parquet_variant::Variant;
604+
/// // variants can be directly constructed
605+
/// let variant = Variant::Int32(123);
606+
/// // or constructed via `From` impls
607+
/// assert_eq!(variant, Variant::from(123i32));
608+
/// ```
609+
/// ## Creating `Variant` from metadata and value
610+
/// ```
611+
/// # use parquet_variant::{Variant, VariantMetadata};
612+
/// let metadata = [0x01, 0x00, 0x00];
613+
/// let value = [0x09, 0x48, 0x49];
614+
/// // parse the header metadata
615+
/// assert_eq!(
616+
/// Variant::ShortString("HI"),
617+
/// Variant::try_new(&metadata, &value).unwrap()
618+
/// );
619+
/// ```
620+
///
621+
/// ## Using `Variant` values
622+
/// ```
623+
/// # use parquet_variant::Variant;
624+
/// # let variant = Variant::Int32(123);
625+
/// // variants can be used in match statements like normal enums
626+
/// match variant {
627+
/// Variant::Int32(i) => println!("Integer: {}", i),
628+
/// Variant::String(s) => println!("String: {}", s),
629+
/// _ => println!("Other variant"),
630+
/// }
631+
/// ```
540632
#[derive(Clone, Debug, PartialEq)]
541633
pub enum Variant<'m, 'v> {
542-
// TODO: Add types for the rest of the primitive types, once API is agreed upon
634+
/// Primitive type: Null
543635
Null,
636+
/// Primitive (type_id=1): INT(8, SIGNED)
544637
Int8(i8),
638+
/// Primitive (type_id=1): INT(16, SIGNED)
545639
Int16(i16),
640+
/// Primitive (type_id=1): INT(32, SIGNED)
546641
Int32(i32),
642+
/// Primitive (type_id=1): INT(64, SIGNED)
547643
Int64(i64),
644+
/// Primitive (type_id=1): DATE
548645
Date(NaiveDate),
646+
/// Primitive (type_id=1): TIMESTAMP(isAdjustedToUTC=true, MICROS)
549647
TimestampMicros(DateTime<Utc>),
648+
/// Primitive (type_id=1): TIMESTAMP(isAdjustedToUTC=false, MICROS)
550649
TimestampNtzMicros(NaiveDateTime),
650+
/// Primitive (type_id=1): DECIMAL(precision, scale) 32-bits
551651
Decimal4 { integer: i32, scale: u8 },
652+
/// Primitive (type_id=1): DECIMAL(precision, scale) 64-bits
552653
Decimal8 { integer: i64, scale: u8 },
654+
/// Primitive (type_id=1): DECIMAL(precision, scale) 128-bits
553655
Decimal16 { integer: i128, scale: u8 },
656+
/// Primitive (type_id=1): FLOAT
554657
Float(f32),
658+
/// Primitive (type_id=1): DOUBLE
555659
Double(f64),
660+
/// Primitive (type_id=1): BOOLEAN (true)
556661
BooleanTrue,
662+
/// Primitive (type_id=1): BOOLEAN (false)
557663
BooleanFalse,
558-
559-
// Note: only need the *value* buffer
664+
// Note: only need the *value* buffer for these types
665+
/// Primitive (type_id=1): BINARY
560666
Binary(&'v [u8]),
667+
/// Primitive (type_id=1): STRING
561668
String(&'v str),
669+
/// Short String (type_id=2): STRING
562670
ShortString(&'v str),
563-
564671
// need both metadata & value
672+
/// Object (type_id=3): N/A
565673
Object(VariantObject<'m, 'v>),
674+
/// Array (type_id=4): N/A
566675
List(VariantList<'m, 'v>),
567676
}
568677

@@ -574,6 +683,7 @@ impl<'m, 'v> Variant<'m, 'v> {
574683
/// # use parquet_variant::{Variant, VariantMetadata};
575684
/// let metadata = [0x01, 0x00, 0x00];
576685
/// let value = [0x09, 0x48, 0x49];
686+
/// // parse the header metadata
577687
/// assert_eq!(
578688
/// Variant::ShortString("HI"),
579689
/// Variant::try_new(&metadata, &value).unwrap()
@@ -629,7 +739,6 @@ impl<'m, 'v> Variant<'m, 'v> {
629739
}
630740
VariantPrimitiveType::BooleanTrue => Variant::BooleanTrue,
631741
VariantPrimitiveType::BooleanFalse => Variant::BooleanFalse,
632-
// TODO: Add types for the rest, once API is agreed upon
633742
VariantPrimitiveType::Date => Variant::Date(decoder::decode_date(value_data)?),
634743
VariantPrimitiveType::TimestampMicros => {
635744
Variant::TimestampMicros(decoder::decode_timestamp_micros(value_data)?)

0 commit comments

Comments
 (0)