Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion parquet-variant-compute/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ mod variant_array_builder;
pub mod variant_get;
mod variant_to_arrow;

pub use variant_array::{ShreddingState, VariantArray, VariantType};
pub use variant_array::{BorrowedShreddingState, ShreddingState, VariantArray, VariantType};
pub use variant_array_builder::{VariantArrayBuilder, VariantValueArrayBuilder};

pub use cast_to_variant::{cast_to_variant, cast_to_variant_with_options};
Expand Down
110 changes: 84 additions & 26 deletions parquet-variant-compute/src/variant_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -272,26 +272,11 @@ impl VariantArray {
)));
};

// Extract value and typed_value fields
let value = if let Some(value_col) = inner.column_by_name("value") {
if let Some(binary_view) = value_col.as_binary_view_opt() {
Some(binary_view.clone())
} else {
return Err(ArrowError::NotYetImplemented(format!(
"VariantArray 'value' field must be BinaryView, got {}",
value_col.data_type()
)));
}
} else {
None
};
let typed_value = inner.column_by_name("typed_value").cloned();
Comment on lines -275 to -288
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now encapsulated by ShreddingState::try_from


// Note these clones are cheap, they just bump the ref count
Ok(Self {
inner: inner.clone(),
metadata: metadata.clone(),
shredding_state: ShreddingState::new(value, typed_value),
shredding_state: ShreddingState::try_from(inner)?,
})
}

Expand Down Expand Up @@ -521,7 +506,7 @@ impl ShreddedVariantFieldArray {
// Note this clone is cheap, it just bumps the ref count
Ok(Self {
inner: inner_struct.clone(),
shredding_state: ShreddingState::from(inner_struct),
shredding_state: ShreddingState::try_from(inner_struct)?,
})
}

Expand Down Expand Up @@ -660,7 +645,7 @@ impl ShreddingState {
/// Create a new `ShreddingState` from the given `value` and `typed_value` fields
///
/// Note you can create a `ShreddingState` from a &[`StructArray`] using
/// `ShreddingState::from(&struct_array)`, for example:
/// `ShreddingState::try_from(&struct_array)`, for example:
///
/// ```no_run
/// # use arrow::array::StructArray;
Expand All @@ -669,7 +654,7 @@ impl ShreddingState {
/// # unimplemented!()
/// # }
/// let struct_array: StructArray = get_struct_array();
/// let shredding_state = ShreddingState::from(&struct_array);
/// let shredding_state = ShreddingState::try_from(&struct_array).unwrap();
/// ```
pub fn new(value: Option<BinaryViewArray>, typed_value: Option<ArrayRef>) -> Self {
Self { value, typed_value }
Expand All @@ -685,6 +670,14 @@ impl ShreddingState {
self.typed_value.as_ref()
}

/// Returns a borrowed version of this shredding state
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

pub fn borrow(&self) -> BorrowedShreddingState<'_> {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I originally had an impl From instead, but this seems to be more readable/intuitive at call sites.

BorrowedShreddingState {
value: self.value_field(),
typed_value: self.typed_value_field(),
}
}

/// Slice all the underlying arrays
pub fn slice(&self, offset: usize, length: usize) -> Self {
Self {
Expand All @@ -694,14 +687,79 @@ impl ShreddingState {
}
}

impl From<&StructArray> for ShreddingState {
fn from(inner_struct: &StructArray) -> Self {
let value = inner_struct
.column_by_name("value")
.and_then(|col| col.as_binary_view_opt().cloned());
let typed_value = inner_struct.column_by_name("typed_value").cloned();
/// Similar to [`ShreddingState`] except it holds borrowed references of the target arrays. Useful
/// for avoiding clone operations when the caller does not need a self-standing shredding state.
#[derive(Clone, Debug)]
pub struct BorrowedShreddingState<'a> {
value: Option<&'a BinaryViewArray>,
typed_value: Option<&'a ArrayRef>,
}

ShreddingState::new(value, typed_value)
impl<'a> BorrowedShreddingState<'a> {
/// Create a new `BorrowedShreddingState` from the given `value` and `typed_value` fields
///
/// Note you can create a `BorrowedShreddingState` from a &[`StructArray`] using
/// `BorrowedShreddingState::try_from(&struct_array)`, for example:
///
/// ```no_run
/// # use arrow::array::StructArray;
/// # use parquet_variant_compute::BorrowedShreddingState;
/// # fn get_struct_array() -> StructArray {
/// # unimplemented!()
/// # }
/// let struct_array: StructArray = get_struct_array();
/// let shredding_state = BorrowedShreddingState::try_from(&struct_array).unwrap();
/// ```
pub fn new(value: Option<&'a BinaryViewArray>, typed_value: Option<&'a ArrayRef>) -> Self {
Self { value, typed_value }
}

/// Return a reference to the value field, if present
pub fn value_field(&self) -> Option<&'a BinaryViewArray> {
self.value
}

/// Return a reference to the typed_value field, if present
pub fn typed_value_field(&self) -> Option<&'a ArrayRef> {
self.typed_value
}
}

impl<'a> TryFrom<&'a StructArray> for BorrowedShreddingState<'a> {
type Error = ArrowError;

fn try_from(inner_struct: &'a StructArray) -> Result<Self, ArrowError> {
// The `value` column need not exist, but if it does it must be a binary view.
let value = if let Some(value_col) = inner_struct.column_by_name("value") {
let Some(binary_view) = value_col.as_binary_view_opt() else {
return Err(ArrowError::NotYetImplemented(format!(
"VariantArray 'value' field must be BinaryView, got {}",
value_col.data_type()
)));
};
Some(binary_view)
} else {
None
};
let typed_value = inner_struct.column_by_name("typed_value");
Ok(BorrowedShreddingState::new(value, typed_value))
}
}

impl TryFrom<&StructArray> for ShreddingState {
type Error = ArrowError;

fn try_from(inner_struct: &StructArray) -> Result<Self, ArrowError> {
Ok(BorrowedShreddingState::try_from(inner_struct)?.into())
}
}

impl From<BorrowedShreddingState<'_>> for ShreddingState {
fn from(state: BorrowedShreddingState<'_>) -> Self {
ShreddingState {
value: state.value_field().cloned(),
typed_value: state.typed_value_field().cloned(),
}
}
}

Expand Down
25 changes: 12 additions & 13 deletions parquet-variant-compute/src/variant_get.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,16 @@ use arrow::{
use arrow_schema::{ArrowError, DataType, FieldRef};
use parquet_variant::{VariantPath, VariantPathElement};

use crate::variant_array::ShreddingState;
use crate::variant_array::BorrowedShreddingState;
use crate::variant_to_arrow::make_variant_to_arrow_row_builder;
use crate::VariantArray;

use arrow::array::AsArray;
use std::sync::Arc;

pub(crate) enum ShreddedPathStep {
pub(crate) enum ShreddedPathStep<'a> {
/// Path step succeeded, return the new shredding state
Success(ShreddingState),
Success(BorrowedShreddingState<'a>),
/// The path element is not present in the `typed_value` column and there is no `value` column,
/// so we know it does not exist. It, and all paths under it, are all-NULL.
Missing,
Expand All @@ -47,18 +47,16 @@ pub(crate) enum ShreddedPathStep {
/// level, or if `typed_value` is not a struct, or if the requested field name does not exist.
///
/// TODO: Support `VariantPathElement::Index`? It wouldn't be easy, and maybe not even possible.
pub(crate) fn follow_shredded_path_element(
shredding_state: &ShreddingState,
pub(crate) fn follow_shredded_path_element<'a>(
shredding_state: &BorrowedShreddingState<'a>,
path_element: &VariantPathElement<'_>,
cast_options: &CastOptions,
) -> Result<ShreddedPathStep> {
) -> Result<ShreddedPathStep<'a>> {
// If the requested path element is not present in `typed_value`, and `value` is missing, then
// we know it does not exist; it, and all paths under it, are all-NULL.
let missing_path_step = || {
let Some(_value_field) = shredding_state.value_field() else {
return ShreddedPathStep::Missing;
};
ShreddedPathStep::NotShredded
let missing_path_step = || match shredding_state.value_field() {
Some(_) => ShreddedPathStep::NotShredded,
None => ShreddedPathStep::Missing,
};

let Some(typed_value) = shredding_state.typed_value_field() else {
Expand Down Expand Up @@ -98,7 +96,8 @@ pub(crate) fn follow_shredded_path_element(
))
})?;

Ok(ShreddedPathStep::Success(struct_array.into()))
let state = BorrowedShreddingState::try_from(struct_array)?;
Ok(ShreddedPathStep::Success(state))
}
VariantPathElement::Index { .. } => {
// TODO: Support array indexing. Among other things, it will require slicing not
Expand Down Expand Up @@ -152,7 +151,7 @@ fn shredded_get_path(

// Peel away the prefix of path elements that traverses the shredded parts of this variant
// column. Shredding will traverse the rest of the path on a per-row basis.
let mut shredding_state = input.shredding_state().clone();
let mut shredding_state = input.shredding_state().borrow();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🎉

let mut accumulated_nulls = input.inner().nulls().cloned();
let mut path_index = 0;
for path_element in path {
Expand Down
Loading