-
Notifications
You must be signed in to change notification settings - Fork 1k
[Variant] Introduce new BorrowedShreddingState concept #8499
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -272,26 +272,11 @@ impl VariantArray { | |
))); | ||
}; | ||
|
||
// Extract value and typed_value fields | ||
let value = if let Some(value_col) = inner.column_by_name("value") { | ||
if let Some(binary_view) = value_col.as_binary_view_opt() { | ||
Some(binary_view.clone()) | ||
} else { | ||
return Err(ArrowError::NotYetImplemented(format!( | ||
"VariantArray 'value' field must be BinaryView, got {}", | ||
value_col.data_type() | ||
))); | ||
} | ||
} else { | ||
None | ||
}; | ||
let typed_value = inner.column_by_name("typed_value").cloned(); | ||
|
||
// Note these clones are cheap, they just bump the ref count | ||
Ok(Self { | ||
inner: inner.clone(), | ||
metadata: metadata.clone(), | ||
shredding_state: ShreddingState::new(value, typed_value), | ||
shredding_state: ShreddingState::try_from(inner)?, | ||
}) | ||
} | ||
|
||
|
@@ -521,7 +506,7 @@ impl ShreddedVariantFieldArray { | |
// Note this clone is cheap, it just bumps the ref count | ||
Ok(Self { | ||
inner: inner_struct.clone(), | ||
shredding_state: ShreddingState::from(inner_struct), | ||
shredding_state: ShreddingState::try_from(inner_struct)?, | ||
}) | ||
} | ||
|
||
|
@@ -660,7 +645,7 @@ impl ShreddingState { | |
/// Create a new `ShreddingState` from the given `value` and `typed_value` fields | ||
/// | ||
/// Note you can create a `ShreddingState` from a &[`StructArray`] using | ||
/// `ShreddingState::from(&struct_array)`, for example: | ||
/// `ShreddingState::try_from(&struct_array)`, for example: | ||
/// | ||
/// ```no_run | ||
/// # use arrow::array::StructArray; | ||
|
@@ -669,7 +654,7 @@ impl ShreddingState { | |
/// # unimplemented!() | ||
/// # } | ||
/// let struct_array: StructArray = get_struct_array(); | ||
/// let shredding_state = ShreddingState::from(&struct_array); | ||
/// let shredding_state = ShreddingState::try_from(&struct_array).unwrap(); | ||
/// ``` | ||
pub fn new(value: Option<BinaryViewArray>, typed_value: Option<ArrayRef>) -> Self { | ||
Self { value, typed_value } | ||
|
@@ -685,6 +670,14 @@ impl ShreddingState { | |
self.typed_value.as_ref() | ||
} | ||
|
||
/// Returns a borrowed version of this shredding state | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 |
||
pub fn borrow(&self) -> BorrowedShreddingState<'_> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I originally had an |
||
BorrowedShreddingState { | ||
value: self.value_field(), | ||
typed_value: self.typed_value_field(), | ||
} | ||
} | ||
|
||
/// Slice all the underlying arrays | ||
pub fn slice(&self, offset: usize, length: usize) -> Self { | ||
Self { | ||
|
@@ -694,14 +687,79 @@ impl ShreddingState { | |
} | ||
} | ||
|
||
impl From<&StructArray> for ShreddingState { | ||
fn from(inner_struct: &StructArray) -> Self { | ||
let value = inner_struct | ||
.column_by_name("value") | ||
.and_then(|col| col.as_binary_view_opt().cloned()); | ||
let typed_value = inner_struct.column_by_name("typed_value").cloned(); | ||
/// Similar to [`ShreddingState`] except it holds borrowed references of the target arrays. Useful | ||
/// for avoiding clone operations when the caller does not need a self-standing shredding state. | ||
#[derive(Clone, Debug)] | ||
pub struct BorrowedShreddingState<'a> { | ||
value: Option<&'a BinaryViewArray>, | ||
typed_value: Option<&'a ArrayRef>, | ||
} | ||
|
||
ShreddingState::new(value, typed_value) | ||
impl<'a> BorrowedShreddingState<'a> { | ||
/// Create a new `BorrowedShreddingState` from the given `value` and `typed_value` fields | ||
/// | ||
/// Note you can create a `BorrowedShreddingState` from a &[`StructArray`] using | ||
/// `BorrowedShreddingState::try_from(&struct_array)`, for example: | ||
/// | ||
/// ```no_run | ||
/// # use arrow::array::StructArray; | ||
/// # use parquet_variant_compute::BorrowedShreddingState; | ||
/// # fn get_struct_array() -> StructArray { | ||
/// # unimplemented!() | ||
/// # } | ||
/// let struct_array: StructArray = get_struct_array(); | ||
/// let shredding_state = BorrowedShreddingState::try_from(&struct_array).unwrap(); | ||
/// ``` | ||
pub fn new(value: Option<&'a BinaryViewArray>, typed_value: Option<&'a ArrayRef>) -> Self { | ||
Self { value, typed_value } | ||
} | ||
|
||
/// Return a reference to the value field, if present | ||
pub fn value_field(&self) -> Option<&'a BinaryViewArray> { | ||
self.value | ||
} | ||
|
||
/// Return a reference to the typed_value field, if present | ||
pub fn typed_value_field(&self) -> Option<&'a ArrayRef> { | ||
self.typed_value | ||
} | ||
} | ||
|
||
impl<'a> TryFrom<&'a StructArray> for BorrowedShreddingState<'a> { | ||
type Error = ArrowError; | ||
|
||
fn try_from(inner_struct: &'a StructArray) -> Result<Self, ArrowError> { | ||
// The `value` column need not exist, but if it does it must be a binary view. | ||
let value = if let Some(value_col) = inner_struct.column_by_name("value") { | ||
let Some(binary_view) = value_col.as_binary_view_opt() else { | ||
return Err(ArrowError::NotYetImplemented(format!( | ||
"VariantArray 'value' field must be BinaryView, got {}", | ||
value_col.data_type() | ||
))); | ||
}; | ||
Some(binary_view) | ||
} else { | ||
None | ||
}; | ||
let typed_value = inner_struct.column_by_name("typed_value"); | ||
Ok(BorrowedShreddingState::new(value, typed_value)) | ||
} | ||
} | ||
|
||
impl TryFrom<&StructArray> for ShreddingState { | ||
type Error = ArrowError; | ||
|
||
fn try_from(inner_struct: &StructArray) -> Result<Self, ArrowError> { | ||
Ok(BorrowedShreddingState::try_from(inner_struct)?.into()) | ||
} | ||
} | ||
|
||
impl From<BorrowedShreddingState<'_>> for ShreddingState { | ||
fn from(state: BorrowedShreddingState<'_>) -> Self { | ||
ShreddingState { | ||
value: state.value_field().cloned(), | ||
typed_value: state.typed_value_field().cloned(), | ||
} | ||
} | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,16 +23,16 @@ use arrow::{ | |
use arrow_schema::{ArrowError, DataType, FieldRef}; | ||
use parquet_variant::{VariantPath, VariantPathElement}; | ||
|
||
use crate::variant_array::ShreddingState; | ||
use crate::variant_array::BorrowedShreddingState; | ||
use crate::variant_to_arrow::make_variant_to_arrow_row_builder; | ||
use crate::VariantArray; | ||
|
||
use arrow::array::AsArray; | ||
use std::sync::Arc; | ||
|
||
pub(crate) enum ShreddedPathStep { | ||
pub(crate) enum ShreddedPathStep<'a> { | ||
/// Path step succeeded, return the new shredding state | ||
Success(ShreddingState), | ||
Success(BorrowedShreddingState<'a>), | ||
/// The path element is not present in the `typed_value` column and there is no `value` column, | ||
/// so we know it does not exist. It, and all paths under it, are all-NULL. | ||
Missing, | ||
|
@@ -47,18 +47,16 @@ pub(crate) enum ShreddedPathStep { | |
/// level, or if `typed_value` is not a struct, or if the requested field name does not exist. | ||
/// | ||
/// TODO: Support `VariantPathElement::Index`? It wouldn't be easy, and maybe not even possible. | ||
pub(crate) fn follow_shredded_path_element( | ||
shredding_state: &ShreddingState, | ||
pub(crate) fn follow_shredded_path_element<'a>( | ||
shredding_state: &BorrowedShreddingState<'a>, | ||
path_element: &VariantPathElement<'_>, | ||
cast_options: &CastOptions, | ||
) -> Result<ShreddedPathStep> { | ||
) -> Result<ShreddedPathStep<'a>> { | ||
// If the requested path element is not present in `typed_value`, and `value` is missing, then | ||
// we know it does not exist; it, and all paths under it, are all-NULL. | ||
let missing_path_step = || { | ||
let Some(_value_field) = shredding_state.value_field() else { | ||
return ShreddedPathStep::Missing; | ||
}; | ||
ShreddedPathStep::NotShredded | ||
let missing_path_step = || match shredding_state.value_field() { | ||
Some(_) => ShreddedPathStep::NotShredded, | ||
None => ShreddedPathStep::Missing, | ||
}; | ||
|
||
let Some(typed_value) = shredding_state.typed_value_field() else { | ||
|
@@ -98,7 +96,8 @@ pub(crate) fn follow_shredded_path_element( | |
)) | ||
})?; | ||
|
||
Ok(ShreddedPathStep::Success(struct_array.into())) | ||
let state = BorrowedShreddingState::try_from(struct_array)?; | ||
Ok(ShreddedPathStep::Success(state)) | ||
} | ||
VariantPathElement::Index { .. } => { | ||
// TODO: Support array indexing. Among other things, it will require slicing not | ||
|
@@ -152,7 +151,7 @@ fn shredded_get_path( | |
|
||
// Peel away the prefix of path elements that traverses the shredded parts of this variant | ||
// column. Shredding will traverse the rest of the path on a per-row basis. | ||
let mut shredding_state = input.shredding_state().clone(); | ||
let mut shredding_state = input.shredding_state().borrow(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🎉 |
||
let mut accumulated_nulls = input.inner().nulls().cloned(); | ||
let mut path_index = 0; | ||
for path_element in path { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Now encapsulated by
ShreddingState::try_from