-
Notifications
You must be signed in to change notification settings - Fork 130
feat[vortex-array]: expr array that represents lazy computation #5400
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
9 commits
Select commit
Hold shift + click to select a range
c13a536
feat[vortex-array]: add an expr array
joseph-isaacs 4e0c3c8
feat[vortex-array]: add an expr array
joseph-isaacs d2af6a0
feat[vortex-array]: add an expr array
joseph-isaacs 0ca42b1
feat[vortex-array]: add an expr array
joseph-isaacs 362c8c7
u
joseph-isaacs 3e37c28
u
joseph-isaacs bdd0a4a
u
joseph-isaacs acbb439
u
joseph-isaacs 0164659
u
joseph-isaacs File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,89 @@ | ||
| // SPDX-License-Identifier: Apache-2.0 | ||
| // SPDX-FileCopyrightText: Copyright the Vortex contributors | ||
|
|
||
| use vortex_dtype::DType; | ||
| use vortex_error::{VortexResult, vortex_ensure}; | ||
|
|
||
| use crate::expr::Expression; | ||
| use crate::stats::ArrayStats; | ||
| use crate::{Array, ArrayRef}; | ||
|
|
||
| /// A array that represents an expression to be evaluated over a child array. | ||
| /// | ||
| /// `ExprArray` enables deferred evaluation of expressions by wrapping a child array | ||
| /// with an expression that operates on it. The expression is not evaluated until the | ||
| /// array is canonicalized/executed. | ||
| /// | ||
| /// # Examples | ||
| /// | ||
| /// ```ignore | ||
| /// // Create an expression that filters an integer array | ||
| /// let data = PrimitiveArray::from_iter([1, 2, 3, 4, 5]); | ||
| /// let expr = gt(root(), lit(3)); // $ > 3 | ||
| /// let expr_array = ExprArray::new_infer_dtype(data.into_array(), expr)?; | ||
| /// | ||
| /// // The expression is evaluated when canonicalized | ||
| /// let result = expr_array.to_canonical(); // Returns BoolArray([false, false, false, true, true]) | ||
| /// ``` | ||
| /// | ||
| /// # Type Safety | ||
| /// | ||
| /// The `dtype` field must match `expr.return_dtype(child.dtype())`. This invariant | ||
| /// is enforced by the safe constructors ([`try_new`](ExprArray::try_new) and | ||
| /// [`new_infer_dtype`](ExprArray::new_infer_dtype)) but can be bypassed | ||
| /// with [`unchecked_new`](ExprArray::unchecked_new) for performance-critical code. | ||
| #[derive(Clone, Debug)] | ||
| pub struct ExprArray { | ||
| /// The underlying array that the expression will operate on. | ||
| pub(super) child: ArrayRef, | ||
| /// The expression to evaluate over the child array. | ||
| pub(super) expr: Expression, | ||
| /// The data type of the result after evaluating the expression. | ||
| pub(super) dtype: DType, | ||
| /// Statistics about the resulting array (may be computed lazily). | ||
| pub(super) stats: ArrayStats, | ||
| } | ||
|
|
||
| impl ExprArray { | ||
| /// Creates a new ExprArray with the dtype validated to match the expression's return type. | ||
| pub fn try_new(child: ArrayRef, expr: Expression, dtype: DType) -> VortexResult<Self> { | ||
| let expected_dtype = expr.return_dtype(child.dtype())?; | ||
| vortex_ensure!( | ||
| dtype == expected_dtype, | ||
| "ExprArray dtype mismatch: expected {}, got {}", | ||
| expected_dtype, | ||
| dtype | ||
| ); | ||
| Ok(unsafe { Self::unchecked_new(child, expr, dtype) }) | ||
| } | ||
|
|
||
| /// Create a new ExprArray without validating that the dtype matches the expression's return type. | ||
| /// | ||
| /// # Safety | ||
| /// | ||
| /// The caller must ensure that `dtype` matches `expr.return_dtype(child.dtype())`. | ||
| /// Violating this invariant may lead to incorrect results or panics when the array is used. | ||
| pub unsafe fn unchecked_new(child: ArrayRef, expr: Expression, dtype: DType) -> Self { | ||
| Self { | ||
| child, | ||
| expr, | ||
| dtype, | ||
| // TODO(joe): Propagate or compute statistics from the child array and expression. | ||
| stats: ArrayStats::default(), | ||
| } | ||
| } | ||
|
|
||
| /// Creates a new ExprArray with the dtype inferred from the expression and child. | ||
| pub fn new_infer_dtype(child: ArrayRef, expr: Expression) -> VortexResult<Self> { | ||
| let dtype = expr.return_dtype(child.dtype())?; | ||
| Ok(unsafe { Self::unchecked_new(child, expr, dtype) }) | ||
| } | ||
|
|
||
| pub fn child(&self) -> &ArrayRef { | ||
| &self.child | ||
| } | ||
|
|
||
| pub fn expr(&self) -> &Expression { | ||
| &self.expr | ||
| } | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,8 @@ | ||
| // SPDX-License-Identifier: Apache-2.0 | ||
| // SPDX-FileCopyrightText: Copyright the Vortex contributors | ||
|
|
||
| mod array; | ||
| pub use array::ExprArray; | ||
|
|
||
| mod vtable; | ||
| pub use vtable::{ExprEncoding, ExprVTable}; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,38 @@ | ||
| // SPDX-License-Identifier: Apache-2.0 | ||
| // SPDX-FileCopyrightText: Copyright the Vortex contributors | ||
|
|
||
| use std::hash::Hash; | ||
|
|
||
| use vortex_dtype::DType; | ||
|
|
||
| use crate::Precision; | ||
| use crate::arrays::expr::{ExprArray, ExprVTable}; | ||
| use crate::hash::{ArrayEq, ArrayHash}; | ||
| use crate::stats::StatsSetRef; | ||
| use crate::vtable::ArrayVTable; | ||
|
|
||
| impl ArrayVTable<ExprVTable> for ExprVTable { | ||
| fn len(array: &ExprArray) -> usize { | ||
| array.child.len() | ||
| } | ||
|
|
||
| fn dtype(array: &ExprArray) -> &DType { | ||
| &array.dtype | ||
| } | ||
|
|
||
| fn stats(array: &ExprArray) -> StatsSetRef<'_> { | ||
| array.stats.to_ref(array.as_ref()) | ||
| } | ||
|
|
||
| fn array_hash<H: std::hash::Hasher>(array: &ExprArray, state: &mut H, precision: Precision) { | ||
| array.child.array_hash(state, precision); | ||
| array.dtype.hash(state); | ||
| array.expr.hash(state) | ||
| } | ||
|
|
||
| fn array_eq(array: &ExprArray, other: &ExprArray, precision: Precision) -> bool { | ||
| array.child.array_eq(&other.child, precision) | ||
| && array.dtype == other.dtype | ||
| && array.expr == other.expr | ||
| } | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,48 @@ | ||
| // SPDX-License-Identifier: Apache-2.0 | ||
| // SPDX-FileCopyrightText: Copyright the Vortex contributors | ||
|
|
||
| use vortex_error::VortexExpect; | ||
|
|
||
| use crate::Canonical; | ||
| use crate::arrays::expr::{ExprArray, ExprVTable}; | ||
| use crate::vtable::CanonicalVTable; | ||
|
|
||
| impl CanonicalVTable<ExprVTable> for ExprVTable { | ||
| fn canonicalize(array: &ExprArray) -> Canonical { | ||
| array | ||
| .expr | ||
| .evaluate(&array.child) | ||
| .vortex_expect("Failed to evaluate expression") | ||
| .to_canonical() | ||
| } | ||
| } | ||
|
|
||
| #[cfg(test)] | ||
| mod tests { | ||
| use vortex_buffer::buffer; | ||
| use vortex_dtype::Nullability::NonNullable; | ||
| use vortex_dtype::{DType, PType}; | ||
|
|
||
| use crate::arrays::expr::ExprArray; | ||
| use crate::arrays::primitive::PrimitiveArray; | ||
| use crate::expr::binary::checked_add; | ||
| use crate::expr::literal::lit; | ||
| use crate::validity::Validity; | ||
| use crate::{Array, IntoArray, assert_arrays_eq}; | ||
|
|
||
| #[test] | ||
| fn test_expr_array_canonicalize() { | ||
| let child = PrimitiveArray::new(buffer![1i32, 2, 3], Validity::NonNullable).into_array(); | ||
|
|
||
| // This expression doesn't use the child, but demonstrates the ExprArray mechanics | ||
| let expr = checked_add(lit(10), lit(5)); | ||
|
|
||
| let dtype = DType::Primitive(PType::I32, NonNullable); | ||
| let expr_array = ExprArray::try_new(child, expr, dtype).unwrap(); | ||
|
|
||
| let actual = expr_array.to_canonical().into_array(); | ||
|
|
||
| let expect = (0..3).map(|_| 15i32).collect::<PrimitiveArray>(); | ||
| assert_arrays_eq!(expect, actual); | ||
| } | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,88 @@ | ||
| // SPDX-License-Identifier: Apache-2.0 | ||
| // SPDX-FileCopyrightText: Copyright the Vortex contributors | ||
|
|
||
| mod array; | ||
| mod canonical; | ||
| mod operations; | ||
| mod operator; | ||
| mod visitor; | ||
|
|
||
| use std::fmt::Debug; | ||
|
|
||
| use vortex_buffer::ByteBuffer; | ||
| use vortex_dtype::DType; | ||
| use vortex_error::{VortexResult, vortex_bail}; | ||
|
|
||
| use crate::arrays::expr::ExprArray; | ||
| use crate::expr::Expression; | ||
| use crate::serde::ArrayChildren; | ||
| use crate::vtable::{NotSupported, VTable}; | ||
| use crate::{EncodingId, EncodingRef, vtable}; | ||
|
|
||
| vtable!(Expr); | ||
|
|
||
| #[derive(Clone, Debug)] | ||
| pub struct ExprEncoding; | ||
|
|
||
| impl VTable for ExprVTable { | ||
| type Array = ExprArray; | ||
| type Encoding = ExprEncoding; | ||
| type Metadata = ExprArrayMetadata; | ||
|
|
||
| type ArrayVTable = Self; | ||
| type CanonicalVTable = Self; | ||
| type OperationsVTable = Self; | ||
| type ValidityVTable = NotSupported; | ||
| type VisitorVTable = Self; | ||
| type ComputeVTable = NotSupported; | ||
| type EncodeVTable = NotSupported; | ||
| type OperatorVTable = Self; | ||
|
|
||
| fn id(_encoding: &Self::Encoding) -> EncodingId { | ||
| EncodingId::new_ref("vortex.expr") | ||
| } | ||
|
|
||
| fn encoding(_array: &Self::Array) -> EncodingRef { | ||
| EncodingRef::new_ref(ExprEncoding.as_ref()) | ||
| } | ||
|
|
||
| fn metadata(array: &ExprArray) -> VortexResult<Self::Metadata> { | ||
| Ok(ExprArrayMetadata((array.expr.clone(), array.dtype.clone()))) | ||
| } | ||
|
|
||
| fn serialize(_metadata: Self::Metadata) -> VortexResult<Option<Vec<u8>>> { | ||
| Ok(None) | ||
| } | ||
|
|
||
| fn deserialize(_bytes: &[u8]) -> VortexResult<Self::Metadata> { | ||
| vortex_bail!("unsupported") | ||
| } | ||
|
|
||
| fn build( | ||
| _encoding: &ExprEncoding, | ||
| dtype: &DType, | ||
| len: usize, | ||
| ExprArrayMetadata((expr, root_dtype)): &Self::Metadata, | ||
| buffers: &[ByteBuffer], | ||
| children: &dyn ArrayChildren, | ||
| ) -> VortexResult<ExprArray> { | ||
| if !buffers.is_empty() { | ||
| vortex_bail!("Expected 0 buffers, got {}", buffers.len()); | ||
| } | ||
|
|
||
| let Ok(child) = children.get(0, root_dtype, len) else { | ||
| vortex_bail!("Expected 1 child, got {}", children.len()); | ||
| }; | ||
|
|
||
| ExprArray::try_new(child, expr.clone(), dtype.clone()) | ||
| } | ||
| } | ||
|
|
||
| pub struct ExprArrayMetadata((Expression, DType)); | ||
|
|
||
| impl Debug for ExprArrayMetadata { | ||
| fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { | ||
| // Since this is used in display method we can omit the dtype. | ||
| self.0.0.fmt_sql(f) | ||
| } | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,37 @@ | ||
| // SPDX-License-Identifier: Apache-2.0 | ||
| // SPDX-FileCopyrightText: Copyright the Vortex contributors | ||
|
|
||
| use std::ops::Range; | ||
|
|
||
| use vortex_error::VortexExpect; | ||
| use vortex_scalar::Scalar; | ||
|
|
||
| use crate::arrays::ConstantArray; | ||
| use crate::arrays::expr::{ExprArray, ExprVTable}; | ||
| use crate::stats::ArrayStats; | ||
| use crate::vtable::OperationsVTable; | ||
| use crate::{Array, ArrayRef, IntoArray}; | ||
|
|
||
| impl OperationsVTable<ExprVTable> for ExprVTable { | ||
| fn slice(array: &ExprArray, range: Range<usize>) -> ArrayRef { | ||
| let child = array.child.slice(range); | ||
|
|
||
| ExprArray { | ||
| child, | ||
| expr: array.expr.clone(), | ||
| dtype: array.dtype.clone(), | ||
| stats: ArrayStats::default(), | ||
| } | ||
| .into_array() | ||
| } | ||
|
|
||
| fn scalar_at(array: &ExprArray, index: usize) -> Scalar { | ||
| // TODO(joe): this is unchecked | ||
| array | ||
| .expr | ||
| .evaluate(&ConstantArray::new(array.child.scalar_at(index), 1).into_array()) | ||
| .vortex_expect("cannot fail") | ||
| .as_constant() | ||
| .vortex_expect("expr are scalar so cannot fail") | ||
| } | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,65 @@ | ||
| // SPDX-License-Identifier: Apache-2.0 | ||
| // SPDX-FileCopyrightText: Copyright the Vortex contributors | ||
|
|
||
| use vortex_error::VortexResult; | ||
|
|
||
| use crate::ArrayRef; | ||
| use crate::arrays::expr::{ExprArray, ExprVTable}; | ||
| use crate::expr::root; | ||
| use crate::expr::session::ExprSession; | ||
| use crate::expr::transform::ExprOptimizer; | ||
| use crate::vtable::OperatorVTable; | ||
|
|
||
| impl OperatorVTable<ExprVTable> for ExprVTable { | ||
| fn reduce(array: &ExprArray) -> VortexResult<Option<ArrayRef>> { | ||
| // Get the default expression session | ||
| let session = ExprSession::default(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe leave a louder fixme / todo that we have to inject this |
||
| let optimizer = ExprOptimizer::new(&session); | ||
|
|
||
| // Try to optimize the expression with type information | ||
| let optimized_expr = | ||
| optimizer.optimize_typed(array.expr().clone(), array.child().dtype())?; | ||
|
|
||
| if optimized_expr != *array.expr() { | ||
| // If the expression simplified to just root(), return the child directly | ||
| if optimized_expr == root() { | ||
| return Ok(Some(array.child().clone())); | ||
| } | ||
|
|
||
| let new_dtype = optimized_expr.return_dtype(array.child().dtype())?; | ||
| Ok(Some( | ||
| ExprArray::try_new(array.child().clone(), optimized_expr, new_dtype)?.into(), | ||
| )) | ||
| } else { | ||
| Ok(None) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| #[cfg(test)] | ||
| mod tests { | ||
|
|
||
| use vortex_dtype::Nullability; | ||
| use vortex_error::VortexExpect; | ||
|
|
||
| use super::*; | ||
| use crate::IntoArray; | ||
| use crate::arrays::{PrimitiveArray, PrimitiveVTable}; | ||
| use crate::expr::{get_item, pack, root}; | ||
|
|
||
| #[test] | ||
| fn test_expr_array_reduce_pack_unpack() -> VortexResult<()> { | ||
| let array = PrimitiveArray::from_iter([1i32, 2, 3, 4, 5]); | ||
|
|
||
| let expr = get_item("a", pack([("a", root())], Nullability::NonNullable)); | ||
|
|
||
| let expr_array = ExprArray::new_infer_dtype(array.into_array(), expr)?; | ||
|
|
||
| // Call reduce - it should optimize pack(a: $).a to just $ | ||
| let reduced = expr_array.reduce()?.vortex_expect("reduce failed"); | ||
|
|
||
| assert!(reduced.is::<PrimitiveVTable>()); | ||
|
|
||
| Ok(()) | ||
| } | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,14 @@ | ||
| // SPDX-License-Identifier: Apache-2.0 | ||
| // SPDX-FileCopyrightText: Copyright the Vortex contributors | ||
|
|
||
| use crate::arrays::expr::{ExprArray, ExprVTable}; | ||
| use crate::vtable::VisitorVTable; | ||
| use crate::{ArrayBufferVisitor, ArrayChildVisitor}; | ||
|
|
||
| impl VisitorVTable<ExprVTable> for ExprVTable { | ||
| fn visit_buffers(_array: &ExprArray, _visitor: &mut dyn ArrayBufferVisitor) {} | ||
|
|
||
| fn visit_children(array: &ExprArray, visitor: &mut dyn ArrayChildVisitor) { | ||
| visitor.visit_child("child", array.child.as_ref()); | ||
| } | ||
| } |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I quite like the pattern we have started to use in vortex-vector where we have
newthat callstry_new().expect(), andnew_unchecked(note not unchecked_new) hasif cfg(debug_assertions) { try_new().expect() } else { Self { ... }}