-
Notifications
You must be signed in to change notification settings - Fork 1k
Improve MutableArrayData Null Handling (#1224) (#1230) #1225
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
bfa0024
f957077
338b703
41ad433
137433c
07e9460
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -19,12 +19,11 @@ use super::{ | |||||
| data::{into_buffers, new_buffers}, | ||||||
| ArrayData, ArrayDataBuilder, | ||||||
| }; | ||||||
| use crate::array::StringOffsetSizeTrait; | ||||||
| use crate::array::{BooleanBufferBuilder, StringOffsetSizeTrait}; | ||||||
| use crate::{ | ||||||
| buffer::MutableBuffer, | ||||||
| datatypes::DataType, | ||||||
| error::{ArrowError, Result}, | ||||||
| util::bit_util, | ||||||
| }; | ||||||
| use half::f16; | ||||||
| use std::mem; | ||||||
|
|
@@ -53,7 +52,7 @@ struct _MutableArrayData<'a> { | |||||
| pub null_count: usize, | ||||||
|
|
||||||
| pub len: usize, | ||||||
| pub null_buffer: MutableBuffer, | ||||||
| pub null_buffer: Option<BooleanBufferBuilder>, | ||||||
|
|
||||||
| // arrow specification only allows up to 3 buffers (2 ignoring the nulls above). | ||||||
| // Thus, we place them in the stack to avoid bound checks and greater data locality. | ||||||
|
|
@@ -83,39 +82,35 @@ impl<'a> _MutableArrayData<'a> { | |||||
| .null_count(self.null_count) | ||||||
| .buffers(buffers) | ||||||
| .child_data(child_data); | ||||||
| if self.null_count > 0 { | ||||||
| array_data_builder = | ||||||
| array_data_builder.null_bit_buffer(self.null_buffer.into()); | ||||||
|
|
||||||
| if let Some(mut null_buffer) = self.null_buffer { | ||||||
| assert_eq!(null_buffer.len(), self.len, "Inconsistent bitmap"); | ||||||
| if self.null_count > 0 { | ||||||
| array_data_builder = | ||||||
| array_data_builder.null_bit_buffer(null_buffer.finish()); | ||||||
| } | ||||||
| } else { | ||||||
| assert_eq!(self.null_count, 0, "Non-zero null count but no null bitmap") | ||||||
| } | ||||||
|
|
||||||
| array_data_builder | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| fn build_extend_null_bits(array: &ArrayData, use_nulls: bool) -> ExtendNullBits { | ||||||
| /// Build a function to extend a `MutableArrayData` with a slice of the null bitmap of `array` | ||||||
| fn build_extend_null_bits(array: &ArrayData) -> ExtendNullBits { | ||||||
| if let Some(bitmap) = array.null_bitmap() { | ||||||
| let bytes = bitmap.bits.as_slice(); | ||||||
| Box::new(move |mutable, start, len| { | ||||||
| utils::resize_for_bits(&mut mutable.null_buffer, mutable.len + len); | ||||||
| mutable.null_count += crate::util::bit_mask::set_bits( | ||||||
| mutable.null_buffer.as_slice_mut(), | ||||||
| bytes, | ||||||
| mutable.len, | ||||||
| array.offset() + start, | ||||||
| len, | ||||||
| ); | ||||||
| let nulls = mutable.null_buffer.as_mut().expect("expected null buffer"); | ||||||
| let start = array.offset() + start; | ||||||
| mutable.null_count += nulls.append_packed_range(start..start + len, bytes); | ||||||
| }) | ||||||
| } else if use_nulls { | ||||||
| } else { | ||||||
| Box::new(|mutable, _, len| { | ||||||
| utils::resize_for_bits(&mut mutable.null_buffer, mutable.len + len); | ||||||
| let write_data = mutable.null_buffer.as_slice_mut(); | ||||||
| let offset = mutable.len; | ||||||
| (0..len).for_each(|i| { | ||||||
| bit_util::set_bit(write_data, offset + i); | ||||||
| }); | ||||||
| let nulls = mutable.null_buffer.as_mut().expect("expected null buffer"); | ||||||
| nulls.append_n(len, true) | ||||||
| }) | ||||||
| } else { | ||||||
| Box::new(|_, _, _| {}) | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
|
|
@@ -141,27 +136,30 @@ fn build_extend_null_bits(array: &ArrayData, use_nulls: bool) -> ExtendNullBits | |||||
| /// ``` | ||||||
| pub struct MutableArrayData<'a> { | ||||||
| arrays: Vec<&'a ArrayData>, | ||||||
| // The attributes in [_MutableArrayData] cannot be in [MutableArrayData] due to | ||||||
| // mutability invariants (interior mutability): | ||||||
| // [MutableArrayData] contains a function that can only mutate [_MutableArrayData], not | ||||||
| // [MutableArrayData] itself | ||||||
| /// The attributes in [_MutableArrayData] cannot be in [MutableArrayData] due to | ||||||
| /// mutability invariants (interior mutability): | ||||||
| /// [MutableArrayData] contains a function that can only mutate [_MutableArrayData], not | ||||||
| /// [MutableArrayData] itself | ||||||
| data: _MutableArrayData<'a>, | ||||||
|
|
||||||
| // the child data of the `Array` in Dictionary arrays. | ||||||
| // This is not stored in `MutableArrayData` because these values constant and only needed | ||||||
| // at the end, when freezing [_MutableArrayData]. | ||||||
| /// the child data of the `Array` in Dictionary arrays. | ||||||
| /// This is not stored in `MutableArrayData` because these values constant and only needed | ||||||
| /// at the end, when freezing [_MutableArrayData]. | ||||||
| dictionary: Option<ArrayData>, | ||||||
|
|
||||||
| // function used to extend values from arrays. This function's lifetime is bound to the array | ||||||
| // because it reads values from it. | ||||||
| /// function used to extend values from arrays. This function's lifetime is bound to the array | ||||||
| /// because it reads values from it. | ||||||
| extend_values: Vec<Extend<'a>>, | ||||||
| // function used to extend nulls from arrays. This function's lifetime is bound to the array | ||||||
| // because it reads nulls from it. | ||||||
|
|
||||||
| /// function used to extend nulls from arrays. This function's lifetime is bound to the array | ||||||
| /// because it reads nulls from it. | ||||||
| extend_null_bits: Vec<ExtendNullBits<'a>>, | ||||||
|
|
||||||
| // function used to extend nulls. | ||||||
| // this is independent of the arrays and therefore has no lifetime. | ||||||
| extend_nulls: ExtendNulls, | ||||||
| /// function used to extend nulls values | ||||||
| /// this is independent of the arrays and therefore has no lifetime. | ||||||
| /// | ||||||
| /// Note: this does not extend the null bitmask | ||||||
| extend_nulls: Option<ExtendNulls>, | ||||||
| } | ||||||
|
|
||||||
| impl<'a> std::fmt::Debug for MutableArrayData<'a> { | ||||||
|
|
@@ -377,11 +375,14 @@ impl<'a> MutableArrayData<'a> { | |||||
| /// returns a new [MutableArrayData] with capacity to `capacity` slots and specialized to create an | ||||||
| /// [ArrayData] from multiple `arrays`. | ||||||
| /// | ||||||
| /// `use_nulls` is a flag used to optimize insertions. It should be `false` if the only source of nulls | ||||||
| /// are the arrays themselves and `true` if the user plans to call [MutableArrayData::extend_nulls]. | ||||||
| /// In other words, if `use_nulls` is `false`, calling [MutableArrayData::extend_nulls] should not be used. | ||||||
| pub fn new(arrays: Vec<&'a ArrayData>, use_nulls: bool, capacity: usize) -> Self { | ||||||
| Self::with_capacities(arrays, use_nulls, Capacities::Array(capacity)) | ||||||
| /// `compute_nulls` is a flag used to optimize insertions, if `compute_nulls` is `true` a null | ||||||
| /// bitmap will be created regardless of the contents of `arrays`, otherwise a null bitmap will | ||||||
| /// be computed only if `arrays` contains nulls. | ||||||
| /// | ||||||
| /// Code that plans to call [MutableArrayData::extend_nulls] MUST set `compute_nulls` to `true`, | ||||||
| /// in order to ensure that a null bitmap is computed. | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it will panic otherwise, right? Rather than get undefined behavior? |
||||||
| pub fn new(arrays: Vec<&'a ArrayData>, compute_nulls: bool, capacity: usize) -> Self { | ||||||
| Self::with_capacities(arrays, compute_nulls, Capacities::Array(capacity)) | ||||||
| } | ||||||
|
|
||||||
| /// Similar to [MutableArray::new], but lets users define the preallocated capacities of the array. | ||||||
|
|
@@ -392,7 +393,7 @@ impl<'a> MutableArrayData<'a> { | |||||
| /// a [Capacities] variant is not yet supported. | ||||||
| pub fn with_capacities( | ||||||
| arrays: Vec<&'a ArrayData>, | ||||||
| mut use_nulls: bool, | ||||||
| mut compute_nulls: bool, | ||||||
| capacities: Capacities, | ||||||
| ) -> Self { | ||||||
| let data_type = arrays[0].data_type(); | ||||||
|
|
@@ -401,7 +402,7 @@ impl<'a> MutableArrayData<'a> { | |||||
| // if any of the arrays has nulls, insertions from any array requires setting bits | ||||||
| // as there is at least one array with nulls. | ||||||
| if arrays.iter().any(|array| array.null_count() > 0) { | ||||||
| use_nulls = true; | ||||||
| compute_nulls = true; | ||||||
| }; | ||||||
|
|
||||||
| let mut array_capacity; | ||||||
|
|
@@ -470,7 +471,9 @@ impl<'a> MutableArrayData<'a> { | |||||
| }; | ||||||
|
|
||||||
| vec![MutableArrayData::with_capacities( | ||||||
| childs, use_nulls, capacities, | ||||||
| childs, | ||||||
| compute_nulls, | ||||||
| capacities, | ||||||
| )] | ||||||
| } | ||||||
| // the dictionary type just appends keys and clones the values. | ||||||
|
|
@@ -487,7 +490,7 @@ impl<'a> MutableArrayData<'a> { | |||||
| .collect::<Vec<_>>(); | ||||||
| MutableArrayData::with_capacities( | ||||||
| child_arrays, | ||||||
| use_nulls, | ||||||
| compute_nulls, | ||||||
| child_cap.clone(), | ||||||
| ) | ||||||
| }) | ||||||
|
|
@@ -501,7 +504,7 @@ impl<'a> MutableArrayData<'a> { | |||||
| .iter() | ||||||
| .map(|array| &array.child_data()[i]) | ||||||
| .collect::<Vec<_>>(); | ||||||
| MutableArrayData::new(child_arrays, use_nulls, capacity) | ||||||
| MutableArrayData::new(child_arrays, compute_nulls, capacity) | ||||||
| }) | ||||||
| .collect::<Vec<_>>() | ||||||
| } | ||||||
|
|
@@ -511,7 +514,7 @@ impl<'a> MutableArrayData<'a> { | |||||
| .iter() | ||||||
| .map(|array| &array.child_data()[i]) | ||||||
| .collect::<Vec<_>>(); | ||||||
| MutableArrayData::new(child_arrays, use_nulls, array_capacity) | ||||||
| MutableArrayData::new(child_arrays, compute_nulls, array_capacity) | ||||||
| }) | ||||||
| .collect::<Vec<_>>(), | ||||||
| }, | ||||||
|
|
@@ -556,19 +559,20 @@ impl<'a> MutableArrayData<'a> { | |||||
| _ => (None, false), | ||||||
| }; | ||||||
|
|
||||||
| let extend_nulls = build_extend_nulls(data_type); | ||||||
|
|
||||||
| let extend_null_bits = arrays | ||||||
| .iter() | ||||||
| .map(|array| build_extend_null_bits(array, use_nulls)) | ||||||
| .collect(); | ||||||
|
|
||||||
| let null_buffer = if use_nulls { | ||||||
| let null_bytes = bit_util::ceil(array_capacity, 8); | ||||||
| MutableBuffer::from_len_zeroed(null_bytes) | ||||||
| let (null_buffer, extend_nulls, extend_null_bits) = if compute_nulls { | ||||||
| let extend_null_bits = arrays | ||||||
| .iter() | ||||||
| .map(|array| build_extend_null_bits(array)) | ||||||
| .collect(); | ||||||
|
|
||||||
| ( | ||||||
| Some(BooleanBufferBuilder::new(array_capacity)), | ||||||
| Some(build_extend_nulls(data_type)), | ||||||
| extend_null_bits, | ||||||
| ) | ||||||
| } else { | ||||||
| // create 0 capacity mutable buffer with the intention that it won't be used | ||||||
| MutableBuffer::with_capacity(0) | ||||||
| // create no null buffer and no extend_null_bits | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nice 👍 |
||||||
| (None, None, vec![]) | ||||||
| }; | ||||||
|
|
||||||
| let extend_values = match &data_type { | ||||||
|
|
@@ -619,18 +623,29 @@ impl<'a> MutableArrayData<'a> { | |||||
| /// This function panics if the range is out of bounds, i.e. if `start + len >= array.len()`. | ||||||
| pub fn extend(&mut self, index: usize, start: usize, end: usize) { | ||||||
| let len = end - start; | ||||||
| (self.extend_null_bits[index])(&mut self.data, start, len); | ||||||
| if !self.extend_null_bits.is_empty() { | ||||||
| (self.extend_null_bits[index])(&mut self.data, start, len); | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the name of the
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. looking at There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes so if it is true it cannot assume that a bitmask won't be needed due to a call to extend_nulls. Otherwise if it is false and the arrays don't contain nulls, it knows it doesn't need to compute a null bitmask? At least that was my reading of it, although the fact you can call extend_nulls having specified use_nulls as false and get something other than a panic suggests maybe I'm missing something 😅 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This is a good point, the particular case the filter benchmarks hit is where the array has a null bitmap, but a zero null count. So an alternative fix might be to fix There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. my reading of the code (and the little documentation that accompanies it) is that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would agree with you if it weren't for this code Effectively it changes meaning part way through 😅 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, I agree - this needs clarification or fixing, to make it less confusing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe @tustvold has resolved this in the latest version of this PR -- by making it clear that |
||||||
| } | ||||||
| (self.extend_values[index])(&mut self.data, index, start, len); | ||||||
| self.data.len += len; | ||||||
| } | ||||||
|
|
||||||
| /// Extends this [MutableArrayData] with null elements, disregarding the bound arrays | ||||||
| /// | ||||||
| /// # Panic | ||||||
| /// This function panics if [`MutableArrayData`] is not computing nulls | ||||||
| pub fn extend_nulls(&mut self, len: usize) { | ||||||
| // TODO: null_buffer should probably be extended here as well | ||||||
| // otherwise is_valid() could later panic | ||||||
| // add test to confirm | ||||||
| let nulls = self.data.null_buffer.as_mut().expect( | ||||||
| "Cannot append nulls to MutableArrayData created with nulls disabled", | ||||||
| ); | ||||||
| let extend_nulls = self.extend_nulls.as_mut().expect("extend nulls"); | ||||||
|
|
||||||
| // Extend null bitmap | ||||||
| nulls.append_n(len, false); | ||||||
| self.data.null_count += len; | ||||||
| (self.extend_nulls)(&mut self.data, len); | ||||||
|
|
||||||
| // Extend values | ||||||
| extend_nulls(&mut self.data, len); | ||||||
| self.data.len += len; | ||||||
| } | ||||||
|
|
||||||
|
|
@@ -1452,6 +1467,30 @@ mod tests { | |||||
| assert_eq!(&result, expected.data()); | ||||||
| } | ||||||
|
|
||||||
| #[test] | ||||||
| #[should_panic( | ||||||
| expected = "Cannot append nulls to MutableArrayData created with nulls disabled" | ||||||
| )] | ||||||
| fn test_disabled_nulls() { | ||||||
| let ints: ArrayRef = | ||||||
| Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(4), Some(5)])); | ||||||
|
|
||||||
| let mut data = MutableArrayData::new(vec![ints.data()], false, 3); | ||||||
| data.extend(0, 1, 2); | ||||||
| data.extend_nulls(1); | ||||||
| } | ||||||
|
|
||||||
| #[test] | ||||||
| fn test_nulls() { | ||||||
| let ints: ArrayRef = Arc::new(Int32Array::from(vec![1])); | ||||||
| let mut data = MutableArrayData::new(vec![ints.data()], true, 3); | ||||||
| data.extend_nulls(9); | ||||||
| let data = data.freeze(); | ||||||
|
|
||||||
| assert_eq!(data.len(), 9); | ||||||
| assert_eq!(data.null_buffer().unwrap().len(), 2); | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 2 because there are two bytes needed to hold the bitmap? |
||||||
| } | ||||||
|
|
||||||
| /* | ||||||
| // this is an old test used on a meanwhile removed dead code | ||||||
| // that is still useful when `MutableArrayData` supports fixed-size lists. | ||||||
|
|
||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Rather than returning a no-op if use_nulls is false, simply don't construct. An ExtendNullBits that does nothing feels like a potential footgun that is better to handle in a way that will fail loudly