Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions arrow/src/array/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -412,10 +412,12 @@ impl BooleanBufferBuilder {
///
/// `to_set` is a slice of bits packed LSB-first into `[u8]`
///
/// Returns the number of `0` bits written
///
/// # Panics
///
/// Panics if `to_set` does not contain `ceil(range.end / 8)` bytes
pub fn append_packed_range(&mut self, range: Range<usize>, to_set: &[u8]) {
pub fn append_packed_range(&mut self, range: Range<usize>, to_set: &[u8]) -> usize {
let offset_write = self.len;
let len = range.end - range.start;
self.advance(len);
Expand All @@ -425,7 +427,7 @@ impl BooleanBufferBuilder {
offset_write,
range.start,
len,
);
)
}

/// Returns the packed bits
Expand Down
171 changes: 105 additions & 66 deletions arrow/src/array/transform/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,11 @@ use super::{
data::{into_buffers, new_buffers},
ArrayData, ArrayDataBuilder,
};
use crate::array::StringOffsetSizeTrait;
use crate::array::{BooleanBufferBuilder, StringOffsetSizeTrait};
use crate::{
buffer::MutableBuffer,
datatypes::DataType,
error::{ArrowError, Result},
util::bit_util,
};
use half::f16;
use std::mem;
Expand Down Expand Up @@ -53,7 +52,7 @@ struct _MutableArrayData<'a> {
pub null_count: usize,

pub len: usize,
pub null_buffer: MutableBuffer,
pub null_buffer: Option<BooleanBufferBuilder>,

// arrow specification only allows up to 3 buffers (2 ignoring the nulls above).
// Thus, we place them in the stack to avoid bound checks and greater data locality.
Expand Down Expand Up @@ -83,39 +82,35 @@ impl<'a> _MutableArrayData<'a> {
.null_count(self.null_count)
.buffers(buffers)
.child_data(child_data);
if self.null_count > 0 {
array_data_builder =
array_data_builder.null_bit_buffer(self.null_buffer.into());

if let Some(mut null_buffer) = self.null_buffer {
assert_eq!(null_buffer.len(), self.len, "Inconsistent bitmap");
if self.null_count > 0 {
array_data_builder =
array_data_builder.null_bit_buffer(null_buffer.finish());
}
} else {
assert_eq!(self.null_count, 0, "Non-zero null count but no null bitmap")
}

array_data_builder
}
}

fn build_extend_null_bits(array: &ArrayData, use_nulls: bool) -> ExtendNullBits {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rather than returning a no-op if use_nulls is false, simply don't construct. An ExtendNullBits that does nothing feels like a potential footgun that is better to handle in a way that will fail loudly

/// Build a function to extend a `MutableArrayData` with a slice of the null bitmap of `array`
fn build_extend_null_bits(array: &ArrayData) -> ExtendNullBits {
if let Some(bitmap) = array.null_bitmap() {
let bytes = bitmap.bits.as_slice();
Box::new(move |mutable, start, len| {
utils::resize_for_bits(&mut mutable.null_buffer, mutable.len + len);
mutable.null_count += crate::util::bit_mask::set_bits(
mutable.null_buffer.as_slice_mut(),
bytes,
mutable.len,
array.offset() + start,
len,
);
let nulls = mutable.null_buffer.as_mut().expect("expected null buffer");
let start = array.offset() + start;
mutable.null_count += nulls.append_packed_range(start..start + len, bytes);
})
} else if use_nulls {
} else {
Box::new(|mutable, _, len| {
utils::resize_for_bits(&mut mutable.null_buffer, mutable.len + len);
let write_data = mutable.null_buffer.as_slice_mut();
let offset = mutable.len;
(0..len).for_each(|i| {
bit_util::set_bit(write_data, offset + i);
});
let nulls = mutable.null_buffer.as_mut().expect("expected null buffer");
nulls.append_n(len, true)
})
} else {
Box::new(|_, _, _| {})
}
}

Expand All @@ -141,27 +136,30 @@ fn build_extend_null_bits(array: &ArrayData, use_nulls: bool) -> ExtendNullBits
/// ```
pub struct MutableArrayData<'a> {
arrays: Vec<&'a ArrayData>,
// The attributes in [_MutableArrayData] cannot be in [MutableArrayData] due to
// mutability invariants (interior mutability):
// [MutableArrayData] contains a function that can only mutate [_MutableArrayData], not
// [MutableArrayData] itself
/// The attributes in [_MutableArrayData] cannot be in [MutableArrayData] due to
/// mutability invariants (interior mutability):
/// [MutableArrayData] contains a function that can only mutate [_MutableArrayData], not
/// [MutableArrayData] itself
data: _MutableArrayData<'a>,

// the child data of the `Array` in Dictionary arrays.
// This is not stored in `MutableArrayData` because these values constant and only needed
// at the end, when freezing [_MutableArrayData].
/// the child data of the `Array` in Dictionary arrays.
/// This is not stored in `MutableArrayData` because these values constant and only needed
/// at the end, when freezing [_MutableArrayData].
dictionary: Option<ArrayData>,

// function used to extend values from arrays. This function's lifetime is bound to the array
// because it reads values from it.
/// function used to extend values from arrays. This function's lifetime is bound to the array
/// because it reads values from it.
extend_values: Vec<Extend<'a>>,
// function used to extend nulls from arrays. This function's lifetime is bound to the array
// because it reads nulls from it.

/// function used to extend nulls from arrays. This function's lifetime is bound to the array
/// because it reads nulls from it.
extend_null_bits: Vec<ExtendNullBits<'a>>,

// function used to extend nulls.
// this is independent of the arrays and therefore has no lifetime.
extend_nulls: ExtendNulls,
/// function used to extend nulls values
/// this is independent of the arrays and therefore has no lifetime.
///
/// Note: this does not extend the null bitmask
extend_nulls: Option<ExtendNulls>,
}

impl<'a> std::fmt::Debug for MutableArrayData<'a> {
Expand Down Expand Up @@ -377,11 +375,14 @@ impl<'a> MutableArrayData<'a> {
/// returns a new [MutableArrayData] with capacity to `capacity` slots and specialized to create an
/// [ArrayData] from multiple `arrays`.
///
/// `use_nulls` is a flag used to optimize insertions. It should be `false` if the only source of nulls
/// are the arrays themselves and `true` if the user plans to call [MutableArrayData::extend_nulls].
/// In other words, if `use_nulls` is `false`, calling [MutableArrayData::extend_nulls] should not be used.
pub fn new(arrays: Vec<&'a ArrayData>, use_nulls: bool, capacity: usize) -> Self {
Self::with_capacities(arrays, use_nulls, Capacities::Array(capacity))
/// `compute_nulls` is a flag used to optimize insertions, if `compute_nulls` is `true` a null
/// bitmap will be created regardless of the contents of `arrays`, otherwise a null bitmap will
/// be computed only if `arrays` contains nulls.
///
/// Code that plans to call [MutableArrayData::extend_nulls] MUST set `compute_nulls` to `true`,
/// in order to ensure that a null bitmap is computed.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
/// in order to ensure that a null bitmap is computed.
/// in order to ensure that a null bitmap is computed, otherwise a panic will result.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it will panic otherwise, right? Rather than get undefined behavior?

pub fn new(arrays: Vec<&'a ArrayData>, compute_nulls: bool, capacity: usize) -> Self {
Self::with_capacities(arrays, compute_nulls, Capacities::Array(capacity))
}

/// Similar to [MutableArray::new], but lets users define the preallocated capacities of the array.
Expand All @@ -392,7 +393,7 @@ impl<'a> MutableArrayData<'a> {
/// a [Capacities] variant is not yet supported.
pub fn with_capacities(
arrays: Vec<&'a ArrayData>,
mut use_nulls: bool,
mut compute_nulls: bool,
capacities: Capacities,
) -> Self {
let data_type = arrays[0].data_type();
Expand All @@ -401,7 +402,7 @@ impl<'a> MutableArrayData<'a> {
// if any of the arrays has nulls, insertions from any array requires setting bits
// as there is at least one array with nulls.
if arrays.iter().any(|array| array.null_count() > 0) {
use_nulls = true;
compute_nulls = true;
};

let mut array_capacity;
Expand Down Expand Up @@ -470,7 +471,9 @@ impl<'a> MutableArrayData<'a> {
};

vec![MutableArrayData::with_capacities(
childs, use_nulls, capacities,
childs,
compute_nulls,
capacities,
)]
}
// the dictionary type just appends keys and clones the values.
Expand All @@ -487,7 +490,7 @@ impl<'a> MutableArrayData<'a> {
.collect::<Vec<_>>();
MutableArrayData::with_capacities(
child_arrays,
use_nulls,
compute_nulls,
child_cap.clone(),
)
})
Expand All @@ -501,7 +504,7 @@ impl<'a> MutableArrayData<'a> {
.iter()
.map(|array| &array.child_data()[i])
.collect::<Vec<_>>();
MutableArrayData::new(child_arrays, use_nulls, capacity)
MutableArrayData::new(child_arrays, compute_nulls, capacity)
})
.collect::<Vec<_>>()
}
Expand All @@ -511,7 +514,7 @@ impl<'a> MutableArrayData<'a> {
.iter()
.map(|array| &array.child_data()[i])
.collect::<Vec<_>>();
MutableArrayData::new(child_arrays, use_nulls, array_capacity)
MutableArrayData::new(child_arrays, compute_nulls, array_capacity)
})
.collect::<Vec<_>>(),
},
Expand Down Expand Up @@ -556,19 +559,20 @@ impl<'a> MutableArrayData<'a> {
_ => (None, false),
};

let extend_nulls = build_extend_nulls(data_type);

let extend_null_bits = arrays
.iter()
.map(|array| build_extend_null_bits(array, use_nulls))
.collect();

let null_buffer = if use_nulls {
let null_bytes = bit_util::ceil(array_capacity, 8);
MutableBuffer::from_len_zeroed(null_bytes)
let (null_buffer, extend_nulls, extend_null_bits) = if compute_nulls {
let extend_null_bits = arrays
.iter()
.map(|array| build_extend_null_bits(array))
.collect();

(
Some(BooleanBufferBuilder::new(array_capacity)),
Some(build_extend_nulls(data_type)),
extend_null_bits,
)
} else {
// create 0 capacity mutable buffer with the intention that it won't be used
MutableBuffer::with_capacity(0)
// create no null buffer and no extend_null_bits
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice 👍

(None, None, vec![])
};

let extend_values = match &data_type {
Expand Down Expand Up @@ -619,18 +623,29 @@ impl<'a> MutableArrayData<'a> {
/// This function panics if the range is out of bounds, i.e. if `start + len >= array.len()`.
pub fn extend(&mut self, index: usize, start: usize, end: usize) {
let len = end - start;
(self.extend_null_bits[index])(&mut self.data, start, len);
if !self.extend_null_bits.is_empty() {
(self.extend_null_bits[index])(&mut self.data, start, len);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the name of the use_nulls parameter is confusing and is not necessarily related to whether nulls are present or not, here is an excerpt of the method's doc comment:

use_nulls is a flag used to optimize insertions. It should be false if the only source of nulls are the arrays themselves

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

looking at build_extend_null_bits this is already optimized by returning a no-op function ( Box::new(|_, _, _| {}) ) if use_nulls is false and the array doesn't have a null bitmap

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes so if it is true it cannot assume that a bitmask won't be needed due to a call to extend_nulls. Otherwise if it is false and the arrays don't contain nulls, it knows it doesn't need to compute a null bitmask?

At least that was my reading of it, although the fact you can call extend_nulls having specified use_nulls as false and get something other than a panic suggests maybe I'm missing something 😅

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no-op function ( Box::new(|_, _, _| {}) ) if use_nulls is false and the array doesn't have a null bitmap

This is a good point, the particular case the filter benchmarks hit is where the array has a null bitmap, but a zero null count. So an alternative fix might be to fix extend_null_bits. Although now that I think about it, I'm not sure that no-op function is correct in the event of mixed array nullability 🤔

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

my reading of the code (and the little documentation that accompanies it) is that use_nulls is used to determine where the nulls will be coming from, not whether nulls are present or not

Copy link
Contributor Author

@tustvold tustvold Jan 23, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would agree with you if it weren't for this code

// if any of the arrays has nulls, insertions from any array requires setting bits
// as there is at least one array with nulls.
if arrays.iter().any(|array| array.null_count() > 0) {
    use_nulls = true;
};

Effectively it changes meaning part way through 😅

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, I agree - this needs clarification or fixing, to make it less confusing

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe @tustvold has resolved this in the latest version of this PR -- by making it clear that use_nulls must be set to true otherwise a panic will result if the buffer is extended with nulls

}
(self.extend_values[index])(&mut self.data, index, start, len);
self.data.len += len;
}

/// Extends this [MutableArrayData] with null elements, disregarding the bound arrays
///
/// # Panic
/// This function panics if [`MutableArrayData`] is not computing nulls
pub fn extend_nulls(&mut self, len: usize) {
// TODO: null_buffer should probably be extended here as well
// otherwise is_valid() could later panic
// add test to confirm
let nulls = self.data.null_buffer.as_mut().expect(
"Cannot append nulls to MutableArrayData created with nulls disabled",
);
let extend_nulls = self.extend_nulls.as_mut().expect("extend nulls");

// Extend null bitmap
nulls.append_n(len, false);
self.data.null_count += len;
(self.extend_nulls)(&mut self.data, len);

// Extend values
extend_nulls(&mut self.data, len);
self.data.len += len;
}

Expand Down Expand Up @@ -1452,6 +1467,30 @@ mod tests {
assert_eq!(&result, expected.data());
}

#[test]
#[should_panic(
expected = "Cannot append nulls to MutableArrayData created with nulls disabled"
)]
fn test_disabled_nulls() {
let ints: ArrayRef =
Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(4), Some(5)]));

let mut data = MutableArrayData::new(vec![ints.data()], false, 3);
data.extend(0, 1, 2);
data.extend_nulls(1);
}

#[test]
fn test_nulls() {
let ints: ArrayRef = Arc::new(Int32Array::from(vec![1]));
let mut data = MutableArrayData::new(vec![ints.data()], true, 3);
data.extend_nulls(9);
let data = data.freeze();

assert_eq!(data.len(), 9);
assert_eq!(data.null_buffer().unwrap().len(), 2);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

2 because there are two bytes needed to hold the bitmap?

}

/*
// this is an old test used on a meanwhile removed dead code
// that is still useful when `MutableArrayData` supports fixed-size lists.
Expand Down