Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Improved performance of filter performance via Simd selection [3x] #871

Merged
merged 2 commits into from
Feb 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ target-tarpaulin
venv
lcov.info
Cargo.lock
example.arrow
fixtures
settings.json
dev/
Expand Down
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,7 @@ harness = false
name = "comparison_kernels"
harness = false


Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A blank line slipped in

[[bench]]
jorgecarleitao marked this conversation as resolved.
Show resolved Hide resolved
name = "read_parquet"
harness = false
Expand Down
9 changes: 9 additions & 0 deletions benches/filter_kernels.rs
Original file line number Diff line number Diff line change
Expand Up @@ -94,12 +94,21 @@ fn add_benchmark(c: &mut Criterion) {
});

let data_array = create_primitive_array::<f32>(size, 0.5);
let data_array_nonull = create_primitive_array::<f32>(size, 0.0);
c.bench_function("filter f32", |b| {
b.iter(|| bench_filter(&data_array, &filter_array))
});
c.bench_function("filter f32 high selectivity", |b| {
b.iter(|| bench_filter(&data_array, &dense_filter_array))
});

c.bench_function("filter f32 nonull", |b| {
b.iter(|| bench_filter(&data_array_nonull, &filter_array))
});
c.bench_function("filter f32 nonull high selectivity", |b| {
b.iter(|| bench_filter(&data_array_nonull, &dense_filter_array))
});

c.bench_function("filter context f32", |b| {
b.iter(|| bench_built_filter(&filter, &data_array))
});
Expand Down
57 changes: 37 additions & 20 deletions src/compute/filter.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
//! Contains operators to filter arrays such as [`filter`].
use crate::array::growable::{make_growable, Growable};
use crate::bitmap::utils::{BitChunkIterExact, BitChunksExact};
use crate::bitmap::utils::{BitChunk, BitChunkIterExact, BitChunksExact};
use crate::bitmap::{utils::SlicesIterator, Bitmap, MutableBitmap};
use crate::chunk::Chunk;
use crate::datatypes::DataType;
use crate::error::Result;
use crate::types::simd::{NativeSimd, Simd};
use crate::types::BitChunkIter;
use crate::types::BitChunkOnes;
use crate::{array::*, types::NativeType};
use num_traits::One;
use num_traits::Zero;

/// Function that can filter arbitrary arrays
pub type Filter<'a> = Box<dyn Fn(&dyn Array) -> Box<dyn Array> + 'a + Send + Sync>;
Expand All @@ -21,20 +23,25 @@ where
I: BitChunkIterExact<<<T as Simd>::Simd as NativeSimd>::Chunk>,
{
let mut chunks = values.chunks_exact(T::Simd::LANES);

let mut new = Vec::<T>::with_capacity(filter_count);
let mut dst = new.as_mut_ptr();
chunks
.by_ref()
.zip(mask_chunks.by_ref())
.for_each(|(chunk, validity_chunk)| {
let iter = BitChunkIter::new(validity_chunk, T::Simd::LANES);
for (value, b) in chunk.iter().zip(iter) {
if b {
unsafe {
dst.write(*value);
dst = dst.add(1);
};
let ones_iter = BitChunkOnes::new(validity_chunk);

let (size, _) = ones_iter.size_hint();
if size == T::Simd::LANES {
// Fast path: all lanes are set
unsafe {
std::ptr::copy(chunk.as_ptr(), dst, size);
dst = dst.add(size);
}
} else {
for pos in ones_iter {
dst.write(chunk[pos]);
dst = dst.add(1);
}
}
});
Expand Down Expand Up @@ -74,22 +81,32 @@ where
let mut validity_chunks = validity.chunks::<<T::Simd as NativeSimd>::Chunk>();

let mut new = Vec::<T>::with_capacity(filter_count);
let mut new_validity = MutableBitmap::with_capacity(filter_count);
let mut dst = new.as_mut_ptr();
let mut new_validity = MutableBitmap::with_capacity(filter_count);

chunks
.by_ref()
.zip(validity_chunks.by_ref())
.zip(mask_chunks.by_ref())
.for_each(|((chunk, validity_chunk), mask_chunk)| {
let mask_iter = BitChunkIter::new(mask_chunk, T::Simd::LANES);
let validity_iter = BitChunkIter::new(validity_chunk, T::Simd::LANES);
for ((value, is_valid), is_selected) in chunk.iter().zip(validity_iter).zip(mask_iter) {
if is_selected {
unsafe {
dst.write(*value);
dst = dst.add(1);
new_validity.push_unchecked(is_valid);
};
let ones_iter = BitChunkOnes::new(mask_chunk);
let (size, _) = ones_iter.size_hint();

if size == T::Simd::LANES {
// Fast path: all lanes are set
unsafe {
std::ptr::copy(chunk.as_ptr(), dst, size);
dst = dst.add(size);
new_validity.extend_from_slice(validity_chunk.to_ne_bytes().as_ref(), 0, size);
}
} else {
for pos in ones_iter {
dst.write(chunk[pos]);
dst = dst.add(1);
new_validity.push(
validity_chunk & (<<<T as Simd>::Simd as NativeSimd>::Chunk>::one() << pos)
> <<<T as Simd>::Simd as NativeSimd>::Chunk>::zero(),
);
}
}
});
Expand Down
88 changes: 70 additions & 18 deletions src/types/bit_chunk.rs
Original file line number Diff line number Diff line change
@@ -1,30 +1,26 @@
use std::{
fmt::Binary,
ops::{BitAnd, BitAndAssign, BitOr, Not, Shl, ShlAssign, ShrAssign},
ops::{BitAndAssign, Not, Shl, ShlAssign, ShrAssign},
};

use num_traits::PrimInt;

use super::NativeType;

/// A chunk of bits. This is used to create masks of a given length
/// whose width is `1` bit. In `simd_packed` notation, this corresponds to `m1xY`.
pub trait BitChunk:
super::private::Sealed
+ PrimInt
+ NativeType
+ Binary
+ BitAnd<Output = Self>
+ ShlAssign
+ Not<Output = Self>
+ ShrAssign<usize>
+ ShlAssign<usize>
+ Shl<usize, Output = Self>
+ Eq
+ BitAndAssign
+ BitOr<Output = Self>
{
/// A value with a single bit set at the most right position.
fn one() -> Self;
/// A value with no bits set.
fn zero() -> Self;
/// convert itself into bytes.
fn to_ne_bytes(self) -> Self::Bytes;
/// convert itself from bytes.
Expand All @@ -34,11 +30,6 @@ pub trait BitChunk:
macro_rules! bit_chunk {
($ty:ty) => {
impl BitChunk for $ty {
#[inline(always)]
fn zero() -> Self {
0
}

#[inline(always)]
fn to_ne_bytes(self) -> Self::Bytes {
self.to_ne_bytes()
Expand All @@ -48,11 +39,6 @@ macro_rules! bit_chunk {
fn from_ne_bytes(v: Self::Bytes) -> Self {
Self::from_ne_bytes(v)
}

#[inline(always)]
fn one() -> Self {
1
}
}
};
}
Expand Down Expand Up @@ -113,6 +99,62 @@ impl<T: BitChunk> Iterator for BitChunkIter<T> {
}
}

// # Safety
// a mathematical invariant of this iterator
unsafe impl<T: BitChunk> crate::trusted_len::TrustedLen for BitChunkIter<T> {}

/// An [`Iterator<Item=usize>`] over a [`BitChunk`].
/// This iterator returns the postion of bit set.
/// Refer: https://lemire.me/blog/2018/03/08/iterating-over-set-bits-quickly-simd-edition/
/// # Example
/// ```
/// use arrow2::types::BitChunkOnes;
/// let a = 0b00010000u8;
/// let iter = BitChunkOnes::new(a);
/// let r = iter.collect::<Vec<_>>();
/// assert_eq!(r, vec![4]);
/// ```
pub struct BitChunkOnes<T: BitChunk> {
jorgecarleitao marked this conversation as resolved.
Show resolved Hide resolved
value: T,
remaining: usize,
}

impl<T: BitChunk> BitChunkOnes<T> {
/// Creates a new [`BitChunkOnes`] with `len` bits.
#[inline]
pub fn new(value: T) -> Self {
Self {
value,
remaining: value.count_ones() as usize,
}
}
}

impl<T: BitChunk> Iterator for BitChunkOnes<T> {
type Item = usize;

#[inline]
fn next(&mut self) -> Option<Self::Item> {
if self.remaining == 0 {
return None;
}
let v = self.value.trailing_zeros() as usize;
self.value &= self.value - T::one();

self.remaining -= 1;
Some(v)
}

#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
(self.remaining, Some(self.remaining))
}
}
jorgecarleitao marked this conversation as resolved.
Show resolved Hide resolved

// # Safety
// a mathematical invariant of this iterator
unsafe impl<T: BitChunk> crate::trusted_len::TrustedLen for BitChunkOnes<T> {}

#[cfg(test)]
mod tests {
use super::*;
Expand All @@ -125,4 +167,14 @@ mod tests {
let r = iter.collect::<Vec<_>>();
assert_eq!(r, (0..16).map(|x| x == 0 || x == 12).collect::<Vec<_>>(),);
}

#[test]
fn test_ones() {
let a = [0b00000001, 0b00010000]; // 0th and 13th entry
let a = u16::from_ne_bytes(a);
let mut iter = BitChunkOnes::new(a);
jorgecarleitao marked this conversation as resolved.
Show resolved Hide resolved
assert_eq!(iter.size_hint(), (2, Some(2)));
assert_eq!(iter.next(), Some(0));
assert_eq!(iter.next(), Some(12));
}
}
2 changes: 1 addition & 1 deletion src/types/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
//! for SIMD, at [`mod@simd`].

mod bit_chunk;
pub use bit_chunk::{BitChunk, BitChunkIter};
pub use bit_chunk::{BitChunk, BitChunkIter, BitChunkOnes};
mod index;
pub mod simd;
pub use index::*;
Expand Down