Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions vortex-btrblocks/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,11 @@ name = "compress"
harness = false
test = false

[[bench]]
name = "compress_listview"
harness = false
test = false

[[bench]]
name = "dict_encode"
harness = false
Expand Down
190 changes: 190 additions & 0 deletions vortex-btrblocks/benches/compress_listview.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

#![allow(clippy::unwrap_used)]
#![allow(clippy::cast_possible_truncation)]
#![allow(unexpected_cfgs)]

#[cfg(not(codspeed))]
mod benchmarks {
use divan::Bencher;
use divan::counter::BytesCount;
use divan::counter::ItemsCount;
use rand::Rng;
use rand::SeedableRng;
use rand::prelude::StdRng;
use vortex_array::ArrayRef;
use vortex_array::IntoArray;
use vortex_array::arrays::ListViewArray;
use vortex_array::arrays::StructArray;
use vortex_array::arrays::VarBinViewArray;
use vortex_array::validity::Validity;
use vortex_btrblocks::BtrBlocksCompressor;
use vortex_buffer::buffer_mut;
use vortex_dtype::FieldNames;

const NUM_ROWS: usize = 8192;
const SEED: u64 = 42;

const SHORT_STRINGS: &[&str] = &[
"alpha_one",
"bravo_two",
"charlie_three",
"delta_four",
"echo_five",
"foxtrot_six",
"golf_seven",
"hotel_eight",
"india_nine",
"juliet_ten",
];

const LONG_STRINGS: &[&str] = &[
"/path/to/some/deeply/nested/resource_a",
"/path/to/some/deeply/nested/resource_b",
"/path/to/another/location/item_c",
"/data/archive/2024/collection/entry_d",
"/data/archive/2024/collection/entry_e",
"/workspace/project/src/module_f",
"/workspace/project/src/module_g",
"/workspace/project/test/fixture_h",
"/var/log/service/output_i",
"/tmp/scratch/workspace/temp_j",
];

/// Wrap `elements` into a `ListViewArray` driven by per-entry `counts`.
/// When `zctl`, offsets are sorted/non-overlapping. Otherwise, adjacent entries overlap by 1.
fn wrap_listview(elements: ArrayRef, counts: &[usize], zctl: bool) -> ListViewArray {
let mut offsets = buffer_mut![0u32; counts.len()];
let mut sizes = buffer_mut![0u32; counts.len()];
let mut offset = 0u32;

for (i, &count) in counts.iter().enumerate() {
// When !zctl, each entry (after the first) starts 1 element before the
// previous entry ended, creating simple pairwise overlaps:
//
// elements: [a, b, c, d, e, f, g, h, i]
// row 0: ├────────┤ counts=[3, 3, 3]
// row 1: ├────────┤ offsets=[0, 2, 5]
// row 2: ├────────┤ sizes =[3, 4, 4]
// ^ ^
// shared shared
let overlap = if !zctl && i > 0 && offset > 0 { 1 } else { 0 };
offsets[i] = offset - overlap;
sizes[i] = count as u32 + overlap;
offset += count as u32;
}

let mut lv = ListViewArray::new(
elements,
offsets.freeze().into_array(),
sizes.freeze().into_array(),
Validity::NonNullable,
);
if zctl {
lv = unsafe { lv.with_zero_copy_to_list(true) };
}
lv
}

fn random_counts(
rng: &mut StdRng,
n: usize,
range: std::ops::RangeInclusive<usize>,
) -> Vec<usize> {
(0..n).map(|_| rng.random_range(range.clone())).collect()
}

fn random_i64_array(rng: &mut StdRng, len: usize, range: std::ops::Range<i64>) -> ArrayRef {
let mut buf = buffer_mut![0i64; len];
for v in buf.iter_mut() {
*v = rng.random_range(range.clone());
}
buf.freeze().into_array()
}

fn random_str_array(rng: &mut StdRng, len: usize, pool: &[&str]) -> ArrayRef {
let values: Vec<&str> = (0..len)
.map(|_| pool[rng.random_range(0..pool.len())])
.collect();
VarBinViewArray::from_iter_str(values).into_array()
}

fn make_struct(names: impl Into<FieldNames>, fields: Vec<ArrayRef>, len: usize) -> ArrayRef {
StructArray::try_new(names.into(), fields, len, Validity::NonNullable)
.unwrap()
.into_array()
}

/// Build the flat inner elements: `Struct<i64, Struct<utf8, utf8, i64>>`.
fn build_inner_elements(rng: &mut StdRng, total_mid: usize) -> (ArrayRef, Vec<usize>) {
let counts = random_counts(rng, total_mid, 1..=3);
let n: usize = counts.iter().sum();

let nested = make_struct(
["str_a", "str_b", "int_b"],
vec![
random_str_array(rng, n, SHORT_STRINGS),
random_str_array(rng, n, LONG_STRINGS),
random_i64_array(rng, n, 1..200),
],
n,
);
let inner = make_struct(
["int_a", "nested"],
vec![random_i64_array(rng, n, 1..500), nested],
n,
);
(inner, counts)
}

/// Build the flat mid-level elements: `Struct<i64, utf8, ListView<Struct<...>>>`.
fn build_mid_elements(rng: &mut StdRng, num_rows: usize, zctl: bool) -> (ArrayRef, Vec<usize>) {
let outer_counts = random_counts(rng, num_rows, 3..=10);
let n: usize = outer_counts.iter().sum();

let (inner_elements, inner_counts) = build_inner_elements(rng, n);
let inner_lv = wrap_listview(inner_elements, &inner_counts, zctl);

let mid = make_struct(
["int_c", "str_c", "inner_list"],
vec![
random_i64_array(rng, n, 0x400000..0x7FFFFF),
random_str_array(rng, n, LONG_STRINGS),
inner_lv.into_array(),
],
n,
);
(mid, outer_counts)
}

/// `ListView<Struct<i64, utf8, ListView<Struct<i64, Struct<utf8, utf8, i64>>>>>`
fn build_nested_listview(num_rows: usize, layout: OffsetLayout) -> ArrayRef {
let mut rng = StdRng::seed_from_u64(SEED);
let zctl = matches!(layout, OffsetLayout::Zctl);
let (mid_elements, outer_counts) = build_mid_elements(&mut rng, num_rows, zctl);
wrap_listview(mid_elements, &outer_counts, zctl).into_array()
}

#[derive(Debug, Clone, Copy)]
enum OffsetLayout {
Zctl,
Overlapping,
}

#[divan::bench(args = [OffsetLayout::Zctl, OffsetLayout::Overlapping])]
fn compress_listview(bencher: Bencher, layout: OffsetLayout) {
let array = build_nested_listview(NUM_ROWS, layout);
let nbytes = array.nbytes();
let compressor = BtrBlocksCompressor::default();
bencher
.with_inputs(|| &array)
.input_counter(|_| ItemsCount::new(NUM_ROWS))
.input_counter(move |_| BytesCount::new(nbytes as usize))
.bench_refs(|array| compressor.compress(array.as_ref()).unwrap());
}
}

fn main() {
divan::main()
}
117 changes: 27 additions & 90 deletions vortex-btrblocks/src/canonical_compressor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ use vortex_array::arrays::list_from_list_view;
use vortex_array::compute::Cost;
use vortex_array::compute::IsConstantOpts;
use vortex_array::compute::is_constant_opts;
use vortex_array::compute::sum;
use vortex_array::vtable::ValidityHelper;
use vortex_dtype::DType;
use vortex_dtype::Nullability;
Expand All @@ -39,13 +38,6 @@ use crate::compressor::float::FloatScheme;
use crate::compressor::integer::IntegerScheme;
use crate::compressor::string::StringScheme;
use crate::compressor::temporal::compress_temporal;
use crate::sample::sample;
use crate::sample::sample_count_approx_one_percent;
use crate::stats::SAMPLE_SIZE;

/// Maximum ratio of expanded (List) element count to shared (ListView) element count
/// below which we prefer List encoding over ListView.
const MAX_LIST_EXPANSION_RATIO: f64 = 1.5;

/// Trait for compressors that can compress canonical arrays.
///
Expand Down Expand Up @@ -245,32 +237,9 @@ impl CanonicalCompressor for BtrBlocksCompressor {
.into_array())
}
Canonical::List(list_view_array) => {
let elements_len = list_view_array.elements().len();
if list_view_array.is_zero_copy_to_list() || elements_len == 0 {
// We can avoid the sizes array.
let list_array = list_from_list_view(list_view_array)?;
return self.compress_list_array(list_array, ctx);
}

// Sample the sizes to estimate the total expanded element
// count, then decide List vs ListView with the expansion
// threshold.
let sampled_sizes = sample(
list_view_array.sizes(),
SAMPLE_SIZE,
sample_count_approx_one_percent(list_view_array.len()),
);
let sampled_sum = sum(&*sampled_sizes)?
.as_primitive()
.as_::<usize>()
.unwrap_or(0);

let estimated_expanded_elements_len =
sampled_sum * list_view_array.len() / sampled_sizes.len();

if estimated_expanded_elements_len as f64
<= elements_len as f64 * MAX_LIST_EXPANSION_RATIO
{
if list_view_array.is_zero_copy_to_list() || list_view_array.elements().is_empty() {
// Offsets are already monotonic and non-overlapping, so we
// can drop the sizes array and compress as a ListArray.
let list_array = list_from_list_view(list_view_array)?;
self.compress_list_array(list_array, ctx)
} else {
Expand Down Expand Up @@ -361,69 +330,37 @@ mod tests {

use crate::BtrBlocksCompressor;

/// ZCTL: [[1,2,3], [4,5], [6,7,8,9]]. Monotonic offsets, no overlap.
fn zctl_listview() -> ListViewArray {
let elements = buffer![1i32, 2, 3, 4, 5, 6, 7, 8, 9].into_array();
let offsets = buffer![0i32, 3, 5].into_array();
let sizes = buffer![3i32, 2, 4].into_array();
unsafe {
ListViewArray::new_unchecked(elements, offsets, sizes, Validity::NonNullable)
.with_zero_copy_to_list(true)
}
}

/// Non-ZCTL, low duplication: [[7,8,9], [1,2,3], [4,5,6]]. Unsorted but disjoint.
fn non_zctl_low_dup_listview() -> ListViewArray {
let elements = buffer![1i32, 2, 3, 4, 5, 6, 7, 8, 9].into_array();
let offsets = buffer![6i32, 0, 3].into_array();
let sizes = buffer![3i32, 3, 3].into_array();
ListViewArray::new(elements, offsets, sizes, Validity::NonNullable)
}

/// Non-ZCTL, high duplication: [[1,2,3]] x 4.
fn non_zctl_high_dup_listview() -> ListViewArray {
let elements = buffer![1i32, 2, 3].into_array();
let offsets = buffer![0i32, 0, 0, 0].into_array();
let sizes = buffer![3i32, 3, 3, 3].into_array();
ListViewArray::new(elements, offsets, sizes, Validity::NonNullable)
}

/// Nullable with overlap: [[1,2,3], null, [1,2,3], [1,2,3]].
fn nullable_overlap_listview() -> ListViewArray {
let elements = buffer![1i32, 2, 3].into_array();
let offsets = buffer![0i32, 0, 0, 0].into_array();
let sizes = buffer![3i32, 0, 3, 3].into_array();
let validity = Validity::from_iter([true, false, true, true]);
ListViewArray::new(elements, offsets, sizes, validity)
}

/// Tests that each ListView variant compresses to the expected encoding and roundtrips.
#[rstest]
#[case::zctl(zctl_listview(), true)]
#[case::non_zctl_low_dup(non_zctl_low_dup_listview(), true)]
#[case::non_zctl_high_dup(non_zctl_high_dup_listview(), false)]
#[case::nullable_overlap(nullable_overlap_listview(), false)]
fn list_view_compress_roundtrip(
#[case::zctl(
unsafe {
ListViewArray::new_unchecked(
buffer![1i32, 2, 3, 4, 5].into_array(),
buffer![0i32, 3].into_array(),
buffer![3i32, 2].into_array(),
Validity::NonNullable,
).with_zero_copy_to_list(true)
},
true,
)]
#[case::overlapping(
ListViewArray::new(
buffer![1i32, 2, 3].into_array(),
buffer![0i32, 0, 0].into_array(),
buffer![3i32, 3, 3].into_array(),
Validity::NonNullable,
),
false,
)]
fn listview_compress_roundtrip(
#[case] input: ListViewArray,
#[case] expect_list: bool,
) -> VortexResult<()> {
let compressor = BtrBlocksCompressor::default();
let result = compressor.compress(input.as_ref())?;

let result = BtrBlocksCompressor::default().compress(input.as_ref())?;
if expect_list {
assert!(
result.as_opt::<ListVTable>().is_some(),
"Expected ListArray, got: {}",
result.encoding_id()
);
assert!(result.as_opt::<ListVTable>().is_some());
} else {
assert!(
result.as_opt::<ListViewVTable>().is_some(),
"Expected ListViewArray, got: {}",
result.encoding_id()
);
assert!(result.as_opt::<ListViewVTable>().is_some());
}

assert_arrays_eq!(result, input);
Ok(())
}
Expand Down
Loading