Skip to content

Commit

Permalink
feat: eagerly compute pruning stats during compression
Browse files Browse the repository at this point in the history
  • Loading branch information
lwwmanning committed Nov 7, 2024
1 parent c1cee33 commit 31513b9
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 1 deletion.
7 changes: 7 additions & 0 deletions vortex-sampling-compressor/src/compressors/chunked.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use std::sync::Arc;
use log::warn;
use vortex_array::aliases::hash_set::HashSet;
use vortex_array::array::{Chunked, ChunkedArray};
use vortex_array::compress::compute_pruning_stats;
use vortex_array::encoding::EncodingRef;
use vortex_array::stats::ArrayStatistics as _;
use vortex_array::{Array, ArrayDType, ArrayDef, IntoArray};
Expand Down Expand Up @@ -116,6 +117,12 @@ impl ChunkedCompressor {
)?;
let mut compressed_chunks = Vec::with_capacity(less_chunked.nchunks());
for (index, chunk) in less_chunked.chunks().enumerate() {
// these are extremely valuable when reading/writing, but are potentially much more expensive
// to compute post-compression. That's because not all encodings implement stats, so we would
// potentially have to canonicalize during writes just to get stats, which would be silly.
// Also, we only really require them for column chunks, not for every array.
compute_pruning_stats(&chunk)?;

let like = previous.as_ref().map(|(like, _)| like);
let (compressed_chunk, tree) = ctx
.named(&format!("chunk-{}", index))
Expand Down
10 changes: 9 additions & 1 deletion vortex-sampling-compressor/src/compressors/struct_.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use itertools::Itertools;
use vortex_array::aliases::hash_set::HashSet;
use vortex_array::array::{Struct, StructArray};
use vortex_array::compress::compute_pruning_stats;
use vortex_array::encoding::EncodingRef;
use vortex_array::stats::ArrayStatistics as _;
use vortex_array::variants::StructArrayTrait;
Expand Down Expand Up @@ -45,7 +46,14 @@ impl EncodingCompressor for StructCompressor {
let (arrays, trees) = array
.children()
.zip_eq(children_trees)
.map(|(array, like)| ctx.compress(&array, like.as_ref()))
.map(|(array, like)| {
// these are extremely valuable when reading/writing, but are potentially much more expensive
// to compute post-compression. That's because not all encodings implement stats, so we would
// potentially have to canonicalize during writes just to get stats, which would be silly.
// Also, we only really require them for column chunks, not for every array.
compute_pruning_stats(&array)?;
ctx.compress(&array, like.as_ref())
})
.process_results(|iter| iter.map(|x| (x.array, x.path)).unzip())?;

Ok(CompressedArray::compressed(
Expand Down

0 comments on commit 31513b9

Please sign in to comment.