diff --git a/vortex-sampling-compressor/src/compressors/chunked.rs b/vortex-sampling-compressor/src/compressors/chunked.rs index aabfbd47b2..ad6b3c1ed0 100644 --- a/vortex-sampling-compressor/src/compressors/chunked.rs +++ b/vortex-sampling-compressor/src/compressors/chunked.rs @@ -4,6 +4,7 @@ use std::sync::Arc; use log::warn; use vortex_array::aliases::hash_set::HashSet; use vortex_array::array::{Chunked, ChunkedArray}; +use vortex_array::compress::compute_pruning_stats; use vortex_array::encoding::EncodingRef; use vortex_array::stats::ArrayStatistics as _; use vortex_array::{Array, ArrayDType, ArrayDef, IntoArray}; @@ -116,6 +117,12 @@ impl ChunkedCompressor { )?; let mut compressed_chunks = Vec::with_capacity(less_chunked.nchunks()); for (index, chunk) in less_chunked.chunks().enumerate() { + // these are extremely valuable when reading/writing, but are potentially much more expensive + // to compute post-compression. That's because not all encodings implement stats, so we would + // potentially have to canonicalize during writes just to get stats, which would be silly. + // Also, we only really require them for column chunks, not for every array. + compute_pruning_stats(&chunk)?; + let like = previous.as_ref().map(|(like, _)| like); let (compressed_chunk, tree) = ctx .named(&format!("chunk-{}", index)) diff --git a/vortex-sampling-compressor/src/compressors/struct_.rs b/vortex-sampling-compressor/src/compressors/struct_.rs index ba4e0bd073..6f42aac098 100644 --- a/vortex-sampling-compressor/src/compressors/struct_.rs +++ b/vortex-sampling-compressor/src/compressors/struct_.rs @@ -1,6 +1,7 @@ use itertools::Itertools; use vortex_array::aliases::hash_set::HashSet; use vortex_array::array::{Struct, StructArray}; +use vortex_array::compress::compute_pruning_stats; use vortex_array::encoding::EncodingRef; use vortex_array::stats::ArrayStatistics as _; use vortex_array::variants::StructArrayTrait; @@ -45,7 +46,14 @@ impl EncodingCompressor for StructCompressor { let (arrays, trees) = array .children() .zip_eq(children_trees) - .map(|(array, like)| ctx.compress(&array, like.as_ref())) + .map(|(array, like)| { + // these are extremely valuable when reading/writing, but are potentially much more expensive + // to compute post-compression. That's because not all encodings implement stats, so we would + // potentially have to canonicalize during writes just to get stats, which would be silly. + // Also, we only really require them for column chunks, not for every array. + compute_pruning_stats(&array)?; + ctx.compress(&array, like.as_ref()) + }) .process_results(|iter| iter.map(|x| (x.array, x.path)).unzip())?; Ok(CompressedArray::compressed(