Skip to content

Commit ba5e7b4

Browse files
authoredDec 4, 2024
Narrow indices types during compression (spiraldb#1558)
Fixes spiraldb#1557
1 parent b5127b8 commit ba5e7b4

File tree

19 files changed

+240
-78
lines changed

19 files changed

+240
-78
lines changed
 

‎Cargo.lock

+1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎docs/quickstart.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,9 @@ Use :func:`~vortex.encoding.compress` to compress the Vortex array and check the
4646

4747
>>> cvtx = vortex.compress(vtx)
4848
>>> cvtx.nbytes
49-
17780
49+
16835
5050
>>> cvtx.nbytes / vtx.nbytes
51-
0.126...
51+
0.119...
5252

5353
Vortex uses nearly ten times fewer bytes than Arrow. Fewer bytes means more of your data fits in
5454
cache and RAM.

‎encodings/datetime-parts/src/compute/mod.rs

+54-16
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,17 @@
11
mod filter;
22
mod take;
33

4-
use itertools::Itertools as _;
54
use vortex_array::array::{PrimitiveArray, TemporalArray};
65
use vortex_array::compute::{
7-
scalar_at, slice, ComputeVTable, FilterFn, ScalarAtFn, SliceFn, TakeFn,
6+
scalar_at, slice, try_cast, ComputeVTable, FilterFn, ScalarAtFn, SliceFn, TakeFn,
87
};
98
use vortex_array::validity::ArrayValidity;
109
use vortex_array::{ArrayDType, ArrayData, IntoArrayData, IntoArrayVariant};
1110
use vortex_datetime_dtype::{TemporalMetadata, TimeUnit};
12-
use vortex_dtype::DType;
13-
use vortex_error::{vortex_bail, VortexResult};
14-
use vortex_scalar::Scalar;
11+
use vortex_dtype::Nullability::NonNullable;
12+
use vortex_dtype::{DType, PType};
13+
use vortex_error::{vortex_bail, VortexExpect, VortexResult};
14+
use vortex_scalar::{PrimitiveScalar, Scalar};
1515

1616
use crate::{DateTimePartsArray, DateTimePartsEncoding};
1717

@@ -106,17 +106,55 @@ pub fn decode_to_temporal(array: &DateTimePartsArray) -> VortexResult<TemporalAr
106106
TimeUnit::D => vortex_bail!(InvalidArgument: "cannot decode into TimeUnit::D"),
107107
};
108108

109-
let days_buf = array.days().into_primitive()?;
110-
let seconds_buf = array.seconds().into_primitive()?;
111-
let subsecond_buf = array.subsecond().into_primitive()?;
112-
113-
let values = days_buf
114-
.maybe_null_slice::<i64>()
115-
.iter()
116-
.zip_eq(seconds_buf.maybe_null_slice::<i64>().iter())
117-
.zip_eq(subsecond_buf.maybe_null_slice::<i64>().iter())
118-
.map(|((d, s), ss)| d * 86_400 * divisor + s * divisor + ss)
119-
.collect::<Vec<_>>();
109+
let days_buf = try_cast(
110+
array.days(),
111+
&DType::Primitive(PType::I64, array.dtype().nullability()),
112+
)?
113+
.into_primitive()?;
114+
let mut values: Vec<i64> = days_buf
115+
.into_maybe_null_slice::<i64>()
116+
.into_iter()
117+
.map(|d| d * 86_400 * divisor)
118+
.collect();
119+
120+
if let Some(seconds) = array.seconds().as_constant() {
121+
let seconds =
122+
PrimitiveScalar::try_from(&seconds.cast(&DType::Primitive(PType::I64, NonNullable))?)?
123+
.typed_value::<i64>()
124+
.vortex_expect("non-nullable");
125+
for v in values.iter_mut() {
126+
*v += seconds * divisor;
127+
}
128+
} else {
129+
let seconds_buf = try_cast(array.seconds(), &DType::Primitive(PType::U32, NonNullable))?
130+
.into_primitive()?;
131+
for (v, second) in values.iter_mut().zip(seconds_buf.maybe_null_slice::<u32>()) {
132+
*v += (*second as i64) * divisor;
133+
}
134+
}
135+
136+
if let Some(subseconds) = array.subsecond().as_constant() {
137+
let subseconds = PrimitiveScalar::try_from(
138+
&subseconds.cast(&DType::Primitive(PType::I64, NonNullable))?,
139+
)?
140+
.typed_value::<i64>()
141+
.vortex_expect("non-nullable");
142+
for v in values.iter_mut() {
143+
*v += subseconds;
144+
}
145+
} else {
146+
let subsecond_buf = try_cast(
147+
array.subsecond(),
148+
&DType::Primitive(PType::I64, NonNullable),
149+
)?
150+
.into_primitive()?;
151+
for (v, subsecond) in values
152+
.iter_mut()
153+
.zip(subsecond_buf.maybe_null_slice::<i64>())
154+
{
155+
*v += *subsecond;
156+
}
157+
}
120158

121159
Ok(TemporalArray::new_timestamp(
122160
PrimitiveArray::from_vec(values, array.validity()).into_array(),

‎encodings/fsst/src/canonical.rs

+18-14
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
use arrow_array::builder::make_view;
22
use arrow_buffer::Buffer;
33
use vortex_array::array::{PrimitiveArray, VarBinArray, VarBinViewArray};
4+
use vortex_array::variants::PrimitiveArrayTrait;
45
use vortex_array::{
56
ArrayDType, ArrayData, Canonical, IntoArrayData, IntoArrayVariant, IntoCanonical,
67
};
8+
use vortex_dtype::match_each_integer_ptype;
79
use vortex_error::VortexResult;
810

911
use crate::FSSTArray;
@@ -33,24 +35,26 @@ impl IntoCanonical for FSSTArray {
3335
.uncompressed_lengths()
3436
.into_canonical()?
3537
.into_primitive()?;
36-
let uncompressed_lens_slice = uncompressed_lens_array.maybe_null_slice::<i32>();
3738

3839
// Directly create the binary views.
39-
let views: Vec<u128> = uncompressed_lens_slice
40-
.iter()
41-
.scan(0, |offset, len| {
42-
let str_start = *offset;
43-
let str_end = *offset + len;
40+
let views: Vec<u128> = match_each_integer_ptype!(uncompressed_lens_array.ptype(), |$P| {
41+
uncompressed_lens_array.maybe_null_slice::<$P>()
42+
.iter()
43+
.map(|&len| len as usize)
44+
.scan(0, |offset, len| {
45+
let str_start = *offset;
46+
let str_end = *offset + len;
4447

45-
*offset += len;
48+
*offset += len;
4649

47-
Some(make_view(
48-
&uncompressed_bytes[(str_start as usize)..(str_end as usize)],
49-
0u32,
50-
str_start as u32,
51-
))
52-
})
53-
.collect();
50+
Some(make_view(
51+
&uncompressed_bytes[str_start..str_end],
52+
0u32,
53+
str_start as u32,
54+
))
55+
})
56+
.collect()
57+
});
5458

5559
let views_array: ArrayData = Buffer::from(views).into();
5660
let uncompressed_bytes_array = PrimitiveArray::from(uncompressed_bytes).into_array();

‎vortex-array/src/array/sparse/mod.rs

+7-11
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
use std::fmt::{Debug, Display};
22

33
use ::serde::{Deserialize, Serialize};
4-
use vortex_dtype::{match_each_integer_ptype, DType};
4+
use vortex_dtype::Nullability::NonNullable;
5+
use vortex_dtype::{match_each_integer_ptype, DType, PType};
56
use vortex_error::{vortex_bail, vortex_panic, VortexExpect as _, VortexResult};
67
use vortex_scalar::{Scalar, ScalarValue};
78

@@ -27,8 +28,8 @@ pub struct SparseMetadata {
2728
// Offset value for patch indices as a result of slicing
2829
indices_offset: usize,
2930
indices_len: usize,
31+
indices_ptype: PType,
3032
fill_value: ScalarValue,
31-
u64_indices: bool,
3233
}
3334

3435
impl Display for SparseMetadata {
@@ -54,9 +55,6 @@ impl SparseArray {
5455
indices_offset: usize,
5556
fill_value: Scalar,
5657
) -> VortexResult<Self> {
57-
if !matches!(indices.dtype(), &DType::IDX | &DType::IDX_32) {
58-
vortex_bail!("Cannot use {} as indices", indices.dtype());
59-
}
6058
if fill_value.dtype() != values.dtype() {
6159
vortex_bail!(
6260
"fill value, {:?}, should be instance of values dtype, {}",
@@ -80,14 +78,16 @@ impl SparseArray {
8078
}
8179
}
8280

81+
let indices_ptype = PType::try_from(indices.dtype())?;
82+
8383
Self::try_from_parts(
8484
values.dtype().clone(),
8585
len,
8686
SparseMetadata {
8787
indices_offset,
8888
indices_len: indices.len(),
89+
indices_ptype,
8990
fill_value: fill_value.into_value(),
90-
u64_indices: matches!(indices.dtype(), &DType::IDX),
9191
},
9292
[indices, values].into(),
9393
StatsSet::default(),
@@ -111,11 +111,7 @@ impl SparseArray {
111111
self.as_ref()
112112
.child(
113113
0,
114-
if self.metadata().u64_indices {
115-
&DType::IDX
116-
} else {
117-
&DType::IDX_32
118-
},
114+
&DType::Primitive(self.metadata().indices_ptype, NonNullable),
119115
self.metadata().indices_len,
120116
)
121117
.vortex_expect("Missing indices array in SparseArray")

‎vortex-array/src/compute/cast.rs

+23-7
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
use vortex_dtype::DType;
2-
use vortex_error::{vortex_err, VortexError, VortexResult};
2+
use vortex_error::{vortex_bail, vortex_err, VortexError, VortexResult};
33

44
use crate::encoding::Encoding;
5-
use crate::{ArrayDType, ArrayData};
5+
use crate::{ArrayDType, ArrayData, IntoArrayData, IntoCanonical};
66

77
pub trait CastFn<Array> {
88
fn cast(&self, array: &Array, dtype: &DType) -> VortexResult<ArrayData>;
@@ -34,9 +34,25 @@ pub fn try_cast(array: impl AsRef<ArrayData>, dtype: &DType) -> VortexResult<Arr
3434
}
3535

3636
// TODO(ngates): check for null_count if dtype is non-nullable
37-
array
38-
.encoding()
39-
.cast_fn()
40-
.map(|f| f.cast(array, dtype))
41-
.unwrap_or_else(|| Err(vortex_err!(NotImplemented: "cast", array.encoding().id())))
37+
if let Some(f) = array.encoding().cast_fn() {
38+
return f.cast(array, dtype);
39+
}
40+
41+
// Otherwise, we fall back to the canonical implementations.
42+
log::debug!(
43+
"Falling back to canonical cast for encoding {} and dtype {} to {}",
44+
array.encoding().id(),
45+
array.dtype(),
46+
dtype
47+
);
48+
let canonicalized = array.clone().into_canonical()?.into_array();
49+
if let Some(f) = canonicalized.encoding().cast_fn() {
50+
return f.cast(&canonicalized, dtype);
51+
}
52+
53+
vortex_bail!(
54+
"No compute kernel to cast array from {} to {}",
55+
array.dtype(),
56+
dtype
57+
)
4258
}

‎vortex-dtype/src/dtype.rs

-6
Original file line numberDiff line numberDiff line change
@@ -45,12 +45,6 @@ impl DType {
4545
/// The default DType for bytes
4646
pub const BYTES: Self = Primitive(PType::U8, Nullability::NonNullable);
4747

48-
/// The default DType for indices
49-
pub const IDX: Self = Primitive(PType::U64, Nullability::NonNullable);
50-
51-
/// The DType for small indices (primarily created from bitmaps)
52-
pub const IDX_32: Self = Primitive(PType::U32, Nullability::NonNullable);
53-
5448
/// Get the nullability of the DType
5549
pub fn nullability(&self) -> Nullability {
5650
self.is_nullable().into()

‎vortex-sampling-compressor/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ arbitrary = { workspace = true, optional = true }
1818
fsst-rs = { workspace = true }
1919
itertools = { workspace = true }
2020
log = { workspace = true }
21+
num-traits = { workspace = true }
2122
rand = { workspace = true }
2223
vortex-alp = { workspace = true }
2324
vortex-array = { workspace = true }

‎vortex-sampling-compressor/src/compressors/date_time_parts.rs

+13-9
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ use vortex_datetime_parts::{
1010
use vortex_error::VortexResult;
1111

1212
use crate::compressors::{CompressedArray, CompressionTree, EncodingCompressor};
13+
use crate::downscale::downscale_integer_array;
1314
use crate::{constants, SamplingCompressor};
1415

1516
#[derive(Debug)]
@@ -48,15 +49,18 @@ impl EncodingCompressor for DateTimePartsCompressor {
4849
subseconds,
4950
} = split_temporal(TemporalArray::try_from(array.clone())?)?;
5051

51-
let days = ctx
52-
.named("days")
53-
.compress(&days, like.as_ref().and_then(|l| l.child(0)))?;
54-
let seconds = ctx
55-
.named("seconds")
56-
.compress(&seconds, like.as_ref().and_then(|l| l.child(1)))?;
57-
let subsecond = ctx
58-
.named("subsecond")
59-
.compress(&subseconds, like.as_ref().and_then(|l| l.child(2)))?;
52+
let days = ctx.named("days").compress(
53+
&downscale_integer_array(days)?,
54+
like.as_ref().and_then(|l| l.child(0)),
55+
)?;
56+
let seconds = ctx.named("seconds").compress(
57+
&downscale_integer_array(seconds)?,
58+
like.as_ref().and_then(|l| l.child(1)),
59+
)?;
60+
let subsecond = ctx.named("subsecond").compress(
61+
&downscale_integer_array(subseconds)?,
62+
like.as_ref().and_then(|l| l.child(2)),
63+
)?;
6064
Ok(CompressedArray::compressed(
6165
DateTimePartsArray::try_new(
6266
array.dtype().clone(),

‎vortex-sampling-compressor/src/compressors/dict.rs

+5-3
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ use vortex_dict::{
1212
use vortex_error::VortexResult;
1313

1414
use crate::compressors::{CompressedArray, CompressionTree, EncodingCompressor};
15+
use crate::downscale::downscale_integer_array;
1516
use crate::{constants, SamplingCompressor};
1617

1718
#[derive(Debug)]
@@ -70,9 +71,10 @@ impl EncodingCompressor for DictCompressor {
7071
};
7172

7273
let (codes, values) = (
73-
ctx.auxiliary("codes")
74-
.excluding(self)
75-
.compress(&codes, like.as_ref().and_then(|l| l.child(0)))?,
74+
ctx.auxiliary("codes").excluding(self).compress(
75+
&downscale_integer_array(codes)?,
76+
like.as_ref().and_then(|l| l.child(0)),
77+
)?,
7678
ctx.named("values")
7779
.excluding(self)
7880
.compress(&values, like.as_ref().and_then(|l| l.child(1)))?,

‎vortex-sampling-compressor/src/compressors/fsst.rs

+2-1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ use super::delta::DeltaCompressor;
1717
use super::r#for::FoRCompressor;
1818
use super::varbin::VarBinCompressor;
1919
use super::{CompressedArray, CompressionTree, EncoderMetadata, EncodingCompressor};
20+
use crate::downscale::downscale_integer_array;
2021
use crate::{constants, SamplingCompressor};
2122

2223
#[derive(Debug)]
@@ -109,7 +110,7 @@ impl EncodingCompressor for FSSTCompressor {
109110
.auxiliary("uncompressed_lengths")
110111
.excluding(self)
111112
.compress(
112-
&fsst_array.uncompressed_lengths(),
113+
&downscale_integer_array(fsst_array.uncompressed_lengths())?,
113114
like.as_ref().and_then(|l| l.child(3)),
114115
)?;
115116

‎vortex-sampling-compressor/src/compressors/list.rs

+2-1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ use vortex_array::{ArrayData, IntoArrayData};
66
use vortex_error::VortexResult;
77

88
use crate::compressors::{CompressedArray, CompressionTree, EncodingCompressor};
9+
use crate::downscale::downscale_integer_array;
910
use crate::{constants, SamplingCompressor};
1011

1112
#[derive(Debug)]
@@ -36,7 +37,7 @@ impl EncodingCompressor for ListCompressor {
3637
like.as_ref().and_then(|l| l.child(0)),
3738
)?;
3839
let compressed_offsets = ctx.auxiliary("offsets").compress(
39-
&list_array.offsets(),
40+
&downscale_integer_array(list_array.offsets())?,
4041
like.as_ref().and_then(|l| l.child(1)),
4142
)?;
4243
Ok(CompressedArray::compressed(

0 commit comments

Comments
 (0)