Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 48 additions & 3 deletions encodings/fsst/src/compress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,10 @@ where
Compressor::train(&lines)
}

/// Most strings are small in practice. If we encounter a larger string, we reallocate
/// the buffer to hold enough capacity for the worst-case compressed value.
const DEFAULT_BUFFER_LEN: usize = 1024 * 1024;

/// Compress from an iterator of bytestrings using FSST.
pub fn fsst_compress_iter<'a, I>(
iter: I,
Expand All @@ -62,8 +66,7 @@ pub fn fsst_compress_iter<'a, I>(
where
I: Iterator<Item = Option<&'a [u8]>>,
{
// TODO(aduffy): this might be too small.
let mut buffer = Vec::with_capacity(16 * 1024 * 1024);
let mut buffer = Vec::with_capacity(DEFAULT_BUFFER_LEN);
let mut builder = VarBinBuilder::<i32>::with_capacity(len);
let mut uncompressed_lengths: BufferMut<i32> = BufferMut::with_capacity(len);
for string in iter {
Expand All @@ -79,7 +82,14 @@ where
.vortex_expect("string length must fit in i32"),
);

// SAFETY: buffer is large enough
// make sure the buffer is 2x+7 larger than the input
let target_size = 2 * s.len() + 7;
if target_size > buffer.len() {
let additional_capacity = target_size - buffer.len();
buffer.reserve(additional_capacity);
}

// SAFETY: buffer is always sized to be large enough
unsafe { compressor.compress_into(s, &mut buffer) };

builder.append_value(&buffer);
Expand All @@ -96,3 +106,38 @@ where
FSSTArray::try_new(dtype, symbols, symbol_lengths, codes, uncompressed_lengths)
.vortex_expect("building FSSTArray from parts")
}

#[cfg(test)]
mod tests {
use fsst::CompressorBuilder;
use vortex_array::dtype::DType;
use vortex_array::dtype::Nullability;
use vortex_array::scalar::Scalar;

use crate::compress::DEFAULT_BUFFER_LEN;
use crate::fsst_compress_iter;

#[test]
fn test_large_string() {
let big_string: String = "abc"
.chars()
.cycle()
.take(10 * DEFAULT_BUFFER_LEN)
.collect();

let compressor = CompressorBuilder::default().build();

let compressed = fsst_compress_iter(
[Some(big_string.as_bytes())].into_iter(),
1,
DType::Utf8(Nullability::NonNullable),
&compressor,
);

let decoded = compressed.scalar_at(0).unwrap();

let expected = Scalar::utf8(big_string, Nullability::NonNullable);

assert_eq!(decoded, expected);
}
}
Loading