Skip to content

Commit fa6233a

Browse files
committed
Vectorized DeltaBitPackDecoder (#1281)
1 parent 936ed5e commit fa6233a

File tree

4 files changed

+277
-160
lines changed

4 files changed

+277
-160
lines changed

parquet/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ flate2 = { version = "1.0", optional = true }
3939
lz4 = { version = "1.23", optional = true }
4040
zstd = { version = "0.10", optional = true }
4141
chrono = { version = "0.4", default-features = false }
42+
num = "0.4"
4243
num-bigint = "0.4"
4344
arrow = { path = "../arrow", version = "9.0.0", optional = true, default-features = false, features = ["ipc"] }
4445
base64 = { version = "0.13", optional = true }

parquet/benches/arrow_reader.rs

Lines changed: 67 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -54,17 +54,17 @@ pub fn seedable_rng() -> StdRng {
5454
StdRng::seed_from_u64(42)
5555
}
5656

57-
fn build_plain_encoded_int32_page_iterator(
57+
fn build_encoded_int32_page_iterator(
5858
schema: SchemaDescPtr,
5959
column_desc: ColumnDescPtr,
6060
null_density: f32,
61+
encoding: Encoding,
6162
) -> impl PageIterator + Clone {
6263
let max_def_level = column_desc.max_def_level();
6364
let max_rep_level = column_desc.max_rep_level();
6465
let rep_levels = vec![0; VALUES_PER_PAGE];
6566
let mut rng = seedable_rng();
6667
let mut pages: Vec<Vec<parquet::column::page::Page>> = Vec::new();
67-
let mut int32_value = 0;
6868
for _i in 0..NUM_ROW_GROUPS {
6969
let mut column_chunk_pages = Vec::new();
7070
for _j in 0..PAGES_PER_GROUP {
@@ -78,16 +78,15 @@ fn build_plain_encoded_int32_page_iterator(
7878
max_def_level
7979
};
8080
if def_level == max_def_level {
81-
int32_value += 1;
82-
values.push(int32_value);
81+
values.push(rng.gen_range(0..1000));
8382
}
8483
def_levels.push(def_level);
8584
}
8685
let mut page_builder =
8786
DataPageBuilderImpl::new(column_desc.clone(), values.len() as u32, true);
8887
page_builder.add_rep_levels(max_rep_level, &rep_levels);
8988
page_builder.add_def_levels(max_def_level, &def_levels);
90-
page_builder.add_values::<Int32Type>(Encoding::PLAIN, &values);
89+
page_builder.add_values::<Int32Type>(encoding, &values);
9190
column_chunk_pages.push(page_builder.consume());
9291
}
9392
pages.push(column_chunk_pages);
@@ -332,9 +331,7 @@ fn create_complex_object_byte_array_dictionary_reader(
332331
page_iterator: impl PageIterator + 'static,
333332
column_desc: ColumnDescPtr,
334333
) -> Box<dyn ArrayReader> {
335-
use parquet::arrow::array_reader::{
336-
make_byte_array_dictionary_reader, ComplexObjectArrayReader,
337-
};
334+
use parquet::arrow::array_reader::ComplexObjectArrayReader;
338335
use parquet::arrow::converter::{Utf8ArrayConverter, Utf8Converter};
339336
let arrow_type =
340337
DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8));
@@ -367,10 +364,11 @@ fn add_benches(c: &mut Criterion) {
367364
// =============================
368365

369366
// int32, plain encoded, no NULLs
370-
let plain_int32_no_null_data = build_plain_encoded_int32_page_iterator(
367+
let plain_int32_no_null_data = build_encoded_int32_page_iterator(
371368
schema.clone(),
372369
mandatory_int32_column_desc.clone(),
373370
0.0,
371+
Encoding::PLAIN,
374372
);
375373
group.bench_function("read Int32Array, plain encoded, mandatory, no NULLs", |b| {
376374
b.iter(|| {
@@ -383,10 +381,11 @@ fn add_benches(c: &mut Criterion) {
383381
assert_eq!(count, EXPECTED_VALUE_COUNT);
384382
});
385383

386-
let plain_int32_no_null_data = build_plain_encoded_int32_page_iterator(
384+
let plain_int32_no_null_data = build_encoded_int32_page_iterator(
387385
schema.clone(),
388386
optional_int32_column_desc.clone(),
389387
0.0,
388+
Encoding::PLAIN,
390389
);
391390
group.bench_function("read Int32Array, plain encoded, optional, no NULLs", |b| {
392391
b.iter(|| {
@@ -400,10 +399,11 @@ fn add_benches(c: &mut Criterion) {
400399
});
401400

402401
// int32, plain encoded, half NULLs
403-
let plain_int32_half_null_data = build_plain_encoded_int32_page_iterator(
402+
let plain_int32_half_null_data = build_encoded_int32_page_iterator(
404403
schema.clone(),
405404
optional_int32_column_desc.clone(),
406405
0.5,
406+
Encoding::PLAIN,
407407
);
408408
group.bench_function(
409409
"read Int32Array, plain encoded, optional, half NULLs",
@@ -419,6 +419,62 @@ fn add_benches(c: &mut Criterion) {
419419
},
420420
);
421421

422+
// int32, binary packed, no NULLs
423+
let plain_int32_no_null_data = build_encoded_int32_page_iterator(
424+
schema.clone(),
425+
mandatory_int32_column_desc.clone(),
426+
0.0,
427+
Encoding::DELTA_BINARY_PACKED,
428+
);
429+
group.bench_function("read Int32Array, binary packed, mandatory, no NULLs", |b| {
430+
b.iter(|| {
431+
let array_reader = create_int32_primitive_array_reader(
432+
plain_int32_no_null_data.clone(),
433+
mandatory_int32_column_desc.clone(),
434+
);
435+
count = bench_array_reader(array_reader);
436+
});
437+
assert_eq!(count, EXPECTED_VALUE_COUNT);
438+
});
439+
440+
let plain_int32_no_null_data = build_encoded_int32_page_iterator(
441+
schema.clone(),
442+
optional_int32_column_desc.clone(),
443+
0.0,
444+
Encoding::DELTA_BINARY_PACKED,
445+
);
446+
group.bench_function("read Int32Array, binary packed, optional, no NULLs", |b| {
447+
b.iter(|| {
448+
let array_reader = create_int32_primitive_array_reader(
449+
plain_int32_no_null_data.clone(),
450+
optional_int32_column_desc.clone(),
451+
);
452+
count = bench_array_reader(array_reader);
453+
});
454+
assert_eq!(count, EXPECTED_VALUE_COUNT);
455+
});
456+
457+
// int32, binary packed, half NULLs
458+
let plain_int32_half_null_data = build_encoded_int32_page_iterator(
459+
schema.clone(),
460+
optional_int32_column_desc.clone(),
461+
0.5,
462+
Encoding::DELTA_BINARY_PACKED,
463+
);
464+
group.bench_function(
465+
"read Int32Array, binary packed, optional, half NULLs",
466+
|b| {
467+
b.iter(|| {
468+
let array_reader = create_int32_primitive_array_reader(
469+
plain_int32_half_null_data.clone(),
470+
optional_int32_column_desc.clone(),
471+
);
472+
count = bench_array_reader(array_reader);
473+
});
474+
assert_eq!(count, EXPECTED_VALUE_COUNT);
475+
},
476+
);
477+
422478
// int32, dictionary encoded, no NULLs
423479
let dictionary_int32_no_null_data = build_dictionary_encoded_int32_page_iterator(
424480
schema.clone(),

0 commit comments

Comments
 (0)