Support Parquet Byte Stream Split Encoding (#5293)

* wip byte-stream-split * decoding works * impl split * clean up * whitespace * remove println * get compiling after rebase * integration test, as one might call it * update parquet-testing revision * encoding bench * improve performance * test fix * add apache headers * one more test and readme update --------- Co-authored-by: Simon Vandel Sillesen <simon.vandel@gmail.com>
apache · Jan 12, 2024 · 4c3e9be · 4c3e9be
1 parent 4a6ae68
commit 4c3e9be
Show file tree

Hide file tree

Showing 10 changed files with 499 additions and 92 deletions.
diff --git a/parquet-testing b/parquet-testing
diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml
@@ -175,6 +175,11 @@ name = "compression"
 required-features = ["experimental", "default"]
 harness = false
 
+[[bench]]
+name = "encoding"
+required-features = ["experimental", "default"]
+harness = false
+
 
 [[bench]]
 name = "metadata"

diff --git a/parquet/README.md b/parquet/README.md
@@ -55,7 +55,7 @@ The `parquet` crate provides the following features which may be enabled in your
 
 ## Parquet Feature Status
 
-- [x] All encodings supported, except for BYTE_STREAM_SPLIT ([#4102](https://github.com/apache/arrow-rs/issues/4102))
+- [x] All encodings supported
 - [x] All compression codecs supported
 - [x] Read support
   - [x] Primitive column value readers

diff --git a/parquet/benches/encoding.rs b/parquet/benches/encoding.rs
@@ -0,0 +1,83 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use criterion::*;
+use parquet::basic::Encoding;
+use parquet::data_type::{DataType, DoubleType, FloatType};
+use parquet::decoding::{get_decoder, Decoder};
+use parquet::encoding::get_encoder;
+use parquet::schema::types::{ColumnDescPtr, ColumnDescriptor, ColumnPath, Type};
+use rand::prelude::*;
+use std::sync::Arc;
+
+fn bench_typed<T: DataType>(c: &mut Criterion, values: &[T::T], encoding: Encoding) {
+    let name = format!(
+        "dtype={}, encoding={:?}",
+        std::any::type_name::<T::T>(),
+        encoding
+    );
+    c.bench_function(&format!("encoding: {}", name), |b| {
+        b.iter(|| {
+            let mut encoder = get_encoder::<T>(encoding).unwrap();
+            encoder.put(values).unwrap();
+            encoder.flush_buffer().unwrap();
+        });
+    });
+
+    let mut encoder = get_encoder::<T>(encoding).unwrap();
+    encoder.put(values).unwrap();
+    let encoded = encoder.flush_buffer().unwrap();
+    println!("{} encoded as {} bytes", name, encoded.len(),);
+
+    let mut buffer = vec![T::T::default(); values.len()];
+    let column_desc_ptr = ColumnDescPtr::new(ColumnDescriptor::new(
+        Arc::new(
+            Type::primitive_type_builder("", T::get_physical_type())
+                .build()
+                .unwrap(),
+        ),
+        0,
+        0,
+        ColumnPath::new(vec![]),
+    ));
+    c.bench_function(&format!("decoding: {}", name), |b| {
+        b.iter(|| {
+            let mut decoder: Box<dyn Decoder<T>> =
+                get_decoder(column_desc_ptr.clone(), encoding).unwrap();
+            decoder.set_data(encoded.clone(), values.len()).unwrap();
+            decoder.get(&mut buffer).unwrap();
+        });
+    });
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let mut rng = StdRng::seed_from_u64(0);
+    let n = 16 * 1024;
+
+    let mut f32s = Vec::new();
+    let mut f64s = Vec::new();
+    for _ in 0..n {
+        f32s.push(rng.gen::<f32>());
+        f64s.push(rng.gen::<f64>());
+    }
+
+    bench_typed::<FloatType>(c, &f32s, Encoding::BYTE_STREAM_SPLIT);
+    bench_typed::<DoubleType>(c, &f64s, Encoding::BYTE_STREAM_SPLIT);
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs
@@ -737,7 +737,9 @@ mod tests {
 
     use arrow_array::builder::*;
     use arrow_array::cast::AsArray;
-    use arrow_array::types::{Decimal128Type, Decimal256Type, DecimalType, Float16Type};
+    use arrow_array::types::{
+        Decimal128Type, Decimal256Type, DecimalType, Float16Type, Float32Type, Float64Type,
+    };
     use arrow_array::*;
     use arrow_array::{RecordBatch, RecordBatchReader};
     use arrow_buffer::{i256, ArrowNativeType, Buffer};
@@ -755,7 +757,7 @@ mod tests {
     use crate::column::reader::decoder::REPETITION_LEVELS_BATCH_SIZE;
     use crate::data_type::{
         BoolType, ByteArray, ByteArrayType, DataType, FixedLenByteArray, FixedLenByteArrayType,
-        Int32Type, Int64Type, Int96Type,
+        FloatType, Int32Type, Int64Type, Int96Type,
     };
     use crate::errors::Result;
     use crate::file::properties::{EnabledStatistics, WriterProperties, WriterVersion};
@@ -861,6 +863,13 @@ mod tests {
                 Encoding::DELTA_BINARY_PACKED,
             ],
         );
+        run_single_column_reader_tests::<FloatType, _, FloatType>(
+            2,
+            ConvertedType::NONE,
+            None,
+            |vals| Arc::new(Float32Array::from_iter(vals.iter().cloned())),
+            &[Encoding::PLAIN, Encoding::BYTE_STREAM_SPLIT],
+        );
     }
 
     #[test]
@@ -1390,6 +1399,35 @@ mod tests {
         assert!(col.value(2).is_nan());
     }
 
+    #[test]
+    fn test_read_float32_float64_byte_stream_split() {
+        let path = format!(
+            "{}/byte_stream_split.zstd.parquet",
+            arrow::util::test_util::parquet_test_data(),
+        );
+        let file = File::open(path).unwrap();
+        let record_reader = ParquetRecordBatchReader::try_new(file, 128).unwrap();
+
+        let mut row_count = 0;
+        for batch in record_reader {
+            let batch = batch.unwrap();
+            row_count += batch.num_rows();
+            let f32_col = batch.column(0).as_primitive::<Float32Type>();
+            let f64_col = batch.column(1).as_primitive::<Float64Type>();
+
+            // This file contains floats from a standard normal distribution
+            for &x in f32_col.values() {
+                assert!(x > -10.0);
+                assert!(x < 10.0);
+            }
+            for &x in f64_col.values() {
+                assert!(x > -10.0);
+                assert!(x < 10.0);
+            }
+        }
+        assert_eq!(row_count, 300);
+    }
+
     /// Parameters for single_column_reader_test
     #[derive(Clone)]
     struct TestOptions {

diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs
@@ -1579,6 +1579,9 @@ mod tests {
             | DataType::UInt32
             | DataType::UInt16
             | DataType::UInt8 => vec![Encoding::PLAIN, Encoding::DELTA_BINARY_PACKED],
+            DataType::Float32 | DataType::Float64 => {
+                vec![Encoding::PLAIN, Encoding::BYTE_STREAM_SPLIT]
+            }
             _ => vec![Encoding::PLAIN],
         };