Skip to content

Soundness: reading parquet with invalid utf8 results in UB #786

@jorgecarleitao

Description

@jorgecarleitao
use std::sync::Arc;

use parquet::arrow::*;
use parquet::file::reader::SerializedFileReader;
use parquet::file::serialized_reader::SliceableCursor;

fn read(buffer: Arc<Vec<u8>>, column: usize) {
    let file = SliceableCursor::new(buffer);

    let file_reader = SerializedFileReader::new(file).unwrap();
    let mut arrow_reader = ParquetFileArrowReader::new(Arc::new(file_reader));

    let mut record_batch_reader = arrow_reader
        .get_record_reader_by_columns(vec![column], 10)
        .unwrap();

    let batch = record_batch_reader.next().unwrap().unwrap();
    println!("{:?}", batch.column(0));
}

fn main() {
    // a parquet file with 1 column with invalid utf8
    let data = vec![
        80, 65, 82, 49, 21, 6, 21, 22, 21, 22, 92, 21, 2, 21, 0, 21, 2, 21, 0, 21, 4, 21,
        0, 18, 28, 54, 0, 40, 5, 104, 101, 255, 108, 111, 24, 5, 104, 101, 255, 108, 111,
        0, 0, 0, 3, 1, 5, 0, 0, 0, 104, 101, 255, 108, 111, 38, 110, 28, 21, 12, 25, 37,
        6, 0, 25, 24, 2, 99, 49, 21, 0, 22, 2, 22, 102, 22, 102, 38, 8, 60, 54, 0, 40, 5,
        104, 101, 255, 108, 111, 24, 5, 104, 101, 255, 108, 111, 0, 0, 0, 21, 4, 25, 44,
        72, 4, 114, 111, 111, 116, 21, 2, 0, 21, 12, 37, 2, 24, 2, 99, 49, 37, 0, 76, 28,
        0, 0, 0, 22, 2, 25, 28, 25, 28, 38, 110, 28, 21, 12, 25, 37, 6, 0, 25, 24, 2, 99,
        49, 21, 0, 22, 2, 22, 102, 22, 102, 38, 8, 60, 54, 0, 40, 5, 104, 101, 255, 108,
        111, 24, 5, 104, 101, 255, 108, 111, 0, 0, 0, 22, 102, 22, 2, 0, 40, 44, 65, 114,
        114, 111, 119, 50, 32, 45, 32, 78, 97, 116, 105, 118, 101, 32, 82, 117, 115, 116,
        32, 105, 109, 112, 108, 101, 109, 101, 110, 116, 97, 116, 105, 111, 110, 32, 111,
        102, 32, 65, 114, 114, 111, 119, 0, 130, 0, 0, 0, 80, 65, 82, 49,
    ];

    read(Arc::new(data), 0)
}

miri output

cargo miri run --example unsafe

StringArray
[
error: Undefined Behavior: type validation failed: encountered 0x001ecbc0, but expected a valid unicode scalar value (in `0..=0x10FFFF` but not in `0xD800..=0xDFFF`)
  --> /home/azureuser/.rustup/toolchains/nightly-2021-07-03-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/char/convert.rs:94:78
   |
94 |     if cfg!(debug_assertions) { char::from_u32(i).unwrap() } else { unsafe { transmute(i) } }
   |                                                                              ^^^^^^^^^^^^ type validation failed: encountered 0x001ecbc0, but expected a valid unicode scalar value (in `0..=0x10FFFF` but not in `0xD800..=0xDFFF`)
   |
   = help: this indicates a bug in the program: it performed an invalid operation, and caused Undefined Behavior
   = help: see https://doc.rust-lang.org/nightly/reference/behavior-considered-undefined.html for further information
           
   = note: inside `std::char::from_u32_unchecked` at /home/azureuser/.rustup/toolchains/nightly-2021-07-03-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/char/convert.rs:94:78
   = note: inside closure at /home/azureuser/.rustup/toolchains/nightly-2021-07-03-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/str/iter.rs:43:22
   = note: inside `std::option::Option::<u32>::map::<char, [closure@<std::str::Chars as std::iter::Iterator>::next::{closure#0}]>` at /home/azureuser/.rustup/toolchains/nightly-2021-07-03-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/option.rs:489:29
   = note: inside `<std::str::Chars as std::iter::Iterator>::next` at /home/azureuser/.rustup/toolchains/nightly-2021-07-03-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/str/iter.rs:41:9
   = note: inside `<std::str::CharIndices as std::iter::Iterator>::next` at /home/azureuser/.rustup/toolchains/nightly-2021-07-03-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/str/iter.rs:140:15
   = note: inside `<str as std::fmt::Debug>::fmt` at /home/azureuser/.rustup/toolchains/nightly-2021-07-03-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/fmt/mod.rs:2079:23
   = note: inside `<&str as std::fmt::Debug>::fmt` at /home/azureuser/.rustup/toolchains/nightly-2021-07-03-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/fmt/mod.rs:2033:62
   = note: inside closure at /home/azureuser/projects/arrow-rs/arrow/src/array/array_string.rs:277:13
   = note: inside `arrow::array::array::print_long_array::<arrow::array::array_string::GenericStringArray<i32>, [closure@<arrow::array::array_string::GenericStringArray<i32> as std::fmt::Debug>::fmt::{closure#0}]>` at /home/azureuser/projects/arrow-rs/arrow/src/array/array.rs:561:13
   = note: inside `<arrow::array::array_string::GenericStringArray<i32> as std::fmt::Debug>::fmt` at /home/azureuser/projects/arrow-rs/arrow/src/array/array_string.rs:276:9
   = note: inside `<std::sync::Arc<dyn arrow::array::array::Array> as std::fmt::Debug>::fmt` at /home/azureuser/.rustup/toolchains/nightly-2021-07-03-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/alloc/src/sync.rs:2266:9
   = note: inside `<&std::sync::Arc<dyn arrow::array::array::Array> as std::fmt::Debug>::fmt` at /home/azureuser/.rustup/toolchains/nightly-2021-07-03-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/fmt/mod.rs:2033:62
   = note: inside `std::fmt::write` at /home/azureuser/.rustup/toolchains/nightly-2021-07-03-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/fmt/mod.rs:1112:17
   = note: inside `<std::io::StdoutLock as std::io::Write>::write_fmt` at /home/azureuser/.rustup/toolchains/nightly-2021-07-03-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/std/src/io/mod.rs:1640:15
   = note: inside `<&std::io::Stdout as std::io::Write>::write_fmt` at /home/azureuser/.rustup/toolchains/nightly-2021-07-03-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/std/src/io/stdio.rs:657:9
   = note: inside `<std::io::Stdout as std::io::Write>::write_fmt` at /home/azureuser/.rustup/toolchains/nightly-2021-07-03-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/std/src/io/stdio.rs:631:9
   = note: inside `std::io::stdio::print_to::<std::io::Stdout>` at /home/azureuser/.rustup/toolchains/nightly-2021-07-03-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/std/src/io/stdio.rs:934:21
   = note: inside `std::io::_print` at /home/azureuser/.rustup/toolchains/nightly-2021-07-03-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/std/src/io/stdio.rs:947:5
note: inside `read` at /home/azureuser/.rustup/toolchains/nightly-2021-07-03-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/std/src/macros.rs:97:9
  --> parquet/examples/unsafe.rs:18:5
   |
18 |     println!("{:?}", batch.column(0));
   |     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
note: inside `main` at parquet/examples/unsafe.rs:38:5
  --> parquet/examples/unsafe.rs:38:5
   |
38 |     read(Arc::new(data), 0)
   |     ^^^^^^^^^^^^^^^^^^^^^^^
   = note: inside `<fn() as std::ops::FnOnce<()>>::call_once - shim(fn())` at /home/azureuser/.rustup/toolchains/nightly-2021-07-03-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/ops/function.rs:227:5
   = note: inside `std::sys_common::backtrace::__rust_begin_short_backtrace::<fn(), ()>` at /home/azureuser/.rustup/toolchains/nightly-2021-07-03-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/std/src/sys_common/backtrace.rs:125:18
   = note: inside closure at /home/azureuser/.rustup/toolchains/nightly-2021-07-03-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/std/src/rt.rs:63:18
   = note: inside `std::ops::function::impls::<impl std::ops::FnOnce<()> for &dyn std::ops::Fn() -> i32 + std::marker::Sync + std::panic::RefUnwindSafe>::call_once` at /home/azureuser/.rustup/toolchains/nightly-2021-07-03-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/ops/function.rs:259:13
   = note: inside `std::panicking::r#try::do_call::<&dyn std::ops::Fn() -> i32 + std::marker::Sync + std::panic::RefUnwindSafe, i32>` at /home/azureuser/.rustup/toolchains/nightly-2021-07-03-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/std/src/panicking.rs:401:40
   = note: inside `std::panicking::r#try::<i32, &dyn std::ops::Fn() -> i32 + std::marker::Sync + std::panic::RefUnwindSafe>` at /home/azureuser/.rustup/toolchains/nightly-2021-07-03-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/std/src/panicking.rs:365:19
   = note: inside `std::panic::catch_unwind::<&dyn std::ops::Fn() -> i32 + std::marker::Sync + std::panic::RefUnwindSafe, i32>` at /home/azureuser/.rustup/toolchains/nightly-2021-07-03-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/std/src/panic.rs:434:14
   = note: inside closure at /home/azureuser/.rustup/toolchains/nightly-2021-07-03-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/std/src/rt.rs:45:48
   = note: inside `std::panicking::r#try::do_call::<[closure@std::rt::lang_start_internal::{closure#2}], isize>` at /home/azureuser/.rustup/toolchains/nightly-2021-07-03-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/std/src/panicking.rs:401:40
   = note: inside `std::panicking::r#try::<isize, [closure@std::rt::lang_start_internal::{closure#2}]>` at /home/azureuser/.rustup/toolchains/nightly-2021-07-03-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/std/src/panicking.rs:365:19
   = note: inside `std::panic::catch_unwind::<[closure@std::rt::lang_start_internal::{closure#2}], isize>` at /home/azureuser/.rustup/toolchains/nightly-2021-07-03-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/std/src/panic.rs:434:14
   = note: inside `std::rt::lang_start_internal` at /home/azureuser/.rustup/toolchains/nightly-2021-07-03-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/std/src/rt.rs:45:20
   = note: inside `std::rt::lang_start::<()>` at /home/azureuser/.rustup/toolchains/nightly-2021-07-03-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/std/src/rt.rs:62:5
   = note: this error originates in the macro `println` (in Nightly builds, run with -Z macro-backtrace for more info)

error: aborting due to previous error

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions