Skip to content

Commit 28c3cb9

Browse files
mkarboalambscovich
authored
Initial API for reading Variant data and metadata (#7535)
# Which issue does this PR close? <!-- We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. For example `Closes #123` indicates that this PR will close issue #123. --> Closes #7423 # Rationale for this change <!-- Why are you proposing this change? If this is already explained clearly in the issue then this section is not needed. Explaining clearly why changes are proposed helps reviewers understand your changes and offer better suggestions for fixes. --> We need to agree on an API for reading Variant metadata. Based on the work and discussions in #7452, in this PR we propose an API plus an implementation (WIP while draft) for reading variant metadata in the parquet-variant crate. A lot of the work is based on the work in #7452 by @PinkCrow007 and feedback from @alamb, @scovich, and @Weijun-H. # What changes are included in this PR? - Adds Variant enum (and associated structs) - Adds an API for parsing and reading metadata - Adds an API for parsing and reading Variant values of various types We attempt to be result- and validation driven while ensuring zero-allocations, and we do so avoiding `serde_json`. We tried to keep the Variant API similar to the `Json::Value` api. <!-- There is no need to duplicate the description in the issue here but it is sometimes worth providing a summary of the individual changes in this PR. --> # Are there any user-facing changes? The new API's added in parquet-variant will be user facing. --------- Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org> Co-authored-by: Ryan Johnson <scovich@users.noreply.github.com>
1 parent d17dce0 commit 28c3cb9

File tree

6 files changed

+1051
-0
lines changed

6 files changed

+1051
-0
lines changed

parquet-variant/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ edition = { workspace = true }
3131
rust-version = { workspace = true }
3232

3333
[dependencies]
34+
arrow-schema = "55.1.0"
3435

3536
[lib]
3637

parquet-variant/src/decoder.rs

Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
use arrow_schema::ArrowError;
18+
use std::array::TryFromSliceError;
19+
20+
use crate::utils::{array_from_slice, first_byte_from_slice, string_from_slice};
21+
22+
#[derive(Debug, Clone, Copy)]
23+
pub enum VariantBasicType {
24+
Primitive = 0,
25+
ShortString = 1,
26+
Object = 2,
27+
Array = 3,
28+
}
29+
30+
#[derive(Debug, Clone, Copy)]
31+
pub enum VariantPrimitiveType {
32+
Null = 0,
33+
BooleanTrue = 1,
34+
BooleanFalse = 2,
35+
Int8 = 3,
36+
// TODO: Add types for the rest of primitives, once API is agreed upon
37+
String = 16,
38+
}
39+
40+
/// Extracts the basic type from a header byte
41+
pub(crate) fn get_basic_type(header: u8) -> Result<VariantBasicType, ArrowError> {
42+
// See https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-encoding
43+
let basic_type = header & 0x03; // Basic type is encoded in the first 2 bits
44+
let basic_type = match basic_type {
45+
0 => VariantBasicType::Primitive,
46+
1 => VariantBasicType::ShortString,
47+
2 => VariantBasicType::Object,
48+
3 => VariantBasicType::Array,
49+
_ => {
50+
//NOTE: A 2-bit value has a max of 4 different values (0-3), hence this is unreachable as we
51+
// masked `basic_type` with 0x03 above.
52+
unreachable!();
53+
}
54+
};
55+
Ok(basic_type)
56+
}
57+
58+
impl TryFrom<u8> for VariantPrimitiveType {
59+
type Error = ArrowError;
60+
61+
fn try_from(value: u8) -> Result<Self, Self::Error> {
62+
match value {
63+
0 => Ok(VariantPrimitiveType::Null),
64+
1 => Ok(VariantPrimitiveType::BooleanTrue),
65+
2 => Ok(VariantPrimitiveType::BooleanFalse),
66+
3 => Ok(VariantPrimitiveType::Int8),
67+
// TODO: Add types for the rest, once API is agreed upon
68+
16 => Ok(VariantPrimitiveType::String),
69+
_ => Err(ArrowError::InvalidArgumentError(format!(
70+
"unknown primitive type: {}",
71+
value
72+
))),
73+
}
74+
}
75+
}
76+
/// Extract the primitive type from a Variant value-header byte
77+
pub(crate) fn get_primitive_type(header: u8) -> Result<VariantPrimitiveType, ArrowError> {
78+
// last 6 bits contain the primitive-type, see spec
79+
VariantPrimitiveType::try_from(header >> 2)
80+
}
81+
82+
/// To be used in `map_err` when unpacking an integer from a slice of bytes.
83+
fn map_try_from_slice_error(e: TryFromSliceError) -> ArrowError {
84+
ArrowError::InvalidArgumentError(e.to_string())
85+
}
86+
87+
/// Decodes an Int8 from the value section of a variant.
88+
pub(crate) fn decode_int8(value: &[u8]) -> Result<i8, ArrowError> {
89+
let value = i8::from_le_bytes(array_from_slice(value, 1)?);
90+
Ok(value)
91+
}
92+
93+
/// Decodes a long string from the value section of a variant.
94+
pub(crate) fn decode_long_string(value: &[u8]) -> Result<&str, ArrowError> {
95+
let len = u32::from_le_bytes(array_from_slice(value, 1)?) as usize;
96+
let string = string_from_slice(value, 5..5 + len)?;
97+
Ok(string)
98+
}
99+
100+
/// Decodes a short string from the value section of a variant.
101+
pub(crate) fn decode_short_string(value: &[u8]) -> Result<&str, ArrowError> {
102+
let len = (first_byte_from_slice(value)? >> 2) as usize;
103+
104+
let string = string_from_slice(value, 1..1 + len)?;
105+
Ok(string)
106+
}
107+
108+
#[cfg(test)]
109+
mod tests {
110+
use super::*;
111+
112+
#[test]
113+
fn test_i8() -> Result<(), ArrowError> {
114+
let value = [
115+
0 | 3 << 2, // Primitive type for i8
116+
42,
117+
];
118+
let result = decode_int8(&value)?;
119+
assert_eq!(result, 42);
120+
Ok(())
121+
}
122+
123+
#[test]
124+
fn test_short_string() -> Result<(), ArrowError> {
125+
let value = [
126+
1 | 5 << 2, // Basic type for short string | length of short string
127+
'H' as u8,
128+
'e' as u8,
129+
'l' as u8,
130+
'l' as u8,
131+
'o' as u8,
132+
'o' as u8,
133+
];
134+
let result = decode_short_string(&value)?;
135+
assert_eq!(result, "Hello");
136+
Ok(())
137+
}
138+
139+
#[test]
140+
fn test_string() -> Result<(), ArrowError> {
141+
let value = [
142+
0 | 16 << 2, // Basic type for short string | length of short string
143+
5,
144+
0,
145+
0,
146+
0, // Length of string
147+
'H' as u8,
148+
'e' as u8,
149+
'l' as u8,
150+
'l' as u8,
151+
'o' as u8,
152+
'o' as u8,
153+
];
154+
let result = decode_long_string(&value)?;
155+
assert_eq!(result, "Hello");
156+
Ok(())
157+
}
158+
}

parquet-variant/src/lib.rs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,16 @@
2626
//! If you are interested in helping, you can find more information on the GitHub [Variant issue]
2727
//!
2828
//! [Variant issue]: https://github.com/apache/arrow-rs/issues/6736
29+
30+
// TODO: dead code removal
31+
#[allow(dead_code)]
32+
mod decoder;
33+
// TODO: dead code removal
34+
#[allow(dead_code)]
35+
mod variant;
36+
// TODO: dead code removal
37+
#[allow(dead_code)]
38+
mod utils;
39+
40+
#[cfg(test)]
41+
mod test_variant;
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
//! End-to-end check: (almost) every sample from apache/parquet-testing/variant
19+
//! can be parsed into our `Variant`.
20+
21+
// NOTE: We keep this file separate rather than a test mod inside variant.rs because it should be
22+
// moved to the test folder later
23+
use std::fs;
24+
use std::path::{Path, PathBuf};
25+
26+
use crate::variant::{Variant, VariantMetadata};
27+
use arrow_schema::ArrowError;
28+
29+
fn cases_dir() -> PathBuf {
30+
Path::new(env!("CARGO_MANIFEST_DIR"))
31+
.join("..")
32+
.join("parquet-testing")
33+
.join("variant")
34+
}
35+
36+
fn load_case(name: &str) -> Result<(Vec<u8>, Vec<u8>), ArrowError> {
37+
let root = cases_dir();
38+
let meta = fs::read(root.join(format!("{name}.metadata")))?;
39+
let val = fs::read(root.join(format!("{name}.value")))?;
40+
Ok((meta, val))
41+
}
42+
43+
fn get_primitive_cases() -> Vec<(&'static str, Variant<'static, 'static>)> {
44+
vec![
45+
("primitive_boolean_false", Variant::BooleanFalse),
46+
("primitive_boolean_true", Variant::BooleanTrue),
47+
("primitive_int8", Variant::Int8(42)),
48+
// Using the From<String> trait
49+
("primitive_string", Variant::from("This string is longer than 64 bytes and therefore does not fit in a short_string and it also includes several non ascii characters such as 🐢, 💖, ♥\u{fe0f}, 🎣 and 🤦!!")),
50+
// Using the From<String> trait
51+
("short_string", Variant::from("Less than 64 bytes (❤\u{fe0f} with utf8)")),
52+
// TODO Reenable when https://github.com/apache/parquet-testing/issues/81 is fixed
53+
// ("primitive_null", Variant::Null),
54+
]
55+
}
56+
57+
fn get_non_primitive_cases() -> Vec<&'static str> {
58+
vec!["object_primitive", "array_primitive"]
59+
}
60+
61+
#[test]
62+
fn variant_primitive() -> Result<(), ArrowError> {
63+
let cases = get_primitive_cases();
64+
for (case, want) in cases {
65+
let (metadata_bytes, value) = load_case(case)?;
66+
let metadata = VariantMetadata::try_new(&metadata_bytes)?;
67+
let got = Variant::try_new(&metadata, &value)?;
68+
assert_eq!(got, want);
69+
}
70+
Ok(())
71+
}
72+
73+
#[test]
74+
fn variant_non_primitive() -> Result<(), ArrowError> {
75+
let cases = get_non_primitive_cases();
76+
for case in cases {
77+
let (metadata, value) = load_case(case)?;
78+
let metadata = VariantMetadata::try_new(&metadata)?;
79+
let variant = Variant::try_new(&metadata, &value)?;
80+
match case {
81+
"object_primitive" => {
82+
assert!(matches!(variant, Variant::Object(_)));
83+
assert_eq!(metadata.dictionary_size(), 7);
84+
let dict_val = metadata.get_field_by(0)?;
85+
assert_eq!(dict_val, "int_field");
86+
}
87+
"array_primitive" => match variant {
88+
Variant::Array(arr) => {
89+
let v = arr.get(0)?;
90+
assert!(matches!(v, Variant::Int8(2)));
91+
let v = arr.get(1)?;
92+
assert!(matches!(v, Variant::Int8(1)));
93+
}
94+
_ => panic!("expected an array"),
95+
},
96+
_ => unreachable!(),
97+
}
98+
}
99+
Ok(())
100+
}

parquet-variant/src/utils.rs

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
use std::{array::TryFromSliceError, ops::Range, str};
18+
19+
use arrow_schema::ArrowError;
20+
21+
use std::fmt::Debug;
22+
use std::slice::SliceIndex;
23+
24+
#[inline]
25+
26+
pub(crate) fn slice_from_slice<I: SliceIndex<[u8]> + Clone + Debug>(
27+
bytes: &[u8],
28+
index: I,
29+
) -> Result<&I::Output, ArrowError> {
30+
bytes.get(index.clone()).ok_or_else(|| {
31+
ArrowError::InvalidArgumentError(format!(
32+
"Tried to extract byte(s) {index:?} from {}-byte buffer",
33+
bytes.len(),
34+
))
35+
})
36+
}
37+
pub(crate) fn array_from_slice<const N: usize>(
38+
bytes: &[u8],
39+
offset: usize,
40+
) -> Result<[u8; N], ArrowError> {
41+
let bytes = slice_from_slice(bytes, offset..offset + N)?;
42+
bytes.try_into().map_err(map_try_from_slice_error)
43+
}
44+
45+
/// To be used in `map_err` when unpacking an integer from a slice of bytes.
46+
pub(crate) fn map_try_from_slice_error(e: TryFromSliceError) -> ArrowError {
47+
ArrowError::InvalidArgumentError(e.to_string())
48+
}
49+
50+
pub(crate) fn first_byte_from_slice(slice: &[u8]) -> Result<&u8, ArrowError> {
51+
slice
52+
.get(0)
53+
.ok_or_else(|| ArrowError::InvalidArgumentError("Received empty bytes".to_string()))
54+
}
55+
56+
/// Helper to get a &str from a slice based on range, if it's valid or an error otherwise
57+
pub(crate) fn string_from_slice(slice: &[u8], range: Range<usize>) -> Result<&str, ArrowError> {
58+
str::from_utf8(slice_from_slice(slice, range)?)
59+
.map_err(|_| ArrowError::InvalidArgumentError("invalid UTF-8 string".to_string()))
60+
}

0 commit comments

Comments
 (0)