Skip to content

Commit 000353d

Browse files
authored
Add support for de/serializing list-encoded JSON structs [#6558] (#6643)
Currently, a StructArray can only be deserialized from or serialized to a JSON object (e.g. `{a: 1, b: "c"}`), but some services (e.g. Presto and Trino) encode ROW types as JSON lists (e.g. `[1, "c"]`) because this is more compact, and the schema is known. This PR adds the ability to encode and decode JSON lists from and to StructArrays, if StructMode is set to ListOnly. In ListOnly mode, object-encoded structs raise an error. Setting to ObjectOnly (the default) has the original parsing behavior. Some notes/questions/points for discussion: 1. I've made a JsonParseMode struct instead of a bool flag for two reasons. One is that it's self-descriptive (what would `true` be?), and the other is that it allows a future Mixed mode that could deserialize either. The latter isn't currently requested by anyone. 2. I kept the error messages as similar to the old messages as possible. I considered having more specific error messages (like "Encountered a '[' when parsing a Struct, but the StructParseMode is ObjectOnly" or similar), but wanted to hear opinions before I went that route. 3. I'm not attached to any name/code-style/etc, so happy to modify to fit local conventions. Fixes #6558
1 parent f59b94f commit 000353d

File tree

7 files changed

+579
-46
lines changed

7 files changed

+579
-46
lines changed

arrow-json/src/lib.rs

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,35 @@ pub use self::writer::{ArrayWriter, LineDelimitedWriter, Writer, WriterBuilder};
7474
use half::f16;
7575
use serde_json::{Number, Value};
7676

77+
/// Specifies what is considered valid JSON when reading or writing
78+
/// RecordBatches or StructArrays.
79+
///
80+
/// This enum controls which form(s) the Reader will accept and which form the
81+
/// Writer will produce. For example, if the RecordBatch Schema is
82+
/// `[("a", Int32), ("r", Struct([("b", Boolean), ("c", Utf8)]))]`
83+
/// then a Reader with [`StructMode::ObjectOnly`] would read rows of the form
84+
/// `{"a": 1, "r": {"b": true, "c": "cat"}}` while with ['StructMode::ListOnly']
85+
/// would read rows of the form `[1, [true, "cat"]]`. A Writer would produce
86+
/// rows formatted similarly.
87+
///
88+
/// The list encoding is more compact if the schema is known, and is used by
89+
/// tools such as [Presto] and [Trino].
90+
///
91+
/// When reading objects, the order of the key does not matter. When reading
92+
/// lists, the entries must be the same number and in the same order as the
93+
/// struct fields. Map columns are not affected by this option.
94+
///
95+
/// [Presto]: (https://prestodb.io/docs/current/develop/client-protocol.html#important-queryresults-attributes)
96+
/// [Trino]: (https://trino.io/docs/current/develop/client-protocol.html#important-queryresults-attributes)
97+
#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)]
98+
pub enum StructMode {
99+
#[default]
100+
/// Encode/decode structs as objects (e.g., {"a": 1, "b": "c"})
101+
ObjectOnly,
102+
/// Encode/decode structs as lists (e.g., [1, "c"])
103+
ListOnly,
104+
}
105+
77106
/// Trait declaring any type that is serializable to JSON. This includes all primitive types (bool, i32, etc.).
78107
pub trait JsonSerializable: 'static {
79108
/// Converts self into json value if its possible
@@ -156,4 +185,72 @@ mod tests {
156185
);
157186
assert_eq!(None, f32::NAN.into_json_value());
158187
}
188+
189+
#[test]
190+
fn test_json_roundtrip_structs() {
191+
use crate::writer::LineDelimited;
192+
use arrow_schema::DataType;
193+
use arrow_schema::Field;
194+
use arrow_schema::Fields;
195+
use arrow_schema::Schema;
196+
use std::sync::Arc;
197+
198+
let schema = Arc::new(Schema::new(vec![
199+
Field::new(
200+
"c1",
201+
DataType::Struct(Fields::from(vec![
202+
Field::new("c11", DataType::Int32, true),
203+
Field::new(
204+
"c12",
205+
DataType::Struct(vec![Field::new("c121", DataType::Utf8, false)].into()),
206+
false,
207+
),
208+
])),
209+
false,
210+
),
211+
Field::new("c2", DataType::Utf8, false),
212+
]));
213+
214+
{
215+
let object_input = r#"{"c1":{"c11":1,"c12":{"c121":"e"}},"c2":"a"}
216+
{"c1":{"c12":{"c121":"f"}},"c2":"b"}
217+
{"c1":{"c11":5,"c12":{"c121":"g"}},"c2":"c"}
218+
"#
219+
.as_bytes();
220+
let object_reader = ReaderBuilder::new(schema.clone())
221+
.with_struct_mode(StructMode::ObjectOnly)
222+
.build(object_input)
223+
.unwrap();
224+
225+
let mut object_output: Vec<u8> = Vec::new();
226+
let mut object_writer = WriterBuilder::new()
227+
.with_struct_mode(StructMode::ObjectOnly)
228+
.build::<_, LineDelimited>(&mut object_output);
229+
for batch_res in object_reader {
230+
object_writer.write(&batch_res.unwrap()).unwrap();
231+
}
232+
assert_eq!(object_input, &object_output);
233+
}
234+
235+
{
236+
let list_input = r#"[[1,["e"]],"a"]
237+
[[null,["f"]],"b"]
238+
[[5,["g"]],"c"]
239+
"#
240+
.as_bytes();
241+
let list_reader = ReaderBuilder::new(schema.clone())
242+
.with_struct_mode(StructMode::ListOnly)
243+
.build(list_input)
244+
.unwrap();
245+
246+
let mut list_output: Vec<u8> = Vec::new();
247+
let mut list_writer = WriterBuilder::new()
248+
.with_struct_mode(StructMode::ListOnly)
249+
.build::<_, LineDelimited>(&mut list_output);
250+
for batch_res in list_reader {
251+
list_writer.write(&batch_res.unwrap()).unwrap();
252+
}
253+
assert_eq!(list_input, &list_output);
254+
}
255+
}
159256
}

arrow-json/src/reader/list_array.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
use crate::reader::tape::{Tape, TapeElement};
1919
use crate::reader::{make_decoder, ArrayDecoder};
20+
use crate::StructMode;
2021
use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder};
2122
use arrow_array::OffsetSizeTrait;
2223
use arrow_buffer::buffer::NullBuffer;
@@ -37,6 +38,7 @@ impl<O: OffsetSizeTrait> ListArrayDecoder<O> {
3738
coerce_primitive: bool,
3839
strict_mode: bool,
3940
is_nullable: bool,
41+
struct_mode: StructMode,
4042
) -> Result<Self, ArrowError> {
4143
let field = match &data_type {
4244
DataType::List(f) if !O::IS_LARGE => f,
@@ -48,6 +50,7 @@ impl<O: OffsetSizeTrait> ListArrayDecoder<O> {
4850
coerce_primitive,
4951
strict_mode,
5052
field.is_nullable(),
53+
struct_mode,
5154
)?;
5255

5356
Ok(Self {

arrow-json/src/reader/map_array.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
use crate::reader::tape::{Tape, TapeElement};
1919
use crate::reader::{make_decoder, ArrayDecoder};
20+
use crate::StructMode;
2021
use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder};
2122
use arrow_buffer::buffer::NullBuffer;
2223
use arrow_buffer::ArrowNativeType;
@@ -36,6 +37,7 @@ impl MapArrayDecoder {
3637
coerce_primitive: bool,
3738
strict_mode: bool,
3839
is_nullable: bool,
40+
struct_mode: StructMode,
3941
) -> Result<Self, ArrowError> {
4042
let fields = match &data_type {
4143
DataType::Map(_, true) => {
@@ -59,12 +61,14 @@ impl MapArrayDecoder {
5961
coerce_primitive,
6062
strict_mode,
6163
fields[0].is_nullable(),
64+
struct_mode,
6265
)?;
6366
let values = make_decoder(
6467
fields[1].data_type().clone(),
6568
coerce_primitive,
6669
strict_mode,
6770
fields[1].is_nullable(),
71+
struct_mode,
6872
)?;
6973

7074
Ok(Self {

0 commit comments

Comments
 (0)