Skip to content

Commit 8fc7592

Browse files
committed
Add support for deserializing list-encoded JSON structs [#6558]
Currently, a StructArray can only be deserialized from a JSON object (e.g. `{a: 1, b: "c"}`), but some services (e.g. Presto and Trino) encode ROW types as JSON lists (e.g. `[1, "c"]`) because this is more compact, and the schema is known. This PR adds the ability to parse JSON lists into StructArrays, if the StructParseMode is set to ListOnly. In ListOnly mode, object-encoded structs raise an error. Setting to ObjectOnly (the default) has the original parsing behavior. Some notes/questions/points for discussion: 1. I've made a JsonParseMode struct instead of a bool flag for two reasons. One is that it's self-descriptive (what would `true` be?), and the other is that it allows a future Mixed mode that could deserialize either. The latter isn't currently requested by anyone. 2. I kept the error messages as similar to the old messages as possible. I considered having more specific error messages (like "Encountered a '[' when parsing a Struct, but the StructParseMode is ObjectOnly" or similar), but wanted to hear opinions before I went that route. 3. I'm not attached to any name/code-style/etc, so happy to modify to fit local conventions. Fixes #6558
1 parent 9f889aa commit 8fc7592

File tree

4 files changed

+322
-33
lines changed

4 files changed

+322
-33
lines changed

arrow-json/src/reader/list_array.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
// under the License.
1717

1818
use crate::reader::tape::{Tape, TapeElement};
19-
use crate::reader::{make_decoder, ArrayDecoder};
19+
use crate::reader::{make_decoder, ArrayDecoder, StructParseMode};
2020
use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder};
2121
use arrow_array::OffsetSizeTrait;
2222
use arrow_buffer::buffer::NullBuffer;
@@ -37,6 +37,7 @@ impl<O: OffsetSizeTrait> ListArrayDecoder<O> {
3737
coerce_primitive: bool,
3838
strict_mode: bool,
3939
is_nullable: bool,
40+
struct_parse_mode: StructParseMode,
4041
) -> Result<Self, ArrowError> {
4142
let field = match &data_type {
4243
DataType::List(f) if !O::IS_LARGE => f,
@@ -48,6 +49,7 @@ impl<O: OffsetSizeTrait> ListArrayDecoder<O> {
4849
coerce_primitive,
4950
strict_mode,
5051
field.is_nullable(),
52+
struct_parse_mode,
5153
)?;
5254

5355
Ok(Self {

arrow-json/src/reader/map_array.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
// under the License.
1717

1818
use crate::reader::tape::{Tape, TapeElement};
19-
use crate::reader::{make_decoder, ArrayDecoder};
19+
use crate::reader::{make_decoder, ArrayDecoder, StructParseMode};
2020
use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder};
2121
use arrow_buffer::buffer::NullBuffer;
2222
use arrow_buffer::ArrowNativeType;
@@ -36,6 +36,7 @@ impl MapArrayDecoder {
3636
coerce_primitive: bool,
3737
strict_mode: bool,
3838
is_nullable: bool,
39+
struct_parse_mode: StructParseMode,
3940
) -> Result<Self, ArrowError> {
4041
let fields = match &data_type {
4142
DataType::Map(_, true) => {
@@ -59,12 +60,14 @@ impl MapArrayDecoder {
5960
coerce_primitive,
6061
strict_mode,
6162
fields[0].is_nullable(),
63+
struct_parse_mode,
6264
)?;
6365
let values = make_decoder(
6466
fields[1].data_type().clone(),
6567
coerce_primitive,
6668
strict_mode,
6769
fields[1].is_nullable(),
70+
struct_parse_mode,
6871
)?;
6972

7073
Ok(Self {

arrow-json/src/reader/mod.rs

Lines changed: 258 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -170,12 +170,30 @@ mod struct_array;
170170
mod tape;
171171
mod timestamp_array;
172172

173+
/// Specifies what is considered valie JSON when parsing StructArrays.
174+
///
175+
/// If a struct with fields `("a", Int32)` and `("b", Utf8)`, it could be represented as
176+
/// a JSON object (`{"a": 1, "b": "c"}`) or a JSON list (`[1, "c"]`). This enum controls
177+
/// which form(s) the Reader will accept.
178+
///
179+
/// For objects, the order of the key does not matter. (??? Extra keys?)
180+
/// For lists, the entries must be the same number and in the same order as the struct fields.
181+
#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)]
182+
pub enum StructParseMode {
183+
#[default]
184+
/// Only parse objects (e.g., {"a": 1, "b": "c"})
185+
ObjectOnly,
186+
/// Only parse lists (e.g., [1, "c"])
187+
ListOnly,
188+
}
189+
173190
/// A builder for [`Reader`] and [`Decoder`]
174191
pub struct ReaderBuilder {
175192
batch_size: usize,
176193
coerce_primitive: bool,
177194
strict_mode: bool,
178195
is_field: bool,
196+
struct_parse_mode: StructParseMode,
179197

180198
schema: SchemaRef,
181199
}
@@ -195,6 +213,7 @@ impl ReaderBuilder {
195213
coerce_primitive: false,
196214
strict_mode: false,
197215
is_field: false,
216+
struct_parse_mode: StructParseMode::ObjectOnly,
198217
schema,
199218
}
200219
}
@@ -235,6 +254,7 @@ impl ReaderBuilder {
235254
coerce_primitive: false,
236255
strict_mode: false,
237256
is_field: true,
257+
struct_parse_mode: StructParseMode::ObjectOnly,
238258
schema: Arc::new(Schema::new([field.into()])),
239259
}
240260
}
@@ -269,6 +289,15 @@ impl ReaderBuilder {
269289
}
270290
}
271291

292+
/// Set the [`StructParseMode`] for the reader, which determines whether
293+
/// structs can be represented by JSON objects, lists, or either.
294+
pub fn with_struct_parse_mode(self, struct_parse_mode: StructParseMode) -> Self {
295+
Self {
296+
struct_parse_mode,
297+
..self
298+
}
299+
}
300+
272301
/// Create a [`Reader`] with the provided [`BufRead`]
273302
pub fn build<R: BufRead>(self, reader: R) -> Result<Reader<R>, ArrowError> {
274303
Ok(Reader {
@@ -287,7 +316,13 @@ impl ReaderBuilder {
287316
}
288317
};
289318

290-
let decoder = make_decoder(data_type, self.coerce_primitive, self.strict_mode, nullable)?;
319+
let decoder = make_decoder(
320+
data_type,
321+
self.coerce_primitive,
322+
self.strict_mode,
323+
nullable,
324+
self.struct_parse_mode,
325+
)?;
291326

292327
let num_fields = self.schema.flattened_fields().len();
293328

@@ -650,6 +685,7 @@ fn make_decoder(
650685
coerce_primitive: bool,
651686
strict_mode: bool,
652687
is_nullable: bool,
688+
struct_parse_mode: StructParseMode,
653689
) -> Result<Box<dyn ArrayDecoder>, ArrowError> {
654690
downcast_integer! {
655691
data_type => (primitive_decoder, data_type),
@@ -696,13 +732,13 @@ fn make_decoder(
696732
DataType::Boolean => Ok(Box::<BooleanArrayDecoder>::default()),
697733
DataType::Utf8 => Ok(Box::new(StringArrayDecoder::<i32>::new(coerce_primitive))),
698734
DataType::LargeUtf8 => Ok(Box::new(StringArrayDecoder::<i64>::new(coerce_primitive))),
699-
DataType::List(_) => Ok(Box::new(ListArrayDecoder::<i32>::new(data_type, coerce_primitive, strict_mode, is_nullable)?)),
700-
DataType::LargeList(_) => Ok(Box::new(ListArrayDecoder::<i64>::new(data_type, coerce_primitive, strict_mode, is_nullable)?)),
701-
DataType::Struct(_) => Ok(Box::new(StructArrayDecoder::new(data_type, coerce_primitive, strict_mode, is_nullable)?)),
735+
DataType::List(_) => Ok(Box::new(ListArrayDecoder::<i32>::new(data_type, coerce_primitive, strict_mode, is_nullable, struct_parse_mode)?)),
736+
DataType::LargeList(_) => Ok(Box::new(ListArrayDecoder::<i64>::new(data_type, coerce_primitive, strict_mode, is_nullable, struct_parse_mode)?)),
737+
DataType::Struct(_) => Ok(Box::new(StructArrayDecoder::new(data_type, coerce_primitive, strict_mode, is_nullable, struct_parse_mode)?)),
702738
DataType::Binary | DataType::LargeBinary | DataType::FixedSizeBinary(_) => {
703739
Err(ArrowError::JsonError(format!("{data_type} is not supported by JSON")))
704740
}
705-
DataType::Map(_, _) => Ok(Box::new(MapArrayDecoder::new(data_type, coerce_primitive, strict_mode, is_nullable)?)),
741+
DataType::Map(_, _) => Ok(Box::new(MapArrayDecoder::new(data_type, coerce_primitive, strict_mode, is_nullable, struct_parse_mode)?)),
706742
d => Err(ArrowError::NotYetImplemented(format!("Support for {d} in JSON reader")))
707743
}
708744
}
@@ -718,7 +754,7 @@ mod tests {
718754
use arrow_buffer::{ArrowNativeType, Buffer};
719755
use arrow_cast::display::{ArrayFormatter, FormatOptions};
720756
use arrow_data::ArrayDataBuilder;
721-
use arrow_schema::Field;
757+
use arrow_schema::{Field, Fields};
722758

723759
use super::*;
724760

@@ -2316,4 +2352,220 @@ mod tests {
23162352
.unwrap()
23172353
);
23182354
}
2355+
2356+
#[test]
2357+
fn test_struct_decoding_list_length() {
2358+
use arrow_array::array;
2359+
2360+
let row = "[1, 2]";
2361+
2362+
let mut fields = vec![Field::new("a", DataType::Int32, true)];
2363+
let too_few_fields = Fields::from(fields.clone());
2364+
fields.push(Field::new("b", DataType::Int32, true));
2365+
let correct_fields = Fields::from(fields.clone());
2366+
fields.push(Field::new("c", DataType::Int32, true));
2367+
let too_many_fields = Fields::from(fields.clone());
2368+
2369+
let parse = |fields: Fields, as_field: bool| {
2370+
let builder = if as_field {
2371+
ReaderBuilder::new_with_field(Field::new("r", DataType::Struct(fields), true))
2372+
} else {
2373+
ReaderBuilder::new(Arc::new(Schema::new(fields)))
2374+
};
2375+
builder
2376+
.with_struct_parse_mode(StructParseMode::ListOnly)
2377+
.build(Cursor::new(row.as_bytes()))
2378+
.unwrap()
2379+
.next()
2380+
.unwrap()
2381+
};
2382+
2383+
let expected_row = StructArray::new(
2384+
correct_fields.clone(),
2385+
vec![
2386+
Arc::new(array::Int32Array::from(vec![1])),
2387+
Arc::new(array::Int32Array::from(vec![2])),
2388+
],
2389+
None,
2390+
);
2391+
let row_field = Field::new("r", DataType::Struct(correct_fields.clone()), true);
2392+
2393+
assert_eq!(
2394+
parse(too_few_fields.clone(), true).unwrap_err().to_string(),
2395+
"Json error: found extra columns for 1 fields".to_string()
2396+
);
2397+
assert_eq!(
2398+
parse(too_few_fields, false).unwrap_err().to_string(),
2399+
"Json error: found extra columns for 1 fields".to_string()
2400+
);
2401+
assert_eq!(
2402+
parse(correct_fields.clone(), true).unwrap(),
2403+
RecordBatch::try_new(
2404+
Arc::new(Schema::new(vec![row_field])),
2405+
vec![Arc::new(expected_row.clone())]
2406+
)
2407+
.unwrap()
2408+
);
2409+
assert_eq!(
2410+
parse(correct_fields, false).unwrap(),
2411+
RecordBatch::from(expected_row)
2412+
);
2413+
assert_eq!(
2414+
parse(too_many_fields.clone(), true)
2415+
.unwrap_err()
2416+
.to_string(),
2417+
"Json error: found 2 columns for 3 fields".to_string()
2418+
);
2419+
assert_eq!(
2420+
parse(too_many_fields, false).unwrap_err().to_string(),
2421+
"Json error: found 2 columns for 3 fields".to_string()
2422+
);
2423+
}
2424+
2425+
#[test]
2426+
fn test_struct_decoding() {
2427+
use arrow_array::builder;
2428+
2429+
let nested_object_json = r#"{"a": {"b": [1, 2], "c": {"d": 3}}}"#;
2430+
let nested_list_json = r#"[[[1, 2], {"d": 3}]]"#;
2431+
let nested_mixed_json = r#"{"a": [[1, 2], {"d": 3}]}"#;
2432+
2433+
let struct_fields = Fields::from(vec![
2434+
Field::new("b", DataType::new_list(DataType::Int32, true), true),
2435+
Field::new_map(
2436+
"c",
2437+
"entries",
2438+
Field::new("keys", DataType::Utf8, false),
2439+
Field::new("values", DataType::Int32, true),
2440+
false,
2441+
false,
2442+
),
2443+
]);
2444+
2445+
let list_array =
2446+
ListArray::from_iter_primitive::<Int32Type, _, _>(vec![Some(vec![Some(1), Some(2)])]);
2447+
2448+
let map_array = {
2449+
let mut map_builder = builder::MapBuilder::new(
2450+
None,
2451+
builder::StringBuilder::new(),
2452+
builder::Int32Builder::new(),
2453+
);
2454+
map_builder.keys().append_value("d");
2455+
map_builder.values().append_value(3);
2456+
map_builder.append(true).unwrap();
2457+
map_builder.finish()
2458+
};
2459+
2460+
let struct_array = StructArray::new(
2461+
struct_fields.clone(),
2462+
vec![Arc::new(list_array), Arc::new(map_array)],
2463+
None,
2464+
);
2465+
2466+
let schema = Arc::new(Schema::new(vec![Field::new(
2467+
"a",
2468+
DataType::Struct(struct_fields),
2469+
true,
2470+
)]));
2471+
let expected = RecordBatch::try_new(schema.clone(), vec![Arc::new(struct_array)]).unwrap();
2472+
2473+
let parse = |s: &str, mode: StructParseMode| {
2474+
ReaderBuilder::new(schema.clone())
2475+
.with_struct_parse_mode(mode)
2476+
.build(Cursor::new(s.as_bytes()))
2477+
.unwrap()
2478+
.next()
2479+
.unwrap()
2480+
};
2481+
2482+
assert_eq!(
2483+
parse(nested_object_json, StructParseMode::ObjectOnly).unwrap(),
2484+
expected
2485+
);
2486+
assert_eq!(
2487+
parse(nested_list_json, StructParseMode::ObjectOnly)
2488+
.unwrap_err()
2489+
.to_string(),
2490+
"Json error: expected { got [[[1, 2], {\"d\": 3}]]".to_owned()
2491+
);
2492+
assert_eq!(
2493+
parse(nested_mixed_json, StructParseMode::ObjectOnly)
2494+
.unwrap_err()
2495+
.to_string(),
2496+
"Json error: whilst decoding field 'a': expected { got [[1, 2], {\"d\": 3}]".to_owned()
2497+
);
2498+
2499+
assert_eq!(
2500+
parse(nested_list_json, StructParseMode::ListOnly).unwrap(),
2501+
expected
2502+
);
2503+
assert_eq!(
2504+
parse(nested_object_json, StructParseMode::ListOnly)
2505+
.unwrap_err()
2506+
.to_string(),
2507+
"Json error: expected [ got {\"a\": {\"b\": [1, 2]\"c\": {\"d\": 3}}}".to_owned()
2508+
);
2509+
assert_eq!(
2510+
parse(nested_mixed_json, StructParseMode::ListOnly)
2511+
.unwrap_err()
2512+
.to_string(),
2513+
"Json error: expected [ got {\"a\": [[1, 2], {\"d\": 3}]}".to_owned()
2514+
);
2515+
}
2516+
2517+
// Test cases:
2518+
// [] -> RecordBatch row with no entries. Schema = [('a', Int32)] -> Error
2519+
// [] -> RecordBatch row with no entries. Schema = [('r', [('a', Int32)])] -> Error
2520+
// [] -> StructArray row with no entries. Fields [('a', Int32')] -> Error
2521+
// [[]] -> RecordBatch row with empty struct entry. Schema = [('r', [('a', Int32)])] -> Error
2522+
#[test]
2523+
fn test_struct_decoding_empty_list() {
2524+
let int_field = Field::new("a", DataType::Int32, true);
2525+
let struct_field = Field::new(
2526+
"r",
2527+
DataType::Struct(Fields::from(vec![int_field.clone()])),
2528+
true,
2529+
);
2530+
2531+
let parse = |json: &str, as_field: bool, field: Field| {
2532+
let builder = if as_field {
2533+
ReaderBuilder::new_with_field(field.clone())
2534+
} else {
2535+
ReaderBuilder::new(Arc::new(Schema::new(vec![field].clone())))
2536+
};
2537+
builder
2538+
.with_struct_parse_mode(StructParseMode::ListOnly)
2539+
.build(Cursor::new(json.as_bytes()))
2540+
.unwrap()
2541+
.next()
2542+
.unwrap()
2543+
};
2544+
2545+
assert_eq!(
2546+
parse("[]", true, struct_field.clone())
2547+
.unwrap_err()
2548+
.to_string(),
2549+
"Json error: found 0 columns for 1 fields".to_owned()
2550+
);
2551+
assert_eq!(
2552+
parse("[]", false, int_field.clone())
2553+
.unwrap_err()
2554+
.to_string(),
2555+
"Json error: found 0 columns for 1 fields".to_owned()
2556+
);
2557+
assert_eq!(
2558+
parse("[]", false, struct_field.clone())
2559+
.unwrap_err()
2560+
.to_string(),
2561+
"Json error: found 0 columns for 1 fields".to_owned()
2562+
);
2563+
2564+
assert_eq!(
2565+
parse("[[]]", false, struct_field.clone())
2566+
.unwrap_err()
2567+
.to_string(),
2568+
"Json error: whilst decoding field 'r': found 0 columns for 1 fields".to_owned()
2569+
);
2570+
}
23192571
}

0 commit comments

Comments
 (0)