@@ -170,12 +170,30 @@ mod struct_array;
170170mod tape;
171171mod timestamp_array;
172172
173+ /// Specifies what is considered valie JSON when parsing StructArrays.
174+ ///
175+ /// If a struct with fields `("a", Int32)` and `("b", Utf8)`, it could be represented as
176+ /// a JSON object (`{"a": 1, "b": "c"}`) or a JSON list (`[1, "c"]`). This enum controls
177+ /// which form(s) the Reader will accept.
178+ ///
179+ /// For objects, the order of the key does not matter. (??? Extra keys?)
180+ /// For lists, the entries must be the same number and in the same order as the struct fields.
181+ #[ derive( Copy , Clone , Debug , Default , PartialEq , Eq ) ]
182+ pub enum StructParseMode {
183+ #[ default]
184+ /// Only parse objects (e.g., {"a": 1, "b": "c"})
185+ ObjectOnly ,
186+ /// Only parse lists (e.g., [1, "c"])
187+ ListOnly ,
188+ }
189+
173190/// A builder for [`Reader`] and [`Decoder`]
174191pub struct ReaderBuilder {
175192 batch_size : usize ,
176193 coerce_primitive : bool ,
177194 strict_mode : bool ,
178195 is_field : bool ,
196+ struct_parse_mode : StructParseMode ,
179197
180198 schema : SchemaRef ,
181199}
@@ -195,6 +213,7 @@ impl ReaderBuilder {
195213 coerce_primitive : false ,
196214 strict_mode : false ,
197215 is_field : false ,
216+ struct_parse_mode : StructParseMode :: ObjectOnly ,
198217 schema,
199218 }
200219 }
@@ -235,6 +254,7 @@ impl ReaderBuilder {
235254 coerce_primitive : false ,
236255 strict_mode : false ,
237256 is_field : true ,
257+ struct_parse_mode : StructParseMode :: ObjectOnly ,
238258 schema : Arc :: new ( Schema :: new ( [ field. into ( ) ] ) ) ,
239259 }
240260 }
@@ -269,6 +289,15 @@ impl ReaderBuilder {
269289 }
270290 }
271291
292+ /// Set the [`StructParseMode`] for the reader, which determines whether
293+ /// structs can be represented by JSON objects, lists, or either.
294+ pub fn with_struct_parse_mode ( self , struct_parse_mode : StructParseMode ) -> Self {
295+ Self {
296+ struct_parse_mode,
297+ ..self
298+ }
299+ }
300+
272301 /// Create a [`Reader`] with the provided [`BufRead`]
273302 pub fn build < R : BufRead > ( self , reader : R ) -> Result < Reader < R > , ArrowError > {
274303 Ok ( Reader {
@@ -287,7 +316,13 @@ impl ReaderBuilder {
287316 }
288317 } ;
289318
290- let decoder = make_decoder ( data_type, self . coerce_primitive , self . strict_mode , nullable) ?;
319+ let decoder = make_decoder (
320+ data_type,
321+ self . coerce_primitive ,
322+ self . strict_mode ,
323+ nullable,
324+ self . struct_parse_mode ,
325+ ) ?;
291326
292327 let num_fields = self . schema . flattened_fields ( ) . len ( ) ;
293328
@@ -650,6 +685,7 @@ fn make_decoder(
650685 coerce_primitive : bool ,
651686 strict_mode : bool ,
652687 is_nullable : bool ,
688+ struct_parse_mode : StructParseMode ,
653689) -> Result < Box < dyn ArrayDecoder > , ArrowError > {
654690 downcast_integer ! {
655691 data_type => ( primitive_decoder, data_type) ,
@@ -696,13 +732,13 @@ fn make_decoder(
696732 DataType :: Boolean => Ok ( Box :: <BooleanArrayDecoder >:: default ( ) ) ,
697733 DataType :: Utf8 => Ok ( Box :: new( StringArrayDecoder :: <i32 >:: new( coerce_primitive) ) ) ,
698734 DataType :: LargeUtf8 => Ok ( Box :: new( StringArrayDecoder :: <i64 >:: new( coerce_primitive) ) ) ,
699- DataType :: List ( _) => Ok ( Box :: new( ListArrayDecoder :: <i32 >:: new( data_type, coerce_primitive, strict_mode, is_nullable) ?) ) ,
700- DataType :: LargeList ( _) => Ok ( Box :: new( ListArrayDecoder :: <i64 >:: new( data_type, coerce_primitive, strict_mode, is_nullable) ?) ) ,
701- DataType :: Struct ( _) => Ok ( Box :: new( StructArrayDecoder :: new( data_type, coerce_primitive, strict_mode, is_nullable) ?) ) ,
735+ DataType :: List ( _) => Ok ( Box :: new( ListArrayDecoder :: <i32 >:: new( data_type, coerce_primitive, strict_mode, is_nullable, struct_parse_mode ) ?) ) ,
736+ DataType :: LargeList ( _) => Ok ( Box :: new( ListArrayDecoder :: <i64 >:: new( data_type, coerce_primitive, strict_mode, is_nullable, struct_parse_mode ) ?) ) ,
737+ DataType :: Struct ( _) => Ok ( Box :: new( StructArrayDecoder :: new( data_type, coerce_primitive, strict_mode, is_nullable, struct_parse_mode ) ?) ) ,
702738 DataType :: Binary | DataType :: LargeBinary | DataType :: FixedSizeBinary ( _) => {
703739 Err ( ArrowError :: JsonError ( format!( "{data_type} is not supported by JSON" ) ) )
704740 }
705- DataType :: Map ( _, _) => Ok ( Box :: new( MapArrayDecoder :: new( data_type, coerce_primitive, strict_mode, is_nullable) ?) ) ,
741+ DataType :: Map ( _, _) => Ok ( Box :: new( MapArrayDecoder :: new( data_type, coerce_primitive, strict_mode, is_nullable, struct_parse_mode ) ?) ) ,
706742 d => Err ( ArrowError :: NotYetImplemented ( format!( "Support for {d} in JSON reader" ) ) )
707743 }
708744}
@@ -718,7 +754,7 @@ mod tests {
718754 use arrow_buffer:: { ArrowNativeType , Buffer } ;
719755 use arrow_cast:: display:: { ArrayFormatter , FormatOptions } ;
720756 use arrow_data:: ArrayDataBuilder ;
721- use arrow_schema:: Field ;
757+ use arrow_schema:: { Field , Fields } ;
722758
723759 use super :: * ;
724760
@@ -2316,4 +2352,220 @@ mod tests {
23162352 . unwrap( )
23172353 ) ;
23182354 }
2355+
2356+ #[ test]
2357+ fn test_struct_decoding_list_length ( ) {
2358+ use arrow_array:: array;
2359+
2360+ let row = "[1, 2]" ;
2361+
2362+ let mut fields = vec ! [ Field :: new( "a" , DataType :: Int32 , true ) ] ;
2363+ let too_few_fields = Fields :: from ( fields. clone ( ) ) ;
2364+ fields. push ( Field :: new ( "b" , DataType :: Int32 , true ) ) ;
2365+ let correct_fields = Fields :: from ( fields. clone ( ) ) ;
2366+ fields. push ( Field :: new ( "c" , DataType :: Int32 , true ) ) ;
2367+ let too_many_fields = Fields :: from ( fields. clone ( ) ) ;
2368+
2369+ let parse = |fields : Fields , as_field : bool | {
2370+ let builder = if as_field {
2371+ ReaderBuilder :: new_with_field ( Field :: new ( "r" , DataType :: Struct ( fields) , true ) )
2372+ } else {
2373+ ReaderBuilder :: new ( Arc :: new ( Schema :: new ( fields) ) )
2374+ } ;
2375+ builder
2376+ . with_struct_parse_mode ( StructParseMode :: ListOnly )
2377+ . build ( Cursor :: new ( row. as_bytes ( ) ) )
2378+ . unwrap ( )
2379+ . next ( )
2380+ . unwrap ( )
2381+ } ;
2382+
2383+ let expected_row = StructArray :: new (
2384+ correct_fields. clone ( ) ,
2385+ vec ! [
2386+ Arc :: new( array:: Int32Array :: from( vec![ 1 ] ) ) ,
2387+ Arc :: new( array:: Int32Array :: from( vec![ 2 ] ) ) ,
2388+ ] ,
2389+ None ,
2390+ ) ;
2391+ let row_field = Field :: new ( "r" , DataType :: Struct ( correct_fields. clone ( ) ) , true ) ;
2392+
2393+ assert_eq ! (
2394+ parse( too_few_fields. clone( ) , true ) . unwrap_err( ) . to_string( ) ,
2395+ "Json error: found extra columns for 1 fields" . to_string( )
2396+ ) ;
2397+ assert_eq ! (
2398+ parse( too_few_fields, false ) . unwrap_err( ) . to_string( ) ,
2399+ "Json error: found extra columns for 1 fields" . to_string( )
2400+ ) ;
2401+ assert_eq ! (
2402+ parse( correct_fields. clone( ) , true ) . unwrap( ) ,
2403+ RecordBatch :: try_new(
2404+ Arc :: new( Schema :: new( vec![ row_field] ) ) ,
2405+ vec![ Arc :: new( expected_row. clone( ) ) ]
2406+ )
2407+ . unwrap( )
2408+ ) ;
2409+ assert_eq ! (
2410+ parse( correct_fields, false ) . unwrap( ) ,
2411+ RecordBatch :: from( expected_row)
2412+ ) ;
2413+ assert_eq ! (
2414+ parse( too_many_fields. clone( ) , true )
2415+ . unwrap_err( )
2416+ . to_string( ) ,
2417+ "Json error: found 2 columns for 3 fields" . to_string( )
2418+ ) ;
2419+ assert_eq ! (
2420+ parse( too_many_fields, false ) . unwrap_err( ) . to_string( ) ,
2421+ "Json error: found 2 columns for 3 fields" . to_string( )
2422+ ) ;
2423+ }
2424+
2425+ #[ test]
2426+ fn test_struct_decoding ( ) {
2427+ use arrow_array:: builder;
2428+
2429+ let nested_object_json = r#"{"a": {"b": [1, 2], "c": {"d": 3}}}"# ;
2430+ let nested_list_json = r#"[[[1, 2], {"d": 3}]]"# ;
2431+ let nested_mixed_json = r#"{"a": [[1, 2], {"d": 3}]}"# ;
2432+
2433+ let struct_fields = Fields :: from ( vec ! [
2434+ Field :: new( "b" , DataType :: new_list( DataType :: Int32 , true ) , true ) ,
2435+ Field :: new_map(
2436+ "c" ,
2437+ "entries" ,
2438+ Field :: new( "keys" , DataType :: Utf8 , false ) ,
2439+ Field :: new( "values" , DataType :: Int32 , true ) ,
2440+ false ,
2441+ false ,
2442+ ) ,
2443+ ] ) ;
2444+
2445+ let list_array =
2446+ ListArray :: from_iter_primitive :: < Int32Type , _ , _ > ( vec ! [ Some ( vec![ Some ( 1 ) , Some ( 2 ) ] ) ] ) ;
2447+
2448+ let map_array = {
2449+ let mut map_builder = builder:: MapBuilder :: new (
2450+ None ,
2451+ builder:: StringBuilder :: new ( ) ,
2452+ builder:: Int32Builder :: new ( ) ,
2453+ ) ;
2454+ map_builder. keys ( ) . append_value ( "d" ) ;
2455+ map_builder. values ( ) . append_value ( 3 ) ;
2456+ map_builder. append ( true ) . unwrap ( ) ;
2457+ map_builder. finish ( )
2458+ } ;
2459+
2460+ let struct_array = StructArray :: new (
2461+ struct_fields. clone ( ) ,
2462+ vec ! [ Arc :: new( list_array) , Arc :: new( map_array) ] ,
2463+ None ,
2464+ ) ;
2465+
2466+ let schema = Arc :: new ( Schema :: new ( vec ! [ Field :: new(
2467+ "a" ,
2468+ DataType :: Struct ( struct_fields) ,
2469+ true ,
2470+ ) ] ) ) ;
2471+ let expected = RecordBatch :: try_new ( schema. clone ( ) , vec ! [ Arc :: new( struct_array) ] ) . unwrap ( ) ;
2472+
2473+ let parse = |s : & str , mode : StructParseMode | {
2474+ ReaderBuilder :: new ( schema. clone ( ) )
2475+ . with_struct_parse_mode ( mode)
2476+ . build ( Cursor :: new ( s. as_bytes ( ) ) )
2477+ . unwrap ( )
2478+ . next ( )
2479+ . unwrap ( )
2480+ } ;
2481+
2482+ assert_eq ! (
2483+ parse( nested_object_json, StructParseMode :: ObjectOnly ) . unwrap( ) ,
2484+ expected
2485+ ) ;
2486+ assert_eq ! (
2487+ parse( nested_list_json, StructParseMode :: ObjectOnly )
2488+ . unwrap_err( )
2489+ . to_string( ) ,
2490+ "Json error: expected { got [[[1, 2], {\" d\" : 3}]]" . to_owned( )
2491+ ) ;
2492+ assert_eq ! (
2493+ parse( nested_mixed_json, StructParseMode :: ObjectOnly )
2494+ . unwrap_err( )
2495+ . to_string( ) ,
2496+ "Json error: whilst decoding field 'a': expected { got [[1, 2], {\" d\" : 3}]" . to_owned( )
2497+ ) ;
2498+
2499+ assert_eq ! (
2500+ parse( nested_list_json, StructParseMode :: ListOnly ) . unwrap( ) ,
2501+ expected
2502+ ) ;
2503+ assert_eq ! (
2504+ parse( nested_object_json, StructParseMode :: ListOnly )
2505+ . unwrap_err( )
2506+ . to_string( ) ,
2507+ "Json error: expected [ got {\" a\" : {\" b\" : [1, 2]\" c\" : {\" d\" : 3}}}" . to_owned( )
2508+ ) ;
2509+ assert_eq ! (
2510+ parse( nested_mixed_json, StructParseMode :: ListOnly )
2511+ . unwrap_err( )
2512+ . to_string( ) ,
2513+ "Json error: expected [ got {\" a\" : [[1, 2], {\" d\" : 3}]}" . to_owned( )
2514+ ) ;
2515+ }
2516+
2517+ // Test cases:
2518+ // [] -> RecordBatch row with no entries. Schema = [('a', Int32)] -> Error
2519+ // [] -> RecordBatch row with no entries. Schema = [('r', [('a', Int32)])] -> Error
2520+ // [] -> StructArray row with no entries. Fields [('a', Int32')] -> Error
2521+ // [[]] -> RecordBatch row with empty struct entry. Schema = [('r', [('a', Int32)])] -> Error
2522+ #[ test]
2523+ fn test_struct_decoding_empty_list ( ) {
2524+ let int_field = Field :: new ( "a" , DataType :: Int32 , true ) ;
2525+ let struct_field = Field :: new (
2526+ "r" ,
2527+ DataType :: Struct ( Fields :: from ( vec ! [ int_field. clone( ) ] ) ) ,
2528+ true ,
2529+ ) ;
2530+
2531+ let parse = |json : & str , as_field : bool , field : Field | {
2532+ let builder = if as_field {
2533+ ReaderBuilder :: new_with_field ( field. clone ( ) )
2534+ } else {
2535+ ReaderBuilder :: new ( Arc :: new ( Schema :: new ( vec ! [ field] . clone ( ) ) ) )
2536+ } ;
2537+ builder
2538+ . with_struct_parse_mode ( StructParseMode :: ListOnly )
2539+ . build ( Cursor :: new ( json. as_bytes ( ) ) )
2540+ . unwrap ( )
2541+ . next ( )
2542+ . unwrap ( )
2543+ } ;
2544+
2545+ assert_eq ! (
2546+ parse( "[]" , true , struct_field. clone( ) )
2547+ . unwrap_err( )
2548+ . to_string( ) ,
2549+ "Json error: found 0 columns for 1 fields" . to_owned( )
2550+ ) ;
2551+ assert_eq ! (
2552+ parse( "[]" , false , int_field. clone( ) )
2553+ . unwrap_err( )
2554+ . to_string( ) ,
2555+ "Json error: found 0 columns for 1 fields" . to_owned( )
2556+ ) ;
2557+ assert_eq ! (
2558+ parse( "[]" , false , struct_field. clone( ) )
2559+ . unwrap_err( )
2560+ . to_string( ) ,
2561+ "Json error: found 0 columns for 1 fields" . to_owned( )
2562+ ) ;
2563+
2564+ assert_eq ! (
2565+ parse( "[[]]" , false , struct_field. clone( ) )
2566+ . unwrap_err( )
2567+ . to_string( ) ,
2568+ "Json error: whilst decoding field 'r': found 0 columns for 1 fields" . to_owned( )
2569+ ) ;
2570+ }
23192571}
0 commit comments