From 0d031cc8aa81296cb1bdfedea7a7cb4ec6aa54ea Mon Sep 17 00:00:00 2001 From: Trevor Hilton Date: Mon, 15 Apr 2024 10:30:04 -0400 Subject: [PATCH] feat: JSON encoding of FixedSizeList (#5646) Added ability to encode the FixedSizeList type in JSON. The implementation of FixedSizeListEncoder is very similar to that of ListEncoder, but is somewhat simpler, because of the constant offset. A test was added to verify behaviour of the JSON encoder with and without explicit nulls. --- arrow-json/src/writer.rs | 85 +++++++++++++++++++++++++++++++- arrow-json/src/writer/encoder.rs | 53 +++++++++++++++++++- 2 files changed, 136 insertions(+), 2 deletions(-) diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index 85a81d98e1b9..b97065989c55 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -834,7 +834,8 @@ mod tests { use serde_json::json; use arrow_array::builder::{ - FixedSizeBinaryBuilder, Int32Builder, Int64Builder, MapBuilder, StringBuilder, + FixedSizeBinaryBuilder, FixedSizeListBuilder, Int32Builder, Int64Builder, MapBuilder, + StringBuilder, }; use arrow_buffer::{Buffer, NullBuffer, OffsetBuffer, ToByteSlice}; use arrow_data::ArrayData; @@ -2215,4 +2216,86 @@ mod tests { ); } } + + #[test] + fn test_writer_fixed_size_list() { + let size = 3; + let field = FieldRef::new(Field::new("item", DataType::Int32, true)); + let schema = SchemaRef::new(Schema::new(vec![Field::new( + "list", + DataType::FixedSizeList(field, size), + true, + )])); + + let values_builder = Int32Builder::new(); + let mut list_builder = FixedSizeListBuilder::new(values_builder, size); + let lists = [ + Some([Some(1), Some(2), None]), + Some([Some(3), None, Some(4)]), + Some([None, Some(5), Some(6)]), + None, + ]; + for list in lists { + match list { + Some(l) => { + for value in l { + match value { + Some(v) => list_builder.values().append_value(v), + None => list_builder.values().append_null(), + } + } + list_builder.append(true); + } + None => { + for _ in 0..size { + list_builder.values().append_null(); + } + list_builder.append(false); + } + } + } + let array = Arc::new(list_builder.finish()) as ArrayRef; + let batch = RecordBatch::try_new(schema, vec![array]).unwrap(); + + //encode and check JSON with explicit nulls: + { + let json_value: Value = { + let mut buf = Vec::new(); + let mut writer = WriterBuilder::new() + .with_explicit_nulls(true) + .build::<_, JsonArray>(&mut buf); + writer.write(&batch).unwrap(); + writer.close().unwrap(); + serde_json::from_slice(&buf).unwrap() + }; + assert_eq!( + json!([ + {"list": [1, 2, null]}, + {"list": [3, null, 4]}, + {"list": [null, 5, 6]}, + {"list": null}, + ]), + json_value + ); + } + // encode and check JSON with no explicit nulls: + { + let json_value: Value = { + let mut buf = Vec::new(); + let mut writer = ArrayWriter::new(&mut buf); + writer.write(&batch).unwrap(); + writer.close().unwrap(); + serde_json::from_slice(&buf).unwrap() + }; + assert_eq!( + json!([ + {"list": [1, 2, null]}, + {"list": [3, null, 4]}, + {"list": [null, 5, 6]}, + {}, // empty because nulls are omitted + ]), + json_value + ); + } + } } diff --git a/arrow-json/src/writer/encoder.rs b/arrow-json/src/writer/encoder.rs index 113dc5dfc758..810e65b2268f 100644 --- a/arrow-json/src/writer/encoder.rs +++ b/arrow-json/src/writer/encoder.rs @@ -88,6 +88,10 @@ fn make_encoder_impl<'a>( let array = array.as_list::(); (Box::new(ListEncoder::try_new(array, options)?) as _, array.nulls().cloned()) } + DataType::FixedSizeList(_, _) => { + let array = array.as_fixed_size_list(); + (Box::new(FixedSizeListEncoder::try_new(array, options)?) as _, array.nulls().cloned()) + } DataType::Dictionary(_, _) => downcast_dictionary_array! { array => (Box::new(DictionaryEncoder::try_new(array, options)?) as _, array.logical_nulls()), @@ -100,7 +104,7 @@ fn make_encoder_impl<'a>( } DataType::FixedSizeBinary(_) => { - let array = array.as_any().downcast_ref::().unwrap(); + let array = array.as_fixed_size_binary(); (Box::new(FixedSizeBinaryEncoder::new(array)) as _, array.nulls().cloned()) } @@ -329,6 +333,53 @@ impl<'a, O: OffsetSizeTrait> Encoder for ListEncoder<'a, O> { } } +struct FixedSizeListEncoder<'a> { + value_length: usize, + nulls: Option, + encoder: Box, +} + +impl<'a> FixedSizeListEncoder<'a> { + fn try_new( + array: &'a FixedSizeListArray, + options: &EncoderOptions, + ) -> Result { + let (encoder, nulls) = make_encoder_impl(array.values().as_ref(), options)?; + Ok(Self { + encoder, + nulls, + value_length: array.value_length().as_usize(), + }) + } +} + +impl<'a> Encoder for FixedSizeListEncoder<'a> { + fn encode(&mut self, idx: usize, out: &mut Vec) { + let start = idx * self.value_length; + let end = start + self.value_length; + out.push(b'['); + match self.nulls.as_ref() { + Some(n) => (start..end).for_each(|idx| { + if idx != start { + out.push(b','); + } + if n.is_null(idx) { + out.extend_from_slice(b"null"); + } else { + self.encoder.encode(idx, out); + } + }), + None => (start..end).for_each(|idx| { + if idx != start { + out.push(b','); + } + self.encoder.encode(idx, out); + }), + } + out.push(b']'); + } +} + struct DictionaryEncoder<'a, K: ArrowDictionaryKeyType> { keys: ScalarBuffer, encoder: Box,