1717
1818//! Options related to how parquet files should be written
1919
20+ use base64:: Engine ;
21+ use std:: sync:: Arc ;
22+
2023use crate :: {
2124 config:: { ParquetOptions , TableParquetOptions } ,
22- DataFusionError , Result ,
25+ DataFusionError , Result , _internal_datafusion_err ,
2326} ;
2427
28+ use arrow_schema:: Schema ;
2529use parquet:: {
30+ arrow:: ARROW_SCHEMA_META_KEY ,
2631 basic:: { BrotliLevel , GzipLevel , ZstdLevel } ,
27- file:: properties:: {
28- EnabledStatistics , WriterProperties , WriterPropertiesBuilder , WriterVersion ,
29- DEFAULT_MAX_STATISTICS_SIZE , DEFAULT_STATISTICS_ENABLED ,
32+ file:: {
33+ metadata:: KeyValue ,
34+ properties:: {
35+ EnabledStatistics , WriterProperties , WriterPropertiesBuilder , WriterVersion ,
36+ DEFAULT_MAX_STATISTICS_SIZE , DEFAULT_STATISTICS_ENABLED ,
37+ } ,
3038 } ,
31- format:: KeyValue ,
3239 schema:: types:: ColumnPath ,
3340} ;
3441
@@ -51,6 +58,17 @@ impl ParquetWriterOptions {
5158 }
5259}
5360
61+ impl TableParquetOptions {
62+ /// Add the arrow schema to the parquet kv_metadata.
63+ /// If already exists, then overwrites.
64+ pub fn arrow_schema ( & mut self , schema : & Arc < Schema > ) {
65+ self . key_value_metadata . insert (
66+ ARROW_SCHEMA_META_KEY . into ( ) ,
67+ Some ( encode_arrow_schema ( schema) ) ,
68+ ) ;
69+ }
70+ }
71+
5472impl TryFrom < & TableParquetOptions > for ParquetWriterOptions {
5573 type Error = DataFusionError ;
5674
@@ -79,6 +97,14 @@ impl TryFrom<&TableParquetOptions> for WriterPropertiesBuilder {
7997
8098 let mut builder = global. into_writer_properties_builder ( ) ?;
8199
100+ // check that the arrow schema is present in the kv_metadata, if configured to do so
101+ if !global. skip_arrow_metadata
102+ && !key_value_metadata. contains_key ( ARROW_SCHEMA_META_KEY )
103+ {
104+ return Err ( _internal_datafusion_err ! ( "arrow schema was not added to the kv_metadata, even though it is required by configuration settings" ) ) ;
105+ }
106+
107+ // add kv_meta, if any
82108 if !key_value_metadata. is_empty ( ) {
83109 builder = builder. set_key_value_metadata ( Some (
84110 key_value_metadata
@@ -140,11 +166,38 @@ impl TryFrom<&TableParquetOptions> for WriterPropertiesBuilder {
140166 }
141167}
142168
169+ /// Encodes the Arrow schema into the IPC format, and base64 encodes it
170+ ///
171+ /// TODO: use extern parquet's private method, once publicly available.
172+ /// Refer to <https://github.com/apache/arrow-rs/pull/6916>
173+ fn encode_arrow_schema ( schema : & Arc < Schema > ) -> String {
174+ let options = arrow_ipc:: writer:: IpcWriteOptions :: default ( ) ;
175+ let mut dictionary_tracker = arrow_ipc:: writer:: DictionaryTracker :: new ( true ) ;
176+ let data_gen = arrow_ipc:: writer:: IpcDataGenerator :: default ( ) ;
177+ let mut serialized_schema = data_gen. schema_to_bytes_with_dictionary_tracker (
178+ schema,
179+ & mut dictionary_tracker,
180+ & options,
181+ ) ;
182+
183+ // manually prepending the length to the schema as arrow uses the legacy IPC format
184+ // TODO: change after addressing ARROW-9777
185+ let schema_len = serialized_schema. ipc_message . len ( ) ;
186+ let mut len_prefix_schema = Vec :: with_capacity ( schema_len + 8 ) ;
187+ len_prefix_schema. append ( & mut vec ! [ 255u8 , 255 , 255 , 255 ] ) ;
188+ len_prefix_schema. append ( ( schema_len as u32 ) . to_le_bytes ( ) . to_vec ( ) . as_mut ( ) ) ;
189+ len_prefix_schema. append ( & mut serialized_schema. ipc_message ) ;
190+
191+ base64:: prelude:: BASE64_STANDARD . encode ( & len_prefix_schema)
192+ }
193+
143194impl ParquetOptions {
144195 /// Convert the global session options, [`ParquetOptions`], into a single write action's [`WriterPropertiesBuilder`].
145196 ///
146197 /// The returned [`WriterPropertiesBuilder`] can then be further modified with additional options
147198 /// applied per column; a customization which is not applicable for [`ParquetOptions`].
199+ ///
200+ /// Note that this method does not include the key_value_metadata from [`TableParquetOptions`].
148201 pub fn into_writer_properties_builder ( & self ) -> Result < WriterPropertiesBuilder > {
149202 let ParquetOptions {
150203 data_pagesize_limit,
@@ -177,6 +230,7 @@ impl ParquetOptions {
177230 bloom_filter_on_read : _, // reads not used for writer props
178231 schema_force_view_types : _,
179232 binary_as_string : _, // not used for writer props
233+ skip_arrow_metadata : _,
180234 } = self ;
181235
182236 let mut builder = WriterProperties :: builder ( )
@@ -444,6 +498,7 @@ mod tests {
444498 bloom_filter_on_read : defaults. bloom_filter_on_read ,
445499 schema_force_view_types : defaults. schema_force_view_types ,
446500 binary_as_string : defaults. binary_as_string ,
501+ skip_arrow_metadata : defaults. skip_arrow_metadata ,
447502 }
448503 }
449504
@@ -546,19 +601,55 @@ mod tests {
546601 bloom_filter_on_read : global_options_defaults. bloom_filter_on_read ,
547602 schema_force_view_types : global_options_defaults. schema_force_view_types ,
548603 binary_as_string : global_options_defaults. binary_as_string ,
604+ skip_arrow_metadata : global_options_defaults. skip_arrow_metadata ,
549605 } ,
550606 column_specific_options,
551607 key_value_metadata,
552608 }
553609 }
554610
611+ #[ test]
612+ fn table_parquet_opts_to_writer_props_skip_arrow_metadata ( ) {
613+ // TableParquetOptions, all props set to default
614+ let mut table_parquet_opts = TableParquetOptions :: default ( ) ;
615+ assert ! (
616+ !table_parquet_opts. global. skip_arrow_metadata,
617+ "default false, to not skip the arrow schema requirement"
618+ ) ;
619+
620+ // see errors without the schema added, using default settings
621+ let should_error = WriterPropertiesBuilder :: try_from ( & table_parquet_opts) ;
622+ assert ! (
623+ should_error. is_err( ) ,
624+ "should error without the required arrow schema in kv_metadata" ,
625+ ) ;
626+
627+ // succeeds if we permit skipping the arrow schema
628+ table_parquet_opts = table_parquet_opts. with_skip_arrow_metadata ( true ) ;
629+ let should_succeed = WriterPropertiesBuilder :: try_from ( & table_parquet_opts) ;
630+ assert ! (
631+ should_succeed. is_ok( ) ,
632+ "should work with the arrow schema skipped by config" ,
633+ ) ;
634+
635+ // Set the arrow schema back to required
636+ table_parquet_opts = table_parquet_opts. with_skip_arrow_metadata ( false ) ;
637+ // add the arrow schema to the kv_meta
638+ table_parquet_opts. arrow_schema ( & Arc :: new ( Schema :: empty ( ) ) ) ;
639+ let should_succeed = WriterPropertiesBuilder :: try_from ( & table_parquet_opts) ;
640+ assert ! (
641+ should_succeed. is_ok( ) ,
642+ "should work with the arrow schema included in TableParquetOptions" ,
643+ ) ;
644+ }
645+
555646 #[ test]
556647 fn table_parquet_opts_to_writer_props ( ) {
557648 // ParquetOptions, all props set to non-default
558649 let parquet_options = parquet_options_with_non_defaults ( ) ;
559650
560651 // TableParquetOptions, using ParquetOptions for global settings
561- let key = "foo" . to_string ( ) ;
652+ let key = ARROW_SCHEMA_META_KEY . to_string ( ) ;
562653 let value = Some ( "bar" . into ( ) ) ;
563654 let table_parquet_opts = TableParquetOptions {
564655 global : parquet_options. clone ( ) ,
@@ -585,14 +676,18 @@ mod tests {
585676 #[ test]
586677 fn test_defaults_match ( ) {
587678 // ensure the global settings are the same
588- let default_table_writer_opts = TableParquetOptions :: default ( ) ;
679+ let mut default_table_writer_opts = TableParquetOptions :: default ( ) ;
589680 let default_parquet_opts = ParquetOptions :: default ( ) ;
590681 assert_eq ! (
591682 default_table_writer_opts. global,
592683 default_parquet_opts,
593684 "should have matching defaults for TableParquetOptions.global and ParquetOptions" ,
594685 ) ;
595686
687+ // selectively skip the arrow_schema metadata, since the WriterProperties default has an empty kv_meta (no arrow schema)
688+ default_table_writer_opts =
689+ default_table_writer_opts. with_skip_arrow_metadata ( true ) ;
690+
596691 // WriterProperties::default, a.k.a. using extern parquet's defaults
597692 let default_writer_props = WriterProperties :: new ( ) ;
598693
@@ -640,6 +735,7 @@ mod tests {
640735 session_config_from_writer_props ( & default_writer_props) ;
641736 from_extern_parquet. global . created_by = same_created_by;
642737 from_extern_parquet. global . compression = Some ( "zstd(3)" . into ( ) ) ;
738+ from_extern_parquet. global . skip_arrow_metadata = true ;
643739
644740 assert_eq ! (
645741 default_table_writer_opts,
@@ -653,6 +749,7 @@ mod tests {
653749 // the TableParquetOptions::default, with only the bloom filter turned on
654750 let mut default_table_writer_opts = TableParquetOptions :: default ( ) ;
655751 default_table_writer_opts. global . bloom_filter_on_write = true ;
752+ default_table_writer_opts. arrow_schema ( & Arc :: new ( Schema :: empty ( ) ) ) ; // add the required arrow schema
656753 let from_datafusion_defaults =
657754 WriterPropertiesBuilder :: try_from ( & default_table_writer_opts)
658755 . unwrap ( )
@@ -681,6 +778,7 @@ mod tests {
681778 let mut default_table_writer_opts = TableParquetOptions :: default ( ) ;
682779 default_table_writer_opts. global . bloom_filter_on_write = true ;
683780 default_table_writer_opts. global . bloom_filter_fpp = Some ( 0.42 ) ;
781+ default_table_writer_opts. arrow_schema ( & Arc :: new ( Schema :: empty ( ) ) ) ; // add the required arrow schema
684782 let from_datafusion_defaults =
685783 WriterPropertiesBuilder :: try_from ( & default_table_writer_opts)
686784 . unwrap ( )
@@ -713,6 +811,7 @@ mod tests {
713811 let mut default_table_writer_opts = TableParquetOptions :: default ( ) ;
714812 default_table_writer_opts. global . bloom_filter_on_write = true ;
715813 default_table_writer_opts. global . bloom_filter_ndv = Some ( 42 ) ;
814+ default_table_writer_opts. arrow_schema ( & Arc :: new ( Schema :: empty ( ) ) ) ; // add the required arrow schema
716815 let from_datafusion_defaults =
717816 WriterPropertiesBuilder :: try_from ( & default_table_writer_opts)
718817 . unwrap ( )
0 commit comments