@@ -35,11 +35,15 @@ pub use avro::AvroExec;
3535pub use csv:: CsvExec ;
3636pub use json:: NdJsonExec ;
3737
38+ use crate :: error:: DataFusionError ;
3839use crate :: {
3940 datasource:: { object_store:: ObjectStore , PartitionedFile } ,
41+ error:: Result ,
4042 scalar:: ScalarValue ,
4143} ;
44+ use arrow:: array:: new_null_array;
4245use lazy_static:: lazy_static;
46+ use log:: info;
4347use std:: {
4448 collections:: HashMap ,
4549 fmt:: { Display , Formatter , Result as FmtResult } ,
@@ -165,6 +169,87 @@ impl<'a> Display for FileGroupsDisplay<'a> {
165169 }
166170}
167171
172+ /// A utility which can adapt file-level record batches to a table schema which may have a schema
173+ /// obtained from merging multiple file-level schemas.
174+ ///
175+ /// This is useful for enabling schema evolution in partitioned datasets.
176+ ///
177+ /// This has to be done in two stages.
178+ ///
179+ /// 1. Before reading the file, we have to map projected column indexes from the table schema to
180+ /// the file schema.
181+ ///
182+ /// 2. After reading a record batch we need to map the read columns back to the expected columns
183+ /// indexes and insert null-valued columns wherever the file schema was missing a colum present
184+ /// in the table schema.
185+ #[ derive( Clone , Debug ) ]
186+ pub ( crate ) struct SchemaAdapter {
187+ /// Schema for the table
188+ table_schema : SchemaRef ,
189+ }
190+
191+ impl SchemaAdapter {
192+ pub ( crate ) fn new ( table_schema : SchemaRef ) -> SchemaAdapter {
193+ Self { table_schema }
194+ }
195+
196+ /// Map projected column indexes to the file schema. This will fail if the table schema
197+ /// and the file schema contain a field with the same name and different types.
198+ pub fn map_projections (
199+ & self ,
200+ file_schema : & Schema ,
201+ projections : & [ usize ] ,
202+ ) -> Result < Vec < usize > > {
203+ let mut mapped: Vec < usize > = vec ! [ ] ;
204+ for idx in projections {
205+ let field = self . table_schema . field ( * idx) ;
206+ if let Ok ( mapped_idx) = file_schema. index_of ( field. name ( ) . as_str ( ) ) {
207+ if file_schema. field ( mapped_idx) . data_type ( ) == field. data_type ( ) {
208+ mapped. push ( mapped_idx)
209+ } else {
210+ let msg = format ! ( "Failed to map column projection for field {}. Incompatible data types {:?} and {:?}" , field. name( ) , file_schema. field( mapped_idx) . data_type( ) , field. data_type( ) ) ;
211+ info ! ( "{}" , msg) ;
212+ return Err ( DataFusionError :: Execution ( msg) ) ;
213+ }
214+ }
215+ }
216+ Ok ( mapped)
217+ }
218+
219+ /// Re-order projected columns by index in record batch to match table schema column ordering. If the record
220+ /// batch does not contain a column for an expected field, insert a null-valued column at the
221+ /// required column index.
222+ pub fn adapt_batch (
223+ & self ,
224+ batch : RecordBatch ,
225+ projections : & [ usize ] ,
226+ ) -> Result < RecordBatch > {
227+ let batch_rows = batch. num_rows ( ) ;
228+
229+ let batch_schema = batch. schema ( ) ;
230+
231+ let mut cols: Vec < ArrayRef > = Vec :: with_capacity ( batch. columns ( ) . len ( ) ) ;
232+ let batch_cols = batch. columns ( ) . to_vec ( ) ;
233+
234+ for field_idx in projections {
235+ let table_field = & self . table_schema . fields ( ) [ * field_idx] ;
236+ if let Some ( ( batch_idx, _name) ) =
237+ batch_schema. column_with_name ( table_field. name ( ) . as_str ( ) )
238+ {
239+ cols. push ( batch_cols[ batch_idx] . clone ( ) ) ;
240+ } else {
241+ cols. push ( new_null_array ( table_field. data_type ( ) , batch_rows) )
242+ }
243+ }
244+
245+ let projected_schema = Arc :: new ( self . table_schema . clone ( ) . project ( projections) ?) ;
246+
247+ let merged_batch = RecordBatch :: try_new ( projected_schema, cols) ?;
248+
249+ Ok ( merged_batch)
250+ }
251+ }
252+
168253/// A helper that projects partition columns into the file record batches.
169254///
170255/// One interesting trick is the usage of a cache for the key buffers of the partition column
@@ -467,6 +552,61 @@ mod tests {
467552 crate :: assert_batches_eq!( expected, & [ projected_batch] ) ;
468553 }
469554
555+ #[ test]
556+ fn schema_adapter_adapt_projections ( ) {
557+ let table_schema = Arc :: new ( Schema :: new ( vec ! [
558+ Field :: new( "c1" , DataType :: Utf8 , true ) ,
559+ Field :: new( "c2" , DataType :: Int64 , true ) ,
560+ Field :: new( "c3" , DataType :: Int8 , true ) ,
561+ ] ) ) ;
562+
563+ let file_schema = Schema :: new ( vec ! [
564+ Field :: new( "c1" , DataType :: Utf8 , true ) ,
565+ Field :: new( "c2" , DataType :: Int64 , true ) ,
566+ ] ) ;
567+
568+ let file_schema_2 = Arc :: new ( Schema :: new ( vec ! [
569+ Field :: new( "c3" , DataType :: Int8 , true ) ,
570+ Field :: new( "c2" , DataType :: Int64 , true ) ,
571+ ] ) ) ;
572+
573+ let file_schema_3 =
574+ Arc :: new ( Schema :: new ( vec ! [ Field :: new( "c3" , DataType :: Float32 , true ) ] ) ) ;
575+
576+ let adapter = SchemaAdapter :: new ( table_schema) ;
577+
578+ let projections1: Vec < usize > = vec ! [ 0 , 1 , 2 ] ;
579+ let projections2: Vec < usize > = vec ! [ 2 ] ;
580+
581+ let mapped = adapter
582+ . map_projections ( & file_schema, projections1. as_slice ( ) )
583+ . expect ( "mapping projections" ) ;
584+
585+ assert_eq ! ( mapped, vec![ 0 , 1 ] ) ;
586+
587+ let mapped = adapter
588+ . map_projections ( & file_schema, projections2. as_slice ( ) )
589+ . expect ( "mapping projections" ) ;
590+
591+ assert ! ( mapped. is_empty( ) ) ;
592+
593+ let mapped = adapter
594+ . map_projections ( & file_schema_2, projections1. as_slice ( ) )
595+ . expect ( "mapping projections" ) ;
596+
597+ assert_eq ! ( mapped, vec![ 1 , 0 ] ) ;
598+
599+ let mapped = adapter
600+ . map_projections ( & file_schema_2, projections2. as_slice ( ) )
601+ . expect ( "mapping projections" ) ;
602+
603+ assert_eq ! ( mapped, vec![ 0 ] ) ;
604+
605+ let mapped = adapter. map_projections ( & file_schema_3, projections1. as_slice ( ) ) ;
606+
607+ assert ! ( mapped. is_err( ) ) ;
608+ }
609+
470610 // sets default for configs that play no role in projections
471611 fn config_for_projection (
472612 file_schema : SchemaRef ,
0 commit comments