@@ -35,11 +35,15 @@ pub use avro::AvroExec;
3535pub  use  csv:: CsvExec ; 
3636pub  use  json:: NdJsonExec ; 
3737
38+ use  crate :: error:: DataFusionError ; 
3839use  crate :: { 
3940    datasource:: { object_store:: ObjectStore ,  PartitionedFile } , 
41+     error:: Result , 
4042    scalar:: ScalarValue , 
4143} ; 
44+ use  arrow:: array:: new_null_array; 
4245use  lazy_static:: lazy_static; 
46+ use  log:: info; 
4347use  std:: { 
4448    collections:: HashMap , 
4549    fmt:: { Display ,  Formatter ,  Result  as  FmtResult } , 
@@ -165,6 +169,87 @@ impl<'a> Display for FileGroupsDisplay<'a> {
165169    } 
166170} 
167171
172+ /// A utility which can adapt file-level record batches to a table schema which may have a schema 
173+ /// obtained from merging multiple file-level schemas. 
174+ /// 
175+ /// This is useful for enabling schema evolution in partitioned datasets. 
176+ /// 
177+ /// This has to be done in two stages. 
178+ /// 
179+ /// 1. Before reading the file, we have to map projected column indexes from the table schema to 
180+ ///    the file schema. 
181+ /// 
182+ /// 2. After reading a record batch we need to map the read columns back to the expected columns 
183+ ///    indexes and insert null-valued columns wherever the file schema was missing a colum present 
184+ ///    in the table schema. 
185+ #[ derive( Clone ,  Debug ) ]  
186+ pub ( crate )  struct  SchemaAdapter  { 
187+     /// Schema for the table 
188+ table_schema :  SchemaRef , 
189+ } 
190+ 
191+ impl  SchemaAdapter  { 
192+     pub ( crate )  fn  new ( table_schema :  SchemaRef )  -> SchemaAdapter  { 
193+         Self  {  table_schema } 
194+     } 
195+ 
196+     /// Map projected column indexes to the file schema. This will fail if the table schema 
197+ /// and the file schema contain a field with the same name and different types. 
198+ pub  fn  map_projections ( 
199+         & self , 
200+         file_schema :  & Schema , 
201+         projections :  & [ usize ] , 
202+     )  -> Result < Vec < usize > >  { 
203+         let  mut  mapped:  Vec < usize >  = vec ! [ ] ; 
204+         for  idx in  projections { 
205+             let  field = self . table_schema . field ( * idx) ; 
206+             if  let  Ok ( mapped_idx)  = file_schema. index_of ( field. name ( ) . as_str ( ) )  { 
207+                 if  file_schema. field ( mapped_idx) . data_type ( )  == field. data_type ( )  { 
208+                     mapped. push ( mapped_idx) 
209+                 }  else  { 
210+                     let  msg = format ! ( "Failed to map column projection for field {}. Incompatible data types {:?} and {:?}" ,  field. name( ) ,  file_schema. field( mapped_idx) . data_type( ) ,  field. data_type( ) ) ; 
211+                     info ! ( "{}" ,  msg) ; 
212+                     return  Err ( DataFusionError :: Execution ( msg) ) ; 
213+                 } 
214+             } 
215+         } 
216+         Ok ( mapped) 
217+     } 
218+ 
219+     /// Re-order projected columns by index in record batch to match table schema column ordering. If the record 
220+ /// batch does not contain a column for an expected field, insert a null-valued column at the 
221+ /// required column index. 
222+ pub  fn  adapt_batch ( 
223+         & self , 
224+         batch :  RecordBatch , 
225+         projections :  & [ usize ] , 
226+     )  -> Result < RecordBatch >  { 
227+         let  batch_rows = batch. num_rows ( ) ; 
228+ 
229+         let  batch_schema = batch. schema ( ) ; 
230+ 
231+         let  mut  cols:  Vec < ArrayRef >  = Vec :: with_capacity ( batch. columns ( ) . len ( ) ) ; 
232+         let  batch_cols = batch. columns ( ) . to_vec ( ) ; 
233+ 
234+         for  field_idx in  projections { 
235+             let  table_field = & self . table_schema . fields ( ) [ * field_idx] ; 
236+             if  let  Some ( ( batch_idx,  _name) )  =
237+                 batch_schema. column_with_name ( table_field. name ( ) . as_str ( ) ) 
238+             { 
239+                 cols. push ( batch_cols[ batch_idx] . clone ( ) ) ; 
240+             }  else  { 
241+                 cols. push ( new_null_array ( table_field. data_type ( ) ,  batch_rows) ) 
242+             } 
243+         } 
244+ 
245+         let  projected_schema = Arc :: new ( self . table_schema . clone ( ) . project ( projections) ?) ; 
246+ 
247+         let  merged_batch = RecordBatch :: try_new ( projected_schema,  cols) ?; 
248+ 
249+         Ok ( merged_batch) 
250+     } 
251+ } 
252+ 
168253/// A helper that projects partition columns into the file record batches. 
169254/// 
170255/// One interesting trick is the usage of a cache for the key buffers of the partition column 
@@ -467,6 +552,61 @@ mod tests {
467552        crate :: assert_batches_eq!( expected,  & [ projected_batch] ) ; 
468553    } 
469554
555+     #[ test]  
556+     fn  schema_adapter_adapt_projections ( )  { 
557+         let  table_schema = Arc :: new ( Schema :: new ( vec ! [ 
558+             Field :: new( "c1" ,  DataType :: Utf8 ,  true ) , 
559+             Field :: new( "c2" ,  DataType :: Int64 ,  true ) , 
560+             Field :: new( "c3" ,  DataType :: Int8 ,  true ) , 
561+         ] ) ) ; 
562+ 
563+         let  file_schema = Schema :: new ( vec ! [ 
564+             Field :: new( "c1" ,  DataType :: Utf8 ,  true ) , 
565+             Field :: new( "c2" ,  DataType :: Int64 ,  true ) , 
566+         ] ) ; 
567+ 
568+         let  file_schema_2 = Arc :: new ( Schema :: new ( vec ! [ 
569+             Field :: new( "c3" ,  DataType :: Int8 ,  true ) , 
570+             Field :: new( "c2" ,  DataType :: Int64 ,  true ) , 
571+         ] ) ) ; 
572+ 
573+         let  file_schema_3 =
574+             Arc :: new ( Schema :: new ( vec ! [ Field :: new( "c3" ,  DataType :: Float32 ,  true ) ] ) ) ; 
575+ 
576+         let  adapter = SchemaAdapter :: new ( table_schema) ; 
577+ 
578+         let  projections1:  Vec < usize >  = vec ! [ 0 ,  1 ,  2 ] ; 
579+         let  projections2:  Vec < usize >  = vec ! [ 2 ] ; 
580+ 
581+         let  mapped = adapter
582+             . map_projections ( & file_schema,  projections1. as_slice ( ) ) 
583+             . expect ( "mapping projections" ) ; 
584+ 
585+         assert_eq ! ( mapped,  vec![ 0 ,  1 ] ) ; 
586+ 
587+         let  mapped = adapter
588+             . map_projections ( & file_schema,  projections2. as_slice ( ) ) 
589+             . expect ( "mapping projections" ) ; 
590+ 
591+         assert ! ( mapped. is_empty( ) ) ; 
592+ 
593+         let  mapped = adapter
594+             . map_projections ( & file_schema_2,  projections1. as_slice ( ) ) 
595+             . expect ( "mapping projections" ) ; 
596+ 
597+         assert_eq ! ( mapped,  vec![ 1 ,  0 ] ) ; 
598+ 
599+         let  mapped = adapter
600+             . map_projections ( & file_schema_2,  projections2. as_slice ( ) ) 
601+             . expect ( "mapping projections" ) ; 
602+ 
603+         assert_eq ! ( mapped,  vec![ 0 ] ) ; 
604+ 
605+         let  mapped = adapter. map_projections ( & file_schema_3,  projections1. as_slice ( ) ) ; 
606+ 
607+         assert ! ( mapped. is_err( ) ) ; 
608+     } 
609+ 
470610    // sets default for configs that play no role in projections 
471611    fn  config_for_projection ( 
472612        file_schema :  SchemaRef , 
0 commit comments