@@ -121,7 +121,6 @@ use datafusion_physical_expr::LexOrdering;
121121use  itertools:: Itertools ; 
122122use  jni:: objects:: GlobalRef ; 
123123use  num:: { BigInt ,  ToPrimitive } ; 
124- use  parquet:: schema:: parser:: parse_message_type; 
125124use  std:: cmp:: max; 
126125use  std:: { collections:: HashMap ,  sync:: Arc } ; 
127126use  url:: Url ; 
@@ -950,50 +949,28 @@ impl PhysicalPlanner {
950949                ) ) 
951950            } 
952951            OpStruct :: NativeScan ( scan)  => { 
953-                 let  data_schema = parse_message_type ( & scan. data_schema ) . unwrap ( ) ; 
954-                 let  required_schema = parse_message_type ( & scan. required_schema ) . unwrap ( ) ; 
955- 
956-                 let  data_schema_descriptor =
957-                     parquet:: schema:: types:: SchemaDescriptor :: new ( Arc :: new ( data_schema) ) ; 
958-                 let  data_schema_arrow = Arc :: new ( 
959-                     parquet:: arrow:: schema:: parquet_to_arrow_schema ( & data_schema_descriptor,  None ) 
960-                         . unwrap ( ) , 
961-                 ) ; 
962- 
963-                 let  required_schema_descriptor =
964-                     parquet:: schema:: types:: SchemaDescriptor :: new ( Arc :: new ( required_schema) ) ; 
965-                 let  required_schema_arrow = Arc :: new ( 
966-                     parquet:: arrow:: schema:: parquet_to_arrow_schema ( 
967-                         & required_schema_descriptor, 
968-                         None , 
969-                     ) 
970-                     . unwrap ( ) , 
971-                 ) ; 
972- 
973-                 let  partition_schema_arrow = scan
974-                     . partition_schema 
952+                 let  data_schema = convert_spark_types_to_arrow_schema ( scan. data_schema . as_slice ( ) ) ; 
953+                 let  required_schema:  SchemaRef  =
954+                     convert_spark_types_to_arrow_schema ( scan. required_schema . as_slice ( ) ) ; 
955+                 let  partition_schema:  SchemaRef  =
956+                     convert_spark_types_to_arrow_schema ( scan. partition_schema . as_slice ( ) ) ; 
957+                 let  projection_vector:  Vec < usize >  = scan
958+                     . projection_vector 
975959                    . iter ( ) 
976-                     . map ( to_arrow_datatype) 
977-                     . collect_vec ( ) ; 
978-                 let  partition_fields:  Vec < _ >  = partition_schema_arrow
979-                     . iter ( ) 
980-                     . enumerate ( ) 
981-                     . map ( |( idx,  data_type) | { 
982-                         Field :: new ( format ! ( "part_{}" ,  idx) ,  data_type. clone ( ) ,  true ) 
983-                     } ) 
960+                     . map ( |offset| * offset as  usize ) 
984961                    . collect ( ) ; 
985962
986963                // Convert the Spark expressions to Physical expressions 
987964                let  data_filters:  Result < Vec < Arc < dyn  PhysicalExpr > > ,  ExecutionError >  = scan
988965                    . data_filters 
989966                    . iter ( ) 
990-                     . map ( |expr| self . create_expr ( expr,  Arc :: clone ( & required_schema_arrow ) ) ) 
967+                     . map ( |expr| self . create_expr ( expr,  Arc :: clone ( & required_schema ) ) ) 
991968                    . collect ( ) ; 
992969
993970                // Create a conjunctive form of the vector because ParquetExecBuilder takes 
994971                // a single expression 
995972                let  data_filters = data_filters?; 
996-                 let  test_data_filters  = data_filters. clone ( ) . into_iter ( ) . reduce ( |left,  right| { 
973+                 let  cnf_data_filters  = data_filters. clone ( ) . into_iter ( ) . reduce ( |left,  right| { 
997974                    Arc :: new ( BinaryExpr :: new ( 
998975                        left, 
999976                        datafusion:: logical_expr:: Operator :: And , 
@@ -1064,29 +1041,21 @@ impl PhysicalPlanner {
10641041                assert_eq ! ( file_groups. len( ) ,  partition_count) ; 
10651042
10661043                let  object_store_url = ObjectStoreUrl :: local_filesystem ( ) ; 
1044+                 let  partition_fields:  Vec < Field >  = partition_schema
1045+                     . fields ( ) 
1046+                     . iter ( ) 
1047+                     . map ( |field| { 
1048+                         Field :: new ( field. name ( ) ,  field. data_type ( ) . clone ( ) ,  field. is_nullable ( ) ) 
1049+                     } ) 
1050+                     . collect_vec ( ) ; 
10671051                let  mut  file_scan_config =
1068-                     FileScanConfig :: new ( object_store_url,  Arc :: clone ( & data_schema_arrow ) ) 
1052+                     FileScanConfig :: new ( object_store_url,  Arc :: clone ( & data_schema ) ) 
10691053                        . with_file_groups ( file_groups) 
10701054                        . with_table_partition_cols ( partition_fields) ; 
10711055
1072-                 // Check for projection, if so generate the vector and add to FileScanConfig. 
1073-                 let  mut  projection_vector:  Vec < usize >  =
1074-                     Vec :: with_capacity ( required_schema_arrow. fields . len ( ) ) ; 
1075-                 // TODO: could be faster with a hashmap rather than iterating over data_schema_arrow with index_of. 
1076-                 required_schema_arrow. fields . iter ( ) . for_each ( |field| { 
1077-                     projection_vector. push ( data_schema_arrow. index_of ( field. name ( ) ) . unwrap ( ) ) ; 
1078-                 } ) ; 
1079- 
1080-                 partition_schema_arrow
1081-                     . iter ( ) 
1082-                     . enumerate ( ) 
1083-                     . for_each ( |( idx,  _) | { 
1084-                         projection_vector. push ( idx + data_schema_arrow. fields . len ( ) ) ; 
1085-                     } ) ; 
1086- 
10871056                assert_eq ! ( 
10881057                    projection_vector. len( ) , 
1089-                     required_schema_arrow . fields. len( )  + partition_schema_arrow . len( ) 
1058+                     required_schema . fields. len( )  + partition_schema . fields . len( ) 
10901059                ) ; 
10911060                file_scan_config = file_scan_config. with_projection ( Some ( projection_vector) ) ; 
10921061
@@ -1095,13 +1064,11 @@ impl PhysicalPlanner {
10951064                table_parquet_options. global . pushdown_filters  = true ; 
10961065                table_parquet_options. global . reorder_filters  = true ; 
10971066
1098-                     let  mut  builder = ParquetExecBuilder :: new ( file_scan_config) 
1099-                         . with_table_parquet_options ( table_parquet_options) 
1100-                         . with_schema_adapter_factory ( 
1101-                             Arc :: new ( CometSchemaAdapterFactory :: default ( ) ) , 
1102-                         ) ; 
1067+                 let  mut  builder = ParquetExecBuilder :: new ( file_scan_config) 
1068+                     . with_table_parquet_options ( table_parquet_options) 
1069+                     . with_schema_adapter_factory ( Arc :: new ( CometSchemaAdapterFactory :: default ( ) ) ) ; 
11031070
1104-                 if  let  Some ( filter)  = test_data_filters  { 
1071+                 if  let  Some ( filter)  = cnf_data_filters  { 
11051072                    builder = builder. with_predicate ( filter) ; 
11061073                } 
11071074
@@ -2309,6 +2276,23 @@ fn from_protobuf_eval_mode(value: i32) -> Result<EvalMode, prost::DecodeError> {
23092276    } 
23102277} 
23112278
2279+ fn  convert_spark_types_to_arrow_schema ( 
2280+     spark_types :  & [ spark_operator:: SparkStructField ] , 
2281+ )  -> SchemaRef  { 
2282+     let  arrow_fields = spark_types
2283+         . iter ( ) 
2284+         . map ( |spark_type| { 
2285+             Field :: new ( 
2286+                 String :: clone ( & spark_type. name ) , 
2287+                 to_arrow_datatype ( spark_type. data_type . as_ref ( ) . unwrap ( ) ) , 
2288+                 spark_type. nullable , 
2289+             ) 
2290+         } ) 
2291+         . collect_vec ( ) ; 
2292+     let  arrow_schema:  SchemaRef  = Arc :: new ( Schema :: new ( arrow_fields) ) ; 
2293+     arrow_schema
2294+ } 
2295+ 
23122296#[ cfg( test) ]  
23132297mod  tests { 
23142298    use  std:: { sync:: Arc ,  task:: Poll } ; 
0 commit comments