@@ -46,7 +46,7 @@ impl ParquetTable {
4646 pub fn try_new ( filename : & str ) -> Result < Self > {
4747 let file = File :: open ( filename) ?;
4848 let parquet_file = ParquetFile :: open ( file, None ) ?;
49- let schema = parquet_file. schema . clone ( ) ;
49+ let schema = parquet_file. projection_schema . clone ( ) ;
5050 Ok ( Self {
5151 filename : filename. to_string ( ) ,
5252 schema,
@@ -72,11 +72,12 @@ impl Table for ParquetTable {
7272
7373pub struct ParquetFile {
7474 reader : SerializedFileReader < File > ,
75- row_group_index : usize ,
76- /// The schema of the underlying file
77- schema : Arc < Schema > ,
75+ /// Projection expressed as column indices into underlying parquet reader
7876 projection : Vec < usize > ,
77+ /// The schema of the projection
78+ projection_schema : Arc < Schema > ,
7979 batch_size : usize ,
80+ row_group_index : usize ,
8081 current_row_group : Option < Box < RowGroupReader > > ,
8182 column_readers : Vec < ColumnReader > ,
8283}
@@ -97,7 +98,7 @@ macro_rules! read_binary_column {
9798 for _ in 0 ..$SELF. batch_size {
9899 read_buffer. push( ByteArray :: default ( ) ) ;
99100 }
100- if $SELF. schema . field( $INDEX) . is_nullable( ) {
101+ if $SELF. projection_schema . field( $INDEX) . is_nullable( ) {
101102
102103 let mut def_levels: Vec <i16 > = Vec :: with_capacity( $SELF. batch_size) ;
103104 for _ in 0 ..$SELF. batch_size {
@@ -247,7 +248,7 @@ impl ParquetFile {
247248 Ok ( ParquetFile {
248249 reader : reader,
249250 row_group_index : 0 ,
250- schema : projected_schema,
251+ projection_schema : projected_schema,
251252 projection,
252253 batch_size : 64 * 1024 ,
253254 current_row_group : None ,
@@ -285,34 +286,25 @@ impl ParquetFile {
285286 let is_nullable = self . schema ( ) . field ( i) . is_nullable ( ) ;
286287 let array: Arc < Array > = match self . column_readers [ i] {
287288 ColumnReader :: BoolColumnReader ( ref mut r) => {
288- match ArrowReader :: < BooleanType > :: read (
289+ ArrowReader :: < BooleanType > :: read (
289290 r,
290291 self . batch_size ,
291292 is_nullable,
292- ) {
293- Ok ( array) => array,
294- Err ( e) => return Err ( e) ,
295- }
293+ ) ?
296294 }
297295 ColumnReader :: Int32ColumnReader ( ref mut r) => {
298- match ArrowReader :: < Int32Type > :: read (
296+ ArrowReader :: < Int32Type > :: read (
299297 r,
300298 self . batch_size ,
301299 is_nullable,
302- ) {
303- Ok ( array) => array,
304- Err ( e) => return Err ( e) ,
305- }
300+ ) ?
306301 }
307302 ColumnReader :: Int64ColumnReader ( ref mut r) => {
308- match ArrowReader :: < Int64Type > :: read (
303+ ArrowReader :: < Int64Type > :: read (
309304 r,
310305 self . batch_size ,
311306 is_nullable,
312- ) {
313- Ok ( array) => array,
314- Err ( e) => return Err ( e) ,
315- }
307+ ) ?
316308 }
317309 ColumnReader :: Int96ColumnReader ( ref mut r) => {
318310 let mut read_buffer: Vec < Int96 > =
@@ -322,7 +314,7 @@ impl ParquetFile {
322314 read_buffer. push ( Int96 :: new ( ) ) ;
323315 }
324316
325- if self . schema . field ( i) . is_nullable ( ) {
317+ if self . projection_schema . field ( i) . is_nullable ( ) {
326318 let mut def_levels: Vec < i16 > =
327319 Vec :: with_capacity ( self . batch_size ) ;
328320 for _ in 0 ..self . batch_size {
@@ -337,21 +329,28 @@ impl ParquetFile {
337329
338330 if values_read == levels_read {
339331 let mut builder = Int64Builder :: new ( values_read) ;
340-
341332 for i in 0 ..values_read {
342- let v = read_buffer[ i] . data ( ) ;
343- let value: u128 = ( v[ 0 ] as u128 ) << 64
344- | ( v[ 1 ] as u128 ) << 32
345- | ( v[ 2 ] as u128 ) ;
346- let ms: i64 = ( value / 1000000 ) as i64 ;
347- builder. append_value ( ms) ?;
333+ builder. append_value ( convert_int96_timestamp (
334+ read_buffer[ i] . data ( ) ,
335+ ) ) ?;
348336 }
349337 Arc :: new ( builder. finish ( ) )
350338 } else {
351- return Err ( ExecutionError :: NotImplemented (
352- "Parquet datasource does not support null values"
353- . to_string ( ) ,
354- ) ) ;
339+ let mut builder = Int64Builder :: new ( levels_read) ;
340+ let mut value_index = 0 ;
341+ for i in 0 ..levels_read {
342+ if def_levels[ i] > 0 {
343+ builder. append_value (
344+ convert_int96_timestamp (
345+ read_buffer[ value_index] . data ( ) ,
346+ ) ,
347+ ) ?;
348+ value_index += 1 ;
349+ } else {
350+ builder. append_null ( ) ?;
351+ }
352+ }
353+ Arc :: new ( builder. finish ( ) )
355354 }
356355 } else {
357356 let ( values_read, _) = r. read_batch (
@@ -364,35 +363,26 @@ impl ParquetFile {
364363 let mut builder = Int64Builder :: new ( values_read) ;
365364
366365 for i in 0 ..values_read {
367- let v = read_buffer[ i] . data ( ) ;
368- let value: u128 = ( v[ 0 ] as u128 ) << 64
369- | ( v[ 1 ] as u128 ) << 32
370- | ( v[ 2 ] as u128 ) ;
371- let ms: i64 = ( value / 1000000 ) as i64 ;
372- builder. append_value ( ms) ?;
366+ builder. append_value ( convert_int96_timestamp (
367+ read_buffer[ i] . data ( ) ,
368+ ) ) ?;
373369 }
374370 Arc :: new ( builder. finish ( ) )
375371 }
376372 }
377373 ColumnReader :: FloatColumnReader ( ref mut r) => {
378- match ArrowReader :: < Float32Type > :: read (
374+ ArrowReader :: < Float32Type > :: read (
379375 r,
380376 self . batch_size ,
381377 is_nullable,
382- ) {
383- Ok ( array) => array,
384- Err ( e) => return Err ( e) ,
385- }
378+ ) ?
386379 }
387380 ColumnReader :: DoubleColumnReader ( ref mut r) => {
388- match ArrowReader :: < Float64Type > :: read (
381+ ArrowReader :: < Float64Type > :: read (
389382 r,
390383 self . batch_size ,
391384 is_nullable,
392- ) {
393- Ok ( array) => array,
394- Err ( e) => return Err ( e) ,
395- }
385+ ) ?
396386 }
397387 ColumnReader :: FixedLenByteArrayColumnReader ( ref mut r) => {
398388 read_binary_column ! ( self , r, i)
@@ -408,17 +398,26 @@ impl ParquetFile {
408398 if batch. len ( ) == 0 || batch[ 0 ] . data ( ) . len ( ) == 0 {
409399 Ok ( None )
410400 } else {
411- Ok ( Some ( RecordBatch :: try_new ( self . schema . clone ( ) , batch) ?) )
401+ Ok ( Some ( RecordBatch :: try_new (
402+ self . projection_schema . clone ( ) ,
403+ batch,
404+ ) ?) )
412405 }
413406 }
414407 _ => Ok ( None ) ,
415408 }
416409 }
417410}
418411
412+ /// convert a parquet timestamp in nanoseconds to a timestamp with milliseconds
413+ fn convert_int96_timestamp ( v : & [ u32 ] ) -> i64 {
414+ let value: u128 = ( v[ 0 ] as u128 ) << 64 | ( v[ 1 ] as u128 ) << 32 | ( v[ 2 ] as u128 ) ;
415+ ( value / 1000000 ) as i64
416+ }
417+
419418impl RecordBatchIterator for ParquetFile {
420419 fn schema ( & self ) -> & Arc < Schema > {
421- & self . schema
420+ & self . projection_schema
422421 }
423422
424423 fn next ( & mut self ) -> Result < Option < RecordBatch > > {
0 commit comments