Skip to content

Commit 639e13e

Browse files
committed
null handling for int96
1 parent 1503855 commit 639e13e

File tree

1 file changed

+50
-51
lines changed

1 file changed

+50
-51
lines changed

rust/datafusion/src/datasource/parquet.rs

Lines changed: 50 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ impl ParquetTable {
4646
pub fn try_new(filename: &str) -> Result<Self> {
4747
let file = File::open(filename)?;
4848
let parquet_file = ParquetFile::open(file, None)?;
49-
let schema = parquet_file.schema.clone();
49+
let schema = parquet_file.projection_schema.clone();
5050
Ok(Self {
5151
filename: filename.to_string(),
5252
schema,
@@ -72,11 +72,12 @@ impl Table for ParquetTable {
7272

7373
pub struct ParquetFile {
7474
reader: SerializedFileReader<File>,
75-
row_group_index: usize,
76-
/// The schema of the underlying file
77-
schema: Arc<Schema>,
75+
/// Projection expressed as column indices into underlying parquet reader
7876
projection: Vec<usize>,
77+
/// The schema of the projection
78+
projection_schema: Arc<Schema>,
7979
batch_size: usize,
80+
row_group_index: usize,
8081
current_row_group: Option<Box<RowGroupReader>>,
8182
column_readers: Vec<ColumnReader>,
8283
}
@@ -97,7 +98,7 @@ macro_rules! read_binary_column {
9798
for _ in 0..$SELF.batch_size {
9899
read_buffer.push(ByteArray::default());
99100
}
100-
if $SELF.schema.field($INDEX).is_nullable() {
101+
if $SELF.projection_schema.field($INDEX).is_nullable() {
101102

102103
let mut def_levels: Vec<i16> = Vec::with_capacity($SELF.batch_size);
103104
for _ in 0..$SELF.batch_size {
@@ -247,7 +248,7 @@ impl ParquetFile {
247248
Ok(ParquetFile {
248249
reader: reader,
249250
row_group_index: 0,
250-
schema: projected_schema,
251+
projection_schema: projected_schema,
251252
projection,
252253
batch_size: 64 * 1024,
253254
current_row_group: None,
@@ -285,34 +286,25 @@ impl ParquetFile {
285286
let is_nullable = self.schema().field(i).is_nullable();
286287
let array: Arc<Array> = match self.column_readers[i] {
287288
ColumnReader::BoolColumnReader(ref mut r) => {
288-
match ArrowReader::<BooleanType>::read(
289+
ArrowReader::<BooleanType>::read(
289290
r,
290291
self.batch_size,
291292
is_nullable,
292-
) {
293-
Ok(array) => array,
294-
Err(e) => return Err(e),
295-
}
293+
)?
296294
}
297295
ColumnReader::Int32ColumnReader(ref mut r) => {
298-
match ArrowReader::<Int32Type>::read(
296+
ArrowReader::<Int32Type>::read(
299297
r,
300298
self.batch_size,
301299
is_nullable,
302-
) {
303-
Ok(array) => array,
304-
Err(e) => return Err(e),
305-
}
300+
)?
306301
}
307302
ColumnReader::Int64ColumnReader(ref mut r) => {
308-
match ArrowReader::<Int64Type>::read(
303+
ArrowReader::<Int64Type>::read(
309304
r,
310305
self.batch_size,
311306
is_nullable,
312-
) {
313-
Ok(array) => array,
314-
Err(e) => return Err(e),
315-
}
307+
)?
316308
}
317309
ColumnReader::Int96ColumnReader(ref mut r) => {
318310
let mut read_buffer: Vec<Int96> =
@@ -322,7 +314,7 @@ impl ParquetFile {
322314
read_buffer.push(Int96::new());
323315
}
324316

325-
if self.schema.field(i).is_nullable() {
317+
if self.projection_schema.field(i).is_nullable() {
326318
let mut def_levels: Vec<i16> =
327319
Vec::with_capacity(self.batch_size);
328320
for _ in 0..self.batch_size {
@@ -337,21 +329,28 @@ impl ParquetFile {
337329

338330
if values_read == levels_read {
339331
let mut builder = Int64Builder::new(values_read);
340-
341332
for i in 0..values_read {
342-
let v = read_buffer[i].data();
343-
let value: u128 = (v[0] as u128) << 64
344-
| (v[1] as u128) << 32
345-
| (v[2] as u128);
346-
let ms: i64 = (value / 1000000) as i64;
347-
builder.append_value(ms)?;
333+
builder.append_value(convert_int96_timestamp(
334+
read_buffer[i].data(),
335+
))?;
348336
}
349337
Arc::new(builder.finish())
350338
} else {
351-
return Err(ExecutionError::NotImplemented(
352-
"Parquet datasource does not support null values"
353-
.to_string(),
354-
));
339+
let mut builder = Int64Builder::new(levels_read);
340+
let mut value_index = 0;
341+
for i in 0..levels_read {
342+
if def_levels[i] > 0 {
343+
builder.append_value(
344+
convert_int96_timestamp(
345+
read_buffer[value_index].data(),
346+
),
347+
)?;
348+
value_index += 1;
349+
} else {
350+
builder.append_null()?;
351+
}
352+
}
353+
Arc::new(builder.finish())
355354
}
356355
} else {
357356
let (values_read, _) = r.read_batch(
@@ -364,35 +363,26 @@ impl ParquetFile {
364363
let mut builder = Int64Builder::new(values_read);
365364

366365
for i in 0..values_read {
367-
let v = read_buffer[i].data();
368-
let value: u128 = (v[0] as u128) << 64
369-
| (v[1] as u128) << 32
370-
| (v[2] as u128);
371-
let ms: i64 = (value / 1000000) as i64;
372-
builder.append_value(ms)?;
366+
builder.append_value(convert_int96_timestamp(
367+
read_buffer[i].data(),
368+
))?;
373369
}
374370
Arc::new(builder.finish())
375371
}
376372
}
377373
ColumnReader::FloatColumnReader(ref mut r) => {
378-
match ArrowReader::<Float32Type>::read(
374+
ArrowReader::<Float32Type>::read(
379375
r,
380376
self.batch_size,
381377
is_nullable,
382-
) {
383-
Ok(array) => array,
384-
Err(e) => return Err(e),
385-
}
378+
)?
386379
}
387380
ColumnReader::DoubleColumnReader(ref mut r) => {
388-
match ArrowReader::<Float64Type>::read(
381+
ArrowReader::<Float64Type>::read(
389382
r,
390383
self.batch_size,
391384
is_nullable,
392-
) {
393-
Ok(array) => array,
394-
Err(e) => return Err(e),
395-
}
385+
)?
396386
}
397387
ColumnReader::FixedLenByteArrayColumnReader(ref mut r) => {
398388
read_binary_column!(self, r, i)
@@ -408,17 +398,26 @@ impl ParquetFile {
408398
if batch.len() == 0 || batch[0].data().len() == 0 {
409399
Ok(None)
410400
} else {
411-
Ok(Some(RecordBatch::try_new(self.schema.clone(), batch)?))
401+
Ok(Some(RecordBatch::try_new(
402+
self.projection_schema.clone(),
403+
batch,
404+
)?))
412405
}
413406
}
414407
_ => Ok(None),
415408
}
416409
}
417410
}
418411

412+
/// convert a parquet timestamp in nanoseconds to a timestamp with milliseconds
413+
fn convert_int96_timestamp(v: &[u32]) -> i64 {
414+
let value: u128 = (v[0] as u128) << 64 | (v[1] as u128) << 32 | (v[2] as u128);
415+
(value / 1000000) as i64
416+
}
417+
419418
impl RecordBatchIterator for ParquetFile {
420419
fn schema(&self) -> &Arc<Schema> {
421-
&self.schema
420+
&self.projection_schema
422421
}
423422

424423
fn next(&mut self) -> Result<Option<RecordBatch>> {

0 commit comments

Comments
 (0)