Skip to content

Commit 8b0adeb

Browse files
committed
Preserve dictionary encoding from parquet (#171)
1 parent 0cc0c05 commit 8b0adeb

File tree

11 files changed

+927
-164
lines changed

11 files changed

+927
-164
lines changed

parquet/src/arrow/array_reader.rs

Lines changed: 18 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -56,11 +56,10 @@ use arrow::datatypes::{
5656
use arrow::util::bit_util;
5757

5858
use crate::arrow::converter::{
59-
BinaryArrayConverter, BinaryConverter, Converter, DecimalArrayConverter,
60-
DecimalConverter, FixedLenBinaryConverter, FixedSizeArrayConverter,
61-
Int96ArrayConverter, Int96Converter, IntervalDayTimeArrayConverter,
62-
IntervalDayTimeConverter, IntervalYearMonthArrayConverter,
63-
IntervalYearMonthConverter, Utf8ArrayConverter, Utf8Converter,
59+
Converter, DecimalArrayConverter, DecimalConverter, FixedLenBinaryConverter,
60+
FixedSizeArrayConverter, Int96ArrayConverter, Int96Converter,
61+
IntervalDayTimeArrayConverter, IntervalDayTimeConverter,
62+
IntervalYearMonthArrayConverter, IntervalYearMonthConverter,
6463
};
6564
use crate::arrow::record_reader::buffer::{ScalarValue, ValuesBuffer};
6665
use crate::arrow::record_reader::{GenericRecordReader, RecordReader};
@@ -70,8 +69,8 @@ use crate::column::page::PageIterator;
7069
use crate::column::reader::decoder::ColumnValueDecoder;
7170
use crate::column::reader::ColumnReaderImpl;
7271
use crate::data_type::{
73-
BoolType, ByteArrayType, DataType, DoubleType, FixedLenByteArrayType, FloatType,
74-
Int32Type, Int64Type, Int96Type,
72+
BoolType, DataType, DoubleType, FixedLenByteArrayType, FloatType, Int32Type,
73+
Int64Type, Int96Type,
7574
};
7675
use crate::errors::{ParquetError, ParquetError::ArrowError, Result};
7776
use crate::file::reader::{FilePageIterator, FileReader};
@@ -81,9 +80,12 @@ use crate::schema::types::{
8180
use crate::schema::visitor::TypeVisitor;
8281

8382
mod byte_array;
83+
mod byte_array_dictionary;
84+
mod dictionary_buffer;
8485
mod offset_buffer;
8586

8687
pub use byte_array::make_byte_array_reader;
88+
pub use byte_array_dictionary::make_byte_array_dictionary_reader;
8789

8890
/// Array reader reads parquet data into arrow array.
8991
pub trait ArrayReader {
@@ -271,7 +273,8 @@ where
271273
.clone(),
272274
};
273275

274-
let record_reader = RecordReader::<T>::new_with_options(column_desc.clone(), null_mask_only);
276+
let record_reader =
277+
RecordReader::<T>::new_with_options(column_desc.clone(), null_mask_only);
275278

276279
Ok(Self {
277280
data_type,
@@ -1783,35 +1786,12 @@ impl<'a> ArrayReaderBuilder {
17831786
)?,
17841787
)),
17851788
PhysicalType::BYTE_ARRAY => match arrow_type {
1786-
// TODO: Replace with optimised dictionary reader (#171)
1787-
Some(ArrowType::Dictionary(_, _)) => {
1788-
match cur_type.get_basic_info().converted_type() {
1789-
ConvertedType::UTF8 => {
1790-
let converter = Utf8Converter::new(Utf8ArrayConverter {});
1791-
Ok(Box::new(ComplexObjectArrayReader::<
1792-
ByteArrayType,
1793-
Utf8Converter,
1794-
>::new(
1795-
page_iterator,
1796-
column_desc,
1797-
converter,
1798-
arrow_type,
1799-
)?))
1800-
}
1801-
_ => {
1802-
let converter = BinaryConverter::new(BinaryArrayConverter {});
1803-
Ok(Box::new(ComplexObjectArrayReader::<
1804-
ByteArrayType,
1805-
BinaryConverter,
1806-
>::new(
1807-
page_iterator,
1808-
column_desc,
1809-
converter,
1810-
arrow_type,
1811-
)?))
1812-
}
1813-
}
1814-
}
1789+
Some(ArrowType::Dictionary(_, _)) => make_byte_array_dictionary_reader(
1790+
page_iterator,
1791+
column_desc,
1792+
arrow_type,
1793+
null_mask_only,
1794+
),
18151795
_ => make_byte_array_reader(
18161796
page_iterator,
18171797
column_desc,
@@ -2025,7 +2005,7 @@ mod tests {
20252005
use crate::arrow::schema::parquet_to_arrow_schema;
20262006
use crate::basic::{Encoding, Type as PhysicalType};
20272007
use crate::column::page::{Page, PageReader};
2028-
use crate::data_type::{ByteArray, DataType, Int32Type, Int64Type};
2008+
use crate::data_type::{ByteArray, ByteArrayType, DataType, Int32Type, Int64Type};
20292009
use crate::errors::Result;
20302010
use crate::file::reader::{FileReader, SerializedFileReader};
20312011
use crate::schema::parser::parse_message_type;

parquet/src/arrow/array_reader/byte_array.rs

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -668,15 +668,9 @@ mod tests {
668668
assert_eq!(decoder.read(&mut output, 4..8).unwrap(), 0);
669669

670670
let valid = vec![false, false, true, true, false, true, true, false, false];
671-
let rev_position_iter = valid
672-
.iter()
673-
.enumerate()
674-
.rev()
675-
.filter_map(|(i, valid)| valid.then(|| i));
676-
677671
let valid_buffer = Buffer::from_iter(valid.iter().cloned());
678672

679-
output.pad_nulls(0, 4, valid.len(), rev_position_iter);
673+
output.pad_nulls(0, 4, valid.len(), valid_buffer.as_slice());
680674
let array = output.into_array(Some(valid_buffer), ArrowType::Utf8);
681675
let strings = array.as_any().downcast_ref::<StringArray>().unwrap();
682676

0 commit comments

Comments
 (0)