Skip to content

Commit 40300ca

Browse files
authored
perf: Speed up Parquet file writing (10%, back to speed of 56) (#8786)
# Which issue does this PR close? - Partial fix for #8783. # Rationale for this change An appreciable slowdown in Parquet writing was noticed. At least part of the slowdown seems to stem from changes to `basic::LogicalType` which caused cloning the enum to take make longer than previously. # What changes are included in this PR? This adds a new `logical_type_ref` call to `BasicTypeInfo` and `ColumnDescriptor`. Unlike the existing `logical_type` which returns a cloned `Option<LogicalType>`, the new methods return `Option<&LogicalType>`. This new function is used in place of `logical_type` internally. # Are these changes tested? Should be covered by existing tests. # Are there any user-facing changes? No. A further optimization would be to change `ColumnOrder::get_sort_order` to take an `Option<&LogicalType>`, but that is a breaking API change.
1 parent 0430664 commit 40300ca

File tree

5 files changed

+26
-10
lines changed

5 files changed

+26
-10
lines changed

parquet/src/column/writer/encoder.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -375,7 +375,7 @@ fn replace_zero<T: ParquetValueType>(val: &T, descr: &ColumnDescriptor, replace:
375375
T::try_from_le_slice(&f64::to_le_bytes(replace as f64)).unwrap()
376376
}
377377
Type::FIXED_LEN_BYTE_ARRAY
378-
if descr.logical_type() == Some(LogicalType::Float16)
378+
if descr.logical_type_ref() == Some(LogicalType::Float16).as_ref()
379379
&& f16::from_le_bytes(val.as_bytes().try_into().unwrap()) == f16::NEG_ZERO =>
380380
{
381381
T::try_from_le_slice(&f16::to_le_bytes(f16::from_f32(replace))).unwrap()

parquet/src/column/writer/mod.rs

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -868,8 +868,8 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
868868
// So truncation of those types could lead to inaccurate min/max statistics
869869
Type::FIXED_LEN_BYTE_ARRAY
870870
if !matches!(
871-
self.descr.logical_type(),
872-
Some(LogicalType::Decimal { .. }) | Some(LogicalType::Float16)
871+
self.descr.logical_type_ref(),
872+
Some(&LogicalType::Decimal { .. }) | Some(&LogicalType::Float16)
873873
) =>
874874
{
875875
true
@@ -882,7 +882,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
882882

883883
/// Returns `true` if this column's logical type is a UTF-8 string.
884884
fn is_utf8(&self) -> bool {
885-
self.get_descriptor().logical_type() == Some(LogicalType::String)
885+
self.get_descriptor().logical_type_ref() == Some(&LogicalType::String)
886886
|| self.get_descriptor().converted_type() == ConvertedType::UTF8
887887
}
888888

@@ -1385,7 +1385,7 @@ fn update_max<T: ParquetValueType>(descr: &ColumnDescriptor, val: &T, max: &mut
13851385
fn is_nan<T: ParquetValueType>(descr: &ColumnDescriptor, val: &T) -> bool {
13861386
match T::PHYSICAL_TYPE {
13871387
Type::FLOAT | Type::DOUBLE => val != val,
1388-
Type::FIXED_LEN_BYTE_ARRAY if descr.logical_type() == Some(LogicalType::Float16) => {
1388+
Type::FIXED_LEN_BYTE_ARRAY if descr.logical_type_ref() == Some(&LogicalType::Float16) => {
13891389
let val = val.as_bytes();
13901390
let val = f16::from_le_bytes([val[0], val[1]]);
13911391
val.is_nan()
@@ -1421,7 +1421,7 @@ fn compare_greater<T: ParquetValueType>(descr: &ColumnDescriptor, a: &T, b: &T)
14211421
Type::INT32 | Type::INT64 => {
14221422
if let Some(LogicalType::Integer {
14231423
is_signed: false, ..
1424-
}) = descr.logical_type()
1424+
}) = descr.logical_type_ref()
14251425
{
14261426
// need to compare unsigned
14271427
return compare_greater_unsigned_int(a, b);
@@ -1438,13 +1438,13 @@ fn compare_greater<T: ParquetValueType>(descr: &ColumnDescriptor, a: &T, b: &T)
14381438
};
14391439
}
14401440
Type::FIXED_LEN_BYTE_ARRAY | Type::BYTE_ARRAY => {
1441-
if let Some(LogicalType::Decimal { .. }) = descr.logical_type() {
1441+
if let Some(LogicalType::Decimal { .. }) = descr.logical_type_ref() {
14421442
return compare_greater_byte_array_decimals(a.as_bytes(), b.as_bytes());
14431443
}
14441444
if let ConvertedType::DECIMAL = descr.converted_type() {
14451445
return compare_greater_byte_array_decimals(a.as_bytes(), b.as_bytes());
14461446
}
1447-
if let Some(LogicalType::Float16) = descr.logical_type() {
1447+
if let Some(LogicalType::Float16) = descr.logical_type_ref() {
14481448
return compare_greater_f16(a.as_bytes(), b.as_bytes());
14491449
}
14501450
}

parquet/src/geospatial/accumulator.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ pub fn try_new_geo_stats_accumulator(
3333
descr: &ColumnDescPtr,
3434
) -> Option<Box<dyn GeoStatsAccumulator>> {
3535
if !matches!(
36-
descr.logical_type(),
36+
descr.logical_type_ref(),
3737
Some(LogicalType::Geometry { .. }) | Some(LogicalType::Geography { .. })
3838
) {
3939
return None;

parquet/src/record/api.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -756,7 +756,7 @@ impl Field {
756756
descr.type_precision(),
757757
descr.type_scale(),
758758
)),
759-
ConvertedType::NONE if descr.logical_type() == Some(LogicalType::Float16) => {
759+
ConvertedType::NONE if descr.logical_type_ref() == Some(&LogicalType::Float16) => {
760760
if value.len() != 2 {
761761
return Err(general_err!(
762762
"Error reading FIXED_LEN_BYTE_ARRAY as FLOAT16. Length must be 2, got {}",

parquet/src/schema/types.rs

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -707,11 +707,19 @@ impl BasicTypeInfo {
707707
}
708708

709709
/// Returns [`LogicalType`] value for the type.
710+
///
711+
/// Note that this function will clone the `LogicalType`. If performance is a concern,
712+
/// use [`Self::logical_type_ref`] instead.
710713
pub fn logical_type(&self) -> Option<LogicalType> {
711714
// Unlike ConvertedType, LogicalType cannot implement Copy, thus we clone it
712715
self.logical_type.clone()
713716
}
714717

718+
/// Return a reference to the [`LogicalType`] value for the type.
719+
pub fn logical_type_ref(&self) -> Option<&LogicalType> {
720+
self.logical_type.as_ref()
721+
}
722+
715723
/// Returns `true` if id is set, `false` otherwise.
716724
pub fn has_id(&self) -> bool {
717725
self.id.is_some()
@@ -908,10 +916,18 @@ impl ColumnDescriptor {
908916
}
909917

910918
/// Returns [`LogicalType`] for this column.
919+
///
920+
/// Note that this function will clone the `LogicalType`. If performance is a concern,
921+
/// use [`Self::logical_type_ref`] instead.
911922
pub fn logical_type(&self) -> Option<LogicalType> {
912923
self.primitive_type.get_basic_info().logical_type()
913924
}
914925

926+
/// Returns a reference to the [`LogicalType`] for this column.
927+
pub fn logical_type_ref(&self) -> Option<&LogicalType> {
928+
self.primitive_type.get_basic_info().logical_type_ref()
929+
}
930+
915931
/// Returns physical type for this column.
916932
/// Note that it will panic if called on a non-primitive type.
917933
pub fn physical_type(&self) -> PhysicalType {

0 commit comments

Comments
 (0)