Skip to content

Commit eaca232

Browse files
authored
[Parquet] Allow reading of files with unknown logical types (#8777)
# Which issue does this PR close? - Closes #8776. # Rationale for this change See issue # What changes are included in this PR? Modifies a few conversion functions to account for unknown logical types # Are these changes tested? Yes, tests are added # Are there any user-facing changes? No
1 parent 6be6cba commit eaca232

File tree

4 files changed

+60
-1
lines changed

4 files changed

+60
-1
lines changed

parquet/src/arrow/arrow_reader/mod.rs

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1195,7 +1195,7 @@ mod tests {
11951195
};
11961196
use crate::arrow::schema::add_encoded_arrow_schema_to_metadata;
11971197
use crate::arrow::{ArrowWriter, ProjectionMask};
1198-
use crate::basic::{ConvertedType, Encoding, Repetition, Type as PhysicalType};
1198+
use crate::basic::{ConvertedType, Encoding, LogicalType, Repetition, Type as PhysicalType};
11991199
use crate::column::reader::decoder::REPETITION_LEVELS_BATCH_SIZE;
12001200
use crate::data_type::{
12011201
BoolType, ByteArray, ByteArrayType, DataType, FixedLenByteArray, FixedLenByteArrayType,
@@ -5019,4 +5019,27 @@ mod tests {
50195019
assert!(sbbf.check(&"Hello"));
50205020
assert!(!sbbf.check(&"Hello_Not_Exists"));
50215021
}
5022+
5023+
#[test]
5024+
fn test_read_unknown_logical_type() {
5025+
let testdata = arrow::util::test_util::parquet_test_data();
5026+
let path = format!("{testdata}/unknown-logical-type.parquet");
5027+
let test_file = File::open(path).unwrap();
5028+
5029+
let builder = ParquetRecordBatchReaderBuilder::try_new(test_file)
5030+
.expect("Error creating reader builder");
5031+
5032+
let schema = builder.metadata().file_metadata().schema_descr();
5033+
assert_eq!(schema.column(0).logical_type(), Some(LogicalType::String));
5034+
assert_eq!(
5035+
schema.column(1).logical_type(),
5036+
Some(LogicalType::_Unknown { field_id: 2555 })
5037+
);
5038+
assert_eq!(schema.column(1).physical_type(), PhysicalType::BYTE_ARRAY);
5039+
5040+
let mut reader = builder.build().unwrap();
5041+
let out = reader.next().unwrap().unwrap();
5042+
assert_eq!(out.num_rows(), 3);
5043+
assert_eq!(out.num_columns(), 2);
5044+
}
50225045
}

parquet/src/arrow/schema/primitive.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,7 @@ fn from_byte_array(info: &BasicTypeInfo, precision: i32, scale: i32) -> Result<D
278278
(Some(LogicalType::Enum), _) => Ok(DataType::Binary),
279279
(Some(LogicalType::Geometry { .. }), _) => Ok(DataType::Binary),
280280
(Some(LogicalType::Geography { .. }), _) => Ok(DataType::Binary),
281+
(Some(LogicalType::_Unknown { .. }), _) => Ok(DataType::Binary),
281282
(None, ConvertedType::NONE) => Ok(DataType::Binary),
282283
(None, ConvertedType::JSON) => Ok(DataType::Utf8),
283284
(None, ConvertedType::BSON) => Ok(DataType::Binary),

parquet/src/file/serialized_reader.rs

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2696,4 +2696,31 @@ mod tests {
26962696
);
26972697
}
26982698
}
2699+
2700+
#[test]
2701+
fn test_read_unknown_logical_type() {
2702+
let file = get_test_file("unknown-logical-type.parquet");
2703+
let reader = SerializedFileReader::new(file).expect("Error opening file");
2704+
2705+
let schema = reader.metadata().file_metadata().schema_descr();
2706+
assert_eq!(
2707+
schema.column(0).logical_type(),
2708+
Some(basic::LogicalType::String)
2709+
);
2710+
assert_eq!(
2711+
schema.column(1).logical_type(),
2712+
Some(basic::LogicalType::_Unknown { field_id: 2555 })
2713+
);
2714+
assert_eq!(schema.column(1).physical_type(), Type::BYTE_ARRAY);
2715+
2716+
let mut iter = reader
2717+
.get_row_iter(None)
2718+
.expect("Failed to create row iterator");
2719+
2720+
let mut num_rows = 0;
2721+
while iter.next().is_some() {
2722+
num_rows += 1;
2723+
}
2724+
assert_eq!(num_rows, reader.metadata().file_metadata().num_rows());
2725+
}
26992726
}

parquet/src/schema/types.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -418,6 +418,8 @@ impl<'a> PrimitiveTypeBuilder<'a> {
418418
self.name
419419
));
420420
}
421+
// unknown logical type means just use physical type
422+
(LogicalType::_Unknown { .. }, _) => {}
421423
(a, b) => {
422424
return Err(general_err!(
423425
"Cannot annotate {:?} from {} for field '{}'",
@@ -1714,6 +1716,12 @@ mod tests {
17141716
"Parquet error: UUID cannot annotate field 'foo' because it is not a FIXED_LEN_BYTE_ARRAY(16) field"
17151717
);
17161718
}
1719+
1720+
// test unknown logical types are ok
1721+
result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1722+
.with_logical_type(Some(LogicalType::_Unknown { field_id: 100 }))
1723+
.build();
1724+
assert!(result.is_ok());
17171725
}
17181726

17191727
#[test]

0 commit comments

Comments
 (0)