Skip to content

Commit

Permalink
Add DataType::Utf8View and DataType::BinaryView (#5470)
Browse files Browse the repository at this point in the history
* Add BinaryView type

* Add Utf8View

* Apply suggestions from code review

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
Co-authored-by: Liang-Chi Hsieh <viirya@gmail.com>

* mention what is unimplemented in binaryview/utf8view

* Update arrow-data/src/data.rs

Co-authored-by: Daniël Heres <danielheres@gmail.com>

---------

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
Co-authored-by: Liang-Chi Hsieh <viirya@gmail.com>
Co-authored-by: Daniël Heres <danielheres@gmail.com>
  • Loading branch information
4 people authored Mar 5, 2024
1 parent d350ac5 commit 7eb866d
Show file tree
Hide file tree
Showing 8 changed files with 48 additions and 2 deletions.
7 changes: 7 additions & 0 deletions arrow-data/src/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,10 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuff
buffer.push(0i64);
[buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
}
DataType::BinaryView | DataType::Utf8View => [
MutableBuffer::new(capacity * mem::size_of::<u128>()),
empty_buffer,
],
DataType::List(_) | DataType::Map(_, _) => {
// offset buffer always starts with a zero
let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
Expand Down Expand Up @@ -1541,6 +1545,9 @@ pub fn layout(data_type: &DataType) -> DataTypeLayout {
DataType::LargeBinary => DataTypeLayout::new_binary::<i64>(),
DataType::Utf8 => DataTypeLayout::new_binary::<i32>(),
DataType::LargeUtf8 => DataTypeLayout::new_binary::<i64>(),
DataType::BinaryView | DataType::Utf8View => {
unimplemented!("BinaryView/Utf8View not implemented")
}
DataType::FixedSizeList(_, _) => DataTypeLayout::new_empty(), // all in child data
DataType::List(_) => DataTypeLayout::new_fixed_width::<i32>(),
DataType::LargeList(_) => DataTypeLayout::new_fixed_width::<i64>(),
Expand Down
3 changes: 3 additions & 0 deletions arrow-data/src/equal/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,9 @@ fn equal_values(
variable_sized_equal::<i64>(lhs, rhs, lhs_start, rhs_start, len)
}
DataType::FixedSizeBinary(_) => fixed_binary_equal(lhs, rhs, lhs_start, rhs_start, len),
DataType::BinaryView | DataType::Utf8View => {
unimplemented!("BinaryView/Utf8View not yet implemented")
}
DataType::List(_) => list_equal::<i32>(lhs, rhs, lhs_start, rhs_start, len),
DataType::LargeList(_) => list_equal::<i64>(lhs, rhs, lhs_start, rhs_start, len),
DataType::FixedSizeList(_, _) => fixed_list_equal(lhs, rhs, lhs_start, rhs_start, len),
Expand Down
9 changes: 9 additions & 0 deletions arrow-data/src/transform/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,9 @@ fn build_extend(array: &ArrayData) -> Extend {
DataType::Decimal256(_, _) => primitive::build_extend::<i256>(array),
DataType::Utf8 | DataType::Binary => variable_size::build_extend::<i32>(array),
DataType::LargeUtf8 | DataType::LargeBinary => variable_size::build_extend::<i64>(array),
DataType::BinaryView | DataType::Utf8View => {
unimplemented!("BinaryView/Utf8View not implemented")
}
DataType::Map(_, _) | DataType::List(_) => list::build_extend::<i32>(array),
DataType::LargeList(_) => list::build_extend::<i64>(array),
DataType::Dictionary(_, _) => unreachable!("should use build_extend_dictionary"),
Expand Down Expand Up @@ -266,6 +269,9 @@ fn build_extend_nulls(data_type: &DataType) -> ExtendNulls {
DataType::Decimal256(_, _) => primitive::extend_nulls::<i256>,
DataType::Utf8 | DataType::Binary => variable_size::extend_nulls::<i32>,
DataType::LargeUtf8 | DataType::LargeBinary => variable_size::extend_nulls::<i64>,
DataType::BinaryView | DataType::Utf8View => {
unimplemented!("BinaryView/Utf8View not implemented")
}
DataType::Map(_, _) | DataType::List(_) => list::extend_nulls::<i32>,
DataType::LargeList(_) => list::extend_nulls::<i64>,
DataType::Dictionary(child_data_type, _) => match child_data_type.as_ref() {
Expand Down Expand Up @@ -419,6 +425,9 @@ impl<'a> MutableArrayData<'a> {
| DataType::LargeBinary
| DataType::Interval(_)
| DataType::FixedSizeBinary(_) => vec![],
DataType::BinaryView | DataType::Utf8View => {
unimplemented!("BinaryView/Utf8View not implemented")
}
DataType::Map(_, _) | DataType::List(_) | DataType::LargeList(_) => {
let children = arrays
.iter()
Expand Down
3 changes: 3 additions & 0 deletions arrow-integration-test/src/datatype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,9 @@ pub fn data_type_to_json(data_type: &DataType) -> serde_json::Value {
DataType::LargeUtf8 => json!({"name": "largeutf8"}),
DataType::Binary => json!({"name": "binary"}),
DataType::LargeBinary => json!({"name": "largebinary"}),
DataType::BinaryView | DataType::Utf8View => {
unimplemented!("BinaryView/Utf8View not implemented")
}
DataType::FixedSizeBinary(byte_width) => {
json!({"name": "fixedsizebinary", "byteWidth": byte_width})
}
Expand Down
1 change: 1 addition & 0 deletions arrow-ipc/src/convert.rs
Original file line number Diff line number Diff line change
Expand Up @@ -543,6 +543,7 @@ pub(crate) fn get_fb_field_type<'a>(
.as_union_value(),
children: Some(fbb.create_vector(&empty_fields[..])),
},
BinaryView | Utf8View => unimplemented!("BinaryView/Utf8View not implemented"),
Utf8 => FBFieldType {
type_type: crate::Type::Utf8,
type_: crate::Utf8Builder::new(fbb).finish().as_union_value(),
Expand Down
24 changes: 22 additions & 2 deletions arrow-schema/src/datatype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,15 @@ pub enum DataType {
/// A single LargeBinary array can store up to [`i64::MAX`] bytes
/// of binary data in total.
LargeBinary,
/// (NOT YET FULLY SUPPORTED) Opaque binary data of variable length.
///
/// Note this data type is not yet fully supported. Using it with arrow APIs may result in `panic`s.
///
/// Logically the same as [`Self::Binary`], but the internal representation uses a view
/// struct that contains the string length and either the string's entire data
/// inline (for small strings) or an inlined prefix, an index of another buffer,
/// and an offset pointing to a slice in that buffer (for non-small strings).
BinaryView,
/// A variable-length string in Unicode with UTF-8 encoding.
///
/// A single Utf8 array can store up to [`i32::MAX`] bytes
Expand All @@ -206,6 +215,15 @@ pub enum DataType {
/// A single LargeUtf8 array can store up to [`i64::MAX`] bytes
/// of string data in total.
LargeUtf8,
/// (NOT YET FULLY SUPPORTED) A variable-length string in Unicode with UTF-8 encoding
///
/// Note this data type is not yet fully supported. Using it with arrow APIs may result in `panic`s.
///
/// Logically the same as [`Self::Utf8`], but the internal representation uses a view
/// struct that contains the string length and either the string's entire data
/// inline (for small strings) or an inlined prefix, an index of another buffer,
/// and an offset pointing to a slice in that buffer (for non-small strings).
Utf8View,
/// A list of some logical data type with variable length.
///
/// A single List array can store up to [`i32::MAX`] elements in total.
Expand Down Expand Up @@ -515,8 +533,8 @@ impl DataType {
DataType::Interval(IntervalUnit::MonthDayNano) => Some(16),
DataType::Decimal128(_, _) => Some(16),
DataType::Decimal256(_, _) => Some(32),
DataType::Utf8 | DataType::LargeUtf8 => None,
DataType::Binary | DataType::LargeBinary => None,
DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => None,
DataType::Binary | DataType::LargeBinary | DataType::BinaryView => None,
DataType::FixedSizeBinary(_) => None,
DataType::List(_) | DataType::LargeList(_) | DataType::Map(_, _) => None,
DataType::FixedSizeList(_, _) => None,
Expand Down Expand Up @@ -555,8 +573,10 @@ impl DataType {
| DataType::Binary
| DataType::FixedSizeBinary(_)
| DataType::LargeBinary
| DataType::BinaryView
| DataType::Utf8
| DataType::LargeUtf8
| DataType::Utf8View
| DataType::Decimal128(_, _)
| DataType::Decimal256(_, _) => 0,
DataType::Timestamp(_, s) => s.as_ref().map(|s| s.len()).unwrap_or_default(),
Expand Down
2 changes: 2 additions & 0 deletions arrow-schema/src/field.rs
Original file line number Diff line number Diff line change
Expand Up @@ -507,6 +507,7 @@ impl Field {
| DataType::Duration(_)
| DataType::Binary
| DataType::LargeBinary
| DataType::BinaryView
| DataType::Interval(_)
| DataType::LargeList(_)
| DataType::List(_)
Expand All @@ -517,6 +518,7 @@ impl Field {
| DataType::FixedSizeBinary(_)
| DataType::Utf8
| DataType::LargeUtf8
| DataType::Utf8View
| DataType::Decimal128(_, _)
| DataType::Decimal256(_, _) => {
if from.data_type == DataType::Null {
Expand Down
1 change: 1 addition & 0 deletions parquet/src/arrow/schema/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,7 @@ fn arrow_to_parquet_type(field: &Field) -> Result<Type> {
.with_length(*length)
.build()
}
DataType::BinaryView | DataType::Utf8View => unimplemented!("BinaryView/Utf8View not implemented"),
DataType::Decimal128(precision, scale)
| DataType::Decimal256(precision, scale) => {
// Decimal precision determines the Parquet physical type to use.
Expand Down

0 comments on commit 7eb866d

Please sign in to comment.