Skip to content

Commit 2a389a3

Browse files
committed
create BinaryArray directly from byte slice to prevent converting to String > &str > &[u8]
1 parent b20ea6d commit 2a389a3

File tree

2 files changed

+56
-11
lines changed

2 files changed

+56
-11
lines changed

rust/arrow/src/array.rs

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -627,6 +627,26 @@ impl<'a> From<Vec<&'a str>> for BinaryArray {
627627
}
628628
}
629629

630+
impl<'a> From<Vec<&[u8]>> for BinaryArray {
631+
fn from(v: Vec<&[u8]>) -> Self {
632+
let mut offsets = vec![];
633+
let mut values = vec![];
634+
let mut length_so_far = 0;
635+
offsets.push(length_so_far);
636+
for s in &v {
637+
length_so_far += s.len() as i32;
638+
offsets.push(length_so_far as i32);
639+
values.extend_from_slice(s);
640+
}
641+
let array_data = ArrayData::builder(DataType::Utf8)
642+
.len(v.len())
643+
.add_buffer(Buffer::from(offsets.to_byte_slice()))
644+
.add_buffer(Buffer::from(&values[..]))
645+
.build();
646+
BinaryArray::from(array_data)
647+
}
648+
}
649+
630650
/// Creates a `BinaryArray` from `List<u8>` array
631651
impl From<ListArray> for BinaryArray {
632652
fn from(v: ListArray) -> Self {
@@ -1155,6 +1175,36 @@ mod tests {
11551175
}
11561176
}
11571177

1178+
#[test]
1179+
fn test_binary_array_from_u8_slice() {
1180+
let values: Vec<&[u8]> = vec![
1181+
&[b'h', b'e', b'l', b'l', b'o'],
1182+
&[],
1183+
&[ b'p', b'a', b'r', b'q', b'u', b'e', b't']
1184+
];
1185+
1186+
// Array data: ["hello", "", "parquet"]
1187+
let binary_array = BinaryArray::from(values);
1188+
1189+
assert_eq!(3, binary_array.len());
1190+
assert_eq!(0, binary_array.null_count());
1191+
assert_eq!([b'h', b'e', b'l', b'l', b'o'], binary_array.value(0));
1192+
assert_eq!("hello", binary_array.get_string(0));
1193+
assert_eq!([] as [u8; 0], binary_array.value(1));
1194+
assert_eq!("", binary_array.get_string(1));
1195+
assert_eq!(
1196+
[b'p', b'a', b'r', b'q', b'u', b'e', b't'],
1197+
binary_array.value(2)
1198+
);
1199+
assert_eq!("parquet", binary_array.get_string(2));
1200+
assert_eq!(5, binary_array.value_offset(2));
1201+
assert_eq!(7, binary_array.value_length(2));
1202+
for i in 0..3 {
1203+
assert!(binary_array.is_valid(i));
1204+
assert!(!binary_array.is_null(i));
1205+
}
1206+
}
1207+
11581208
#[test]
11591209
#[should_panic(
11601210
expected = "BinaryArray can only be created from List<u8> arrays, mismatched \

rust/arrow/src/compute/array_ops.rs

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -236,17 +236,14 @@ pub fn filter(array: &Array, filter: &BooleanArray) -> Result<ArrayRef> {
236236
DataType::Float64 => filter_array!(array, filter, Float64Array),
237237
DataType::Boolean => filter_array!(array, filter, BooleanArray),
238238
DataType::Utf8 => {
239-
//TODO: this is inefficient and we should improve the Arrow impl to help make
240-
// this more concise
241239
let b = array.as_any().downcast_ref::<BinaryArray>().unwrap();
242-
let mut values: Vec<String> = Vec::with_capacity(b.len());
240+
let mut values: Vec<&[u8]> = Vec::with_capacity(b.len());
243241
for i in 0..b.len() {
244242
if filter.value(i) {
245-
values.push(b.get_string(i));
243+
values.push(b.value(i));
246244
}
247245
}
248-
let tmp: Vec<&str> = values.iter().map(|s| s.as_str()).collect();
249-
Ok(Arc::new(BinaryArray::from(tmp)))
246+
Ok(Arc::new(BinaryArray::from(values)))
250247
}
251248
other => Err(ArrowError::ComputeError(format!(
252249
"filter not supported for {:?}",
@@ -288,14 +285,12 @@ pub fn limit(array: &Array, num_rows_to_read: usize) -> Result<ArrayRef> {
288285
DataType::Float64 => limit_array!(array, num_rows_to_read, Float64Array),
289286
DataType::Boolean => limit_array!(array, num_rows_to_read, BooleanArray),
290287
DataType::Utf8 => {
291-
//TODO: this is inefficient and we should improve the Arrow impl to help make this more concise
292288
let b = array.as_any().downcast_ref::<BinaryArray>().unwrap();
293-
let mut values: Vec<String> = Vec::with_capacity(num_rows_to_read as usize);
289+
let mut values: Vec<&[u8]> = Vec::with_capacity(num_rows_to_read as usize);
294290
for i in 0..num_rows_to_read {
295-
values.push(b.get_string(i));
291+
values.push(b.value(i));
296292
}
297-
let tmp: Vec<&str> = values.iter().map(|s| s.as_str()).collect();
298-
Ok(Arc::new(BinaryArray::from(tmp)))
293+
Ok(Arc::new(BinaryArray::from(values)))
299294
}
300295
other => Err(ArrowError::ComputeError(format!(
301296
"limit not supported for {:?}",

0 commit comments

Comments
 (0)