Skip to content

Commit 9707916

Browse files
joroKr21avantgardnerio
authored andcommitted
Fix panics in array_union (#287) (apache#15149) v48
* Drop rust-toolchain * Fix panics in array_union * Fix the chrono
1 parent 2717874 commit 9707916

File tree

4 files changed

+100
-110
lines changed

4 files changed

+100
-110
lines changed

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ arrow-schema = { version = "53.3.0", default-features = false }
9595
async-trait = "0.1.73"
9696
bigdecimal = "0.4.7"
9797
bytes = "1.4"
98-
chrono = { version = "0.4.38", default-features = false }
98+
chrono = { version = ">=0.4.34,<0.4.40", default-features = false }
9999
ctor = "0.2.0"
100100
dashmap = "6.0.1"
101101
datafusion = { path = "datafusion/core", version = "44.0.0", default-features = false }

benchmarks/requirements.txt

Lines changed: 0 additions & 18 deletions
This file was deleted.

datafusion/functions-nested/src/set_ops.rs

Lines changed: 89 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -17,16 +17,16 @@
1717

1818
//! [`ScalarUDFImpl`] definitions for array_union, array_intersect and array_distinct functions.
1919
20-
use crate::make_array::{empty_array_type, make_array_inner};
2120
use crate::utils::make_scalar_function;
22-
use arrow::array::{new_empty_array, Array, ArrayRef, GenericListArray, OffsetSizeTrait};
21+
use arrow::array::{Array, ArrayRef, GenericListArray, OffsetSizeTrait};
2322
use arrow::buffer::OffsetBuffer;
2423
use arrow::compute;
2524
use arrow::datatypes::{DataType, Field, FieldRef};
2625
use arrow::row::{RowConverter, SortField};
26+
use arrow_array::{new_null_array, LargeListArray, ListArray};
2727
use arrow_schema::DataType::{FixedSizeList, LargeList, List, Null};
2828
use datafusion_common::cast::{as_large_list_array, as_list_array};
29-
use datafusion_common::{exec_err, internal_err, Result};
29+
use datafusion_common::{exec_err, internal_err, plan_err, Result};
3030
use datafusion_expr::scalar_doc_sections::DOC_SECTION_ARRAY;
3131
use datafusion_expr::{
3232
ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
@@ -92,7 +92,8 @@ impl ScalarUDFImpl for ArrayUnion {
9292

9393
fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
9494
match (&arg_types[0], &arg_types[1]) {
95-
(&Null, dt) => Ok(dt.clone()),
95+
(Null, Null) => Ok(DataType::new_list(Null, true)),
96+
(Null, dt) => Ok(dt.clone()),
9697
(dt, Null) => Ok(dt.clone()),
9798
(dt, _) => Ok(dt.clone()),
9899
}
@@ -182,9 +183,10 @@ impl ScalarUDFImpl for ArrayIntersect {
182183

183184
fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
184185
match (arg_types[0].clone(), arg_types[1].clone()) {
185-
(Null, Null) | (Null, _) => Ok(Null),
186-
(_, Null) => Ok(empty_array_type()),
187-
(dt, _) => Ok(dt),
186+
(Null, Null) => Ok(DataType::new_list(Null, true)),
187+
(Null, dt) => Ok(dt.clone()),
188+
(dt, Null) => Ok(dt.clone()),
189+
(dt, _) => Ok(dt.clone()),
188190
}
189191
}
190192

@@ -332,22 +334,18 @@ fn array_distinct_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
332334
return exec_err!("array_distinct needs one argument");
333335
}
334336

335-
// handle null
336-
if args[0].data_type() == &Null {
337-
return Ok(Arc::clone(&args[0]));
338-
}
339-
340-
// handle for list & largelist
341-
match args[0].data_type() {
337+
let array = &args[0];
338+
match array.data_type() {
339+
Null => Ok(Arc::clone(array)),
342340
List(field) => {
343-
let array = as_list_array(&args[0])?;
341+
let array = as_list_array(array)?;
344342
general_array_distinct(array, field)
345343
}
346344
LargeList(field) => {
347-
let array = as_large_list_array(&args[0])?;
345+
let array = as_large_list_array(array)?;
348346
general_array_distinct(array, field)
349347
}
350-
array_type => exec_err!("array_distinct does not support type '{array_type:?}'"),
348+
arg_type => exec_err!("array_distinct does not support type '{arg_type:?}'"),
351349
}
352350
}
353351

@@ -372,80 +370,69 @@ fn generic_set_lists<OffsetSize: OffsetSizeTrait>(
372370
field: Arc<Field>,
373371
set_op: SetOp,
374372
) -> Result<ArrayRef> {
375-
if matches!(l.value_type(), Null) {
373+
if l.is_empty() || l.value_type().is_null() {
376374
let field = Arc::new(Field::new_list_field(r.value_type(), true));
377375
return general_array_distinct::<OffsetSize>(r, &field);
378-
} else if matches!(r.value_type(), Null) {
376+
} else if r.is_empty() || r.value_type().is_null() {
379377
let field = Arc::new(Field::new_list_field(l.value_type(), true));
380378
return general_array_distinct::<OffsetSize>(l, &field);
381379
}
382380

383-
// Handle empty array at rhs case
384-
// array_union(arr, []) -> arr;
385-
// array_intersect(arr, []) -> [];
386-
if r.value_length(0).is_zero() {
387-
if set_op == SetOp::Union {
388-
return Ok(Arc::new(l.clone()) as ArrayRef);
389-
} else {
390-
return Ok(Arc::new(r.clone()) as ArrayRef);
391-
}
392-
}
393-
394381
if l.value_type() != r.value_type() {
395-
return internal_err!("{set_op:?} is not implemented for '{l:?}' and '{r:?}'");
382+
return internal_err!(
383+
"{set_op} is not implemented for {} and {}",
384+
l.data_type(),
385+
r.data_type()
386+
);
396387
}
397388

398-
let dt = l.value_type();
399-
400389
let mut offsets = vec![OffsetSize::usize_as(0)];
401390
let mut new_arrays = vec![];
402-
403-
let converter = RowConverter::new(vec![SortField::new(dt)])?;
391+
let converter = RowConverter::new(vec![SortField::new(l.value_type())])?;
404392
for (first_arr, second_arr) in l.iter().zip(r.iter()) {
405-
if let (Some(first_arr), Some(second_arr)) = (first_arr, second_arr) {
406-
let l_values = converter.convert_columns(&[first_arr])?;
407-
let r_values = converter.convert_columns(&[second_arr])?;
408-
409-
let l_iter = l_values.iter().sorted().dedup();
410-
let values_set: HashSet<_> = l_iter.clone().collect();
411-
let mut rows = if set_op == SetOp::Union {
412-
l_iter.collect::<Vec<_>>()
413-
} else {
414-
vec![]
415-
};
416-
for r_val in r_values.iter().sorted().dedup() {
417-
match set_op {
418-
SetOp::Union => {
419-
if !values_set.contains(&r_val) {
420-
rows.push(r_val);
421-
}
422-
}
423-
SetOp::Intersect => {
424-
if values_set.contains(&r_val) {
425-
rows.push(r_val);
426-
}
427-
}
428-
}
429-
}
393+
let l_values = if let Some(first_arr) = first_arr {
394+
converter.convert_columns(&[first_arr])?
395+
} else {
396+
converter.convert_columns(&[])?
397+
};
398+
399+
let r_values = if let Some(second_arr) = second_arr {
400+
converter.convert_columns(&[second_arr])?
401+
} else {
402+
converter.convert_columns(&[])?
403+
};
430404

431-
let last_offset = match offsets.last().copied() {
432-
Some(offset) => offset,
433-
None => return internal_err!("offsets should not be empty"),
434-
};
435-
offsets.push(last_offset + OffsetSize::usize_as(rows.len()));
436-
let arrays = converter.convert_rows(rows)?;
437-
let array = match arrays.first() {
438-
Some(array) => Arc::clone(array),
439-
None => {
440-
return internal_err!("{set_op}: failed to get array from rows");
441-
}
442-
};
443-
new_arrays.push(array);
405+
let l_iter = l_values.iter().sorted().dedup();
406+
let values_set: HashSet<_> = l_iter.clone().collect();
407+
let mut rows = if set_op == SetOp::Union {
408+
l_iter.collect()
409+
} else {
410+
vec![]
411+
};
412+
413+
for r_val in r_values.iter().sorted().dedup() {
414+
match set_op {
415+
SetOp::Union if !values_set.contains(&r_val) => rows.push(r_val),
416+
SetOp::Intersect if values_set.contains(&r_val) => rows.push(r_val),
417+
_ => (),
418+
}
444419
}
420+
421+
let last_offset = match offsets.last() {
422+
Some(offset) => *offset,
423+
None => return internal_err!("offsets should not be empty"),
424+
};
425+
426+
offsets.push(last_offset + OffsetSize::usize_as(rows.len()));
427+
let arrays = converter.convert_rows(rows)?;
428+
new_arrays.push(match arrays.first() {
429+
Some(array) => Arc::clone(array),
430+
None => return internal_err!("{set_op}: failed to get array from rows"),
431+
});
445432
}
446433

447434
let offsets = OffsetBuffer::new(offsets.into());
448-
let new_arrays_ref = new_arrays.iter().map(|v| v.as_ref()).collect::<Vec<_>>();
435+
let new_arrays_ref: Vec<_> = new_arrays.iter().map(|v| v.as_ref()).collect();
449436
let values = compute::concat(&new_arrays_ref)?;
450437
let arr = GenericListArray::<OffsetSize>::try_new(field, offsets, values, None)?;
451438
Ok(Arc::new(arr))
@@ -456,38 +443,60 @@ fn general_set_op(
456443
array2: &ArrayRef,
457444
set_op: SetOp,
458445
) -> Result<ArrayRef> {
446+
fn empty_array(data_type: &DataType, len: usize, large: bool) -> Result<ArrayRef> {
447+
let field = Arc::new(Field::new_list_field(data_type.clone(), true));
448+
let values = new_null_array(data_type, len);
449+
if large {
450+
Ok(Arc::new(LargeListArray::try_new(
451+
field,
452+
OffsetBuffer::new_zeroed(len),
453+
values,
454+
None,
455+
)?))
456+
} else {
457+
Ok(Arc::new(ListArray::try_new(
458+
field,
459+
OffsetBuffer::new_zeroed(len),
460+
values,
461+
None,
462+
)?))
463+
}
464+
}
465+
459466
match (array1.data_type(), array2.data_type()) {
467+
(Null, Null) => Ok(Arc::new(ListArray::new_null(
468+
Arc::new(Field::new_list_field(Null, true)),
469+
array1.len(),
470+
))),
460471
(Null, List(field)) => {
461472
if set_op == SetOp::Intersect {
462-
return Ok(new_empty_array(&Null));
473+
return empty_array(field.data_type(), array1.len(), false);
463474
}
464475
let array = as_list_array(&array2)?;
465476
general_array_distinct::<i32>(array, field)
466477
}
467478

468479
(List(field), Null) => {
469480
if set_op == SetOp::Intersect {
470-
return make_array_inner(&[]);
481+
return empty_array(field.data_type(), array1.len(), false);
471482
}
472483
let array = as_list_array(&array1)?;
473484
general_array_distinct::<i32>(array, field)
474485
}
475486
(Null, LargeList(field)) => {
476487
if set_op == SetOp::Intersect {
477-
return Ok(new_empty_array(&Null));
488+
return empty_array(field.data_type(), array1.len(), true);
478489
}
479490
let array = as_large_list_array(&array2)?;
480491
general_array_distinct::<i64>(array, field)
481492
}
482493
(LargeList(field), Null) => {
483494
if set_op == SetOp::Intersect {
484-
return make_array_inner(&[]);
495+
return empty_array(field.data_type(), array1.len(), true);
485496
}
486497
let array = as_large_list_array(&array1)?;
487498
general_array_distinct::<i64>(array, field)
488499
}
489-
(Null, Null) => Ok(new_empty_array(&Null)),
490-
491500
(List(field), List(_)) => {
492501
let array1 = as_list_array(&array1)?;
493502
let array2 = as_list_array(&array2)?;

datafusion/sqllogictest/test_files/array.slt

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4063,21 +4063,24 @@ select array_union(arrow_cast([1, 2, 3, 4], 'LargeList(Int64)'), arrow_cast([5,
40634063
statement ok
40644064
CREATE TABLE arrays_with_repeating_elements_for_union
40654065
AS VALUES
4066-
([1], [2]),
4066+
([0, 1, 1], []),
4067+
([1, 1], [2]),
40674068
([2, 3], [3]),
40684069
([3], [3, 4])
40694070
;
40704071

40714072
query ?
40724073
select array_union(column1, column2) from arrays_with_repeating_elements_for_union;
40734074
----
4075+
[0, 1]
40744076
[1, 2]
40754077
[2, 3]
40764078
[3, 4]
40774079

40784080
query ?
40794081
select array_union(arrow_cast(column1, 'LargeList(Int64)'), arrow_cast(column2, 'LargeList(Int64)')) from arrays_with_repeating_elements_for_union;
40804082
----
4083+
[0, 1]
40814084
[1, 2]
40824085
[2, 3]
40834086
[3, 4]
@@ -4097,15 +4100,11 @@ select array_union(arrow_cast([], 'LargeList(Int64)'), arrow_cast([], 'LargeList
40974100
[]
40984101

40994102
# array_union scalar function #7
4100-
query ?
4103+
query error DataFusion error: Internal error: array_union is not implemented for
41014104
select array_union([[null]], []);
4102-
----
4103-
[[]]
41044105

4105-
query ?
4106+
query error DataFusion error: Internal error: array_union is not implemented for
41064107
select array_union(arrow_cast([[null]], 'LargeList(List(Int64))'), arrow_cast([], 'LargeList(Int64)'));
4107-
----
4108-
[[]]
41094108

41104109
# array_union scalar function #8
41114110
query ?
@@ -5893,12 +5892,12 @@ select array_intersect(arrow_cast([1, 1, 2, 2, 3, 3], 'LargeList(Int64)'), null)
58935892
query ?
58945893
select array_intersect(null, [1, 1, 2, 2, 3, 3]);
58955894
----
5896-
NULL
5895+
[]
58975896

58985897
query ?
58995898
select array_intersect(null, arrow_cast([1, 1, 2, 2, 3, 3], 'LargeList(Int64)'));
59005899
----
5901-
NULL
5900+
[]
59025901

59035902
query ?
59045903
select array_intersect([], null);
@@ -5923,12 +5922,12 @@ select array_intersect(arrow_cast([], 'LargeList(Int64)'), null);
59235922
query ?
59245923
select array_intersect(null, []);
59255924
----
5926-
NULL
5925+
[]
59275926

59285927
query ?
59295928
select array_intersect(null, arrow_cast([], 'LargeList(Int64)'));
59305929
----
5931-
NULL
5930+
[]
59325931

59335932
query ?
59345933
select array_intersect(null, null);

0 commit comments

Comments
 (0)