Skip to content

Commit 04f56bd

Browse files
rluvatonalamb
andauthored
fix(datafusion-functions-nested): arrow-distinct now work with null rows (#13966)
* added failing test * fix(datafusion-functions-nested): `arrow-distinct` now work with null rows * Update datafusion/functions-nested/src/set_ops.rs Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org> * Update set_ops.rs --------- Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent 38ccb00 commit 04f56bd

File tree

2 files changed

+16
-3
lines changed

2 files changed

+16
-3
lines changed

datafusion/functions-nested/src/set_ops.rs

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -516,11 +516,16 @@ fn general_array_distinct<OffsetSize: OffsetSizeTrait>(
516516
let mut new_arrays = Vec::with_capacity(array.len());
517517
let converter = RowConverter::new(vec![SortField::new(dt)])?;
518518
// distinct for each list in ListArray
519-
for arr in array.iter().flatten() {
519+
for arr in array.iter() {
520+
let last_offset: OffsetSize = offsets.last().copied().unwrap();
521+
let Some(arr) = arr else {
522+
// Add same offset for null
523+
offsets.push(last_offset);
524+
continue;
525+
};
520526
let values = converter.convert_columns(&[arr])?;
521527
// sort elements in list and remove duplicates
522528
let rows = values.iter().sorted().dedup().collect::<Vec<_>>();
523-
let last_offset: OffsetSize = offsets.last().copied().unwrap();
524529
offsets.push(last_offset + OffsetSize::usize_as(rows.len()));
525530
let arrays = converter.convert_rows(rows)?;
526531
let array = match arrays.first() {
@@ -538,6 +543,7 @@ fn general_array_distinct<OffsetSize: OffsetSizeTrait>(
538543
Arc::clone(field),
539544
offsets,
540545
values,
541-
None,
546+
// Keep the list nulls
547+
array.nulls().cloned(),
542548
)?))
543549
}

datafusion/sqllogictest/test_files/array.slt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5674,6 +5674,13 @@ select array_distinct([sum(a)]) from t1 where a > 100 group by b;
56745674
statement ok
56755675
drop table t1;
56765676

5677+
query ?
5678+
select array_distinct(a) from values ([1, 2, 3]), (null), ([1, 3, 1]) as X(a);
5679+
----
5680+
[1, 2, 3]
5681+
NULL
5682+
[1, 3]
5683+
56775684
query ?
56785685
select array_distinct([]);
56795686
----

0 commit comments

Comments
 (0)