-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Description
Describe the bug
Certain kernels when operating on RunArrays don't account for them potentially being sliced, meaning they can return incorrect results.
To Reproduce
See following tests:
arrow-rs/arrow-cast/src/cast/mod.rs
Lines 11748 to 11763 in addf74d
| #[test] | |
| #[should_panic = "assertion `left == right` failed\n left: ScalarBuffer([1, 1, 2])\n right: [2, 2, 3]"] | |
| // TODO: fix cast of RunArrays to account for sliced RunArray's | |
| // https://github.com/apache/arrow-rs/issues/9018 | |
| fn test_sliced_run_end_encoded_to_primitive() { | |
| let run_ends = Int32Array::from(vec![2, 5, 6]); | |
| let values = Int32Array::from(vec![1, 2, 3]); | |
| // [1, 1, 2, 2, 2, 3] | |
| let run_array = RunArray::<Int32Type>::try_new(&run_ends, &values).unwrap(); | |
| let run_array = run_array.slice(3, 3); // [2, 2, 3] | |
| let array_ref = Arc::new(run_array) as ArrayRef; | |
| let cast_result = cast(&array_ref, &DataType::Int64).unwrap(); | |
| let result_run_array = cast_result.as_primitive::<Int64Type>(); | |
| assert_eq!(result_run_array.values(), &[2, 2, 3]); | |
| } |
arrow-rs/arrow-select/src/concat.rs
Lines 1718 to 1741 in addf74d
| #[test] | |
| #[should_panic = "assertion `left == right` failed\n left: [20, 20, 40, 40, 40]\n right: [10, 10, 20, 20, 30, 40, 40, 40]"] | |
| // TODO: fix concat of RunArrays to account for sliced RunArray's | |
| // https://github.com/apache/arrow-rs/issues/9018 | |
| fn test_concat_sliced_run_array() { | |
| // Slicing away first run in both arrays | |
| let run_ends1 = Int32Array::from(vec![2, 4]); | |
| let values1 = Int32Array::from(vec![10, 20]); | |
| let array1 = RunArray::try_new(&run_ends1, &values1).unwrap(); // [10, 10, 20, 20] | |
| let array1 = array1.slice(2, 2); // [20, 20] | |
| let run_ends2 = Int32Array::from(vec![1, 4]); | |
| let values2 = Int32Array::from(vec![30, 40]); | |
| let array2 = RunArray::try_new(&run_ends2, &values2).unwrap(); // [30, 40, 40, 40] | |
| let array2 = array2.slice(1, 3); // [40, 40, 40] | |
| let result = concat(&[&array1, &array2]).unwrap(); | |
| let result = result.as_run::<Int32Type>(); | |
| let result = result.downcast::<Int32Array>().unwrap(); | |
| let expected = vec![20, 20, 40, 40, 40]; | |
| let actual = result.into_iter().flatten().collect::<Vec<_>>(); | |
| assert_eq!(expected, actual); | |
| } |
arrow-rs/arrow-select/src/filter.rs
Lines 1357 to 1375 in addf74d
| #[test] | |
| #[should_panic = "assertion `left == right` failed\n left: [-2, 9]\n right: [7, -2]"] | |
| // TODO: fix filter of RunArrays to account for sliced RunArray's | |
| // https://github.com/apache/arrow-rs/issues/9018 | |
| fn test_filter_run_end_encoding_array_sliced() { | |
| let run_ends = Int64Array::from(vec![2, 3, 8]); | |
| let values = Int64Array::from(vec![7, -2, 9]); | |
| let a = RunArray::try_new(&run_ends, &values).unwrap(); // [7, 7, -2, 9, 9, 9, 9, 9] | |
| let a = a.slice(2, 3); // [-2, 9, 9] | |
| let b = BooleanArray::from(vec![true, false, true]); | |
| let result = filter(&a, &b).unwrap(); | |
| let result = result.as_run::<Int64Type>(); | |
| let result = result.downcast::<Int64Array>().unwrap(); | |
| let expected = vec![-2, 9]; | |
| let actual = result.into_iter().flatten().collect::<Vec<_>>(); | |
| assert_eq!(expected, actual); | |
| } |
arrow-rs/arrow-select/src/interleave.rs
Lines 1174 to 1198 in addf74d
| #[test] | |
| #[should_panic = "assertion `left == right` failed\n left: [1, 4, 2, 5, 6]\n right: [2, 5, 2, 5, 6]"] | |
| // TODO: fix interleave of RunArrays to account for sliced RunArray's | |
| // https://github.com/apache/arrow-rs/issues/9018 | |
| fn test_interleave_run_end_encoded_sliced() { | |
| let mut builder = PrimitiveRunBuilder::<Int32Type, Int32Type>::new(); | |
| builder.extend([1, 1, 2, 2, 2, 3].into_iter().map(Some)); | |
| let a = builder.finish(); | |
| let a = a.slice(2, 3); // [2, 2, 2] | |
| let mut builder = PrimitiveRunBuilder::<Int32Type, Int32Type>::new(); | |
| builder.extend([4, 5, 5, 6, 6, 6].into_iter().map(Some)); | |
| let b = builder.finish(); | |
| let b = b.slice(1, 3); // [5, 5, 6] | |
| let indices = &[(0, 1), (1, 0), (0, 3), (1, 2), (1, 3)]; | |
| let result = interleave(&[&a, &b], indices).unwrap(); | |
| let result = result.as_run::<Int32Type>(); | |
| let result = result.downcast::<Int32Array>().unwrap(); | |
| let expected = vec![2, 5, 2, 5, 6]; | |
| let actual = result.into_iter().flatten().collect::<Vec<_>>(); | |
| assert_eq!(actual, expected); | |
| } |
These are not necessarily exhaustive; they are just the ones I found. The only other kernel I checked was take, which seemed fine:
arrow-rs/arrow-select/src/take.rs
Lines 2441 to 2460 in addf74d
| #[test] | |
| fn test_take_runs_sliced() { | |
| let logical_array: Vec<i32> = vec![1, 1, 2, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6]; | |
| let mut builder = PrimitiveRunBuilder::<Int32Type, Int32Type>::new(); | |
| builder.extend(logical_array.into_iter().map(Some)); | |
| let run_array = builder.finish(); | |
| let run_array = run_array.slice(4, 6); // [3, 3, 3, 4, 4, 5] | |
| let take_indices: PrimitiveArray<Int32Type> = vec![0, 5, 5, 1, 4].into_iter().collect(); | |
| let result = take_run(&run_array, &take_indices).unwrap(); | |
| let result = result.downcast::<Int32Array>().unwrap(); | |
| let expected = vec![3, 5, 5, 3, 4]; | |
| let actual = result.into_iter().flatten().collect::<Vec<_>>(); | |
| assert_eq!(expected, actual); | |
| } |
Expected behavior
Kernels should account for slicing when operating on RunArrays.
Additional context
In fixing these kernels we should see if we can enhance the RunArray/RunEndBuffer APIs to make it easier to handle sliced arrays; for example in apache/datafusion#18981 I found it a bit confusing to try understand what's going on, since there was logic to manually slice the values ourselves.