Skip to content

Commit

Permalink
Fix string view LIKE checks with NULL values (apache#6662)
Browse files Browse the repository at this point in the history
* Fix string view LIKE checks with NULL values

* Add TODO comments to operations yet to be fixed

* Use from_unary for contains with string view

from_unary takes care of nulls

* Fix performance

* fixup! Fix performance
  • Loading branch information
findepi committed Nov 14, 2024
1 parent e568675 commit 5cd0aff
Show file tree
Hide file tree
Showing 2 changed files with 133 additions and 20 deletions.
119 changes: 118 additions & 1 deletion arrow-string/src/like.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1709,7 +1709,93 @@ mod tests {
}

#[test]
fn like_scalar_null() {
fn string_null_like_pattern() {
// Different patterns have different execution code paths
for pattern in &[
"", // can execute as equality check
"_", // can execute as length check
"%", // can execute as starts_with("") or non-null check
"a%", // can execute as starts_with("a")
"%a", // can execute as ends_with("")
"a%b", // can execute as starts_with("a") && ends_with("b")
"%a%", // can_execute as contains("a")
"%a%b_c_d%e", // can_execute as regular expression
] {
let a = Scalar::new(StringArray::new_null(1));
let b = StringArray::new_scalar(pattern);
let r = like(&a, &b).unwrap();
assert_eq!(r.len(), 1, "With pattern {pattern}");
assert_eq!(r.null_count(), 1, "With pattern {pattern}");
assert!(r.is_null(0), "With pattern {pattern}");

let a = Scalar::new(StringArray::new_null(1));
let b = StringArray::from_iter_values([pattern]);
let r = like(&a, &b).unwrap();
assert_eq!(r.len(), 1, "With pattern {pattern}");
assert_eq!(r.null_count(), 1, "With pattern {pattern}");
assert!(r.is_null(0), "With pattern {pattern}");

let a = StringArray::new_null(1);
let b = StringArray::from_iter_values([pattern]);
let r = like(&a, &b).unwrap();
assert_eq!(r.len(), 1, "With pattern {pattern}");
assert_eq!(r.null_count(), 1, "With pattern {pattern}");
assert!(r.is_null(0), "With pattern {pattern}");

let a = StringArray::new_null(1);
let b = StringArray::new_scalar(pattern);
let r = like(&a, &b).unwrap();
assert_eq!(r.len(), 1, "With pattern {pattern}");
assert_eq!(r.null_count(), 1, "With pattern {pattern}");
assert!(r.is_null(0), "With pattern {pattern}");
}
}

#[test]
fn string_view_null_like_pattern() {
// Different patterns have different execution code paths
for pattern in &[
"", // can execute as equality check
"_", // can execute as length check
"%", // can execute as starts_with("") or non-null check
"a%", // can execute as starts_with("a")
"%a", // can execute as ends_with("")
"a%b", // can execute as starts_with("a") && ends_with("b")
"%a%", // can_execute as contains("a")
"%a%b_c_d%e", // can_execute as regular expression
] {
let a = Scalar::new(StringViewArray::new_null(1));
let b = StringViewArray::new_scalar(pattern);
let r = like(&a, &b).unwrap();
assert_eq!(r.len(), 1, "With pattern {pattern}");
assert_eq!(r.null_count(), 1, "With pattern {pattern}");
assert!(r.is_null(0), "With pattern {pattern}");

let a = Scalar::new(StringViewArray::new_null(1));
let b = StringViewArray::from_iter_values([pattern]);
let r = like(&a, &b).unwrap();
assert_eq!(r.len(), 1, "With pattern {pattern}");
assert_eq!(r.null_count(), 1, "With pattern {pattern}");
assert!(r.is_null(0), "With pattern {pattern}");

let a = StringViewArray::new_null(1);
let b = StringViewArray::from_iter_values([pattern]);
let r = like(&a, &b).unwrap();
assert_eq!(r.len(), 1, "With pattern {pattern}");
assert_eq!(r.null_count(), 1, "With pattern {pattern}");
assert!(r.is_null(0), "With pattern {pattern}");

let a = StringViewArray::new_null(1);
let b = StringViewArray::new_scalar(pattern);
let r = like(&a, &b).unwrap();
assert_eq!(r.len(), 1, "With pattern {pattern}");
assert_eq!(r.null_count(), 1, "With pattern {pattern}");
assert!(r.is_null(0), "With pattern {pattern}");
}
}

#[test]
fn string_like_scalar_null() {
let a = StringArray::new_scalar("a");
let b = Scalar::new(StringArray::new_null(1));
let r = like(&a, &b).unwrap();
Expand Down Expand Up @@ -1739,6 +1825,37 @@ mod tests {
assert!(r.is_null(0));
}

#[test]
fn string_view_like_scalar_null() {
let a = StringViewArray::new_scalar("a");
let b = Scalar::new(StringViewArray::new_null(1));
let r = like(&a, &b).unwrap();
assert_eq!(r.len(), 1);
assert_eq!(r.null_count(), 1);
assert!(r.is_null(0));

let a = StringViewArray::from_iter_values(["a"]);
let b = Scalar::new(StringViewArray::new_null(1));
let r = like(&a, &b).unwrap();
assert_eq!(r.len(), 1);
assert_eq!(r.null_count(), 1);
assert!(r.is_null(0));

let a = StringViewArray::from_iter_values(["a"]);
let b = StringViewArray::new_null(1);
let r = like(&a, &b).unwrap();
assert_eq!(r.len(), 1);
assert_eq!(r.null_count(), 1);
assert!(r.is_null(0));

let a = StringViewArray::new_scalar("a");
let b = StringViewArray::new_null(1);
let r = like(&a, &b).unwrap();
assert_eq!(r.len(), 1);
assert_eq!(r.null_count(), 1);
assert!(r.is_null(0));
}

#[test]
fn like_escape() {
// (value, pattern, expected)
Expand Down
34 changes: 15 additions & 19 deletions arrow-string/src/predicate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
// specific language governing permissions and limitations
// under the License.

use arrow_array::{ArrayAccessor, BooleanArray, StringViewArray};
use arrow_array::{Array, ArrayAccessor, BooleanArray, StringViewArray};
use arrow_buffer::BooleanBuffer;
use arrow_schema::ArrowError;
use memchr::memchr3;
use memchr::memmem::Finder;
Expand Down Expand Up @@ -110,30 +111,21 @@ impl<'a> Predicate<'a> {
Predicate::IEqAscii(v) => BooleanArray::from_unary(array, |haystack| {
haystack.eq_ignore_ascii_case(v) != negate
}),
Predicate::Contains(finder) => {
if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
BooleanArray::from(
string_view_array
.bytes_iter()
.map(|haystack| finder.find(haystack).is_some() != negate)
.collect::<Vec<_>>(),
)
} else {
BooleanArray::from_unary(array, |haystack| {
finder.find(haystack.as_bytes()).is_some() != negate
})
}
}
Predicate::Contains(finder) => BooleanArray::from_unary(array, |haystack| {
finder.find(haystack.as_bytes()).is_some() != negate
}),
Predicate::StartsWith(v) => {
if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
BooleanArray::from(
let nulls = string_view_array.logical_nulls();
let values = BooleanBuffer::from(
string_view_array
.prefix_bytes_iter(v.len())
.map(|haystack| {
equals_bytes(haystack, v.as_bytes(), equals_kernel) != negate
})
.collect::<Vec<_>>(),
)
);
BooleanArray::new(values, nulls)
} else {
BooleanArray::from_unary(array, |haystack| {
starts_with(haystack, v, equals_kernel) != negate
Expand All @@ -142,6 +134,7 @@ impl<'a> Predicate<'a> {
}
Predicate::IStartsWithAscii(v) => {
if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
// TODO respect null buffer
BooleanArray::from(
string_view_array
.prefix_bytes_iter(v.len())
Expand All @@ -162,14 +155,16 @@ impl<'a> Predicate<'a> {
}
Predicate::EndsWith(v) => {
if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
BooleanArray::from(
let nulls = string_view_array.logical_nulls();
let values = BooleanBuffer::from(
string_view_array
.suffix_bytes_iter(v.len())
.map(|haystack| {
equals_bytes(haystack, v.as_bytes(), equals_kernel) != negate
})
.collect::<Vec<_>>(),
)
);
BooleanArray::new(values, nulls)
} else {
BooleanArray::from_unary(array, |haystack| {
ends_with(haystack, v, equals_kernel) != negate
Expand All @@ -178,6 +173,7 @@ impl<'a> Predicate<'a> {
}
Predicate::IEndsWithAscii(v) => {
if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
// TODO respect null buffer
BooleanArray::from(
string_view_array
.suffix_bytes_iter(v.len())
Expand Down

0 comments on commit 5cd0aff

Please sign in to comment.