-
Notifications
You must be signed in to change notification settings - Fork 796
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implement native support StringViewArray for regexp_is_match
and regexp_is_match_scalar
function, deprecate regexp_is_match_utf8
and regexp_is_match_utf8_scalar
#6376
Changes from 1 commit
5088c2c
595d64c
e80deea
4fed56b
e10b474
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,6 +18,7 @@ | |
//! Defines kernel to extract substrings based on a regular | ||
//! expression of a \[Large\]StringArray | ||
|
||
use crate::like::StringArrayType; | ||
use arrow_array::builder::{BooleanBufferBuilder, GenericStringBuilder, ListBuilder}; | ||
use arrow_array::cast::AsArray; | ||
use arrow_array::*; | ||
|
@@ -28,23 +29,31 @@ use regex::Regex; | |
use std::collections::HashMap; | ||
use std::sync::Arc; | ||
|
||
/// Perform SQL `array ~ regex_array` operation on [`StringArray`] / [`LargeStringArray`]. | ||
/// Perform SQL `array ~ regex_array` operation on | ||
/// [`StringArray`] / [`LargeStringArray`] / [`StringViewArray`]. | ||
/// | ||
/// If `regex_array` element has an empty value, the corresponding result value is always true. | ||
/// | ||
/// `flags_array` are optional [`StringArray`] / [`LargeStringArray`] flag, which allow | ||
/// special search modes, such as case insensitive and multi-line mode. | ||
/// `flags_array` are optional [`StringArray`] / [`LargeStringArray`] / [`StringViewArray`] flag, | ||
/// which allow special search modes, such as case-insensitive and multi-line mode. | ||
/// See the documentation [here](https://docs.rs/regex/1.5.4/regex/#grouping-and-flags) | ||
/// for more information. | ||
pub fn regexp_is_match_utf8<OffsetSize: OffsetSizeTrait>( | ||
array: &GenericStringArray<OffsetSize>, | ||
regex_array: &GenericStringArray<OffsetSize>, | ||
flags_array: Option<&GenericStringArray<OffsetSize>>, | ||
) -> Result<BooleanArray, ArrowError> { | ||
pub fn regexp_is_match_utf8<'a, S1, S2, S3>( | ||
array: &'a S1, | ||
regex_array: &'a S2, | ||
flags_array: Option<&'a S3>, | ||
) -> Result<BooleanArray, ArrowError> | ||
where | ||
&'a S1: StringArrayType<'a>, | ||
&'a S2: StringArrayType<'a>, | ||
&'a S3: StringArrayType<'a>, | ||
{ | ||
if array.len() != regex_array.len() { | ||
return Err(ArrowError::ComputeError( | ||
"Cannot perform comparison operation on arrays of different length".to_string(), | ||
)); | ||
} | ||
|
||
let nulls = NullBuffer::union(array.nulls(), regex_array.nulls()); | ||
|
||
let mut patterns: HashMap<String, Regex> = HashMap::new(); | ||
|
@@ -107,18 +116,22 @@ pub fn regexp_is_match_utf8<OffsetSize: OffsetSizeTrait>( | |
.nulls(nulls) | ||
.build_unchecked() | ||
}; | ||
|
||
Ok(BooleanArray::from(data)) | ||
} | ||
|
||
/// Perform SQL `array ~ regex_array` operation on [`StringArray`] / | ||
/// [`LargeStringArray`] and a scalar. | ||
/// Perform SQL `array ~ regex_array` operation on | ||
/// [`StringArray`] / [`LargeStringArray`] / [`StringViewArray`] and a scalar. | ||
/// | ||
/// See the documentation on [`regexp_is_match_utf8`] for more details. | ||
pub fn regexp_is_match_utf8_scalar<OffsetSize: OffsetSizeTrait>( | ||
array: &GenericStringArray<OffsetSize>, | ||
pub fn regexp_is_match_utf8_scalar<'a, S>( | ||
array: &'a S, | ||
regex: &str, | ||
flag: Option<&str>, | ||
) -> Result<BooleanArray, ArrowError> { | ||
) -> Result<BooleanArray, ArrowError> | ||
where | ||
&'a S: StringArrayType<'a>, | ||
{ | ||
let null_bit_buffer = array.nulls().map(|x| x.inner().sliced()); | ||
let mut result = BooleanBufferBuilder::new(array.len()); | ||
|
||
|
@@ -517,8 +530,8 @@ mod tests { | |
($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) => { | ||
#[test] | ||
fn $test_name() { | ||
let left = StringArray::from($left); | ||
let right = StringArray::from($right); | ||
let left = $left; | ||
let right = $right; | ||
let res = $op(&left, &right, None).unwrap(); | ||
let expected = $expected; | ||
assert_eq!(expected.len(), res.len()); | ||
|
@@ -531,9 +544,9 @@ mod tests { | |
($test_name:ident, $left:expr, $right:expr, $flag:expr, $op:expr, $expected:expr) => { | ||
#[test] | ||
fn $test_name() { | ||
let left = StringArray::from($left); | ||
let right = StringArray::from($right); | ||
let flag = Some(StringArray::from($flag)); | ||
let left = $left; | ||
let right = $right; | ||
let flag = Some($flag); | ||
let res = $op(&left, &right, flag.as_ref()).unwrap(); | ||
let expected = $expected; | ||
assert_eq!(expected.len(), res.len()); | ||
|
@@ -549,7 +562,7 @@ mod tests { | |
($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) => { | ||
#[test] | ||
fn $test_name() { | ||
let left = StringArray::from($left); | ||
let left = $left; | ||
let res = $op(&left, $right, None).unwrap(); | ||
let expected = $expected; | ||
assert_eq!(expected.len(), res.len()); | ||
|
@@ -569,7 +582,7 @@ mod tests { | |
($test_name:ident, $left:expr, $right:expr, $flag:expr, $op:expr, $expected:expr) => { | ||
#[test] | ||
fn $test_name() { | ||
let left = StringArray::from($left); | ||
let left = $left; | ||
let flag = Some($flag); | ||
let res = $op(&left, $right, flag).unwrap(); | ||
let expected = $expected; | ||
|
@@ -591,40 +604,103 @@ mod tests { | |
|
||
test_flag_utf8!( | ||
test_utf8_array_regexp_is_match, | ||
vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrow"], | ||
vec!["^ar", "^AR", "ow$", "OW$", "foo", ""], | ||
regexp_is_match_utf8, | ||
StringArray::from(vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrow"]), | ||
StringArray::from(vec!["^ar", "^AR", "ow$", "OW$", "foo", ""]), | ||
regexp_is_match_utf8::< | ||
GenericStringArray<i32>, | ||
GenericStringArray<i32>, | ||
GenericStringArray<i32>, | ||
>, | ||
[true, false, true, false, false, true] | ||
); | ||
test_flag_utf8!( | ||
test_utf8_array_regexp_is_match_2, | ||
StringViewArray::from(vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrow"]), | ||
StringViewArray::from(vec!["^ar", "^AR", "ow$", "OW$", "foo", ""]), | ||
regexp_is_match_utf8::<StringViewArray, StringViewArray, StringViewArray>, | ||
[true, false, true, false, false, true] | ||
); | ||
test_flag_utf8!( | ||
test_utf8_array_regexp_is_match_3, | ||
StringViewArray::from(vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrow"]), | ||
StringArray::from(vec!["^ar", "^AR", "ow$", "OW$", "foo", ""]), | ||
regexp_is_match_utf8::<StringViewArray, GenericStringArray<i32>, GenericStringArray<i32>>, | ||
[true, false, true, false, false, true] | ||
); | ||
|
||
test_flag_utf8!( | ||
test_utf8_array_regexp_is_match_insensitive, | ||
vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrow"], | ||
vec!["^ar", "^AR", "ow$", "OW$", "foo", ""], | ||
vec!["i"; 6], | ||
regexp_is_match_utf8, | ||
StringArray::from(vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrow"]), | ||
StringArray::from(vec!["^ar", "^AR", "ow$", "OW$", "foo", ""]), | ||
StringArray::from(vec!["i"; 6]), | ||
regexp_is_match_utf8::< | ||
GenericStringArray<i32>, | ||
GenericStringArray<i32>, | ||
GenericStringArray<i32>, | ||
>, | ||
[true, true, true, true, false, true] | ||
); | ||
test_flag_utf8!( | ||
test_utf8_array_regexp_is_match_insensitive_2, | ||
StringViewArray::from(vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrow"]), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. StringViewArray has special case handling for strings that are more than 12 bytes long (the string data is stored out of band in those cases) Can you please add tests that have some strings that are longer than 12 bytes? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes, noted. I will review and update test cases for this scenario. |
||
StringViewArray::from(vec!["^ar", "^AR", "ow$", "OW$", "foo", ""]), | ||
StringArray::from(vec!["i"; 6]), | ||
regexp_is_match_utf8::<StringViewArray, StringViewArray, GenericStringArray<i32>>, | ||
[true, true, true, true, false, true] | ||
); | ||
test_flag_utf8!( | ||
test_utf8_array_regexp_is_match_insensitive_3, | ||
LargeStringArray::from(vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrow"]), | ||
StringViewArray::from(vec!["^ar", "^AR", "ow$", "OW$", "foo", ""]), | ||
StringArray::from(vec!["i"; 6]), | ||
regexp_is_match_utf8::<GenericStringArray<i64>, StringViewArray, GenericStringArray<i32>>, | ||
[true, true, true, true, false, true] | ||
); | ||
|
||
test_flag_utf8_scalar!( | ||
test_utf8_array_regexp_is_match_scalar, | ||
vec!["arrow", "ARROW", "parquet", "PARQUET"], | ||
StringArray::from(vec!["arrow", "ARROW", "parquet", "PARQUET"]), | ||
"^ar", | ||
regexp_is_match_utf8_scalar, | ||
regexp_is_match_utf8_scalar::<GenericStringArray<i32>>, | ||
[true, false, false, false] | ||
); | ||
test_flag_utf8_scalar!( | ||
test_utf8_array_regexp_is_match_scalar_2, | ||
StringViewArray::from(vec!["arrow", "ARROW", "parquet", "PARQUET"]), | ||
"^ar", | ||
regexp_is_match_utf8_scalar::<StringViewArray>, | ||
[true, false, false, false] | ||
); | ||
|
||
test_flag_utf8_scalar!( | ||
test_utf8_array_regexp_is_match_empty_scalar, | ||
vec!["arrow", "ARROW", "parquet", "PARQUET"], | ||
StringArray::from(vec!["arrow", "ARROW", "parquet", "PARQUET"]), | ||
"", | ||
regexp_is_match_utf8_scalar::<GenericStringArray<i32>>, | ||
[true, true, true, true] | ||
); | ||
test_flag_utf8_scalar!( | ||
test_utf8_array_regexp_is_match_empty_scalar_2, | ||
StringViewArray::from(vec!["arrow", "ARROW", "parquet", "PARQUET"]), | ||
"", | ||
regexp_is_match_utf8_scalar, | ||
regexp_is_match_utf8_scalar::<StringViewArray>, | ||
[true, true, true, true] | ||
); | ||
|
||
test_flag_utf8_scalar!( | ||
test_utf8_array_regexp_is_match_insensitive_scalar, | ||
vec!["arrow", "ARROW", "parquet", "PARQUET"], | ||
StringArray::from(vec!["arrow", "ARROW", "parquet", "PARQUET"]), | ||
"^ar", | ||
"i", | ||
regexp_is_match_utf8_scalar::<GenericStringArray<i32>>, | ||
[true, true, false, false] | ||
); | ||
test_flag_utf8_scalar!( | ||
test_utf8_array_regexp_is_match_insensitive_scalar_2, | ||
StringViewArray::from(vec!["arrow", "ARROW", "parquet", "PARQUET"]), | ||
"^ar", | ||
"i", | ||
regexp_is_match_utf8_scalar, | ||
regexp_is_match_utf8_scalar::<StringViewArray>, | ||
[true, true, false, false] | ||
); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unfortunately, I think this is a API change (as is the above)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have an idea of how to update this PR to avoid an API change -- the reason this is important is that a breaking API change would need to wait until the next major release (Dec 2024) per the release schedule: https://github.com/apache/arrow-rs?tab=readme-ov-file#release-versioning-and-schedule
TLDR is I think if we introduced a new function like the following:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@alamb Sounds good 👍 But why do we use
&dyn Array
for the newregex_is_match
function instead of keeping the current implementation?Or am I misunderstanding you? I understand that we will provide a new
regex_is_match
function, and mark the currentregex_is_match_utf8
function as:Is that right? 🤔