Skip to content

Commit 3118b81

Browse files
nirnayroycipherstakesalambblaginin
authored
Implementation for regex_instr (#15928)
* Implementation for regex_instr * linting and typo addressed in bench * prettier formatting * scalar_functions_formatting * linting format macros * formatting * address comments to PR * formatting * clippy * fmt * address docs typo * remove unnecessary struct and comment * delete redundant lines add tests for subexp correct function signature for benches * refactor get_index * comments addressed * update doc * clippy upgrade --------- Co-authored-by: Nirnay Roy <nirnayroy1012@gmail.com> Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org> Co-authored-by: Dmitrii Blaginin <dmitrii@blaginin.me>
1 parent 1cc67ab commit 3118b81

File tree

6 files changed

+1189
-39
lines changed

6 files changed

+1189
-39
lines changed

datafusion/functions/benches/regx.rs

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ use arrow::compute::cast;
2323
use arrow::datatypes::DataType;
2424
use criterion::{black_box, criterion_group, criterion_main, Criterion};
2525
use datafusion_functions::regex::regexpcount::regexp_count_func;
26+
use datafusion_functions::regex::regexpinstr::regexp_instr_func;
2627
use datafusion_functions::regex::regexplike::regexp_like;
2728
use datafusion_functions::regex::regexpmatch::regexp_match;
2829
use datafusion_functions::regex::regexpreplace::regexp_replace;
@@ -71,6 +72,15 @@ fn start(rng: &mut ThreadRng) -> Int64Array {
7172
Int64Array::from(data)
7273
}
7374

75+
fn n(rng: &mut ThreadRng) -> Int64Array {
76+
let mut data: Vec<i64> = vec![];
77+
for _ in 0..1000 {
78+
data.push(rng.random_range(1..5));
79+
}
80+
81+
Int64Array::from(data)
82+
}
83+
7484
fn flags(rng: &mut ThreadRng) -> StringArray {
7585
let samples = [Some("i".to_string()), Some("im".to_string()), None];
7686
let mut sb = StringBuilder::new();
@@ -86,6 +96,15 @@ fn flags(rng: &mut ThreadRng) -> StringArray {
8696
sb.finish()
8797
}
8898

99+
fn subexp(rng: &mut ThreadRng) -> Int64Array {
100+
let mut data: Vec<i64> = vec![];
101+
for _ in 0..1000 {
102+
data.push(rng.random_range(1..5));
103+
}
104+
105+
Int64Array::from(data)
106+
}
107+
89108
fn criterion_benchmark(c: &mut Criterion) {
90109
c.bench_function("regexp_count_1000 string", |b| {
91110
let mut rng = rand::rng();
@@ -127,6 +146,50 @@ fn criterion_benchmark(c: &mut Criterion) {
127146
})
128147
});
129148

149+
c.bench_function("regexp_instr_1000 string", |b| {
150+
let mut rng = rand::rng();
151+
let data = Arc::new(data(&mut rng)) as ArrayRef;
152+
let regex = Arc::new(regex(&mut rng)) as ArrayRef;
153+
let start = Arc::new(start(&mut rng)) as ArrayRef;
154+
let n = Arc::new(n(&mut rng)) as ArrayRef;
155+
let flags = Arc::new(flags(&mut rng)) as ArrayRef;
156+
let subexp = Arc::new(subexp(&mut rng)) as ArrayRef;
157+
158+
b.iter(|| {
159+
black_box(
160+
regexp_instr_func(&[
161+
Arc::clone(&data),
162+
Arc::clone(&regex),
163+
Arc::clone(&start),
164+
Arc::clone(&n),
165+
Arc::clone(&flags),
166+
Arc::clone(&subexp),
167+
])
168+
.expect("regexp_instr should work on utf8"),
169+
)
170+
})
171+
});
172+
173+
c.bench_function("regexp_instr_1000 utf8view", |b| {
174+
let mut rng = rand::rng();
175+
let data = cast(&data(&mut rng), &DataType::Utf8View).unwrap();
176+
let regex = cast(&regex(&mut rng), &DataType::Utf8View).unwrap();
177+
let start = Arc::new(start(&mut rng)) as ArrayRef;
178+
let flags = cast(&flags(&mut rng), &DataType::Utf8View).unwrap();
179+
180+
b.iter(|| {
181+
black_box(
182+
regexp_instr_func(&[
183+
Arc::clone(&data),
184+
Arc::clone(&regex),
185+
Arc::clone(&start),
186+
Arc::clone(&flags),
187+
])
188+
.expect("regexp_instr should work on utf8view"),
189+
)
190+
})
191+
});
192+
130193
c.bench_function("regexp_like_1000", |b| {
131194
let mut rng = rand::rng();
132195
let data = Arc::new(data(&mut rng)) as ArrayRef;

datafusion/functions/src/regex/mod.rs

Lines changed: 73 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,15 +17,20 @@
1717

1818
//! "regex" DataFusion functions
1919
20+
use arrow::error::ArrowError;
21+
use regex::Regex;
22+
use std::collections::hash_map::Entry;
23+
use std::collections::HashMap;
2024
use std::sync::Arc;
21-
2225
pub mod regexpcount;
26+
pub mod regexpinstr;
2327
pub mod regexplike;
2428
pub mod regexpmatch;
2529
pub mod regexpreplace;
2630

2731
// create UDFs
2832
make_udf_function!(regexpcount::RegexpCountFunc, regexp_count);
33+
make_udf_function!(regexpinstr::RegexpInstrFunc, regexp_instr);
2934
make_udf_function!(regexpmatch::RegexpMatchFunc, regexp_match);
3035
make_udf_function!(regexplike::RegexpLikeFunc, regexp_like);
3136
make_udf_function!(regexpreplace::RegexpReplaceFunc, regexp_replace);
@@ -60,7 +65,35 @@ pub mod expr_fn {
6065
super::regexp_match().call(args)
6166
}
6267

63-
/// Returns true if a has at least one match in a string, false otherwise.
68+
/// Returns index of regular expression matches in a string.
69+
pub fn regexp_instr(
70+
values: Expr,
71+
regex: Expr,
72+
start: Option<Expr>,
73+
n: Option<Expr>,
74+
endoption: Option<Expr>,
75+
flags: Option<Expr>,
76+
subexpr: Option<Expr>,
77+
) -> Expr {
78+
let mut args = vec![values, regex];
79+
if let Some(start) = start {
80+
args.push(start);
81+
};
82+
if let Some(n) = n {
83+
args.push(n);
84+
};
85+
if let Some(endoption) = endoption {
86+
args.push(endoption);
87+
};
88+
if let Some(flags) = flags {
89+
args.push(flags);
90+
};
91+
if let Some(subexpr) = subexpr {
92+
args.push(subexpr);
93+
};
94+
super::regexp_instr().call(args)
95+
}
96+
/// Returns true if a regex has at least one match in a string, false otherwise.
6497
pub fn regexp_like(values: Expr, regex: Expr, flags: Option<Expr>) -> Expr {
6598
let mut args = vec![values, regex];
6699
if let Some(flags) = flags {
@@ -89,7 +122,45 @@ pub fn functions() -> Vec<Arc<datafusion_expr::ScalarUDF>> {
89122
vec![
90123
regexp_count(),
91124
regexp_match(),
125+
regexp_instr(),
92126
regexp_like(),
93127
regexp_replace(),
94128
]
95129
}
130+
131+
pub fn compile_and_cache_regex<'strings, 'cache>(
132+
regex: &'strings str,
133+
flags: Option<&'strings str>,
134+
regex_cache: &'cache mut HashMap<(&'strings str, Option<&'strings str>), Regex>,
135+
) -> Result<&'cache Regex, ArrowError>
136+
where
137+
'strings: 'cache,
138+
{
139+
let result = match regex_cache.entry((regex, flags)) {
140+
Entry::Occupied(occupied_entry) => occupied_entry.into_mut(),
141+
Entry::Vacant(vacant_entry) => {
142+
let compiled = compile_regex(regex, flags)?;
143+
vacant_entry.insert(compiled)
144+
}
145+
};
146+
Ok(result)
147+
}
148+
149+
pub fn compile_regex(regex: &str, flags: Option<&str>) -> Result<Regex, ArrowError> {
150+
let pattern = match flags {
151+
None | Some("") => regex.to_string(),
152+
Some(flags) => {
153+
if flags.contains("g") {
154+
return Err(ArrowError::ComputeError(
155+
"regexp_count()/regexp_instr() does not support the global flag"
156+
.to_string(),
157+
));
158+
}
159+
format!("(?{flags}){regex}")
160+
}
161+
};
162+
163+
Regex::new(&pattern).map_err(|_| {
164+
ArrowError::ComputeError(format!("Regular expression did not compile: {pattern}"))
165+
})
166+
}

datafusion/functions/src/regex/regexpcount.rs

Lines changed: 1 addition & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18+
use crate::regex::{compile_and_cache_regex, compile_regex};
1819
use arrow::array::{Array, ArrayRef, AsArray, Datum, Int64Array, StringArrayType};
1920
use arrow::datatypes::{DataType, Int64Type};
2021
use arrow::datatypes::{
@@ -29,7 +30,6 @@ use datafusion_expr::{
2930
use datafusion_macros::user_doc;
3031
use itertools::izip;
3132
use regex::Regex;
32-
use std::collections::hash_map::Entry;
3333
use std::collections::HashMap;
3434
use std::sync::Arc;
3535

@@ -550,42 +550,6 @@ where
550550
}
551551
}
552552

553-
fn compile_and_cache_regex<'strings, 'cache>(
554-
regex: &'strings str,
555-
flags: Option<&'strings str>,
556-
regex_cache: &'cache mut HashMap<(&'strings str, Option<&'strings str>), Regex>,
557-
) -> Result<&'cache Regex, ArrowError>
558-
where
559-
'strings: 'cache,
560-
{
561-
let result = match regex_cache.entry((regex, flags)) {
562-
Entry::Occupied(occupied_entry) => occupied_entry.into_mut(),
563-
Entry::Vacant(vacant_entry) => {
564-
let compiled = compile_regex(regex, flags)?;
565-
vacant_entry.insert(compiled)
566-
}
567-
};
568-
Ok(result)
569-
}
570-
571-
fn compile_regex(regex: &str, flags: Option<&str>) -> Result<Regex, ArrowError> {
572-
let pattern = match flags {
573-
None | Some("") => regex.to_string(),
574-
Some(flags) => {
575-
if flags.contains("g") {
576-
return Err(ArrowError::ComputeError(
577-
"regexp_count() does not support global flag".to_string(),
578-
));
579-
}
580-
format!("(?{flags}){regex}")
581-
}
582-
};
583-
584-
Regex::new(&pattern).map_err(|_| {
585-
ArrowError::ComputeError(format!("Regular expression did not compile: {pattern}"))
586-
})
587-
}
588-
589553
fn count_matches(
590554
value: Option<&str>,
591555
pattern: &Regex,

0 commit comments

Comments
 (0)